conn_linux.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662
  1. //+build linux
  2. package netlink
  3. import (
  4. "math"
  5. "os"
  6. "runtime"
  7. "sync"
  8. "syscall"
  9. "time"
  10. "unsafe"
  11. "golang.org/x/net/bpf"
  12. "golang.org/x/sys/unix"
  13. )
  14. var _ Socket = &conn{}
  15. var _ deadlineSetter = &conn{}
  16. // A conn is the Linux implementation of a netlink sockets connection.
  17. //
  18. // All conn methods must wrap system call errors with os.NewSyscallError to
  19. // enable more intelligible error messages in OpError.
  20. type conn struct {
  21. s socket
  22. sa *unix.SockaddrNetlink
  23. }
  24. // A socket is an interface over socket system calls.
  25. type socket interface {
  26. Bind(sa unix.Sockaddr) error
  27. Close() error
  28. FD() int
  29. File() *os.File
  30. Getsockname() (unix.Sockaddr, error)
  31. Recvmsg(p, oob []byte, flags int) (n int, oobn int, recvflags int, from unix.Sockaddr, err error)
  32. Sendmsg(p, oob []byte, to unix.Sockaddr, flags int) error
  33. SetDeadline(t time.Time) error
  34. SetReadDeadline(t time.Time) error
  35. SetWriteDeadline(t time.Time) error
  36. SetSockoptSockFprog(level, opt int, fprog *unix.SockFprog) error
  37. SetSockoptInt(level, opt, value int) error
  38. GetSockoptInt(level, opt int) (int, error)
  39. }
  40. // dial is the entry point for Dial. dial opens a netlink socket using
  41. // system calls, and returns its PID.
  42. func dial(family int, config *Config) (*conn, uint32, error) {
  43. // Prepare sysSocket's internal loop and create the socket.
  44. //
  45. // The conditional is inverted because a zero value of false is desired
  46. // if no config, but it's easier to interpret within this code when the
  47. // value is inverted.
  48. if config == nil {
  49. config = &Config{}
  50. }
  51. // The caller has indicated it wants the netlink socket to be created
  52. // inside another network namespace.
  53. if config.NetNS != 0 {
  54. runtime.LockOSThread()
  55. defer runtime.UnlockOSThread()
  56. // Retrieve and store the calling OS thread's network namespace so
  57. // the thread can be reassigned to it after creating a socket in another
  58. // network namespace.
  59. threadNS, err := threadNetNS()
  60. if err != nil {
  61. return nil, 0, err
  62. }
  63. // Always close the netns handle created above.
  64. defer threadNS.Close()
  65. // Assign the current OS thread the goroutine is locked to to the given
  66. // network namespace.
  67. if err := threadNS.Set(config.NetNS); err != nil {
  68. return nil, 0, err
  69. }
  70. // Thread's namespace has been successfully set. Return the thread
  71. // back to its original namespace after attempting to create the
  72. // netlink socket.
  73. defer threadNS.Restore()
  74. }
  75. sock := &sysSocket{}
  76. if err := sock.Socket(family); err != nil {
  77. return nil, 0, os.NewSyscallError("socket", err)
  78. }
  79. return bind(sock, config)
  80. }
  81. // bind binds a connection to netlink using the input socket, which may be
  82. // a system call implementation or a mocked one for tests.
  83. func bind(s socket, config *Config) (*conn, uint32, error) {
  84. if config == nil {
  85. config = &Config{}
  86. }
  87. addr := &unix.SockaddrNetlink{
  88. Family: unix.AF_NETLINK,
  89. Groups: config.Groups,
  90. }
  91. // Socket must be closed in the event of any system call errors, to avoid
  92. // leaking file descriptors.
  93. if err := s.Bind(addr); err != nil {
  94. _ = s.Close()
  95. return nil, 0, os.NewSyscallError("bind", err)
  96. }
  97. sa, err := s.Getsockname()
  98. if err != nil {
  99. _ = s.Close()
  100. return nil, 0, os.NewSyscallError("getsockname", err)
  101. }
  102. pid := sa.(*unix.SockaddrNetlink).Pid
  103. return &conn{
  104. s: s,
  105. sa: addr,
  106. }, pid, nil
  107. }
  108. // SendMessages serializes multiple Messages and sends them to netlink.
  109. func (c *conn) SendMessages(messages []Message) error {
  110. var buf []byte
  111. for _, m := range messages {
  112. b, err := m.MarshalBinary()
  113. if err != nil {
  114. return err
  115. }
  116. buf = append(buf, b...)
  117. }
  118. addr := &unix.SockaddrNetlink{
  119. Family: unix.AF_NETLINK,
  120. }
  121. return os.NewSyscallError("sendmsg", c.s.Sendmsg(buf, nil, addr, 0))
  122. }
  123. // Send sends a single Message to netlink.
  124. func (c *conn) Send(m Message) error {
  125. b, err := m.MarshalBinary()
  126. if err != nil {
  127. return err
  128. }
  129. addr := &unix.SockaddrNetlink{
  130. Family: unix.AF_NETLINK,
  131. }
  132. return os.NewSyscallError("sendmsg", c.s.Sendmsg(b, nil, addr, 0))
  133. }
  134. // Receive receives one or more Messages from netlink.
  135. func (c *conn) Receive() ([]Message, error) {
  136. b := make([]byte, os.Getpagesize())
  137. for {
  138. // Peek at the buffer to see how many bytes are available.
  139. //
  140. // TODO(mdlayher): deal with OOB message data if available, such as
  141. // when PacketInfo ConnOption is true.
  142. n, _, _, _, err := c.s.Recvmsg(b, nil, unix.MSG_PEEK)
  143. if err != nil {
  144. return nil, os.NewSyscallError("recvmsg", err)
  145. }
  146. // Break when we can read all messages
  147. if n < len(b) {
  148. break
  149. }
  150. // Double in size if not enough bytes
  151. b = make([]byte, len(b)*2)
  152. }
  153. // Read out all available messages
  154. n, _, _, _, err := c.s.Recvmsg(b, nil, 0)
  155. if err != nil {
  156. return nil, os.NewSyscallError("recvmsg", err)
  157. }
  158. n = nlmsgAlign(n)
  159. raw, err := syscall.ParseNetlinkMessage(b[:n])
  160. if err != nil {
  161. return nil, err
  162. }
  163. msgs := make([]Message, 0, len(raw))
  164. for _, r := range raw {
  165. m := Message{
  166. Header: sysToHeader(r.Header),
  167. Data: r.Data,
  168. }
  169. msgs = append(msgs, m)
  170. }
  171. return msgs, nil
  172. }
  173. // Close closes the connection.
  174. func (c *conn) Close() error {
  175. return os.NewSyscallError("close", c.s.Close())
  176. }
  177. // FD retrieves the file descriptor of the Conn.
  178. func (c *conn) FD() int {
  179. return c.s.FD()
  180. }
  181. // File retrieves the *os.File associated with the Conn.
  182. func (c *conn) File() *os.File {
  183. return c.s.File()
  184. }
  185. // JoinGroup joins a multicast group by ID.
  186. func (c *conn) JoinGroup(group uint32) error {
  187. return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  188. unix.SOL_NETLINK,
  189. unix.NETLINK_ADD_MEMBERSHIP,
  190. int(group),
  191. ))
  192. }
  193. // LeaveGroup leaves a multicast group by ID.
  194. func (c *conn) LeaveGroup(group uint32) error {
  195. return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  196. unix.SOL_NETLINK,
  197. unix.NETLINK_DROP_MEMBERSHIP,
  198. int(group),
  199. ))
  200. }
  201. // SetBPF attaches an assembled BPF program to a conn.
  202. func (c *conn) SetBPF(filter []bpf.RawInstruction) error {
  203. prog := unix.SockFprog{
  204. Len: uint16(len(filter)),
  205. Filter: (*unix.SockFilter)(unsafe.Pointer(&filter[0])),
  206. }
  207. return os.NewSyscallError("setsockopt", c.s.SetSockoptSockFprog(
  208. unix.SOL_SOCKET,
  209. unix.SO_ATTACH_FILTER,
  210. &prog,
  211. ))
  212. }
  213. // RemoveBPF removes a BPF filter from a conn.
  214. func (c *conn) RemoveBPF() error {
  215. // 0 argument is ignored by SO_DETACH_FILTER.
  216. return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  217. unix.SOL_SOCKET,
  218. unix.SO_DETACH_FILTER,
  219. 0,
  220. ))
  221. }
  222. // SetOption enables or disables a netlink socket option for the Conn.
  223. func (c *conn) SetOption(option ConnOption, enable bool) error {
  224. o, ok := linuxOption(option)
  225. if !ok {
  226. // Return the typical Linux error for an unknown ConnOption.
  227. return os.NewSyscallError("setsockopt", unix.ENOPROTOOPT)
  228. }
  229. var v int
  230. if enable {
  231. v = 1
  232. }
  233. return os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  234. unix.SOL_NETLINK,
  235. o,
  236. v,
  237. ))
  238. }
  239. func (c *conn) SetDeadline(t time.Time) error {
  240. return c.s.SetDeadline(t)
  241. }
  242. func (c *conn) SetReadDeadline(t time.Time) error {
  243. return c.s.SetReadDeadline(t)
  244. }
  245. func (c *conn) SetWriteDeadline(t time.Time) error {
  246. return c.s.SetWriteDeadline(t)
  247. }
  248. // SetReadBuffer sets the size of the operating system's receive buffer
  249. // associated with the Conn.
  250. func (c *conn) SetReadBuffer(bytes int) error {
  251. // First try SO_RCVBUFFORCE. Given necessary permissions this syscall ignores limits.
  252. err := os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  253. unix.SOL_SOCKET,
  254. unix.SO_RCVBUFFORCE,
  255. bytes,
  256. ))
  257. if err != nil {
  258. // If SO_SNDBUFFORCE fails, try SO_RCVBUF
  259. err = os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  260. unix.SOL_SOCKET,
  261. unix.SO_RCVBUF,
  262. bytes,
  263. ))
  264. }
  265. return err
  266. }
  267. // SetReadBuffer sets the size of the operating system's transmit buffer
  268. // associated with the Conn.
  269. func (c *conn) SetWriteBuffer(bytes int) error {
  270. // First try SO_SNDBUFFORCE. Given necessary permissions this syscall ignores limits.
  271. err := os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  272. unix.SOL_SOCKET,
  273. unix.SO_SNDBUFFORCE,
  274. bytes,
  275. ))
  276. if err != nil {
  277. // If SO_SNDBUFFORCE fails, try SO_SNDBUF
  278. err = os.NewSyscallError("setsockopt", c.s.SetSockoptInt(
  279. unix.SOL_SOCKET,
  280. unix.SO_SNDBUF,
  281. bytes,
  282. ))
  283. }
  284. return err
  285. }
  286. // GetReadBuffer retrieves the size of the operating system's receive buffer
  287. // associated with the Conn.
  288. func (c *conn) GetReadBuffer() (int, error) {
  289. value, err := c.s.GetSockoptInt(
  290. unix.SOL_SOCKET,
  291. unix.SO_RCVBUF,
  292. )
  293. if err != nil {
  294. return 0, os.NewSyscallError("getsockopt", err)
  295. }
  296. return value, nil
  297. }
  298. // GetWriteBuffer retrieves the size of the operating system's transmit buffer
  299. // associated with the Conn.
  300. func (c *conn) GetWriteBuffer() (int, error) {
  301. value, err := c.s.GetSockoptInt(
  302. unix.SOL_SOCKET,
  303. unix.SO_SNDBUF,
  304. )
  305. if err != nil {
  306. return 0, os.NewSyscallError("getsockopt", err)
  307. }
  308. return value, nil
  309. }
  310. // linuxOption converts a ConnOption to its Linux value.
  311. func linuxOption(o ConnOption) (int, bool) {
  312. switch o {
  313. case PacketInfo:
  314. return unix.NETLINK_PKTINFO, true
  315. case BroadcastError:
  316. return unix.NETLINK_BROADCAST_ERROR, true
  317. case NoENOBUFS:
  318. return unix.NETLINK_NO_ENOBUFS, true
  319. case ListenAllNSID:
  320. return unix.NETLINK_LISTEN_ALL_NSID, true
  321. case CapAcknowledge:
  322. return unix.NETLINK_CAP_ACK, true
  323. case ExtendedAcknowledge:
  324. return unix.NETLINK_EXT_ACK, true
  325. default:
  326. return 0, false
  327. }
  328. }
  329. // sysToHeader converts a syscall.NlMsghdr to a Header.
  330. func sysToHeader(r syscall.NlMsghdr) Header {
  331. // NB: the memory layout of Header and syscall.NlMsgHdr must be
  332. // exactly the same for this unsafe cast to work
  333. return *(*Header)(unsafe.Pointer(&r))
  334. }
  335. // newError converts an error number from netlink into the appropriate
  336. // system call error for Linux.
  337. func newError(errno int) error {
  338. return syscall.Errno(errno)
  339. }
  340. var _ socket = &sysSocket{}
  341. // A sysSocket is a socket which uses system calls for socket operations.
  342. type sysSocket struct {
  343. mu sync.RWMutex
  344. fd *os.File
  345. closed bool
  346. }
  347. // read executes f, a read function, against the associated file descriptor.
  348. func (s *sysSocket) read(f func(fd int) bool) error {
  349. s.mu.RLock()
  350. defer s.mu.RUnlock()
  351. if s.closed {
  352. return syscall.EBADF
  353. }
  354. return fdread(s.fd, f)
  355. }
  356. // write executes f, a write function, against the associated file descriptor.
  357. func (s *sysSocket) write(f func(fd int) bool) error {
  358. s.mu.RLock()
  359. defer s.mu.RUnlock()
  360. if s.closed {
  361. return syscall.EBADF
  362. }
  363. return fdwrite(s.fd, f)
  364. }
  365. // control executes f, a control function, against the associated file descriptor.
  366. func (s *sysSocket) control(f func(fd int)) error {
  367. s.mu.RLock()
  368. defer s.mu.RUnlock()
  369. if s.closed {
  370. return syscall.EBADF
  371. }
  372. return fdcontrol(s.fd, f)
  373. }
  374. func (s *sysSocket) Socket(family int) error {
  375. // Mirror what the standard library does when creating file
  376. // descriptors: avoid racing a fork/exec with the creation
  377. // of new file descriptors, so that child processes do not
  378. // inherit netlink socket file descriptors unexpectedly.
  379. //
  380. // On Linux, SOCK_CLOEXEC was introduced in 2.6.27. OTOH,
  381. // Go supports Linux 2.6.23 and above. If we get EINVAL on
  382. // the first try, it may be that we are running on a kernel
  383. // older than 2.6.27. In that case, take syscall.ForkLock
  384. // and try again without SOCK_CLOEXEC.
  385. //
  386. // SOCK_NONBLOCK was also added in 2.6.27, but we don't
  387. // use SOCK_NONBLOCK here for now, not until we remove support
  388. // for Go 1.11, since we still support the old blocking file
  389. // descriptor behavior.
  390. //
  391. // For a more thorough explanation, see similar work in the
  392. // Go tree: func sysSocket in net/sock_cloexec.go, as well
  393. // as the detailed comment in syscall/exec_unix.go.
  394. //
  395. // TODO(acln): update this to mirror net.sysSocket completely:
  396. // use SOCK_NONBLOCK as well, and remove the separate
  397. // setBlockingMode step once Go 1.11 support is removed and
  398. // we switch to using entirely non-blocking file descriptors.
  399. fd, err := unix.Socket(
  400. unix.AF_NETLINK,
  401. unix.SOCK_RAW|unix.SOCK_CLOEXEC,
  402. family,
  403. )
  404. if err == unix.EINVAL {
  405. syscall.ForkLock.RLock()
  406. fd, err = unix.Socket(
  407. unix.AF_NETLINK,
  408. unix.SOCK_RAW,
  409. family,
  410. )
  411. if err == nil {
  412. unix.CloseOnExec(fd)
  413. }
  414. syscall.ForkLock.RUnlock()
  415. }
  416. if err := setBlockingMode(fd); err != nil {
  417. return err
  418. }
  419. // When using Go 1.12+, the setBlockingMode call we just did puts the
  420. // file descriptor into non-blocking mode. In that case, os.NewFile
  421. // registers the file descriptor with the runtime poller, which is
  422. // then used for all subsequent operations.
  423. //
  424. // See also: https://golang.org/pkg/os/#NewFile
  425. s.fd = os.NewFile(uintptr(fd), "netlink")
  426. return nil
  427. }
  428. func (s *sysSocket) Bind(sa unix.Sockaddr) error {
  429. var err error
  430. doErr := s.control(func(fd int) {
  431. err = unix.Bind(fd, sa)
  432. })
  433. if doErr != nil {
  434. return doErr
  435. }
  436. return err
  437. }
  438. func (s *sysSocket) Close() error {
  439. // Be sure to acquire a write lock because we need to stop any other
  440. // goroutines from sending system call requests after close.
  441. // Any invocation of do() after this write lock unlocks is guaranteed
  442. // to find s.done being true.
  443. s.mu.Lock()
  444. defer s.mu.Unlock()
  445. // Close the socket from the main thread, this operation has no risk
  446. // of routing data to the wrong socket.
  447. err := s.fd.Close()
  448. s.closed = true
  449. return err
  450. }
  451. func (s *sysSocket) FD() int { return int(s.fd.Fd()) }
  452. func (s *sysSocket) File() *os.File { return s.fd }
  453. func (s *sysSocket) Getsockname() (unix.Sockaddr, error) {
  454. var (
  455. sa unix.Sockaddr
  456. err error
  457. )
  458. doErr := s.control(func(fd int) {
  459. sa, err = unix.Getsockname(fd)
  460. })
  461. if doErr != nil {
  462. return nil, doErr
  463. }
  464. return sa, err
  465. }
  466. func (s *sysSocket) Recvmsg(p, oob []byte, flags int) (int, int, int, unix.Sockaddr, error) {
  467. var (
  468. n, oobn, recvflags int
  469. from unix.Sockaddr
  470. err error
  471. )
  472. doErr := s.read(func(fd int) bool {
  473. n, oobn, recvflags, from, err = unix.Recvmsg(fd, p, oob, flags)
  474. // Check for readiness.
  475. return ready(err)
  476. })
  477. if doErr != nil {
  478. return 0, 0, 0, nil, doErr
  479. }
  480. return n, oobn, recvflags, from, err
  481. }
  482. func (s *sysSocket) Sendmsg(p, oob []byte, to unix.Sockaddr, flags int) error {
  483. var err error
  484. doErr := s.write(func(fd int) bool {
  485. err = unix.Sendmsg(fd, p, oob, to, flags)
  486. // Check for readiness.
  487. return ready(err)
  488. })
  489. if doErr != nil {
  490. return doErr
  491. }
  492. return err
  493. }
  494. func (s *sysSocket) SetDeadline(t time.Time) error {
  495. return s.fd.SetDeadline(t)
  496. }
  497. func (s *sysSocket) SetReadDeadline(t time.Time) error {
  498. return s.fd.SetReadDeadline(t)
  499. }
  500. func (s *sysSocket) SetWriteDeadline(t time.Time) error {
  501. return s.fd.SetWriteDeadline(t)
  502. }
  503. func (s *sysSocket) SetSockoptInt(level, opt, value int) error {
  504. // Value must be in range of a C integer.
  505. if value < math.MinInt32 || value > math.MaxInt32 {
  506. return unix.EINVAL
  507. }
  508. var err error
  509. doErr := s.control(func(fd int) {
  510. err = unix.SetsockoptInt(fd, level, opt, value)
  511. })
  512. if doErr != nil {
  513. return doErr
  514. }
  515. return err
  516. }
  517. func (s *sysSocket) GetSockoptInt(level, opt int) (int, error) {
  518. var (
  519. value int
  520. err error
  521. )
  522. doErr := s.control(func(fd int) {
  523. value, err = unix.GetsockoptInt(fd, level, opt)
  524. })
  525. if doErr != nil {
  526. return 0, doErr
  527. }
  528. return value, err
  529. }
  530. func (s *sysSocket) SetSockoptSockFprog(level, opt int, fprog *unix.SockFprog) error {
  531. var err error
  532. doErr := s.control(func(fd int) {
  533. err = unix.SetsockoptSockFprog(fd, level, opt, fprog)
  534. })
  535. if doErr != nil {
  536. return doErr
  537. }
  538. return err
  539. }
  540. // ready indicates readiness based on the value of err.
  541. func ready(err error) bool {
  542. // When a socket is in non-blocking mode, we might see
  543. // EAGAIN. In that case, return false to let the poller wait for readiness.
  544. // See the source code for internal/poll.FD.RawRead for more details.
  545. //
  546. // Starting in Go 1.14, goroutines are asynchronously preemptible. The 1.14
  547. // release notes indicate that applications should expect to see EINTR more
  548. // often on slow system calls (like recvmsg while waiting for input), so
  549. // we must handle that case as well.
  550. //
  551. // If the socket is in blocking mode, EAGAIN should never occur.
  552. switch err {
  553. case syscall.EAGAIN, syscall.EINTR:
  554. // Not ready.
  555. return false
  556. default:
  557. // Ready whether there was error or no error.
  558. return true
  559. }
  560. }