conn.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594
  1. package netlink
  2. import (
  3. "errors"
  4. "math/rand"
  5. "os"
  6. "sync"
  7. "sync/atomic"
  8. "syscall"
  9. "time"
  10. "golang.org/x/net/bpf"
  11. )
  12. // A Conn is a connection to netlink. A Conn can be used to send and
  13. // receives messages to and from netlink.
  14. //
  15. // A Conn is safe for concurrent use, but to avoid contention in
  16. // high-throughput applications, the caller should almost certainly create a
  17. // pool of Conns and distribute them among workers.
  18. //
  19. // A Conn is capable of manipulating netlink subsystems from within a specific
  20. // Linux network namespace, but special care must be taken when doing so. See
  21. // the documentation of Config for details.
  22. type Conn struct {
  23. // sock is the operating system-specific implementation of
  24. // a netlink sockets connection.
  25. sock Socket
  26. // seq is an atomically incremented integer used to provide sequence
  27. // numbers when Conn.Send is called.
  28. seq *uint32
  29. // pid is the PID assigned by netlink.
  30. pid uint32
  31. // d provides debugging capabilities for a Conn if not nil.
  32. d *debugger
  33. // mu serializes access to the netlink socket for the request/response
  34. // transaction within Execute.
  35. mu sync.RWMutex
  36. }
  37. // A Socket is an operating-system specific implementation of netlink
  38. // sockets used by Conn.
  39. type Socket interface {
  40. Close() error
  41. Send(m Message) error
  42. SendMessages(m []Message) error
  43. Receive() ([]Message, error)
  44. }
  45. // Dial dials a connection to netlink, using the specified netlink family.
  46. // Config specifies optional configuration for Conn. If config is nil, a default
  47. // configuration will be used.
  48. func Dial(family int, config *Config) (*Conn, error) {
  49. // Use OS-specific dial() to create Socket
  50. c, pid, err := dial(family, config)
  51. if err != nil {
  52. return nil, err
  53. }
  54. return NewConn(c, pid), nil
  55. }
  56. // NewConn creates a Conn using the specified Socket and PID for netlink
  57. // communications.
  58. //
  59. // NewConn is primarily useful for tests. Most applications should use
  60. // Dial instead.
  61. func NewConn(sock Socket, pid uint32) *Conn {
  62. // Seed the sequence number using a random number generator.
  63. r := rand.New(rand.NewSource(time.Now().UnixNano()))
  64. seq := r.Uint32()
  65. // Configure a debugger if arguments are set.
  66. var d *debugger
  67. if len(debugArgs) > 0 {
  68. d = newDebugger(debugArgs)
  69. }
  70. return &Conn{
  71. sock: sock,
  72. seq: &seq,
  73. pid: pid,
  74. d: d,
  75. }
  76. }
  77. // debug executes fn with the debugger if the debugger is not nil.
  78. func (c *Conn) debug(fn func(d *debugger)) {
  79. if c.d == nil {
  80. return
  81. }
  82. fn(c.d)
  83. }
  84. // Close closes the connection.
  85. //
  86. // Due to a bug https://github.com/mdlayher/netlink/issues/162, Close currently
  87. // cannot unblock concurrent calls to Send or Receive. As a stop-gap measure,
  88. // call SetDeadline with a time in the past such as time.Unix(1, 0) in a
  89. // different goroutine to unblock a long-running Receive loop. The intent is to
  90. // fix this issue in v1.2.0.
  91. func (c *Conn) Close() error {
  92. // Close does not acquire a lock because it must be able to interrupt any
  93. // blocked system calls, such as when Receive is waiting on a multicast
  94. // group message.
  95. //
  96. // We rely on the kernel to deal with concurrent operations to the netlink
  97. // socket itself.
  98. return newOpError("close", c.sock.Close())
  99. }
  100. // Execute sends a single Message to netlink using Send, receives one or more
  101. // replies using Receive, and then checks the validity of the replies against
  102. // the request using Validate.
  103. //
  104. // Execute acquires a lock for the duration of the function call which blocks
  105. // concurrent calls to Send, SendMessages, and Receive, in order to ensure
  106. // consistency between netlink request/reply messages.
  107. //
  108. // See the documentation of Send, Receive, and Validate for details about
  109. // each function.
  110. func (c *Conn) Execute(message Message) ([]Message, error) {
  111. // Acquire the write lock and invoke the internal implementations of Send
  112. // and Receive which require the lock already be held.
  113. c.mu.Lock()
  114. defer c.mu.Unlock()
  115. req, err := c.lockedSend(message)
  116. if err != nil {
  117. return nil, err
  118. }
  119. replies, err := c.lockedReceive()
  120. if err != nil {
  121. return nil, err
  122. }
  123. if err := Validate(req, replies); err != nil {
  124. return nil, err
  125. }
  126. return replies, nil
  127. }
  128. // SendMessages sends multiple Messages to netlink. The handling of
  129. // a Header's Length, Sequence and PID fields is the same as when
  130. // calling Send.
  131. func (c *Conn) SendMessages(messages []Message) ([]Message, error) {
  132. // Wait for any concurrent calls to Execute to finish before proceeding.
  133. c.mu.RLock()
  134. defer c.mu.RUnlock()
  135. for idx, m := range messages {
  136. ml := nlmsgLength(len(m.Data))
  137. // TODO(mdlayher): fine-tune this limit.
  138. if ml > (1024 * 32) {
  139. return nil, errors.New("netlink message data too large")
  140. }
  141. c.fixMsg(&messages[idx], ml)
  142. }
  143. c.debug(func(d *debugger) {
  144. for _, m := range messages {
  145. d.debugf(1, "send msgs: %+v", m)
  146. }
  147. })
  148. if err := c.sock.SendMessages(messages); err != nil {
  149. c.debug(func(d *debugger) {
  150. d.debugf(1, "send msgs: err: %v", err)
  151. })
  152. return nil, newOpError("send-messages", err)
  153. }
  154. return messages, nil
  155. }
  156. // Send sends a single Message to netlink. In most cases, a Header's Length,
  157. // Sequence, and PID fields should be set to 0, so they can be populated
  158. // automatically before the Message is sent. On success, Send returns a copy
  159. // of the Message with all parameters populated, for later validation.
  160. //
  161. // If Header.Length is 0, it will be automatically populated using the
  162. // correct length for the Message, including its payload.
  163. //
  164. // If Header.Sequence is 0, it will be automatically populated using the
  165. // next sequence number for this connection.
  166. //
  167. // If Header.PID is 0, it will be automatically populated using a PID
  168. // assigned by netlink.
  169. func (c *Conn) Send(message Message) (Message, error) {
  170. // Wait for any concurrent calls to Execute to finish before proceeding.
  171. c.mu.RLock()
  172. defer c.mu.RUnlock()
  173. return c.lockedSend(message)
  174. }
  175. // lockedSend implements Send, but must be called with c.mu acquired for reading.
  176. // We rely on the kernel to deal with concurrent reads and writes to the netlink
  177. // socket itself.
  178. func (c *Conn) lockedSend(message Message) (Message, error) {
  179. ml := nlmsgLength(len(message.Data))
  180. // TODO(mdlayher): fine-tune this limit.
  181. if ml > (1024 * 32) {
  182. return Message{}, errors.New("netlink message data too large")
  183. }
  184. c.fixMsg(&message, ml)
  185. c.debug(func(d *debugger) {
  186. d.debugf(1, "send: %+v", message)
  187. })
  188. if err := c.sock.Send(message); err != nil {
  189. c.debug(func(d *debugger) {
  190. d.debugf(1, "send: err: %v", err)
  191. })
  192. return Message{}, newOpError("send", err)
  193. }
  194. return message, nil
  195. }
  196. // Receive receives one or more messages from netlink. Multi-part messages are
  197. // handled transparently and returned as a single slice of Messages, with the
  198. // final empty "multi-part done" message removed.
  199. //
  200. // If any of the messages indicate a netlink error, that error will be returned.
  201. func (c *Conn) Receive() ([]Message, error) {
  202. // Wait for any concurrent calls to Execute to finish before proceeding.
  203. c.mu.RLock()
  204. defer c.mu.RUnlock()
  205. return c.lockedReceive()
  206. }
  207. // lockedReceive implements Receive, but must be called with c.mu acquired for reading.
  208. // We rely on the kernel to deal with concurrent reads and writes to the netlink
  209. // socket itself.
  210. func (c *Conn) lockedReceive() ([]Message, error) {
  211. msgs, err := c.receive()
  212. if err != nil {
  213. c.debug(func(d *debugger) {
  214. d.debugf(1, "recv: err: %v", err)
  215. })
  216. return nil, err
  217. }
  218. c.debug(func(d *debugger) {
  219. for _, m := range msgs {
  220. d.debugf(1, "recv: %+v", m)
  221. }
  222. })
  223. // When using nltest, it's possible for zero messages to be returned by receive.
  224. if len(msgs) == 0 {
  225. return msgs, nil
  226. }
  227. // Trim the final message with multi-part done indicator if
  228. // present.
  229. if m := msgs[len(msgs)-1]; m.Header.Flags&Multi != 0 && m.Header.Type == Done {
  230. return msgs[:len(msgs)-1], nil
  231. }
  232. return msgs, nil
  233. }
  234. // receive is the internal implementation of Conn.Receive, which can be called
  235. // recursively to handle multi-part messages.
  236. func (c *Conn) receive() ([]Message, error) {
  237. // NB: All non-nil errors returned from this function *must* be of type
  238. // OpError in order to maintain the appropriate contract with callers of
  239. // this package.
  240. //
  241. // This contract also applies to functions called within this function,
  242. // such as checkMessage.
  243. var res []Message
  244. for {
  245. msgs, err := c.sock.Receive()
  246. if err != nil {
  247. return nil, newOpError("receive", err)
  248. }
  249. // If this message is multi-part, we will need to perform an recursive call
  250. // to continue draining the socket
  251. var multi bool
  252. for _, m := range msgs {
  253. if err := checkMessage(m); err != nil {
  254. return nil, err
  255. }
  256. // Does this message indicate a multi-part message?
  257. if m.Header.Flags&Multi == 0 {
  258. // No, check the next messages.
  259. continue
  260. }
  261. // Does this message indicate the last message in a series of
  262. // multi-part messages from a single read?
  263. multi = m.Header.Type != Done
  264. }
  265. res = append(res, msgs...)
  266. if !multi {
  267. // No more messages coming.
  268. return res, nil
  269. }
  270. }
  271. }
  272. // A groupJoinLeaver is a Socket that supports joining and leaving
  273. // netlink multicast groups.
  274. type groupJoinLeaver interface {
  275. Socket
  276. JoinGroup(group uint32) error
  277. LeaveGroup(group uint32) error
  278. }
  279. // JoinGroup joins a netlink multicast group by its ID.
  280. func (c *Conn) JoinGroup(group uint32) error {
  281. conn, ok := c.sock.(groupJoinLeaver)
  282. if !ok {
  283. return notSupported("join-group")
  284. }
  285. return newOpError("join-group", conn.JoinGroup(group))
  286. }
  287. // LeaveGroup leaves a netlink multicast group by its ID.
  288. func (c *Conn) LeaveGroup(group uint32) error {
  289. conn, ok := c.sock.(groupJoinLeaver)
  290. if !ok {
  291. return notSupported("leave-group")
  292. }
  293. return newOpError("leave-group", conn.LeaveGroup(group))
  294. }
  295. // A bpfSetter is a Socket that supports setting and removing BPF filters.
  296. type bpfSetter interface {
  297. Socket
  298. bpf.Setter
  299. RemoveBPF() error
  300. }
  301. // SetBPF attaches an assembled BPF program to a Conn.
  302. func (c *Conn) SetBPF(filter []bpf.RawInstruction) error {
  303. conn, ok := c.sock.(bpfSetter)
  304. if !ok {
  305. return notSupported("set-bpf")
  306. }
  307. return newOpError("set-bpf", conn.SetBPF(filter))
  308. }
  309. // RemoveBPF removes a BPF filter from a Conn.
  310. func (c *Conn) RemoveBPF() error {
  311. conn, ok := c.sock.(bpfSetter)
  312. if !ok {
  313. return notSupported("remove-bpf")
  314. }
  315. return newOpError("remove-bpf", conn.RemoveBPF())
  316. }
  317. // A deadlineSetter is a Socket that supports setting deadlines.
  318. type deadlineSetter interface {
  319. Socket
  320. SetDeadline(time.Time) error
  321. SetReadDeadline(time.Time) error
  322. SetWriteDeadline(time.Time) error
  323. }
  324. // SetDeadline sets the read and write deadlines associated with the connection.
  325. //
  326. // Deadline functionality is only supported on Go 1.12+. Calling this function
  327. // on older versions of Go will result in an error.
  328. func (c *Conn) SetDeadline(t time.Time) error {
  329. conn, ok := c.sock.(deadlineSetter)
  330. if !ok {
  331. return notSupported("set-deadline")
  332. }
  333. return newOpError("set-deadline", conn.SetDeadline(t))
  334. }
  335. // SetReadDeadline sets the read deadline associated with the connection.
  336. //
  337. // Deadline functionality is only supported on Go 1.12+. Calling this function
  338. // on older versions of Go will result in an error.
  339. func (c *Conn) SetReadDeadline(t time.Time) error {
  340. conn, ok := c.sock.(deadlineSetter)
  341. if !ok {
  342. return notSupported("set-read-deadline")
  343. }
  344. return newOpError("set-read-deadline", conn.SetReadDeadline(t))
  345. }
  346. // SetWriteDeadline sets the write deadline associated with the connection.
  347. //
  348. // Deadline functionality is only supported on Go 1.12+. Calling this function
  349. // on older versions of Go will result in an error.
  350. func (c *Conn) SetWriteDeadline(t time.Time) error {
  351. conn, ok := c.sock.(deadlineSetter)
  352. if !ok {
  353. return notSupported("set-write-deadline")
  354. }
  355. return newOpError("set-write-deadline", conn.SetWriteDeadline(t))
  356. }
  357. // A ConnOption is a boolean option that may be set for a Conn.
  358. type ConnOption int
  359. // Possible ConnOption values. These constants are equivalent to the Linux
  360. // setsockopt boolean options for netlink sockets.
  361. const (
  362. PacketInfo ConnOption = iota
  363. BroadcastError
  364. NoENOBUFS
  365. ListenAllNSID
  366. CapAcknowledge
  367. ExtendedAcknowledge
  368. )
  369. // An optionSetter is a Socket that supports setting netlink options.
  370. type optionSetter interface {
  371. Socket
  372. SetOption(option ConnOption, enable bool) error
  373. }
  374. // SetOption enables or disables a netlink socket option for the Conn.
  375. func (c *Conn) SetOption(option ConnOption, enable bool) error {
  376. conn, ok := c.sock.(optionSetter)
  377. if !ok {
  378. return notSupported("set-option")
  379. }
  380. return newOpError("set-option", conn.SetOption(option, enable))
  381. }
  382. // A bufferSetter is a Socket that supports setting connection buffer sizes.
  383. type bufferSetter interface {
  384. Socket
  385. SetReadBuffer(bytes int) error
  386. SetWriteBuffer(bytes int) error
  387. }
  388. // SetReadBuffer sets the size of the operating system's receive buffer
  389. // associated with the Conn.
  390. func (c *Conn) SetReadBuffer(bytes int) error {
  391. conn, ok := c.sock.(bufferSetter)
  392. if !ok {
  393. return notSupported("set-read-buffer")
  394. }
  395. return newOpError("set-read-buffer", conn.SetReadBuffer(bytes))
  396. }
  397. // SetWriteBuffer sets the size of the operating system's transmit buffer
  398. // associated with the Conn.
  399. func (c *Conn) SetWriteBuffer(bytes int) error {
  400. conn, ok := c.sock.(bufferSetter)
  401. if !ok {
  402. return notSupported("set-write-buffer")
  403. }
  404. return newOpError("set-write-buffer", conn.SetWriteBuffer(bytes))
  405. }
  406. // A filer is a Socket that supports retrieving its associated *os.File.
  407. type filer interface {
  408. Socket
  409. File() *os.File
  410. }
  411. var _ syscall.Conn = &Conn{}
  412. // TODO(mdlayher): mutex or similar to enforce syscall.RawConn contract of
  413. // FD remaining valid for duration of calls?
  414. // SyscallConn returns a raw network connection. This implements the
  415. // syscall.Conn interface.
  416. //
  417. // On Go 1.12+, all methods of the returned syscall.RawConn are supported and
  418. // the Conn is integrated with the runtime network poller. On versions of Go
  419. // prior to Go 1.12, only the Control method of the returned syscall.RawConn
  420. // is implemented.
  421. //
  422. // SyscallConn is intended for advanced use cases, such as getting and setting
  423. // arbitrary socket options using the netlink socket's file descriptor.
  424. //
  425. // Once invoked, it is the caller's responsibility to ensure that operations
  426. // performed using Conn and the syscall.RawConn do not conflict with
  427. // each other.
  428. func (c *Conn) SyscallConn() (syscall.RawConn, error) {
  429. fc, ok := c.sock.(filer)
  430. if !ok {
  431. return nil, notSupported("syscall-conn")
  432. }
  433. return newRawConn(fc.File())
  434. }
  435. // fixMsg updates the fields of m using the logic specified in Send.
  436. func (c *Conn) fixMsg(m *Message, ml int) {
  437. if m.Header.Length == 0 {
  438. m.Header.Length = uint32(nlmsgAlign(ml))
  439. }
  440. if m.Header.Sequence == 0 {
  441. m.Header.Sequence = c.nextSequence()
  442. }
  443. if m.Header.PID == 0 {
  444. m.Header.PID = c.pid
  445. }
  446. }
  447. // nextSequence atomically increments Conn's sequence number and returns
  448. // the incremented value.
  449. func (c *Conn) nextSequence() uint32 {
  450. return atomic.AddUint32(c.seq, 1)
  451. }
  452. // Validate validates one or more reply Messages against a request Message,
  453. // ensuring that they contain matching sequence numbers and PIDs.
  454. func Validate(request Message, replies []Message) error {
  455. for _, m := range replies {
  456. // Check for mismatched sequence, unless:
  457. // - request had no sequence, meaning we are probably validating
  458. // a multicast reply
  459. if m.Header.Sequence != request.Header.Sequence && request.Header.Sequence != 0 {
  460. return newOpError("validate", errMismatchedSequence)
  461. }
  462. // Check for mismatched PID, unless:
  463. // - request had no PID, meaning we are either:
  464. // - validating a multicast reply
  465. // - netlink has not yet assigned us a PID
  466. // - response had no PID, meaning it's from the kernel as a multicast reply
  467. if m.Header.PID != request.Header.PID && request.Header.PID != 0 && m.Header.PID != 0 {
  468. return newOpError("validate", errMismatchedPID)
  469. }
  470. }
  471. return nil
  472. }
  473. // Config contains options for a Conn.
  474. type Config struct {
  475. // Groups is a bitmask which specifies multicast groups. If set to 0,
  476. // no multicast group subscriptions will be made.
  477. Groups uint32
  478. // NetNS specifies the network namespace the Conn will operate in.
  479. //
  480. // If set (non-zero), Conn will enter the specified network namespace and
  481. // an error will occur in Dial if the operation fails.
  482. //
  483. // If not set (zero), a best-effort attempt will be made to enter the
  484. // network namespace of the calling thread: this means that any changes made
  485. // to the calling thread's network namespace will also be reflected in Conn.
  486. // If this operation fails (due to lack of permissions or because network
  487. // namespaces are disabled by kernel configuration), Dial will not return
  488. // an error, and the Conn will operate in the default network namespace of
  489. // the process. This enables non-privileged use of Conn in applications
  490. // which do not require elevated privileges.
  491. //
  492. // Entering a network namespace is a privileged operation (root or
  493. // CAP_SYS_ADMIN are required), and most applications should leave this set
  494. // to 0.
  495. NetNS int
  496. // DisableNSLockThread is deprecated and has no effect.
  497. DisableNSLockThread bool
  498. }