conn.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. package netlink
  2. import (
  3. "math/rand"
  4. "sync"
  5. "sync/atomic"
  6. "syscall"
  7. "time"
  8. "golang.org/x/net/bpf"
  9. )
  10. // A Conn is a connection to netlink. A Conn can be used to send and
  11. // receives messages to and from netlink.
  12. //
  13. // A Conn is safe for concurrent use, but to avoid contention in
  14. // high-throughput applications, the caller should almost certainly create a
  15. // pool of Conns and distribute them among workers.
  16. //
  17. // A Conn is capable of manipulating netlink subsystems from within a specific
  18. // Linux network namespace, but special care must be taken when doing so. See
  19. // the documentation of Config for details.
  20. type Conn struct {
  21. // Atomics must come first.
  22. //
  23. // seq is an atomically incremented integer used to provide sequence
  24. // numbers when Conn.Send is called.
  25. seq uint32
  26. // mu serializes access to the netlink socket for the request/response
  27. // transaction within Execute.
  28. mu sync.RWMutex
  29. // sock is the operating system-specific implementation of
  30. // a netlink sockets connection.
  31. sock Socket
  32. // pid is the PID assigned by netlink.
  33. pid uint32
  34. // d provides debugging capabilities for a Conn if not nil.
  35. d *debugger
  36. }
  37. // A Socket is an operating-system specific implementation of netlink
  38. // sockets used by Conn.
  39. //
  40. // Deprecated: the intent of Socket was to provide an abstraction layer for
  41. // testing, but this abstraction is awkward to use properly and disables much of
  42. // the functionality of the Conn type. Do not use.
  43. type Socket interface {
  44. Close() error
  45. Send(m Message) error
  46. SendMessages(m []Message) error
  47. Receive() ([]Message, error)
  48. }
  49. // Dial dials a connection to netlink, using the specified netlink family.
  50. // Config specifies optional configuration for Conn. If config is nil, a default
  51. // configuration will be used.
  52. func Dial(family int, config *Config) (*Conn, error) {
  53. // TODO(mdlayher): plumb in netlink.OpError wrapping?
  54. // Use OS-specific dial() to create Socket.
  55. c, pid, err := dial(family, config)
  56. if err != nil {
  57. return nil, err
  58. }
  59. return NewConn(c, pid), nil
  60. }
  61. // NewConn creates a Conn using the specified Socket and PID for netlink
  62. // communications.
  63. //
  64. // NewConn is primarily useful for tests. Most applications should use
  65. // Dial instead.
  66. func NewConn(sock Socket, pid uint32) *Conn {
  67. // Seed the sequence number using a random number generator.
  68. r := rand.New(rand.NewSource(time.Now().UnixNano()))
  69. seq := r.Uint32()
  70. // Configure a debugger if arguments are set.
  71. var d *debugger
  72. if len(debugArgs) > 0 {
  73. d = newDebugger(debugArgs)
  74. }
  75. return &Conn{
  76. seq: seq,
  77. sock: sock,
  78. pid: pid,
  79. d: d,
  80. }
  81. }
  82. // debug executes fn with the debugger if the debugger is not nil.
  83. func (c *Conn) debug(fn func(d *debugger)) {
  84. if c.d == nil {
  85. return
  86. }
  87. fn(c.d)
  88. }
  89. // Close closes the connection and unblocks any pending read operations.
  90. func (c *Conn) Close() error {
  91. // Close does not acquire a lock because it must be able to interrupt any
  92. // blocked system calls, such as when Receive is waiting on a multicast
  93. // group message.
  94. //
  95. // We rely on the kernel to deal with concurrent operations to the netlink
  96. // socket itself.
  97. return newOpError("close", c.sock.Close())
  98. }
  99. // Execute sends a single Message to netlink using Send, receives one or more
  100. // replies using Receive, and then checks the validity of the replies against
  101. // the request using Validate.
  102. //
  103. // Execute acquires a lock for the duration of the function call which blocks
  104. // concurrent calls to Send, SendMessages, and Receive, in order to ensure
  105. // consistency between netlink request/reply messages.
  106. //
  107. // See the documentation of Send, Receive, and Validate for details about
  108. // each function.
  109. func (c *Conn) Execute(m Message) ([]Message, error) {
  110. // Acquire the write lock and invoke the internal implementations of Send
  111. // and Receive which require the lock already be held.
  112. c.mu.Lock()
  113. defer c.mu.Unlock()
  114. req, err := c.lockedSend(m)
  115. if err != nil {
  116. return nil, err
  117. }
  118. res, err := c.lockedReceive()
  119. if err != nil {
  120. return nil, err
  121. }
  122. if err := Validate(req, res); err != nil {
  123. return nil, err
  124. }
  125. return res, nil
  126. }
  127. // SendMessages sends multiple Messages to netlink. The handling of
  128. // a Header's Length, Sequence and PID fields is the same as when
  129. // calling Send.
  130. func (c *Conn) SendMessages(msgs []Message) ([]Message, error) {
  131. // Wait for any concurrent calls to Execute to finish before proceeding.
  132. c.mu.RLock()
  133. defer c.mu.RUnlock()
  134. for i := range msgs {
  135. c.fixMsg(&msgs[i], nlmsgLength(len(msgs[i].Data)))
  136. }
  137. c.debug(func(d *debugger) {
  138. for _, m := range msgs {
  139. d.debugf(1, "send msgs: %+v", m)
  140. }
  141. })
  142. if err := c.sock.SendMessages(msgs); err != nil {
  143. c.debug(func(d *debugger) {
  144. d.debugf(1, "send msgs: err: %v", err)
  145. })
  146. return nil, newOpError("send-messages", err)
  147. }
  148. return msgs, nil
  149. }
  150. // Send sends a single Message to netlink. In most cases, a Header's Length,
  151. // Sequence, and PID fields should be set to 0, so they can be populated
  152. // automatically before the Message is sent. On success, Send returns a copy
  153. // of the Message with all parameters populated, for later validation.
  154. //
  155. // If Header.Length is 0, it will be automatically populated using the
  156. // correct length for the Message, including its payload.
  157. //
  158. // If Header.Sequence is 0, it will be automatically populated using the
  159. // next sequence number for this connection.
  160. //
  161. // If Header.PID is 0, it will be automatically populated using a PID
  162. // assigned by netlink.
  163. func (c *Conn) Send(m Message) (Message, error) {
  164. // Wait for any concurrent calls to Execute to finish before proceeding.
  165. c.mu.RLock()
  166. defer c.mu.RUnlock()
  167. return c.lockedSend(m)
  168. }
  169. // lockedSend implements Send, but must be called with c.mu acquired for reading.
  170. // We rely on the kernel to deal with concurrent reads and writes to the netlink
  171. // socket itself.
  172. func (c *Conn) lockedSend(m Message) (Message, error) {
  173. c.fixMsg(&m, nlmsgLength(len(m.Data)))
  174. c.debug(func(d *debugger) {
  175. d.debugf(1, "send: %+v", m)
  176. })
  177. if err := c.sock.Send(m); err != nil {
  178. c.debug(func(d *debugger) {
  179. d.debugf(1, "send: err: %v", err)
  180. })
  181. return Message{}, newOpError("send", err)
  182. }
  183. return m, nil
  184. }
  185. // Receive receives one or more messages from netlink. Multi-part messages are
  186. // handled transparently and returned as a single slice of Messages, with the
  187. // final empty "multi-part done" message removed.
  188. //
  189. // If any of the messages indicate a netlink error, that error will be returned.
  190. func (c *Conn) Receive() ([]Message, error) {
  191. // Wait for any concurrent calls to Execute to finish before proceeding.
  192. c.mu.RLock()
  193. defer c.mu.RUnlock()
  194. return c.lockedReceive()
  195. }
  196. // lockedReceive implements Receive, but must be called with c.mu acquired for reading.
  197. // We rely on the kernel to deal with concurrent reads and writes to the netlink
  198. // socket itself.
  199. func (c *Conn) lockedReceive() ([]Message, error) {
  200. msgs, err := c.receive()
  201. if err != nil {
  202. c.debug(func(d *debugger) {
  203. d.debugf(1, "recv: err: %v", err)
  204. })
  205. return nil, err
  206. }
  207. c.debug(func(d *debugger) {
  208. for _, m := range msgs {
  209. d.debugf(1, "recv: %+v", m)
  210. }
  211. })
  212. // When using nltest, it's possible for zero messages to be returned by receive.
  213. if len(msgs) == 0 {
  214. return msgs, nil
  215. }
  216. // Trim the final message with multi-part done indicator if
  217. // present.
  218. if m := msgs[len(msgs)-1]; m.Header.Flags&Multi != 0 && m.Header.Type == Done {
  219. return msgs[:len(msgs)-1], nil
  220. }
  221. return msgs, nil
  222. }
  223. // receive is the internal implementation of Conn.Receive, which can be called
  224. // recursively to handle multi-part messages.
  225. func (c *Conn) receive() ([]Message, error) {
  226. // NB: All non-nil errors returned from this function *must* be of type
  227. // OpError in order to maintain the appropriate contract with callers of
  228. // this package.
  229. //
  230. // This contract also applies to functions called within this function,
  231. // such as checkMessage.
  232. var res []Message
  233. for {
  234. msgs, err := c.sock.Receive()
  235. if err != nil {
  236. return nil, newOpError("receive", err)
  237. }
  238. // If this message is multi-part, we will need to continue looping to
  239. // drain all the messages from the socket.
  240. var multi bool
  241. for _, m := range msgs {
  242. if err := checkMessage(m); err != nil {
  243. return nil, err
  244. }
  245. // Does this message indicate a multi-part message?
  246. if m.Header.Flags&Multi == 0 {
  247. // No, check the next messages.
  248. continue
  249. }
  250. // Does this message indicate the last message in a series of
  251. // multi-part messages from a single read?
  252. multi = m.Header.Type != Done
  253. }
  254. res = append(res, msgs...)
  255. if !multi {
  256. // No more messages coming.
  257. return res, nil
  258. }
  259. }
  260. }
  261. // A groupJoinLeaver is a Socket that supports joining and leaving
  262. // netlink multicast groups.
  263. type groupJoinLeaver interface {
  264. Socket
  265. JoinGroup(group uint32) error
  266. LeaveGroup(group uint32) error
  267. }
  268. // JoinGroup joins a netlink multicast group by its ID.
  269. func (c *Conn) JoinGroup(group uint32) error {
  270. conn, ok := c.sock.(groupJoinLeaver)
  271. if !ok {
  272. return notSupported("join-group")
  273. }
  274. return newOpError("join-group", conn.JoinGroup(group))
  275. }
  276. // LeaveGroup leaves a netlink multicast group by its ID.
  277. func (c *Conn) LeaveGroup(group uint32) error {
  278. conn, ok := c.sock.(groupJoinLeaver)
  279. if !ok {
  280. return notSupported("leave-group")
  281. }
  282. return newOpError("leave-group", conn.LeaveGroup(group))
  283. }
  284. // A bpfSetter is a Socket that supports setting and removing BPF filters.
  285. type bpfSetter interface {
  286. Socket
  287. bpf.Setter
  288. RemoveBPF() error
  289. }
  290. // SetBPF attaches an assembled BPF program to a Conn.
  291. func (c *Conn) SetBPF(filter []bpf.RawInstruction) error {
  292. conn, ok := c.sock.(bpfSetter)
  293. if !ok {
  294. return notSupported("set-bpf")
  295. }
  296. return newOpError("set-bpf", conn.SetBPF(filter))
  297. }
  298. // RemoveBPF removes a BPF filter from a Conn.
  299. func (c *Conn) RemoveBPF() error {
  300. conn, ok := c.sock.(bpfSetter)
  301. if !ok {
  302. return notSupported("remove-bpf")
  303. }
  304. return newOpError("remove-bpf", conn.RemoveBPF())
  305. }
  306. // A deadlineSetter is a Socket that supports setting deadlines.
  307. type deadlineSetter interface {
  308. Socket
  309. SetDeadline(time.Time) error
  310. SetReadDeadline(time.Time) error
  311. SetWriteDeadline(time.Time) error
  312. }
  313. // SetDeadline sets the read and write deadlines associated with the connection.
  314. func (c *Conn) SetDeadline(t time.Time) error {
  315. conn, ok := c.sock.(deadlineSetter)
  316. if !ok {
  317. return notSupported("set-deadline")
  318. }
  319. return newOpError("set-deadline", conn.SetDeadline(t))
  320. }
  321. // SetReadDeadline sets the read deadline associated with the connection.
  322. func (c *Conn) SetReadDeadline(t time.Time) error {
  323. conn, ok := c.sock.(deadlineSetter)
  324. if !ok {
  325. return notSupported("set-read-deadline")
  326. }
  327. return newOpError("set-read-deadline", conn.SetReadDeadline(t))
  328. }
  329. // SetWriteDeadline sets the write deadline associated with the connection.
  330. func (c *Conn) SetWriteDeadline(t time.Time) error {
  331. conn, ok := c.sock.(deadlineSetter)
  332. if !ok {
  333. return notSupported("set-write-deadline")
  334. }
  335. return newOpError("set-write-deadline", conn.SetWriteDeadline(t))
  336. }
  337. // A ConnOption is a boolean option that may be set for a Conn.
  338. type ConnOption int
  339. // Possible ConnOption values. These constants are equivalent to the Linux
  340. // setsockopt boolean options for netlink sockets.
  341. const (
  342. PacketInfo ConnOption = iota
  343. BroadcastError
  344. NoENOBUFS
  345. ListenAllNSID
  346. CapAcknowledge
  347. ExtendedAcknowledge
  348. GetStrictCheck
  349. )
  350. // An optionSetter is a Socket that supports setting netlink options.
  351. type optionSetter interface {
  352. Socket
  353. SetOption(option ConnOption, enable bool) error
  354. }
  355. // SetOption enables or disables a netlink socket option for the Conn.
  356. func (c *Conn) SetOption(option ConnOption, enable bool) error {
  357. conn, ok := c.sock.(optionSetter)
  358. if !ok {
  359. return notSupported("set-option")
  360. }
  361. return newOpError("set-option", conn.SetOption(option, enable))
  362. }
  363. // A bufferSetter is a Socket that supports setting connection buffer sizes.
  364. type bufferSetter interface {
  365. Socket
  366. SetReadBuffer(bytes int) error
  367. SetWriteBuffer(bytes int) error
  368. }
  369. // SetReadBuffer sets the size of the operating system's receive buffer
  370. // associated with the Conn.
  371. func (c *Conn) SetReadBuffer(bytes int) error {
  372. conn, ok := c.sock.(bufferSetter)
  373. if !ok {
  374. return notSupported("set-read-buffer")
  375. }
  376. return newOpError("set-read-buffer", conn.SetReadBuffer(bytes))
  377. }
  378. // SetWriteBuffer sets the size of the operating system's transmit buffer
  379. // associated with the Conn.
  380. func (c *Conn) SetWriteBuffer(bytes int) error {
  381. conn, ok := c.sock.(bufferSetter)
  382. if !ok {
  383. return notSupported("set-write-buffer")
  384. }
  385. return newOpError("set-write-buffer", conn.SetWriteBuffer(bytes))
  386. }
  387. // A syscallConner is a Socket that supports syscall.Conn.
  388. type syscallConner interface {
  389. Socket
  390. syscall.Conn
  391. }
  392. var _ syscall.Conn = &Conn{}
  393. // SyscallConn returns a raw network connection. This implements the
  394. // syscall.Conn interface.
  395. //
  396. // SyscallConn is intended for advanced use cases, such as getting and setting
  397. // arbitrary socket options using the netlink socket's file descriptor.
  398. //
  399. // Once invoked, it is the caller's responsibility to ensure that operations
  400. // performed using Conn and the syscall.RawConn do not conflict with
  401. // each other.
  402. func (c *Conn) SyscallConn() (syscall.RawConn, error) {
  403. sc, ok := c.sock.(syscallConner)
  404. if !ok {
  405. return nil, notSupported("syscall-conn")
  406. }
  407. // TODO(mdlayher): mutex or similar to enforce syscall.RawConn contract of
  408. // FD remaining valid for duration of calls?
  409. return sc.SyscallConn()
  410. }
  411. // fixMsg updates the fields of m using the logic specified in Send.
  412. func (c *Conn) fixMsg(m *Message, ml int) {
  413. if m.Header.Length == 0 {
  414. m.Header.Length = uint32(nlmsgAlign(ml))
  415. }
  416. if m.Header.Sequence == 0 {
  417. m.Header.Sequence = c.nextSequence()
  418. }
  419. if m.Header.PID == 0 {
  420. m.Header.PID = c.pid
  421. }
  422. }
  423. // nextSequence atomically increments Conn's sequence number and returns
  424. // the incremented value.
  425. func (c *Conn) nextSequence() uint32 {
  426. return atomic.AddUint32(&c.seq, 1)
  427. }
  428. // Validate validates one or more reply Messages against a request Message,
  429. // ensuring that they contain matching sequence numbers and PIDs.
  430. func Validate(request Message, replies []Message) error {
  431. for _, m := range replies {
  432. // Check for mismatched sequence, unless:
  433. // - request had no sequence, meaning we are probably validating
  434. // a multicast reply
  435. if m.Header.Sequence != request.Header.Sequence && request.Header.Sequence != 0 {
  436. return newOpError("validate", errMismatchedSequence)
  437. }
  438. // Check for mismatched PID, unless:
  439. // - request had no PID, meaning we are either:
  440. // - validating a multicast reply
  441. // - netlink has not yet assigned us a PID
  442. // - response had no PID, meaning it's from the kernel as a multicast reply
  443. if m.Header.PID != request.Header.PID && request.Header.PID != 0 && m.Header.PID != 0 {
  444. return newOpError("validate", errMismatchedPID)
  445. }
  446. }
  447. return nil
  448. }
  449. // Config contains options for a Conn.
  450. type Config struct {
  451. // Groups is a bitmask which specifies multicast groups. If set to 0,
  452. // no multicast group subscriptions will be made.
  453. Groups uint32
  454. // NetNS specifies the network namespace the Conn will operate in.
  455. //
  456. // If set (non-zero), Conn will enter the specified network namespace and
  457. // an error will occur in Dial if the operation fails.
  458. //
  459. // If not set (zero), a best-effort attempt will be made to enter the
  460. // network namespace of the calling thread: this means that any changes made
  461. // to the calling thread's network namespace will also be reflected in Conn.
  462. // If this operation fails (due to lack of permissions or because network
  463. // namespaces are disabled by kernel configuration), Dial will not return
  464. // an error, and the Conn will operate in the default network namespace of
  465. // the process. This enables non-privileged use of Conn in applications
  466. // which do not require elevated privileges.
  467. //
  468. // Entering a network namespace is a privileged operation (root or
  469. // CAP_SYS_ADMIN are required), and most applications should leave this set
  470. // to 0.
  471. NetNS int
  472. // DisableNSLockThread is a no-op.
  473. //
  474. // Deprecated: internal changes have made this option obsolete and it has no
  475. // effect. Do not use.
  476. DisableNSLockThread bool
  477. // PID specifies the port ID used to bind the netlink socket. If set to 0,
  478. // the kernel will assign a port ID on the caller's behalf.
  479. //
  480. // Most callers should leave this field set to 0. This option is intended
  481. // for advanced use cases where the kernel expects a fixed unicast address
  482. // destination for netlink messages.
  483. PID uint32
  484. // Strict applies a more strict default set of options to the Conn,
  485. // including:
  486. // - ExtendedAcknowledge: true
  487. // - provides more useful error messages when supported by the kernel
  488. // - GetStrictCheck: true
  489. // - more strictly enforces request validation for some families such
  490. // as rtnetlink which were historically misused
  491. //
  492. // If any of the options specified by Strict cannot be configured due to an
  493. // outdated kernel or similar, an error will be returned.
  494. //
  495. // When possible, setting Strict to true is recommended for applications
  496. // running on modern Linux kernels.
  497. Strict bool
  498. }