netmon.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593
  1. // Copyright (c) Tailscale Inc & AUTHORS
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. // Package monitor provides facilities for monitoring network
  4. // interface and route changes. It primarily exists to know when
  5. // portable devices move between different networks.
  6. package netmon
  7. import (
  8. "encoding/json"
  9. "errors"
  10. "net/netip"
  11. "runtime"
  12. "sync"
  13. "time"
  14. "tailscale.com/net/interfaces"
  15. "tailscale.com/types/logger"
  16. "tailscale.com/util/clientmetric"
  17. "tailscale.com/util/set"
  18. )
  19. // pollWallTimeInterval is how often we check the time to check
  20. // for big jumps in wall (non-monotonic) time as a backup mechanism
  21. // to get notified of a sleeping device waking back up.
  22. // Usually there are also minor network change events on wake that let
  23. // us check the wall time sooner than this.
  24. const pollWallTimeInterval = 15 * time.Second
  25. // message represents a message returned from an osMon.
  26. type message interface {
  27. // Ignore is whether we should ignore this message.
  28. ignore() bool
  29. }
  30. // osMon is the interface that each operating system-specific
  31. // implementation of the link monitor must implement.
  32. type osMon interface {
  33. Close() error
  34. // Receive returns a new network interface change message. It
  35. // should block until there's either something to return, or
  36. // until the osMon is closed. After a Close, the returned
  37. // error is ignored.
  38. Receive() (message, error)
  39. // IsInterestingInterface reports whether the provided interface should
  40. // be considered for network change events.
  41. IsInterestingInterface(iface string) bool
  42. }
  43. // Monitor represents a monitoring instance.
  44. type Monitor struct {
  45. logf logger.Logf
  46. om osMon // nil means not supported on this platform
  47. change chan bool // send false to wake poller, true to also force ChangeDeltas be sent
  48. stop chan struct{} // closed on Stop
  49. // Things that must be set early, before use,
  50. // and not change at runtime.
  51. tsIfName string // tailscale interface name, if known/set ("tailscale0", "utun3", ...)
  52. mu sync.Mutex // guards all following fields
  53. cbs set.HandleSet[ChangeFunc]
  54. ruleDelCB set.HandleSet[RuleDeleteCallback]
  55. ifState *interfaces.State
  56. gwValid bool // whether gw and gwSelfIP are valid
  57. gw netip.Addr // our gateway's IP
  58. gwSelfIP netip.Addr // our own IP address (that corresponds to gw)
  59. started bool
  60. closed bool
  61. goroutines sync.WaitGroup
  62. wallTimer *time.Timer // nil until Started; re-armed AfterFunc per tick
  63. lastWall time.Time
  64. timeJumped bool // whether we need to send a changed=true after a big time jump
  65. }
  66. // ChangeFunc is a callback function registered with Monitor that's called when the
  67. // network changed.
  68. type ChangeFunc func(*ChangeDelta)
  69. // ChangeDelta describes the difference between two network states.
  70. type ChangeDelta struct {
  71. // Monitor is the network monitor that sent this delta.
  72. Monitor *Monitor
  73. // Old is the old interface state, if known.
  74. // It's nil if the old state is unknown.
  75. // Do not mutate it.
  76. Old *interfaces.State
  77. // New is the new network state.
  78. // It is always non-nil.
  79. // Do not mutate it.
  80. New *interfaces.State
  81. // Major is our legacy boolean of whether the network changed in some major
  82. // way.
  83. //
  84. // Deprecated: do not remove. As of 2023-08-23 we're in a renewed effort to
  85. // remove it and ask specific qustions of ChangeDelta instead. Look at Old
  86. // and New (or add methods to ChangeDelta) instead of using Major.
  87. Major bool
  88. // TimeJumped is whether there was a big jump in wall time since the last
  89. // time we checked. This is a hint that a mobile sleeping device might have
  90. // come out of sleep.
  91. TimeJumped bool
  92. // TODO(bradfitz): add some lazy cached fields here as needed with methods
  93. // on *ChangeDelta to let callers ask specific questions
  94. }
  95. // New instantiates and starts a monitoring instance.
  96. // The returned monitor is inactive until it's started by the Start method.
  97. // Use RegisterChangeCallback to get notified of network changes.
  98. func New(logf logger.Logf) (*Monitor, error) {
  99. logf = logger.WithPrefix(logf, "monitor: ")
  100. m := &Monitor{
  101. logf: logf,
  102. change: make(chan bool, 1),
  103. stop: make(chan struct{}),
  104. lastWall: wallTime(),
  105. }
  106. st, err := m.interfaceStateUncached()
  107. if err != nil {
  108. return nil, err
  109. }
  110. m.ifState = st
  111. m.om, err = newOSMon(logf, m)
  112. if err != nil {
  113. return nil, err
  114. }
  115. if m.om == nil {
  116. return nil, errors.New("newOSMon returned nil, nil")
  117. }
  118. return m, nil
  119. }
  120. // InterfaceState returns the latest snapshot of the machine's network
  121. // interfaces.
  122. //
  123. // The returned value is owned by Mon; it must not be modified.
  124. func (m *Monitor) InterfaceState() *interfaces.State {
  125. m.mu.Lock()
  126. defer m.mu.Unlock()
  127. return m.ifState
  128. }
  129. func (m *Monitor) interfaceStateUncached() (*interfaces.State, error) {
  130. return interfaces.GetState()
  131. }
  132. // SetTailscaleInterfaceName sets the name of the Tailscale interface. For
  133. // example, "tailscale0", "tun0", "utun3", etc.
  134. //
  135. // This must be called only early in tailscaled startup before the monitor is
  136. // used.
  137. func (m *Monitor) SetTailscaleInterfaceName(ifName string) {
  138. m.tsIfName = ifName
  139. }
  140. // GatewayAndSelfIP returns the current network's default gateway, and
  141. // the machine's default IP for that gateway.
  142. //
  143. // It's the same as interfaces.LikelyHomeRouterIP, but it caches the
  144. // result until the monitor detects a network change.
  145. func (m *Monitor) GatewayAndSelfIP() (gw, myIP netip.Addr, ok bool) {
  146. m.mu.Lock()
  147. defer m.mu.Unlock()
  148. if m.gwValid {
  149. return m.gw, m.gwSelfIP, true
  150. }
  151. gw, myIP, ok = interfaces.LikelyHomeRouterIP()
  152. changed := false
  153. if ok {
  154. changed = m.gw != gw || m.gwSelfIP != myIP
  155. m.gw, m.gwSelfIP = gw, myIP
  156. m.gwValid = true
  157. }
  158. if changed {
  159. m.logf("gateway and self IP changed: gw=%v self=%v", m.gw, m.gwSelfIP)
  160. }
  161. return gw, myIP, ok
  162. }
  163. // RegisterChangeCallback adds callback to the set of parties to be
  164. // notified (in their own goroutine) when the network state changes.
  165. // To remove this callback, call unregister (or close the monitor).
  166. func (m *Monitor) RegisterChangeCallback(callback ChangeFunc) (unregister func()) {
  167. m.mu.Lock()
  168. defer m.mu.Unlock()
  169. handle := m.cbs.Add(callback)
  170. return func() {
  171. m.mu.Lock()
  172. defer m.mu.Unlock()
  173. delete(m.cbs, handle)
  174. }
  175. }
  176. // RuleDeleteCallback is a callback when a Linux IP policy routing
  177. // rule is deleted. The table is the table number (52, 253, 354) and
  178. // priority is the priority order number (for Tailscale rules
  179. // currently: 5210, 5230, 5250, 5270)
  180. type RuleDeleteCallback func(table uint8, priority uint32)
  181. // RegisterRuleDeleteCallback adds callback to the set of parties to be
  182. // notified (in their own goroutine) when a Linux ip rule is deleted.
  183. // To remove this callback, call unregister (or close the monitor).
  184. func (m *Monitor) RegisterRuleDeleteCallback(callback RuleDeleteCallback) (unregister func()) {
  185. m.mu.Lock()
  186. defer m.mu.Unlock()
  187. handle := m.ruleDelCB.Add(callback)
  188. return func() {
  189. m.mu.Lock()
  190. defer m.mu.Unlock()
  191. delete(m.ruleDelCB, handle)
  192. }
  193. }
  194. // Start starts the monitor.
  195. // A monitor can only be started & closed once.
  196. func (m *Monitor) Start() {
  197. m.mu.Lock()
  198. defer m.mu.Unlock()
  199. if m.started || m.closed {
  200. return
  201. }
  202. m.started = true
  203. if shouldMonitorTimeJump {
  204. m.wallTimer = time.AfterFunc(pollWallTimeInterval, m.pollWallTime)
  205. }
  206. if m.om == nil {
  207. return
  208. }
  209. m.goroutines.Add(2)
  210. go m.pump()
  211. go m.debounce()
  212. }
  213. // Close closes the monitor.
  214. func (m *Monitor) Close() error {
  215. m.mu.Lock()
  216. if m.closed {
  217. m.mu.Unlock()
  218. return nil
  219. }
  220. m.closed = true
  221. close(m.stop)
  222. if m.wallTimer != nil {
  223. m.wallTimer.Stop()
  224. }
  225. var err error
  226. if m.om != nil {
  227. err = m.om.Close()
  228. }
  229. started := m.started
  230. m.mu.Unlock()
  231. if started {
  232. m.goroutines.Wait()
  233. }
  234. return err
  235. }
  236. // InjectEvent forces the monitor to pretend there was a network
  237. // change and re-check the state of the network. Any registered
  238. // ChangeFunc callbacks will be called within the event coalescing
  239. // period (under a fraction of a second).
  240. func (m *Monitor) InjectEvent() {
  241. select {
  242. case m.change <- true:
  243. default:
  244. // Another change signal is already
  245. // buffered. Debounce will wake up soon
  246. // enough.
  247. }
  248. }
  249. // Poll forces the monitor to pretend there was a network
  250. // change and re-check the state of the network.
  251. //
  252. // This is like InjectEvent but only fires ChangeFunc callbacks
  253. // if the network state differed at all.
  254. func (m *Monitor) Poll() {
  255. select {
  256. case m.change <- false:
  257. default:
  258. }
  259. }
  260. func (m *Monitor) stopped() bool {
  261. select {
  262. case <-m.stop:
  263. return true
  264. default:
  265. return false
  266. }
  267. }
  268. // pump continuously retrieves messages from the connection, notifying
  269. // the change channel of changes, and stopping when a stop is issued.
  270. func (m *Monitor) pump() {
  271. defer m.goroutines.Done()
  272. for !m.stopped() {
  273. msg, err := m.om.Receive()
  274. if err != nil {
  275. if m.stopped() {
  276. return
  277. }
  278. // Keep retrying while we're not closed.
  279. m.logf("error from link monitor: %v", err)
  280. time.Sleep(time.Second)
  281. continue
  282. }
  283. if rdm, ok := msg.(ipRuleDeletedMessage); ok {
  284. m.notifyRuleDeleted(rdm)
  285. continue
  286. }
  287. if msg.ignore() {
  288. continue
  289. }
  290. m.Poll()
  291. }
  292. }
  293. func (m *Monitor) notifyRuleDeleted(rdm ipRuleDeletedMessage) {
  294. m.mu.Lock()
  295. defer m.mu.Unlock()
  296. for _, cb := range m.ruleDelCB {
  297. go cb(rdm.table, rdm.priority)
  298. }
  299. }
  300. // isInterestingInterface reports whether the provided interface should be
  301. // considered when checking for network state changes.
  302. // The ips parameter should be the IPs of the provided interface.
  303. func (m *Monitor) isInterestingInterface(i interfaces.Interface, ips []netip.Prefix) bool {
  304. if !m.om.IsInterestingInterface(i.Name) {
  305. return false
  306. }
  307. return true
  308. }
  309. // debounce calls the callback function with a delay between events
  310. // and exits when a stop is issued.
  311. func (m *Monitor) debounce() {
  312. defer m.goroutines.Done()
  313. for {
  314. var forceCallbacks bool
  315. select {
  316. case <-m.stop:
  317. return
  318. case forceCallbacks = <-m.change:
  319. }
  320. if newState, err := m.interfaceStateUncached(); err != nil {
  321. m.logf("interfaces.State: %v", err)
  322. } else {
  323. m.handlePotentialChange(newState, forceCallbacks)
  324. }
  325. select {
  326. case <-m.stop:
  327. return
  328. case <-time.After(250 * time.Millisecond):
  329. }
  330. }
  331. }
  332. var (
  333. metricChangeEq = clientmetric.NewCounter("netmon_link_change_eq")
  334. metricChange = clientmetric.NewCounter("netmon_link_change")
  335. metricChangeTimeJump = clientmetric.NewCounter("netmon_link_change_timejump")
  336. metricChangeMajor = clientmetric.NewCounter("netmon_link_change_major")
  337. )
  338. // handlePotentialChange considers whether newState is different enough to wake
  339. // up callers and updates the monitor's state if so.
  340. //
  341. // If forceCallbacks is true, they're always notified.
  342. func (m *Monitor) handlePotentialChange(newState *interfaces.State, forceCallbacks bool) {
  343. m.mu.Lock()
  344. defer m.mu.Unlock()
  345. oldState := m.ifState
  346. timeJumped := shouldMonitorTimeJump && m.checkWallTimeAdvanceLocked()
  347. if !timeJumped && !forceCallbacks && oldState.Equal(newState) {
  348. // Exactly equal. Nothing to do.
  349. metricChangeEq.Add(1)
  350. return
  351. }
  352. delta := &ChangeDelta{
  353. Monitor: m,
  354. Old: oldState,
  355. New: newState,
  356. TimeJumped: timeJumped,
  357. }
  358. delta.Major = m.IsMajorChangeFrom(oldState, newState)
  359. if delta.Major {
  360. m.gwValid = false
  361. m.ifState = newState
  362. if s1, s2 := oldState.String(), delta.New.String(); s1 == s2 {
  363. m.logf("[unexpected] network state changed, but stringification didn't: %v", s1)
  364. m.logf("[unexpected] old: %s", jsonSummary(oldState))
  365. m.logf("[unexpected] new: %s", jsonSummary(newState))
  366. }
  367. }
  368. // See if we have a queued or new time jump signal.
  369. if timeJumped {
  370. m.resetTimeJumpedLocked()
  371. if !delta.Major {
  372. // Only log if it wasn't an interesting change.
  373. m.logf("time jumped (probably wake from sleep); synthesizing major change event")
  374. delta.Major = true
  375. }
  376. }
  377. metricChange.Add(1)
  378. if delta.Major {
  379. metricChangeMajor.Add(1)
  380. }
  381. if delta.TimeJumped {
  382. metricChangeTimeJump.Add(1)
  383. }
  384. for _, cb := range m.cbs {
  385. go cb(delta)
  386. }
  387. }
  388. // IsMajorChangeFrom reports whether the transition from s1 to s2 is
  389. // a "major" change, where major roughly means it's worth tearing down
  390. // a bunch of connections and rebinding.
  391. //
  392. // TODO(bradiftz): tigten this definition.
  393. func (m *Monitor) IsMajorChangeFrom(s1, s2 *interfaces.State) bool {
  394. if s1 == nil && s2 == nil {
  395. return false
  396. }
  397. if s1 == nil || s2 == nil {
  398. return true
  399. }
  400. if s1.HaveV6 != s2.HaveV6 ||
  401. s1.HaveV4 != s2.HaveV4 ||
  402. s1.IsExpensive != s2.IsExpensive ||
  403. s1.DefaultRouteInterface != s2.DefaultRouteInterface ||
  404. s1.HTTPProxy != s2.HTTPProxy ||
  405. s1.PAC != s2.PAC {
  406. return true
  407. }
  408. for iname, i := range s1.Interface {
  409. if iname == m.tsIfName {
  410. // Ignore changes in the Tailscale interface itself.
  411. continue
  412. }
  413. ips := s1.InterfaceIPs[iname]
  414. if !m.isInterestingInterface(i, ips) {
  415. continue
  416. }
  417. i2, ok := s2.Interface[iname]
  418. if !ok {
  419. return true
  420. }
  421. ips2, ok := s2.InterfaceIPs[iname]
  422. if !ok {
  423. return true
  424. }
  425. if !i.Equal(i2) || !prefixesMajorEqual(ips, ips2) {
  426. return true
  427. }
  428. }
  429. // Iterate over s2 in case there is a field in s2 that doesn't exist in s1
  430. for iname, i := range s2.Interface {
  431. if iname == m.tsIfName {
  432. // Ignore changes in the Tailscale interface itself.
  433. continue
  434. }
  435. ips := s2.InterfaceIPs[iname]
  436. if !m.isInterestingInterface(i, ips) {
  437. continue
  438. }
  439. i1, ok := s1.Interface[iname]
  440. if !ok {
  441. return true
  442. }
  443. ips1, ok := s1.InterfaceIPs[iname]
  444. if !ok {
  445. return true
  446. }
  447. if !i.Equal(i1) || !prefixesMajorEqual(ips, ips1) {
  448. return true
  449. }
  450. }
  451. return false
  452. }
  453. // prefixesMajorEqual reports whether a and b are equal after ignoring
  454. // boring things like link-local, loopback, and multicast addresses.
  455. func prefixesMajorEqual(a, b []netip.Prefix) bool {
  456. // trim returns a subslice of p with link local unicast,
  457. // loopback, and multicast prefixes removed from the front.
  458. trim := func(p []netip.Prefix) []netip.Prefix {
  459. for len(p) > 0 {
  460. a := p[0].Addr()
  461. if a.IsLinkLocalUnicast() || a.IsLoopback() || a.IsMulticast() {
  462. p = p[1:]
  463. continue
  464. }
  465. break
  466. }
  467. return p
  468. }
  469. for {
  470. a = trim(a)
  471. b = trim(b)
  472. if len(a) == 0 || len(b) == 0 {
  473. return len(a) == 0 && len(b) == 0
  474. }
  475. if a[0] != b[0] {
  476. return false
  477. }
  478. a, b = a[1:], b[1:]
  479. }
  480. }
  481. func jsonSummary(x any) any {
  482. j, err := json.Marshal(x)
  483. if err != nil {
  484. return err
  485. }
  486. return j
  487. }
  488. func wallTime() time.Time {
  489. // From time package's docs: "The canonical way to strip a
  490. // monotonic clock reading is to use t = t.Round(0)."
  491. return time.Now().Round(0)
  492. }
  493. func (m *Monitor) pollWallTime() {
  494. m.mu.Lock()
  495. defer m.mu.Unlock()
  496. if m.closed {
  497. return
  498. }
  499. if m.checkWallTimeAdvanceLocked() {
  500. m.InjectEvent()
  501. }
  502. m.wallTimer.Reset(pollWallTimeInterval)
  503. }
  504. // shouldMonitorTimeJump is whether we keep a regular periodic timer running in
  505. // the background watching for jumps in wall time.
  506. //
  507. // We don't do this on mobile platforms for battery reasons, and because these
  508. // platforms don't really sleep in the same way.
  509. const shouldMonitorTimeJump = runtime.GOOS != "android" && runtime.GOOS != "ios"
  510. // checkWallTimeAdvanceLocked reports whether wall time jumped more than 150% of
  511. // pollWallTimeInterval, indicating we probably just came out of sleep. Once a
  512. // time jump is detected it must be reset by calling resetTimeJumpedLocked.
  513. func (m *Monitor) checkWallTimeAdvanceLocked() bool {
  514. if !shouldMonitorTimeJump {
  515. panic("unreachable") // if callers are correct
  516. }
  517. now := wallTime()
  518. if now.Sub(m.lastWall) > pollWallTimeInterval*3/2 {
  519. m.timeJumped = true // it is reset by debounce.
  520. }
  521. m.lastWall = now
  522. return m.timeJumped
  523. }
  524. // resetTimeJumpedLocked consumes the signal set by checkWallTimeAdvanceLocked.
  525. func (m *Monitor) resetTimeJumpedLocked() {
  526. m.timeJumped = false
  527. }
  528. type ipRuleDeletedMessage struct {
  529. table uint8
  530. priority uint32
  531. }
  532. func (ipRuleDeletedMessage) ignore() bool { return true }