controller.go 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. /*
  2. * Copyright (c) 2015, Psiphon Inc.
  3. * All rights reserved.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation, either version 3 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. *
  18. */
  19. // Package psiphon implements the core tunnel functionality of a Psiphon client.
  20. // The main function is RunForever, which runs a Controller that obtains lists of
  21. // servers, establishes tunnel connections, and runs local proxies through which
  22. // tunneled traffic may be sent.
  23. package psiphon
  24. import (
  25. "errors"
  26. "math/rand"
  27. "net"
  28. "sync"
  29. "time"
  30. )
  31. // Controller is a tunnel lifecycle coordinator. It manages lists of servers to
  32. // connect to; establishes and monitors tunnels; and runs local proxies which
  33. // route traffic through the tunnels.
  34. type Controller struct {
  35. config *Config
  36. sessionId string
  37. componentFailureSignal chan struct{}
  38. shutdownBroadcast chan struct{}
  39. runWaitGroup *sync.WaitGroup
  40. establishedTunnels chan *Tunnel
  41. failedTunnels chan *Tunnel
  42. tunnelMutex sync.Mutex
  43. establishedOnce bool
  44. tunnels []*Tunnel
  45. nextTunnel int
  46. startedConnectedReporter bool
  47. isEstablishing bool
  48. establishWaitGroup *sync.WaitGroup
  49. stopEstablishingBroadcast chan struct{}
  50. candidateServerEntries chan *candidateServerEntry
  51. establishPendingConns *Conns
  52. untunneledPendingConns *Conns
  53. untunneledDialConfig *DialConfig
  54. splitTunnelClassifier *SplitTunnelClassifier
  55. signalFetchRemoteServerList chan struct{}
  56. signalDownloadUpgrade chan string
  57. impairedProtocolClassification map[string]int
  58. signalReportConnected chan struct{}
  59. serverAffinityDoneBroadcast chan struct{}
  60. }
  61. type candidateServerEntry struct {
  62. serverEntry *ServerEntry
  63. isServerAffinityCandidate bool
  64. }
  65. // NewController initializes a new controller.
  66. func NewController(config *Config) (controller *Controller, err error) {
  67. // Needed by regen, at least
  68. rand.Seed(int64(time.Now().Nanosecond()))
  69. // Supply a default HostNameTransformer
  70. if config.HostNameTransformer == nil {
  71. config.HostNameTransformer = &IdentityHostNameTransformer{}
  72. }
  73. // Generate a session ID for the Psiphon server API. This session ID is
  74. // used across all tunnels established by the controller.
  75. sessionId, err := MakeSessionId()
  76. if err != nil {
  77. return nil, ContextError(err)
  78. }
  79. NoticeSessionId(sessionId)
  80. // untunneledPendingConns may be used to interrupt the fetch remote server list
  81. // request and other untunneled connection establishments. BindToDevice may be
  82. // used to exclude these requests and connection from VPN routing.
  83. // TODO: fetch remote server list and untunneled upgrade download should remove
  84. // their completed conns from untunneledPendingConns.
  85. untunneledPendingConns := new(Conns)
  86. untunneledDialConfig := &DialConfig{
  87. UpstreamProxyUrl: config.UpstreamProxyUrl,
  88. PendingConns: untunneledPendingConns,
  89. DeviceBinder: config.DeviceBinder,
  90. DnsServerGetter: config.DnsServerGetter,
  91. UseIndistinguishableTLS: config.UseIndistinguishableTLS,
  92. TrustedCACertificatesFilename: config.TrustedCACertificatesFilename,
  93. DeviceRegion: config.DeviceRegion,
  94. }
  95. controller = &Controller{
  96. config: config,
  97. sessionId: sessionId,
  98. // componentFailureSignal receives a signal from a component (including socks and
  99. // http local proxies) if they unexpectedly fail. Senders should not block.
  100. // A buffer allows at least one stop signal to be sent before there is a receiver.
  101. componentFailureSignal: make(chan struct{}, 1),
  102. shutdownBroadcast: make(chan struct{}),
  103. runWaitGroup: new(sync.WaitGroup),
  104. // establishedTunnels and failedTunnels buffer sizes are large enough to
  105. // receive full pools of tunnels without blocking. Senders should not block.
  106. establishedTunnels: make(chan *Tunnel, config.TunnelPoolSize),
  107. failedTunnels: make(chan *Tunnel, config.TunnelPoolSize),
  108. tunnels: make([]*Tunnel, 0),
  109. establishedOnce: false,
  110. startedConnectedReporter: false,
  111. isEstablishing: false,
  112. establishPendingConns: new(Conns),
  113. untunneledPendingConns: untunneledPendingConns,
  114. untunneledDialConfig: untunneledDialConfig,
  115. impairedProtocolClassification: make(map[string]int),
  116. // TODO: Add a buffer of 1 so we don't miss a signal while receiver is
  117. // starting? Trade-off is potential back-to-back fetch remotes. As-is,
  118. // establish will eventually signal another fetch remote.
  119. signalFetchRemoteServerList: make(chan struct{}),
  120. signalDownloadUpgrade: make(chan string),
  121. signalReportConnected: make(chan struct{}),
  122. }
  123. controller.splitTunnelClassifier = NewSplitTunnelClassifier(config, controller)
  124. return controller, nil
  125. }
  126. // Run executes the controller. It launches components and then monitors
  127. // for a shutdown signal; after receiving the signal it shuts down the
  128. // controller.
  129. // The components include:
  130. // - the periodic remote server list fetcher
  131. // - the connected reporter
  132. // - the tunnel manager
  133. // - a local SOCKS proxy that port forwards through the pool of tunnels
  134. // - a local HTTP proxy that port forwards through the pool of tunnels
  135. func (controller *Controller) Run(shutdownBroadcast <-chan struct{}) {
  136. ReportAvailableRegions()
  137. // Start components
  138. listenIP, err := GetInterfaceIPAddress(controller.config.ListenInterface)
  139. if err != nil {
  140. NoticeError("error getting listener IP: %s", err)
  141. return
  142. }
  143. socksProxy, err := NewSocksProxy(controller.config, controller, listenIP)
  144. if err != nil {
  145. NoticeAlert("error initializing local SOCKS proxy: %s", err)
  146. return
  147. }
  148. defer socksProxy.Close()
  149. httpProxy, err := NewHttpProxy(
  150. controller.config, controller.untunneledDialConfig, controller, listenIP)
  151. if err != nil {
  152. NoticeAlert("error initializing local HTTP proxy: %s", err)
  153. return
  154. }
  155. defer httpProxy.Close()
  156. if !controller.config.DisableRemoteServerListFetcher {
  157. controller.runWaitGroup.Add(1)
  158. go controller.remoteServerListFetcher()
  159. }
  160. if controller.config.UpgradeDownloadUrl != "" &&
  161. controller.config.UpgradeDownloadFilename != "" {
  162. controller.runWaitGroup.Add(1)
  163. go controller.upgradeDownloader()
  164. }
  165. /// Note: the connected reporter isn't started until a tunnel is
  166. // established
  167. controller.runWaitGroup.Add(1)
  168. go controller.runTunnels()
  169. if *controller.config.EstablishTunnelTimeoutSeconds != 0 {
  170. controller.runWaitGroup.Add(1)
  171. go controller.establishTunnelWatcher()
  172. }
  173. // Wait while running
  174. select {
  175. case <-shutdownBroadcast:
  176. NoticeInfo("controller shutdown by request")
  177. case <-controller.componentFailureSignal:
  178. NoticeAlert("controller shutdown due to component failure")
  179. }
  180. close(controller.shutdownBroadcast)
  181. // Interrupts and stops establish workers blocking on
  182. // tunnel establishment network operations.
  183. controller.establishPendingConns.CloseAll()
  184. // Interrupts and stops workers blocking on untunneled
  185. // network operations. This includes fetch remote server
  186. // list and untunneled uprade download.
  187. // Note: this doesn't interrupt the final, untunneled status
  188. // requests started in operateTunnel after shutdownBroadcast.
  189. // This is by design -- we want to give these requests a short
  190. // timer period to succeed and deliver stats. These particular
  191. // requests opt out of untunneledPendingConns and use the
  192. // PSIPHON_API_SHUTDOWN_SERVER_TIMEOUT timeout (see
  193. // doUntunneledStatusRequest).
  194. controller.untunneledPendingConns.CloseAll()
  195. // Now with all workers signaled to stop and with all
  196. // blocking network operations interrupted, wait for
  197. // all workers to terminate.
  198. controller.runWaitGroup.Wait()
  199. controller.splitTunnelClassifier.Shutdown()
  200. NoticeInfo("exiting controller")
  201. NoticeExiting()
  202. }
  203. // SignalComponentFailure notifies the controller that an associated component has failed.
  204. // This will terminate the controller.
  205. func (controller *Controller) SignalComponentFailure() {
  206. select {
  207. case controller.componentFailureSignal <- *new(struct{}):
  208. default:
  209. }
  210. }
  211. // remoteServerListFetcher fetches an out-of-band list of server entries
  212. // for more tunnel candidates. It fetches when signalled, with retries
  213. // on failure.
  214. func (controller *Controller) remoteServerListFetcher() {
  215. defer controller.runWaitGroup.Done()
  216. var lastFetchTime time.Time
  217. fetcherLoop:
  218. for {
  219. // Wait for a signal before fetching
  220. select {
  221. case <-controller.signalFetchRemoteServerList:
  222. case <-controller.shutdownBroadcast:
  223. break fetcherLoop
  224. }
  225. // Skip fetch entirely (i.e., send no request at all, even when ETag would save
  226. // on response size) when a recent fetch was successful
  227. if time.Now().Before(lastFetchTime.Add(FETCH_REMOTE_SERVER_LIST_STALE_PERIOD)) {
  228. continue
  229. }
  230. retryLoop:
  231. for {
  232. // Don't attempt to fetch while there is no network connectivity,
  233. // to avoid alert notice noise.
  234. if !WaitForNetworkConnectivity(
  235. controller.config.NetworkConnectivityChecker,
  236. controller.shutdownBroadcast) {
  237. break fetcherLoop
  238. }
  239. err := FetchRemoteServerList(
  240. controller.config, controller.untunneledDialConfig)
  241. if err == nil {
  242. lastFetchTime = time.Now()
  243. break retryLoop
  244. }
  245. NoticeAlert("failed to fetch remote server list: %s", err)
  246. timeout := time.After(FETCH_REMOTE_SERVER_LIST_RETRY_PERIOD)
  247. select {
  248. case <-timeout:
  249. case <-controller.shutdownBroadcast:
  250. break fetcherLoop
  251. }
  252. }
  253. }
  254. NoticeInfo("exiting remote server list fetcher")
  255. }
  256. // establishTunnelWatcher terminates the controller if a tunnel
  257. // has not been established in the configured time period. This
  258. // is regardless of how many tunnels are presently active -- meaning
  259. // that if an active tunnel was established and lost the controller
  260. // is left running (to re-establish).
  261. func (controller *Controller) establishTunnelWatcher() {
  262. defer controller.runWaitGroup.Done()
  263. timeout := time.After(
  264. time.Duration(*controller.config.EstablishTunnelTimeoutSeconds) * time.Second)
  265. select {
  266. case <-timeout:
  267. if !controller.hasEstablishedOnce() {
  268. NoticeAlert("failed to establish tunnel before timeout")
  269. controller.SignalComponentFailure()
  270. }
  271. case <-controller.shutdownBroadcast:
  272. }
  273. NoticeInfo("exiting establish tunnel watcher")
  274. }
  275. // connectedReporter sends periodic "connected" requests to the Psiphon API.
  276. // These requests are for server-side unique user stats calculation. See the
  277. // comment in DoConnectedRequest for a description of the request mechanism.
  278. // To ensure we don't over- or under-count unique users, only one connected
  279. // request is made across all simultaneous multi-tunnels; and the connected
  280. // request is repeated periodically for very long-lived tunnels.
  281. // The signalReportConnected mechanism is used to trigger another connected
  282. // request immediately after a reconnect.
  283. func (controller *Controller) connectedReporter() {
  284. defer controller.runWaitGroup.Done()
  285. loop:
  286. for {
  287. // Pick any active tunnel and make the next connected request. No error
  288. // is logged if there's no active tunnel, as that's not an unexpected condition.
  289. reported := false
  290. tunnel := controller.getNextActiveTunnel()
  291. if tunnel != nil {
  292. err := tunnel.serverContext.DoConnectedRequest()
  293. if err == nil {
  294. reported = true
  295. } else {
  296. NoticeAlert("failed to make connected request: %s", err)
  297. }
  298. }
  299. // Schedule the next connected request and wait.
  300. var duration time.Duration
  301. if reported {
  302. duration = PSIPHON_API_CONNECTED_REQUEST_PERIOD
  303. } else {
  304. duration = PSIPHON_API_CONNECTED_REQUEST_RETRY_PERIOD
  305. }
  306. timeout := time.After(duration)
  307. select {
  308. case <-controller.signalReportConnected:
  309. case <-timeout:
  310. // Make another connected request
  311. case <-controller.shutdownBroadcast:
  312. break loop
  313. }
  314. }
  315. NoticeInfo("exiting connected reporter")
  316. }
  317. func (controller *Controller) startOrSignalConnectedReporter() {
  318. // session is nil when DisableApi is set
  319. if controller.config.DisableApi {
  320. return
  321. }
  322. // Start the connected reporter after the first tunnel is established.
  323. // Concurrency note: only the runTunnels goroutine may access startedConnectedReporter.
  324. if !controller.startedConnectedReporter {
  325. controller.startedConnectedReporter = true
  326. controller.runWaitGroup.Add(1)
  327. go controller.connectedReporter()
  328. } else {
  329. select {
  330. case controller.signalReportConnected <- *new(struct{}):
  331. default:
  332. }
  333. }
  334. }
  335. // upgradeDownloader makes periodic attemps to complete a client upgrade
  336. // download. DownloadUpgrade() is resumable, so each attempt has potential for
  337. // getting closer to completion, even in conditions where the download or
  338. // tunnel is repeatedly interrupted.
  339. // An upgrade download is triggered by either a handshake response indicating
  340. // that a new version is available; or after failing to connect, in which case
  341. // it's useful to check, out-of-band, for an upgrade with new circumvention
  342. // capabilities.
  343. // Once the download operation completes successfully, the downloader exits
  344. // and is not run again: either there is not a newer version, or the upgrade
  345. // has been downloaded and is ready to be applied.
  346. // We're assuming that the upgrade will be applied and the entire system
  347. // restarted before another upgrade is to be downloaded.
  348. //
  349. // TODO: refactor upgrade downloader and remote server list fetcher to use
  350. // common code (including the resumable download routines).
  351. //
  352. func (controller *Controller) upgradeDownloader() {
  353. defer controller.runWaitGroup.Done()
  354. var lastDownloadTime time.Time
  355. downloadLoop:
  356. for {
  357. // Wait for a signal before downloading
  358. var handshakeVersion string
  359. select {
  360. case handshakeVersion = <-controller.signalDownloadUpgrade:
  361. case <-controller.shutdownBroadcast:
  362. break downloadLoop
  363. }
  364. // Unless handshake is explicitly advertizing a new version, skip
  365. // checking entirely when a recent download was successful.
  366. if handshakeVersion == "" &&
  367. time.Now().Before(lastDownloadTime.Add(DOWNLOAD_UPGRADE_STALE_PERIOD)) {
  368. continue
  369. }
  370. retryLoop:
  371. for {
  372. // Don't attempt to download while there is no network connectivity,
  373. // to avoid alert notice noise.
  374. if !WaitForNetworkConnectivity(
  375. controller.config.NetworkConnectivityChecker,
  376. controller.shutdownBroadcast) {
  377. break downloadLoop
  378. }
  379. // Pick any active tunnel and make the next download attempt. If there's
  380. // no active tunnel, the untunneledDialConfig will be used.
  381. tunnel := controller.getNextActiveTunnel()
  382. err := DownloadUpgrade(
  383. controller.config,
  384. handshakeVersion,
  385. tunnel,
  386. controller.untunneledDialConfig)
  387. if err == nil {
  388. lastDownloadTime = time.Now()
  389. break retryLoop
  390. }
  391. NoticeAlert("failed to download upgrade: %s", err)
  392. timeout := time.After(DOWNLOAD_UPGRADE_RETRY_PERIOD)
  393. select {
  394. case <-timeout:
  395. case <-controller.shutdownBroadcast:
  396. break downloadLoop
  397. }
  398. }
  399. }
  400. NoticeInfo("exiting upgrade downloader")
  401. }
  402. // runTunnels is the controller tunnel management main loop. It starts and stops
  403. // establishing tunnels based on the target tunnel pool size and the current size
  404. // of the pool. Tunnels are established asynchronously using worker goroutines.
  405. //
  406. // When there are no server entries for the target region/protocol, the
  407. // establishCandidateGenerator will yield no candidates and wait before
  408. // trying again. In the meantime, a remote server entry fetch may supply
  409. // valid candidates.
  410. //
  411. // When a tunnel is established, it's added to the active pool. The tunnel's
  412. // operateTunnel goroutine monitors the tunnel.
  413. //
  414. // When a tunnel fails, it's removed from the pool and the establish process is
  415. // restarted to fill the pool.
  416. func (controller *Controller) runTunnels() {
  417. defer controller.runWaitGroup.Done()
  418. // Start running
  419. controller.startEstablishing()
  420. loop:
  421. for {
  422. select {
  423. case failedTunnel := <-controller.failedTunnels:
  424. NoticeAlert("tunnel failed: %s", failedTunnel.serverEntry.IpAddress)
  425. controller.terminateTunnel(failedTunnel)
  426. // Note: we make this extra check to ensure the shutdown signal takes priority
  427. // and that we do not start establishing. Critically, startEstablishing() calls
  428. // establishPendingConns.Reset() which clears the closed flag in
  429. // establishPendingConns; this causes the pendingConns.Add() within
  430. // interruptibleTCPDial to succeed instead of aborting, and the result
  431. // is that it's possible for establish goroutines to run all the way through
  432. // NewServerContext before being discarded... delaying shutdown.
  433. select {
  434. case <-controller.shutdownBroadcast:
  435. break loop
  436. default:
  437. }
  438. controller.classifyImpairedProtocol(failedTunnel)
  439. // Concurrency note: only this goroutine may call startEstablishing/stopEstablishing
  440. // and access isEstablishing.
  441. if !controller.isEstablishing {
  442. controller.startEstablishing()
  443. }
  444. case establishedTunnel := <-controller.establishedTunnels:
  445. if controller.isImpairedProtocol(establishedTunnel.protocol) {
  446. NoticeAlert("established tunnel with impaired protocol: %s", establishedTunnel.protocol)
  447. // Protocol was classified as impaired while this tunnel
  448. // established, so discard.
  449. controller.discardTunnel(establishedTunnel)
  450. // Reset establish generator to stop producing tunnels
  451. // with impaired protocols.
  452. if controller.isEstablishing {
  453. controller.stopEstablishing()
  454. controller.startEstablishing()
  455. }
  456. break
  457. }
  458. tunnelCount, registered := controller.registerTunnel(establishedTunnel)
  459. if !registered {
  460. // Already fully established, so discard.
  461. controller.discardTunnel(establishedTunnel)
  462. break
  463. }
  464. NoticeActiveTunnel(establishedTunnel.serverEntry.IpAddress, establishedTunnel.protocol)
  465. if tunnelCount == 1 {
  466. // The split tunnel classifier is started once the first tunnel is
  467. // established. This first tunnel is passed in to be used to make
  468. // the routes data request.
  469. // A long-running controller may run while the host device is present
  470. // in different regions. In this case, we want the split tunnel logic
  471. // to switch to routes for new regions and not classify traffic based
  472. // on routes installed for older regions.
  473. // We assume that when regions change, the host network will also
  474. // change, and so all tunnels will fail and be re-established. Under
  475. // that assumption, the classifier will be re-Start()-ed here when
  476. // the region has changed.
  477. controller.splitTunnelClassifier.Start(establishedTunnel)
  478. // Signal a connected request on each 1st tunnel establishment. For
  479. // multi-tunnels, the session is connected as long as at least one
  480. // tunnel is established.
  481. controller.startOrSignalConnectedReporter()
  482. // If the handshake indicated that a new client version is available,
  483. // trigger an upgrade download.
  484. // Note: serverContext is nil when DisableApi is set
  485. if establishedTunnel.serverContext != nil &&
  486. establishedTunnel.serverContext.clientUpgradeVersion != "" {
  487. handshakeVersion := establishedTunnel.serverContext.clientUpgradeVersion
  488. select {
  489. case controller.signalDownloadUpgrade <- handshakeVersion:
  490. default:
  491. }
  492. }
  493. }
  494. // TODO: design issue -- might not be enough server entries with region/caps to ever fill tunnel slots;
  495. // possible solution is establish target MIN(CountServerEntries(region, protocol), TunnelPoolSize)
  496. if controller.isFullyEstablished() {
  497. controller.stopEstablishing()
  498. }
  499. case <-controller.shutdownBroadcast:
  500. break loop
  501. }
  502. }
  503. // Stop running
  504. controller.stopEstablishing()
  505. controller.terminateAllTunnels()
  506. // Drain tunnel channels
  507. close(controller.establishedTunnels)
  508. for tunnel := range controller.establishedTunnels {
  509. controller.discardTunnel(tunnel)
  510. }
  511. close(controller.failedTunnels)
  512. for tunnel := range controller.failedTunnels {
  513. controller.discardTunnel(tunnel)
  514. }
  515. NoticeInfo("exiting run tunnels")
  516. }
  517. // classifyImpairedProtocol tracks "impaired" protocol classifications for failed
  518. // tunnels. A protocol is classified as impaired if a tunnel using that protocol
  519. // fails, repeatedly, shortly after the start of the connection. During tunnel
  520. // establishment, impaired protocols are briefly skipped.
  521. //
  522. // One purpose of this measure is to defend against an attack where the adversary,
  523. // for example, tags an OSSH TCP connection as an "unidentified" protocol; allows
  524. // it to connect; but then kills the underlying TCP connection after a short time.
  525. // Since OSSH has less latency than other protocols that may bypass an "unidentified"
  526. // filter, these other protocols might never be selected for use.
  527. //
  528. // Concurrency note: only the runTunnels() goroutine may call classifyImpairedProtocol
  529. func (controller *Controller) classifyImpairedProtocol(failedTunnel *Tunnel) {
  530. if failedTunnel.startTime.Add(IMPAIRED_PROTOCOL_CLASSIFICATION_DURATION).After(time.Now()) {
  531. controller.impairedProtocolClassification[failedTunnel.protocol] += 1
  532. } else {
  533. controller.impairedProtocolClassification[failedTunnel.protocol] = 0
  534. }
  535. if len(controller.getImpairedProtocols()) == len(SupportedTunnelProtocols) {
  536. // Reset classification if all protocols are classified as impaired as
  537. // the network situation (or attack) may not be protocol-specific.
  538. // TODO: compare against count of distinct supported protocols for
  539. // current known server entries.
  540. controller.impairedProtocolClassification = make(map[string]int)
  541. }
  542. }
  543. // getImpairedProtocols returns a list of protocols that have sufficient
  544. // classifications to be considered impaired protocols.
  545. //
  546. // Concurrency note: only the runTunnels() goroutine may call getImpairedProtocols
  547. func (controller *Controller) getImpairedProtocols() []string {
  548. NoticeImpairedProtocolClassification(controller.impairedProtocolClassification)
  549. impairedProtocols := make([]string, 0)
  550. for protocol, count := range controller.impairedProtocolClassification {
  551. if count >= IMPAIRED_PROTOCOL_CLASSIFICATION_THRESHOLD {
  552. impairedProtocols = append(impairedProtocols, protocol)
  553. }
  554. }
  555. return impairedProtocols
  556. }
  557. // isImpairedProtocol checks if the specified protocol is classified as impaired.
  558. //
  559. // Concurrency note: only the runTunnels() goroutine may call isImpairedProtocol
  560. func (controller *Controller) isImpairedProtocol(protocol string) bool {
  561. count, ok := controller.impairedProtocolClassification[protocol]
  562. return ok && count >= IMPAIRED_PROTOCOL_CLASSIFICATION_THRESHOLD
  563. }
  564. // SignalTunnelFailure implements the TunnelOwner interface. This function
  565. // is called by Tunnel.operateTunnel when the tunnel has detected that it
  566. // has failed. The Controller will signal runTunnels to create a new
  567. // tunnel and/or remove the tunnel from the list of active tunnels.
  568. func (controller *Controller) SignalTunnelFailure(tunnel *Tunnel) {
  569. // Don't block. Assumes the receiver has a buffer large enough for
  570. // the typical number of operated tunnels. In case there's no room,
  571. // terminate the tunnel (runTunnels won't get a signal in this case,
  572. // but the tunnel will be removed from the list of active tunnels).
  573. select {
  574. case controller.failedTunnels <- tunnel:
  575. default:
  576. controller.terminateTunnel(tunnel)
  577. }
  578. }
  579. // discardTunnel disposes of a successful connection that is no longer required.
  580. func (controller *Controller) discardTunnel(tunnel *Tunnel) {
  581. NoticeInfo("discard tunnel: %s", tunnel.serverEntry.IpAddress)
  582. // TODO: not calling PromoteServerEntry, since that would rank the
  583. // discarded tunnel before fully active tunnels. Can a discarded tunnel
  584. // be promoted (since it connects), but with lower rank than all active
  585. // tunnels?
  586. tunnel.Close(true)
  587. }
  588. // registerTunnel adds the connected tunnel to the pool of active tunnels
  589. // which are candidates for port forwarding. Returns true if the pool has an
  590. // empty slot and false if the pool is full (caller should discard the tunnel).
  591. func (controller *Controller) registerTunnel(tunnel *Tunnel) (int, bool) {
  592. controller.tunnelMutex.Lock()
  593. defer controller.tunnelMutex.Unlock()
  594. if len(controller.tunnels) >= controller.config.TunnelPoolSize {
  595. return len(controller.tunnels), false
  596. }
  597. // Perform a final check just in case we've established
  598. // a duplicate connection.
  599. for _, activeTunnel := range controller.tunnels {
  600. if activeTunnel.serverEntry.IpAddress == tunnel.serverEntry.IpAddress {
  601. NoticeAlert("duplicate tunnel: %s", tunnel.serverEntry.IpAddress)
  602. return len(controller.tunnels), false
  603. }
  604. }
  605. controller.establishedOnce = true
  606. controller.tunnels = append(controller.tunnels, tunnel)
  607. NoticeTunnels(len(controller.tunnels))
  608. // Promote this successful tunnel to first rank so it's one
  609. // of the first candidates next time establish runs.
  610. // Connecting to a TargetServerEntry does not change the
  611. // ranking.
  612. if controller.config.TargetServerEntry == "" {
  613. PromoteServerEntry(tunnel.serverEntry.IpAddress)
  614. }
  615. return len(controller.tunnels), true
  616. }
  617. // hasEstablishedOnce indicates if at least one active tunnel has
  618. // been established up to this point. This is regardeless of how many
  619. // tunnels are presently active.
  620. func (controller *Controller) hasEstablishedOnce() bool {
  621. controller.tunnelMutex.Lock()
  622. defer controller.tunnelMutex.Unlock()
  623. return controller.establishedOnce
  624. }
  625. // isFullyEstablished indicates if the pool of active tunnels is full.
  626. func (controller *Controller) isFullyEstablished() bool {
  627. controller.tunnelMutex.Lock()
  628. defer controller.tunnelMutex.Unlock()
  629. return len(controller.tunnels) >= controller.config.TunnelPoolSize
  630. }
  631. // terminateTunnel removes a tunnel from the pool of active tunnels
  632. // and closes the tunnel. The next-tunnel state used by getNextActiveTunnel
  633. // is adjusted as required.
  634. func (controller *Controller) terminateTunnel(tunnel *Tunnel) {
  635. controller.tunnelMutex.Lock()
  636. defer controller.tunnelMutex.Unlock()
  637. for index, activeTunnel := range controller.tunnels {
  638. if tunnel == activeTunnel {
  639. controller.tunnels = append(
  640. controller.tunnels[:index], controller.tunnels[index+1:]...)
  641. if controller.nextTunnel > index {
  642. controller.nextTunnel--
  643. }
  644. if controller.nextTunnel >= len(controller.tunnels) {
  645. controller.nextTunnel = 0
  646. }
  647. activeTunnel.Close(false)
  648. NoticeTunnels(len(controller.tunnels))
  649. break
  650. }
  651. }
  652. }
  653. // terminateAllTunnels empties the tunnel pool, closing all active tunnels.
  654. // This is used when shutting down the controller.
  655. func (controller *Controller) terminateAllTunnels() {
  656. controller.tunnelMutex.Lock()
  657. defer controller.tunnelMutex.Unlock()
  658. // Closing all tunnels in parallel. In an orderly shutdown, each tunnel
  659. // may take a few seconds to send a final status request. We only want
  660. // to wait as long as the single slowest tunnel.
  661. closeWaitGroup := new(sync.WaitGroup)
  662. closeWaitGroup.Add(len(controller.tunnels))
  663. for _, activeTunnel := range controller.tunnels {
  664. tunnel := activeTunnel
  665. go func() {
  666. defer closeWaitGroup.Done()
  667. tunnel.Close(false)
  668. }()
  669. }
  670. closeWaitGroup.Wait()
  671. controller.tunnels = make([]*Tunnel, 0)
  672. controller.nextTunnel = 0
  673. NoticeTunnels(len(controller.tunnels))
  674. }
  675. // getNextActiveTunnel returns the next tunnel from the pool of active
  676. // tunnels. Currently, tunnel selection order is simple round-robin.
  677. func (controller *Controller) getNextActiveTunnel() (tunnel *Tunnel) {
  678. controller.tunnelMutex.Lock()
  679. defer controller.tunnelMutex.Unlock()
  680. for i := len(controller.tunnels); i > 0; i-- {
  681. tunnel = controller.tunnels[controller.nextTunnel]
  682. controller.nextTunnel =
  683. (controller.nextTunnel + 1) % len(controller.tunnels)
  684. return tunnel
  685. }
  686. return nil
  687. }
  688. // isActiveTunnelServerEntry is used to check if there's already
  689. // an existing tunnel to a candidate server.
  690. func (controller *Controller) isActiveTunnelServerEntry(serverEntry *ServerEntry) bool {
  691. controller.tunnelMutex.Lock()
  692. defer controller.tunnelMutex.Unlock()
  693. for _, activeTunnel := range controller.tunnels {
  694. if activeTunnel.serverEntry.IpAddress == serverEntry.IpAddress {
  695. return true
  696. }
  697. }
  698. return false
  699. }
  700. // Dial selects an active tunnel and establishes a port forward
  701. // connection through the selected tunnel. Failure to connect is considered
  702. // a port foward failure, for the purpose of monitoring tunnel health.
  703. func (controller *Controller) Dial(
  704. remoteAddr string, alwaysTunnel bool, downstreamConn net.Conn) (conn net.Conn, err error) {
  705. tunnel := controller.getNextActiveTunnel()
  706. if tunnel == nil {
  707. return nil, ContextError(errors.New("no active tunnels"))
  708. }
  709. // Perform split tunnel classification when feature is enabled, and if the remote
  710. // address is classified as untunneled, dial directly.
  711. if !alwaysTunnel && controller.config.SplitTunnelDnsServer != "" {
  712. host, _, err := net.SplitHostPort(remoteAddr)
  713. if err != nil {
  714. return nil, ContextError(err)
  715. }
  716. // Note: a possible optimization, when split tunnel is active and IsUntunneled performs
  717. // a DNS resolution in order to make its classification, is to reuse that IP address in
  718. // the following Dials so they do not need to make their own resolutions. However, the
  719. // way this is currently implemented ensures that, e.g., DNS geo load balancing occurs
  720. // relative to the outbound network.
  721. if controller.splitTunnelClassifier.IsUntunneled(host) {
  722. // TODO: track downstreamConn and close it when the DialTCP conn closes, as with tunnel.Dial conns?
  723. return DialTCP(remoteAddr, controller.untunneledDialConfig)
  724. }
  725. }
  726. tunneledConn, err := tunnel.Dial(remoteAddr, alwaysTunnel, downstreamConn)
  727. if err != nil {
  728. return nil, ContextError(err)
  729. }
  730. return tunneledConn, nil
  731. }
  732. // startEstablishing creates a pool of worker goroutines which will
  733. // attempt to establish tunnels to candidate servers. The candidates
  734. // are generated by another goroutine.
  735. func (controller *Controller) startEstablishing() {
  736. if controller.isEstablishing {
  737. return
  738. }
  739. NoticeInfo("start establishing")
  740. controller.isEstablishing = true
  741. controller.establishWaitGroup = new(sync.WaitGroup)
  742. controller.stopEstablishingBroadcast = make(chan struct{})
  743. controller.candidateServerEntries = make(chan *candidateServerEntry)
  744. controller.establishPendingConns.Reset()
  745. // The server affinity mechanism attempts to favor the previously
  746. // used server when reconnecting. This is beneficial for user
  747. // applications which expect consistency in user IP address (for
  748. // example, a web site which prompts for additional user
  749. // authentication when the IP address changes).
  750. //
  751. // Only the very first server, as determined by
  752. // datastore.PromoteServerEntry(), is the server affinity candidate.
  753. // Concurrent connections attempts to many servers are launched
  754. // without delay, in case the affinity server connection fails.
  755. // While the affinity server connection is outstanding, when any
  756. // other connection is established, there is a short grace period
  757. // delay before delivering the established tunnel; this allows some
  758. // time for the affinity server connection to succeed first.
  759. // When the affinity server connection fails, any other established
  760. // tunnel is registered without delay.
  761. //
  762. // Note: the establishTunnelWorker that receives the affinity
  763. // candidate is solely resonsible for closing
  764. // controller.serverAffinityDoneBroadcast.
  765. //
  766. // Note: if config.EgressRegion or config.TunnelProtocol has changed
  767. // since the top server was promoted, the first server may not actually
  768. // be the last connected server.
  769. // TODO: should not favor the first server in this case
  770. controller.serverAffinityDoneBroadcast = make(chan struct{})
  771. for i := 0; i < controller.config.ConnectionWorkerPoolSize; i++ {
  772. controller.establishWaitGroup.Add(1)
  773. go controller.establishTunnelWorker()
  774. }
  775. controller.establishWaitGroup.Add(1)
  776. go controller.establishCandidateGenerator(
  777. controller.getImpairedProtocols())
  778. }
  779. // stopEstablishing signals the establish goroutines to stop and waits
  780. // for the group to halt. pendingConns is used to interrupt any worker
  781. // blocked on a socket connect.
  782. func (controller *Controller) stopEstablishing() {
  783. if !controller.isEstablishing {
  784. return
  785. }
  786. NoticeInfo("stop establishing")
  787. close(controller.stopEstablishingBroadcast)
  788. // Note: interruptibleTCPClose doesn't really interrupt socket connects
  789. // and may leave goroutines running for a time after the Wait call.
  790. controller.establishPendingConns.CloseAll()
  791. // Note: establishCandidateGenerator closes controller.candidateServerEntries
  792. // (as it may be sending to that channel).
  793. controller.establishWaitGroup.Wait()
  794. controller.isEstablishing = false
  795. controller.establishWaitGroup = nil
  796. controller.stopEstablishingBroadcast = nil
  797. controller.candidateServerEntries = nil
  798. controller.serverAffinityDoneBroadcast = nil
  799. }
  800. // establishCandidateGenerator populates the candidate queue with server entries
  801. // from the data store. Server entries are iterated in rank order, so that promoted
  802. // servers with higher rank are priority candidates.
  803. func (controller *Controller) establishCandidateGenerator(impairedProtocols []string) {
  804. defer controller.establishWaitGroup.Done()
  805. defer close(controller.candidateServerEntries)
  806. iterator, err := NewServerEntryIterator(controller.config)
  807. if err != nil {
  808. NoticeAlert("failed to iterate over candidates: %s", err)
  809. controller.SignalComponentFailure()
  810. return
  811. }
  812. defer iterator.Close()
  813. isServerAffinityCandidate := true
  814. // TODO: reconcile server affinity scheme with multi-tunnel mode
  815. if controller.config.TunnelPoolSize > 1 {
  816. isServerAffinityCandidate = false
  817. close(controller.serverAffinityDoneBroadcast)
  818. }
  819. loop:
  820. // Repeat until stopped
  821. for i := 0; ; i++ {
  822. if !WaitForNetworkConnectivity(
  823. controller.config.NetworkConnectivityChecker,
  824. controller.stopEstablishingBroadcast,
  825. controller.shutdownBroadcast) {
  826. break loop
  827. }
  828. // Send each iterator server entry to the establish workers
  829. startTime := time.Now()
  830. for {
  831. serverEntry, err := iterator.Next()
  832. if err != nil {
  833. NoticeAlert("failed to get next candidate: %s", err)
  834. controller.SignalComponentFailure()
  835. break loop
  836. }
  837. if serverEntry == nil {
  838. // Completed this iteration
  839. break
  840. }
  841. // Disable impaired protocols. This is only done for the
  842. // first iteration of the ESTABLISH_TUNNEL_WORK_TIME
  843. // loop since (a) one iteration should be sufficient to
  844. // evade the attack; (b) there's a good chance of false
  845. // positives (such as short tunnel durations due to network
  846. // hopping on a mobile device).
  847. // Impaired protocols logic is not applied when
  848. // config.TunnelProtocol is specified.
  849. // The edited serverEntry is temporary copy which is not
  850. // stored or reused.
  851. if i == 0 && controller.config.TunnelProtocol == "" {
  852. serverEntry.DisableImpairedProtocols(impairedProtocols)
  853. if len(serverEntry.GetSupportedProtocols()) == 0 {
  854. // Skip this server entry, as it has no supported
  855. // protocols after disabling the impaired ones
  856. // TODO: modify ServerEntryIterator to skip these?
  857. continue
  858. }
  859. }
  860. // Note: there must be only one server affinity candidate, as it
  861. // closes the serverAffinityDoneBroadcast channel.
  862. candidate := &candidateServerEntry{serverEntry, isServerAffinityCandidate}
  863. isServerAffinityCandidate = false
  864. // TODO: here we could generate multiple candidates from the
  865. // server entry when there are many MeekFrontingAddresses.
  866. select {
  867. case controller.candidateServerEntries <- candidate:
  868. case <-controller.stopEstablishingBroadcast:
  869. break loop
  870. case <-controller.shutdownBroadcast:
  871. break loop
  872. }
  873. if time.Now().After(startTime.Add(ESTABLISH_TUNNEL_WORK_TIME)) {
  874. // Start over, after a brief pause, with a new shuffle of the server
  875. // entries, and potentially some newly fetched server entries.
  876. break
  877. }
  878. }
  879. // Free up resources now, but don't reset until after the pause.
  880. iterator.Close()
  881. // Trigger a fetch remote server list, since we may have failed to
  882. // connect with all known servers. Don't block sending signal, since
  883. // this signal may have already been sent.
  884. // Don't wait for fetch remote to succeed, since it may fail and
  885. // enter a retry loop and we're better off trying more known servers.
  886. // TODO: synchronize the fetch response, so it can be incorporated
  887. // into the server entry iterator as soon as available.
  888. select {
  889. case controller.signalFetchRemoteServerList <- *new(struct{}):
  890. default:
  891. }
  892. // Trigger an out-of-band upgrade availability check and download.
  893. // Since we may have failed to connect, we may benefit from upgrading
  894. // to a new client version with new circumvention capabilities.
  895. select {
  896. case controller.signalDownloadUpgrade <- "":
  897. default:
  898. }
  899. // After a complete iteration of candidate servers, pause before iterating again.
  900. // This helps avoid some busy wait loop conditions, and also allows some time for
  901. // network conditions to change. Also allows for fetch remote to complete,
  902. // in typical conditions (it isn't strictly necessary to wait for this, there will
  903. // be more rounds if required).
  904. timeout := time.After(ESTABLISH_TUNNEL_PAUSE_PERIOD)
  905. select {
  906. case <-timeout:
  907. // Retry iterating
  908. case <-controller.stopEstablishingBroadcast:
  909. break loop
  910. case <-controller.shutdownBroadcast:
  911. break loop
  912. }
  913. iterator.Reset()
  914. }
  915. NoticeInfo("stopped candidate generator")
  916. }
  917. // establishTunnelWorker pulls candidates from the candidate queue, establishes
  918. // a connection to the tunnel server, and delivers the established tunnel to a channel.
  919. func (controller *Controller) establishTunnelWorker() {
  920. defer controller.establishWaitGroup.Done()
  921. loop:
  922. for candidateServerEntry := range controller.candidateServerEntries {
  923. // Note: don't receive from candidateServerEntries and stopEstablishingBroadcast
  924. // in the same select, since we want to prioritize receiving the stop signal
  925. if controller.isStopEstablishingBroadcast() {
  926. break loop
  927. }
  928. // There may already be a tunnel to this candidate. If so, skip it.
  929. if controller.isActiveTunnelServerEntry(candidateServerEntry.serverEntry) {
  930. continue
  931. }
  932. tunnel, err := EstablishTunnel(
  933. controller.config,
  934. controller.untunneledDialConfig,
  935. controller.sessionId,
  936. controller.establishPendingConns,
  937. candidateServerEntry.serverEntry,
  938. controller) // TunnelOwner
  939. if err != nil {
  940. // Unblock other candidates immediately when
  941. // server affinity candidate fails.
  942. if candidateServerEntry.isServerAffinityCandidate {
  943. close(controller.serverAffinityDoneBroadcast)
  944. }
  945. // Before emitting error, check if establish interrupted, in which
  946. // case the error is noise.
  947. if controller.isStopEstablishingBroadcast() {
  948. break loop
  949. }
  950. NoticeInfo("failed to connect to %s: %s", candidateServerEntry.serverEntry.IpAddress, err)
  951. continue
  952. }
  953. // Block for server affinity grace period before delivering.
  954. if !candidateServerEntry.isServerAffinityCandidate {
  955. timer := time.NewTimer(ESTABLISH_TUNNEL_SERVER_AFFINITY_GRACE_PERIOD)
  956. select {
  957. case <-timer.C:
  958. case <-controller.serverAffinityDoneBroadcast:
  959. case <-controller.stopEstablishingBroadcast:
  960. }
  961. }
  962. // Deliver established tunnel.
  963. // Don't block. Assumes the receiver has a buffer large enough for
  964. // the number of desired tunnels. If there's no room, the tunnel must
  965. // not be required so it's discarded.
  966. select {
  967. case controller.establishedTunnels <- tunnel:
  968. default:
  969. controller.discardTunnel(tunnel)
  970. }
  971. // Unblock other candidates only after delivering when
  972. // server affinity candidate succeeds.
  973. if candidateServerEntry.isServerAffinityCandidate {
  974. close(controller.serverAffinityDoneBroadcast)
  975. }
  976. }
  977. NoticeInfo("stopped establish worker")
  978. }
  979. func (controller *Controller) isStopEstablishingBroadcast() bool {
  980. select {
  981. case <-controller.stopEstablishingBroadcast:
  982. return true
  983. default:
  984. }
  985. return false
  986. }