Jelajahi Sumber

Fix: timeout tunnel.Activate

- Underlying SendAPIRequest for handshake had
  no explicit timeout, leaving tunnel-core
  hanging when Activate was pending for a dead
  tunnel.

- Move goroutine for handshake into Activate,
  to simplify controller.runTunnels code.
Rod Hynes 8 tahun lalu
induk
melakukan
fd9bcb7d74
2 mengubah file dengan 57 tambahan dan 26 penghapusan
  1. 3 21
      psiphon/controller.go
  2. 54 5
      psiphon/tunnel.go

+ 3 - 21
psiphon/controller.go

@@ -677,6 +677,8 @@ loop:
 
 			active, outstanding := controller.numTunnels()
 
+			// discardTunnel will be true here when already fully established.
+
 			discardTunnel := (outstanding <= 0)
 			isFirstTunnel := (active == 0)
 			isLastTunnel := (outstanding == 1)
@@ -687,26 +689,7 @@ loop:
 					controller.stopEstablishing()
 				}
 
-				// Call connectedTunnel.Activate in a goroutine, as it blocks on a network
-				// operation and would block shutdown. If the shutdown signal is received,
-				// discard the tunnel, which will interrupt the handshake request that may
-				// be blocking Activate.
-
-				activatedTunnelResult := make(chan error)
-
-				go func() {
-					activatedTunnelResult <- connectedTunnel.Activate(controller)
-				}()
-
-				var err error
-				select {
-				case err = <-activatedTunnelResult:
-				case <-controller.shutdownBroadcast:
-					controller.discardTunnel(connectedTunnel)
-					// Await the interrupted goroutine.
-					<-activatedTunnelResult
-					break loop
-				}
+				err := connectedTunnel.Activate(controller, controller.shutdownBroadcast)
 
 				if err != nil {
 					NoticeAlert("failed to activate %s: %s", connectedTunnel.serverEntry.IpAddress, err)
@@ -729,7 +712,6 @@ loop:
 			}
 
 			if discardTunnel {
-				// Already fully established, so discard.
 				controller.discardTunnel(connectedTunnel)
 
 				// Clear the reference to this discarded tunnel and immediately run

+ 54 - 5
psiphon/tunnel.go

@@ -186,7 +186,9 @@ func ConnectTunnel(
 // Activate completes the tunnel establishment, performing the handshake
 // request and starting operateTunnel, the worker that monitors the tunnel
 // and handles periodic management.
-func (tunnel *Tunnel) Activate(tunnelOwner TunnelOwner) error {
+func (tunnel *Tunnel) Activate(
+	tunnelOwner TunnelOwner,
+	shutdownBroadcast chan struct{}) error {
 
 	// Create a new Psiphon API server context for this tunnel. This includes
 	// performing a handshake request. If the handshake fails, this activation
@@ -194,13 +196,60 @@ func (tunnel *Tunnel) Activate(tunnelOwner TunnelOwner) error {
 	var serverContext *ServerContext
 	if !tunnel.config.DisableApi {
 		NoticeInfo("starting server context for %s", tunnel.serverEntry.IpAddress)
-		var err error
-		serverContext, err = NewServerContext(tunnel)
-		if err != nil {
+
+		// Call NewServerContext in a goroutine, as it blocks on a network operation,
+		// the handshake request, and would block shutdown. If the shutdown signal is
+		// received, close the tunnel, which will interrupt the handshake request
+		// that may be blocking NewServerContext.
+		//
+		// Timeout after SshKeepAliveProbeTimeoutSeconds. NewServerContext may not
+		// return if the tunnel network connection is unstable during the handshake
+		// request. At this point, there is no operateTunnel monitor that will detect
+		// this condition with SSH keep alives. SshKeepAliveProbeTimeoutSeconds is
+		// used as the timeout deadline as it represents the time after which we wish
+		// to restart establishing when a tunnel has died.
+
+		type newServerContextResult struct {
+			serverContext *ServerContext
+			err           error
+		}
+
+		resultChannel := make(chan newServerContextResult)
+
+		go func() {
+			serverContext, err := NewServerContext(tunnel)
+			resultChannel <- newServerContextResult{
+				serverContext: serverContext,
+				err:           err,
+			}
+		}()
+
+		timer := time.NewTimer(
+			time.Second *
+				time.Duration(
+					*tunnel.config.TunnelSshKeepAliveProbeTimeoutSeconds))
+
+		var result newServerContextResult
+		select {
+		case result = <-resultChannel:
+		case <-timer.C:
+			result.err = errors.New("timed out")
+			// Interrupt the Activate goroutine and await its completion.
+			tunnel.Close(true)
+			<-resultChannel
+		case <-shutdownBroadcast:
+			result.err = errors.New("shutdown")
+			tunnel.Close(true)
+			<-resultChannel
+		}
+
+		if result.err != nil {
 			return common.ContextError(
 				fmt.Errorf("error starting server context for %s: %s",
-					tunnel.serverEntry.IpAddress, err))
+					tunnel.serverEntry.IpAddress, result.err))
 		}
+
+		serverContext = result.serverContext
 	}
 
 	tunnel.mutex.Lock()