Browse Source

Add Conjure registration caching logic

- Don't replace a cache entry when there is
  an existing one with the same key.

- Delete a cache entry when a tunnel fails
  later in its lifecycle.

- Log the reason when setting a cache entry.
Rod Hynes 4 years ago
parent
commit
06616d2ea2

+ 59 - 4
psiphon/common/refraction/refraction.go

@@ -419,7 +419,9 @@ func dial(
 
 	if useConjure && (conjureCachedRegistration != nil || conjureRecordRegistrar != nil) {
 
-		if err == nil || ctx.Err() == context.Canceled {
+		isCanceled := (err != nil && ctx.Err() == context.Canceled)
+
+		if err == nil || isCanceled {
 
 			registration := conjureCachedRegistration
 			if registration == nil {
@@ -437,14 +439,16 @@ func dial(
 				// connection is established.
 				registration.TcpDialer = nil
 
-				conjureRegistrationCache.put(conjureConfig, registration)
+				conjureRegistrationCache.put(conjureConfig, registration, isCanceled)
 			}
 
 		} else if conjureCachedRegistration != nil {
 
 			conjureConfig.Logger.WithTraceFields(
 				common.LogFields{
-					"diagnosticID": conjureConfig.DiagnosticID}).Info(
+					"diagnosticID": conjureConfig.DiagnosticID,
+					"reason":       "phantom dial failed",
+				}).Info(
 				"drop cached registration")
 		}
 	}
@@ -471,6 +475,10 @@ func dial(
 	return refractionConn, nil
 }
 
+func DeleteCachedConjureRegistration(config *ConjureConfig) {
+	conjureRegistrationCache.delete(config)
+}
+
 type registrationCache struct {
 	mutex sync.Mutex
 	TTL   time.Duration
@@ -488,7 +496,8 @@ func newRegistrationCache() *registrationCache {
 
 func (c *registrationCache) put(
 	config *ConjureConfig,
-	registration *refraction_networking_client.ConjureReg) {
+	registration *refraction_networking_client.ConjureReg,
+	isCanceled bool) {
 
 	c.mutex.Lock()
 	defer c.mutex.Unlock()
@@ -502,10 +511,37 @@ func (c *registrationCache) put(
 		c.TTL = config.RegistrationCacheTTL
 	}
 
+	// Drop the cached registration if another entry is found under the same key.
+	// Since the dial pops its entry out of the cache, finding an existing entry
+	// implies that another tunnel establishment candidate with the same key has
+	// successfully registered and connected (or canceled) in the meantime.
+	// Prefer that newer cached registration.
+	//
+	// For Psiphon, one scenario resulting in this condition is that the first
+	// dial to a given server, using a cached registration, is delayed long
+	// enough that a new candidate for the same server has been started and
+	// outpaced the first candidate.
+	_, found := c.cache.Get(config.RegistrationCacheKey)
+	if found {
+		config.Logger.WithTraceFields(
+			common.LogFields{
+				"diagnosticID": config.DiagnosticID,
+				"reason":       "existing entry found",
+			}).Info(
+			"drop cached registration")
+		return
+	}
+
+	reason := "connected"
+	if isCanceled {
+		reason = "canceled"
+	}
+
 	config.Logger.WithTraceFields(
 		common.LogFields{
 			"diagnosticID": config.DiagnosticID,
 			"cacheSize":    c.cache.ItemCount(),
+			"reason":       reason,
 		}).Info(
 		"put cached registration")
 
@@ -545,6 +581,25 @@ func (c *registrationCache) pop(
 	return nil
 }
 
+func (c *registrationCache) delete(config *ConjureConfig) {
+
+	c.mutex.Lock()
+	defer c.mutex.Unlock()
+
+	_, found := c.cache.Get(config.RegistrationCacheKey)
+
+	config.Logger.WithTraceFields(
+		common.LogFields{
+			"diagnosticID": config.DiagnosticID,
+			"found":        found,
+		}).Info(
+		"delete cached registration")
+
+	if found {
+		c.cache.Delete(config.RegistrationCacheKey)
+	}
+}
+
 var conjureRegistrationCache = newRegistrationCache()
 
 type cachedRegistrar struct {

+ 3 - 0
psiphon/common/refraction/refraction_disabled.go

@@ -53,3 +53,6 @@ func DialTapDance(_ context.Context, _ bool, _ string, _ common.NetDialer, _ str
 func DialConjure(_ context.Context, _ bool, _ string, _ common.NetDialer, _ string, _ *ConjureConfig) (net.Conn, error) {
 	return nil, errors.TraceNew("operation is not enabled")
 }
+
+func DeleteCachedConjureRegistration(_ *ConjureConfig) {
+}

+ 39 - 3
psiphon/tunnel.go

@@ -87,6 +87,7 @@ type Tunnel struct {
 	isClosed                       bool
 	dialParams                     *DialParameters
 	livenessTestMetrics            *livenessTestMetrics
+	extraFailureAction             func()
 	serverContext                  *ServerContext
 	monitoringStartTime            time.Time
 	conn                           *common.BurstMonitoredConn
@@ -156,6 +157,7 @@ func ConnectTunnel(
 		config:              config,
 		dialParams:          dialParams,
 		livenessTestMetrics: dialResult.livenessTestMetrics,
+		extraFailureAction:  dialResult.extraFailureAction,
 		monitoringStartTime: dialResult.monitoringStartTime,
 		conn:                dialResult.monitoredConn,
 		sshClient:           dialResult.sshClient,
@@ -179,7 +181,7 @@ func (tunnel *Tunnel) Activate(
 	activationSucceeded := false
 	baseCtx := ctx
 	defer func() {
-		if !activationSucceeded && baseCtx.Err() == nil {
+		if !activationSucceeded && baseCtx.Err() != context.Canceled {
 			tunnel.dialParams.Failed(tunnel.config)
 			_ = RecordFailedTunnelStat(
 				tunnel.config,
@@ -188,6 +190,9 @@ func (tunnel *Tunnel) Activate(
 				-1,
 				-1,
 				retErr)
+			if tunnel.extraFailureAction != nil {
+				tunnel.extraFailureAction()
+			}
 		}
 	}()
 
@@ -653,6 +658,7 @@ type dialResult struct {
 	sshClient           *ssh.Client
 	sshRequests         <-chan *ssh.Request
 	livenessTestMetrics *livenessTestMetrics
+	extraFailureAction  func()
 }
 
 // dialTunnel is a helper that builds the transport layers and establishes the
@@ -689,14 +695,17 @@ func dialTunnel(
 	// parameters are cleared, no longer to be retried, if the tunnel fails to
 	// connect.
 	//
+	//
+	//
 	// Limitation: dials that fail to connect due to the server being in a
 	// load-limiting state are not distinguished and excepted from this
 	// logic.
 	dialSucceeded := false
 	baseCtx := ctx
 	var failedTunnelLivenessTestMetrics *livenessTestMetrics
+	var extraFailureAction func()
 	defer func() {
-		if !dialSucceeded && baseCtx.Err() == nil {
+		if !dialSucceeded && baseCtx.Err() != context.Canceled {
 			dialParams.Failed(config)
 			_ = RecordFailedTunnelStat(
 				config,
@@ -705,6 +714,9 @@ func dialTunnel(
 				-1,
 				-1,
 				retErr)
+			if extraFailureAction != nil {
+				extraFailureAction()
+			}
 		}
 	}()
 
@@ -821,6 +833,25 @@ func dialTunnel(
 			Logger:               NoticeCommonLogger(),
 		}
 
+		// Set extraFailureAction, which is invoked whenever the tunnel fails (i.e.,
+		// where RecordFailedTunnelStat is invoked). The action will remove any
+		// cached registration. When refraction.DialConjure succeeds, the underlying
+		// registration is cached. After refraction.DialConjure returns, it no
+		// longer modifies the cached state of that registration, assuming that it
+		// remains valid and effective. However adversarial impact on a given
+		// phantom IP may not become evident until after the initial TCP connection
+		// establishment and handshake performed by refraction.DialConjure. For
+		// example, it may be that the phantom dial is targeted for severe
+		// throttling which begins or is only evident later in the flow. Scheduling
+		// a call to DeleteCachedConjureRegistration allows us to invalidate the
+		// cached registration for a tunnel that fails later in its lifecycle.
+		//
+		// Note that extraFailureAction will retain a reference to conjureConfig for
+		// the lifetime of the tunnel.
+		extraFailureAction = func() {
+			refraction.DeleteCachedConjureRegistration(conjureConfig)
+		}
+
 		if dialParams.ConjureAPIRegistration {
 
 			// Use MeekConn to domain front Conjure API registration.
@@ -1137,7 +1168,9 @@ func dialTunnel(
 			monitoredConn:       monitoredConn,
 			sshClient:           result.sshClient,
 			sshRequests:         result.sshRequests,
-			livenessTestMetrics: result.livenessTestMetrics},
+			livenessTestMetrics: result.livenessTestMetrics,
+			extraFailureAction:  extraFailureAction,
+		},
 		nil
 }
 
@@ -1723,6 +1756,9 @@ loop:
 				bytesUp,
 				bytesDown,
 				err)
+			if tunnel.extraFailureAction != nil {
+				tunnel.extraFailureAction()
+			}
 
 			// SSHKeepAliveResetOnFailureProbability is set when a late-lifecycle
 			// impaired protocol attack is suspected. With the given probability, reset