Browse Source

Do not set timeout for packetman netlink I/O

- Avoids phenomenon of "orphaned" packets
  filling the queue.

- Less overhead from timeouts/re-reading.
Rod Hynes 5 years ago
parent
commit
7acfd1e25b
1 changed files with 16 additions and 23 deletions
  1. 16 23
      psiphon/common/packetman/packetman_linux.go

+ 16 - 23
psiphon/common/packetman/packetman_linux.go

@@ -43,9 +43,8 @@ func IsSupported() bool {
 }
 
 const (
-	netlinkSocketIOTimeout = 10 * time.Millisecond
-	defaultSocketMark      = 0x70736970 // "PSIP"
-	appliedSpecCacheTTL    = 1 * time.Minute
+	defaultSocketMark   = 0x70736970 // "PSIP"
+	appliedSpecCacheTTL = 1 * time.Minute
 )
 
 // Manipulator is a SYN-ACK packet manipulator.
@@ -200,9 +199,13 @@ func (m *Manipulator) Start() (retErr error) {
 	// via https://github.com/florianl/go-nfqueue/issues/3.
 	maxQueueLen := uint32(1024)
 
-	// Note: runContext alone is not sufficient to interrupt the
-	// nfqueue.socketCallback goroutine spawned by nfqueue.Register; timeouts
-	// must be set. See comment in Manipulator.Stop.
+	// Timeout note: on a small subset of production servers, we have found that
+	// setting a non-zero read timeout results in occasional "orphaned" packets
+	// which remain in the queue but are not delivered to handleInterceptedPacket
+	// for a verdict. This phenomenon leads to a stall in nfqueue processing once
+	// the queue fills up with packers apparently awaiting a verdict. The shorter
+	// the timeout, the faster that orphaned packets accumulate. With no timeout,
+	// and reads in blocking mode, this phenomenon does not occur.
 
 	m.nfqueue, err = nfqueue.Open(
 		&nfqueue.Config{
@@ -211,8 +214,8 @@ func (m *Manipulator) Start() (retErr error) {
 			MaxQueueLen:  maxQueueLen,
 			Copymode:     nfqueue.NfQnlCopyPacket,
 			Logger:       newNfqueueLogger(m.config.Logger),
-			ReadTimeout:  netlinkSocketIOTimeout,
-			WriteTimeout: netlinkSocketIOTimeout,
+			ReadTimeout:  0,
+			WriteTimeout: 0,
 		})
 	if err != nil {
 		return errors.Trace(err)
@@ -252,23 +255,13 @@ func (m *Manipulator) Stop() {
 		return
 	}
 
-	m.stopRunning()
+	// Call stopRunning before interrupting the blocked read; this ensures that
+	// the nfqueue socketCallback loop will exit after the read is interrupted.
 
-	// stopRunning will cancel the context passed into nfqueue.Register. The
-	// goroutine spawned by Register, nfqueue.socketCallback, polls the context
-	// after a read timeout:
-	// https://github.com/florianl/go-nfqueue/blob/1e38df738c06deffbac08da8fec4b7c28a69b918/nfqueue_gteq_1.12.go#L138-L146
-	//
-	// There's no stop synchronization exposed by nfqueue. Calling nfqueue.Close
-	// while socketCallback is still running can result in errors such as
-	// "nfqueuenfqueue_gteq_1.12.go:134: Could not unbind from queue: netlink
-	// send: sendmsg: bad file descriptor".
-	//
-	// To avoid invalid file descriptor operations and spurious error messages,
-	// sleep for two polling periods, which should be sufficient, in most cases,
-	// for socketCallback to poll the context and exit.
+	m.stopRunning()
 
-	time.Sleep(2 * netlinkSocketIOTimeout)
+	// Interrupt a blocked read.
+	m.nfqueue.Con.SetDeadline(time.Unix(0, 1))
 
 	m.nfqueue.Close()