Browse Source

Merge pull request #491 from rod-hynes/master

Obfuscated QUIC and new metrics
Rod Hynes 7 years ago
parent
commit
6ecbbd4d5f
47 changed files with 5101 additions and 450 deletions
  1. 1 0
      README.md
  2. 9 0
      psiphon/TCPConn.go
  3. 2 0
      psiphon/TCPConn_bind.go
  4. 2 0
      psiphon/UDPConn_bind.go
  5. 141 22
      psiphon/common/fragmentor/fragmentor.go
  6. 26 9
      psiphon/common/fragmentor/fragmentor_test.go
  7. 8 0
      psiphon/common/logger.go
  8. 58 38
      psiphon/common/obfuscator/obfuscatedSshConn.go
  9. 17 8
      psiphon/common/obfuscator/obfuscator.go
  10. 2 2
      psiphon/common/obfuscator/obfuscator_test.go
  11. 16 8
      psiphon/common/parameters/clientParameters.go
  12. 5 3
      psiphon/common/protocol/protocol.go
  13. 384 0
      psiphon/common/quic/obfuscator.go
  14. 112 7
      psiphon/common/quic/quic.go
  15. 8 2
      psiphon/common/quic/quic_test.go
  16. 27 35
      psiphon/common/tactics/tactics.go
  17. 2 1
      psiphon/controller.go
  18. 0 107
      psiphon/fragmentor.go
  19. 6 16
      psiphon/meekConn.go
  20. 5 0
      psiphon/net.go
  21. 26 0
      psiphon/notice.go
  22. 78 38
      psiphon/server/api.go
  23. 0 8
      psiphon/server/log.go
  24. 12 10
      psiphon/server/meek.go
  25. 64 7
      psiphon/server/server_test.go
  26. 85 29
      psiphon/server/tunnelServer.go
  27. 26 0
      psiphon/serverApi.go
  28. 21 10
      psiphon/tlsDialer.go
  29. 51 30
      psiphon/tunnel.go
  30. 0 6
      vendor/github.com/Psiphon-Labs/tls-tris/13.go
  31. 1 0
      vendor/github.com/Psiphon-Labs/tls-tris/alert.go
  32. 13 17
      vendor/github.com/Psiphon-Labs/tls-tris/common.go
  33. 13 7
      vendor/github.com/Psiphon-Labs/tls-tris/conn.go
  34. 23 6
      vendor/github.com/Psiphon-Labs/tls-tris/handshake_client.go
  35. 40 3
      vendor/github.com/Psiphon-Labs/tls-tris/handshake_messages.go
  36. 14 8
      vendor/github.com/Psiphon-Labs/tls-tris/handshake_server.go
  37. 16 8
      vendor/github.com/Psiphon-Labs/tls-tris/prf.go
  38. 10 2
      vendor/github.com/Psiphon-Labs/tls-tris/ticket.go
  39. 122 0
      vendor/github.com/Yawning/chacha20/LICENSE
  40. 14 0
      vendor/github.com/Yawning/chacha20/README.md
  41. 273 0
      vendor/github.com/Yawning/chacha20/chacha20.go
  42. 95 0
      vendor/github.com/Yawning/chacha20/chacha20_amd64.go
  43. 1295 0
      vendor/github.com/Yawning/chacha20/chacha20_amd64.py
  44. 1180 0
      vendor/github.com/Yawning/chacha20/chacha20_amd64.s
  45. 394 0
      vendor/github.com/Yawning/chacha20/chacha20_ref.go
  46. 395 0
      vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go
  47. 9 3
      vendor/vendor.json

+ 1 - 0
README.md

@@ -101,6 +101,7 @@ Psiphon Tunnel Core uses:
 * [utls](https://github.com/refraction-networking/utls)
 * [quic-go](https://github.com/lucas-clemente/quic-go)
 * [tls-tris](https://github.com/cloudflare/tls-tris)
+* [chacha20](https://github.com/Yawning/chacha20)
 
 Licensing
 --------------------------------------------------------------------------------

+ 9 - 0
psiphon/TCPConn.go

@@ -27,6 +27,7 @@ import (
 	"sync/atomic"
 
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/fragmentor"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/upstreamproxy"
 )
 
@@ -79,6 +80,14 @@ func DialTCP(
 			config.ResolvedIPCallback(ipAddress)
 		}
 	}
+
+	if config.FragmentorConfig.IsFragmenting() {
+		conn = fragmentor.NewConn(
+			config.FragmentorConfig,
+			func(message string) { NoticeInfo(message) },
+			conn)
+	}
+
 	return conn, nil
 }
 

+ 2 - 0
psiphon/TCPConn_bind.go

@@ -41,6 +41,8 @@ import (
 // The sequence of syscalls in this implementation are taken from:
 // https://github.com/golang/go/issues/6966
 // (originally: https://code.google.com/p/go/issues/detail?id=6966)
+//
+// TODO: use https://golang.org/pkg/net/#Dialer.Control, introduced in Go 1.11?
 func tcpDial(ctx context.Context, addr string, config *DialConfig) (net.Conn, error) {
 
 	// Get the remote IP and port, resolving a domain name if necessary

+ 2 - 0
psiphon/UDPConn_bind.go

@@ -32,6 +32,8 @@ import (
 
 func newUDPConn(domain int, config *DialConfig) (net.PacketConn, error) {
 
+	// TODO: use https://golang.org/pkg/net/#Dialer.Control, introduced in Go 1.11?
+
 	socketFD, err := syscall.Socket(domain, syscall.SOCK_DGRAM, 0)
 	if err != nil {
 		return nil, common.ContextError(err)

+ 141 - 22
psiphon/common/fragmentor/fragmentor.go

@@ -29,6 +29,7 @@ import (
 	"time"
 
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/parameters"
 )
 
 const (
@@ -36,6 +37,88 @@ const (
 	MAX_FRAGMENTOR_ITERATIONS_PER_NOTICE = 5
 )
 
+// Config specifies a fragmentor configuration. NewUpstreamConfig and
+// NewDownstreamConfig will generate configurations based on the given
+// client parameters.
+type Config struct {
+	isUpstream      bool
+	bytesToFragment int
+	minWriteBytes   int
+	maxWriteBytes   int
+	minDelay        time.Duration
+	maxDelay        time.Duration
+}
+
+// NewUpstreamConfig creates a new Config; may return nil.
+func NewUpstreamConfig(
+	p *parameters.ClientParametersSnapshot, tunnelProtocol string) *Config {
+	return newConfig(p, true, tunnelProtocol)
+}
+
+// NewDownstreamConfig creates a new Config; may return nil.
+func NewDownstreamConfig(
+	p *parameters.ClientParametersSnapshot, tunnelProtocol string) *Config {
+	return newConfig(p, false, tunnelProtocol)
+}
+
+func newConfig(
+	p *parameters.ClientParametersSnapshot,
+	isUpstream bool,
+	tunnelProtocol string) *Config {
+
+	probability := parameters.FragmentorProbability
+	limitProtocols := parameters.FragmentorLimitProtocols
+	minTotalBytes := parameters.FragmentorMinTotalBytes
+	maxTotalBytes := parameters.FragmentorMaxTotalBytes
+	minWriteBytes := parameters.FragmentorMinWriteBytes
+	maxWriteBytes := parameters.FragmentorMaxWriteBytes
+	minDelay := parameters.FragmentorMinDelay
+	maxDelay := parameters.FragmentorMaxDelay
+
+	if !isUpstream {
+		probability = parameters.FragmentorDownstreamProbability
+		limitProtocols = parameters.FragmentorDownstreamLimitProtocols
+		minTotalBytes = parameters.FragmentorDownstreamMinTotalBytes
+		maxTotalBytes = parameters.FragmentorDownstreamMaxTotalBytes
+		minWriteBytes = parameters.FragmentorDownstreamMinWriteBytes
+		maxWriteBytes = parameters.FragmentorDownstreamMaxWriteBytes
+		minDelay = parameters.FragmentorDownstreamMinDelay
+		maxDelay = parameters.FragmentorDownstreamMaxDelay
+	}
+
+	coinFlip := p.WeightedCoinFlip(probability)
+	tunnelProtocols := p.TunnelProtocols(limitProtocols)
+
+	if !coinFlip || (len(tunnelProtocols) > 0 && !common.Contains(tunnelProtocols, tunnelProtocol)) {
+		return nil
+	}
+
+	bytesToFragment, err := common.MakeSecureRandomRange(
+		p.Int(minTotalBytes), p.Int(maxTotalBytes))
+	if err != nil {
+		bytesToFragment = 0
+	}
+
+	if bytesToFragment == 0 {
+		return nil
+	}
+
+	return &Config{
+		isUpstream:      isUpstream,
+		bytesToFragment: bytesToFragment,
+		minWriteBytes:   p.Int(minWriteBytes),
+		maxWriteBytes:   p.Int(maxWriteBytes),
+		minDelay:        p.Duration(minDelay),
+		maxDelay:        p.Duration(maxDelay),
+	}
+}
+
+// IsFragmenting indicates whether the fragmentor configuration results in any
+// fragmentation; config may be nil.
+func (config *Config) IsFragmenting() bool {
+	return config != nil && config.bytesToFragment > 0
+}
+
 // Conn implements simple fragmentation of application-level messages/packets
 // into multiple TCP packets by splitting writes into smaller sizes and adding
 // delays between writes.
@@ -46,39 +129,61 @@ const (
 // portion of a TCP flow.
 type Conn struct {
 	net.Conn
+	config          *Config
 	noticeEmitter   func(string)
 	runCtx          context.Context
 	stopRunning     context.CancelFunc
 	isClosed        int32
 	writeMutex      sync.Mutex
 	numNotices      int
-	bytesToFragment int
 	bytesFragmented int
-	minWriteBytes   int
-	maxWriteBytes   int
-	minDelay        time.Duration
-	maxDelay        time.Duration
+	maxBytesWritten int
+	minBytesWritten int
+	minDelayed      time.Duration
+	maxDelayed      time.Duration
 }
 
 // NewConn creates a new Conn.
 func NewConn(
-	conn net.Conn,
+	config *Config,
 	noticeEmitter func(string),
-	bytesToFragment, minWriteBytes, maxWriteBytes int,
-	minDelay, maxDelay time.Duration) *Conn {
+	conn net.Conn) *Conn {
 
 	runCtx, stopRunning := context.WithCancel(context.Background())
 	return &Conn{
-		Conn:            conn,
-		noticeEmitter:   noticeEmitter,
-		runCtx:          runCtx,
-		stopRunning:     stopRunning,
-		bytesToFragment: bytesToFragment,
-		minWriteBytes:   minWriteBytes,
-		maxWriteBytes:   maxWriteBytes,
-		minDelay:        minDelay,
-		maxDelay:        maxDelay,
+		Conn:          conn,
+		config:        config,
+		noticeEmitter: noticeEmitter,
+		runCtx:        runCtx,
+		stopRunning:   stopRunning,
+	}
+}
+
+// GetMetrics implements the common.MetricsSource interface.
+func (c *Conn) GetMetrics() common.LogFields {
+	c.writeMutex.Lock()
+	defer c.writeMutex.Unlock()
+
+	logFields := make(common.LogFields)
+
+	if c.bytesFragmented == 0 {
+		return logFields
 	}
+
+	var prefix string
+	if c.config.isUpstream {
+		prefix = "upstream_"
+	} else {
+		prefix = "downstream_"
+	}
+
+	logFields[prefix+"bytes_fragmented"] = c.bytesFragmented
+	logFields[prefix+"min_bytes_written"] = c.minBytesWritten
+	logFields[prefix+"max_bytes_written"] = c.maxBytesWritten
+	logFields[prefix+"min_delayed"] = int(c.minDelayed / time.Microsecond)
+	logFields[prefix+"max_delayed"] = int(c.maxDelayed / time.Microsecond)
+
+	return logFields
 }
 
 func (c *Conn) Write(buffer []byte) (int, error) {
@@ -86,7 +191,7 @@ func (c *Conn) Write(buffer []byte) (int, error) {
 	c.writeMutex.Lock()
 	defer c.writeMutex.Unlock()
 
-	if c.bytesFragmented >= c.bytesToFragment {
+	if c.bytesFragmented >= c.config.bytesToFragment {
 		return c.Conn.Write(buffer)
 	}
 
@@ -112,9 +217,9 @@ func (c *Conn) Write(buffer []byte) (int, error) {
 	for iterations := 0; len(buffer) > 0; iterations += 1 {
 
 		delay, err := common.MakeSecureRandomPeriod(
-			c.minDelay, c.maxDelay)
+			c.config.minDelay, c.config.maxDelay)
 		if err != nil {
-			delay = c.minDelay
+			delay = c.config.minDelay
 		}
 
 		timer := time.NewTimer(delay)
@@ -130,12 +235,12 @@ func (c *Conn) Write(buffer []byte) (int, error) {
 			return totalBytesWritten, err
 		}
 
-		minWriteBytes := c.minWriteBytes
+		minWriteBytes := c.config.minWriteBytes
 		if minWriteBytes > len(buffer) {
 			minWriteBytes = len(buffer)
 		}
 
-		maxWriteBytes := c.maxWriteBytes
+		maxWriteBytes := c.config.maxWriteBytes
 		if maxWriteBytes > len(buffer) {
 			maxWriteBytes = len(buffer)
 		}
@@ -155,6 +260,20 @@ func (c *Conn) Write(buffer []byte) (int, error) {
 			return totalBytesWritten, err
 		}
 
+		if c.minBytesWritten == 0 || c.minBytesWritten > bytesWritten {
+			c.minBytesWritten = bytesWritten
+		}
+		if c.maxBytesWritten < bytesWritten {
+			c.maxBytesWritten = bytesWritten
+		}
+
+		if c.minDelayed == 0 || c.minDelayed > delay {
+			c.minDelayed = delay
+		}
+		if c.maxDelayed < delay {
+			c.maxDelayed = delay
+		}
+
 		if emitNotice {
 			if iterations < MAX_FRAGMENTOR_ITERATIONS_PER_NOTICE {
 				fmt.Fprintf(&notice, " [%s] %d", delay, bytesWritten)

+ 26 - 9
psiphon/common/fragmentor/fragmentor_test.go

@@ -29,6 +29,8 @@ import (
 	"time"
 
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/parameters"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/protocol"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -55,6 +57,24 @@ func TestFragmentor(t *testing.T) {
 	minDelay := 2 * time.Millisecond
 	maxDelay := 2 * time.Millisecond
 
+	clientParameters, err := parameters.NewClientParameters(nil)
+	if err != nil {
+		t.Fatalf("parameters.NewClientParameters failed: %s", err)
+	}
+	_, err = clientParameters.Set("", false, map[string]interface{}{
+		"FragmentorProbability":    1.0,
+		"FragmentorLimitProtocols": protocol.TunnelProtocols{},
+		"FragmentorMinTotalBytes":  bytesFragmented,
+		"FragmentorMaxTotalBytes":  bytesFragmented,
+		"FragmentorMinWriteBytes":  minWriteBytes,
+		"FragmentorMaxWriteBytes":  maxWriteBytes,
+		"FragmentorMinDelay":       minDelay,
+		"FragmentorMaxDelay":       maxDelay,
+	})
+	if err != nil {
+		t.Fatalf("ClientParameters.Set failed: %s", err)
+	}
+
 	testGroup, testCtx := errgroup.WithContext(context.Background())
 
 	testGroup.Go(func() error {
@@ -86,19 +106,16 @@ func TestFragmentor(t *testing.T) {
 		if err != nil {
 			return common.ContextError(err)
 		}
-		conn = NewConn(
-			conn,
+		fragConn := NewConn(
+			NewUpstreamConfig(clientParameters.Get(), ""),
 			func(message string) { t.Logf(message) },
-			bytesFragmented,
-			minWriteBytes,
-			maxWriteBytes,
-			minDelay,
-			maxDelay)
-		defer conn.Close()
-		_, err = conn.Write(data)
+			conn)
+		defer fragConn.Close()
+		_, err = fragConn.Write(data)
 		if err != nil {
 			return common.ContextError(err)
 		}
+		t.Logf("%+v", fragConn.GetMetrics())
 		return nil
 	})
 

+ 8 - 0
psiphon/common/logger.go

@@ -42,3 +42,11 @@ type LogContext interface {
 // LogFields is type-compatible with psiphon/server.LogFields
 // and logrus.LogFields.
 type LogFields map[string]interface{}
+
+// MetricsSource is an object that provides metrics to be logged
+type MetricsSource interface {
+
+	// GetMetrics returns a LogFields populated with
+	// metrics from the MetricsSource
+	GetMetrics() LogFields
+}

+ 58 - 38
psiphon/common/obfuscator/obfuscatedSshConn.go

@@ -38,11 +38,11 @@ const (
 	SSH_PADDING_MULTIPLE       = 16  // Default cipher block size
 )
 
-// ObfuscatedSshConn wraps a Conn and applies the obfuscated SSH protocol
+// ObfuscatedSSHConn wraps a Conn and applies the obfuscated SSH protocol
 // to the traffic on the connection:
 // https://github.com/brl/obfuscated-openssh/blob/master/README.obfuscation
 //
-// ObfuscatedSshConn is used to add obfuscation to golang's stock ssh
+// ObfuscatedSSHConn is used to add obfuscation to golang's stock "ssh"
 // client and server without modification to that standard library code.
 // The underlying connection must be used for SSH traffic. This code
 // injects the obfuscated seed message, applies obfuscated stream cipher
@@ -54,28 +54,29 @@ const (
 // no synchronization of access to the read/writeBuffers, so concurrent
 // calls to one of Read or Write will result in undefined behavior.
 //
-type ObfuscatedSshConn struct {
+type ObfuscatedSSHConn struct {
 	net.Conn
-	mode            ObfuscatedSshConnMode
+	mode            ObfuscatedSSHConnMode
 	obfuscator      *Obfuscator
 	readDeobfuscate func([]byte)
 	writeObfuscate  func([]byte)
-	readState       ObfuscatedSshReadState
-	writeState      ObfuscatedSshWriteState
+	readState       ObfuscatedSSHReadState
+	writeState      ObfuscatedSSHWriteState
 	readBuffer      *bytes.Buffer
 	writeBuffer     *bytes.Buffer
 	transformBuffer *bytes.Buffer
 	legacyPadding   bool
+	paddingLength   int
 }
 
-type ObfuscatedSshConnMode int
+type ObfuscatedSSHConnMode int
 
 const (
 	OBFUSCATION_CONN_MODE_CLIENT = iota
 	OBFUSCATION_CONN_MODE_SERVER
 )
 
-type ObfuscatedSshReadState int
+type ObfuscatedSSHReadState int
 
 const (
 	OBFUSCATION_READ_STATE_IDENTIFICATION_LINES = iota
@@ -84,7 +85,7 @@ const (
 	OBFUSCATION_READ_STATE_FINISHED
 )
 
-type ObfuscatedSshWriteState int
+type ObfuscatedSSHWriteState int
 
 const (
 	OBFUSCATION_WRITE_STATE_CLIENT_SEND_SEED_MESSAGE = iota
@@ -94,28 +95,27 @@ const (
 	OBFUSCATION_WRITE_STATE_FINISHED
 )
 
-// NewObfuscatedSshConn creates a new ObfuscatedSshConn.
+// NewObfuscatedSSHConn creates a new ObfuscatedSSHConn.
 // The underlying conn must be used for SSH traffic and must have
 // transferred no traffic.
 //
-// In client mode, NewObfuscatedSshConn does not block or initiate network
+// In client mode, NewObfuscatedSSHConn does not block or initiate network
 // I/O. The obfuscation seed message is sent when Write() is first called.
 //
-// In server mode, NewObfuscatedSshConn cannot completely initialize itself
+// In server mode, NewObfuscatedSSHConn cannot completely initialize itself
 // without the seed message from the client to derive obfuscation keys. So
-// NewObfuscatedSshConn blocks on reading the client seed message from the
+// NewObfuscatedSSHConn blocks on reading the client seed message from the
 // underlying conn.
-//
-func NewObfuscatedSshConn(
-	mode ObfuscatedSshConnMode,
+func NewObfuscatedSSHConn(
+	mode ObfuscatedSSHConnMode,
 	conn net.Conn,
 	obfuscationKeyword string,
-	minPadding, maxPadding *int) (*ObfuscatedSshConn, error) {
+	minPadding, maxPadding *int) (*ObfuscatedSSHConn, error) {
 
 	var err error
 	var obfuscator *Obfuscator
 	var readDeobfuscate, writeObfuscate func([]byte)
-	var writeState ObfuscatedSshWriteState
+	var writeState ObfuscatedSSHWriteState
 
 	if mode == OBFUSCATION_CONN_MODE_CLIENT {
 		obfuscator, err = NewClientObfuscator(
@@ -143,7 +143,7 @@ func NewObfuscatedSshConn(
 		writeState = OBFUSCATION_WRITE_STATE_SERVER_SEND_IDENTIFICATION_LINE_PADDING
 	}
 
-	return &ObfuscatedSshConn{
+	return &ObfuscatedSSHConn{
 		Conn:            conn,
 		mode:            mode,
 		obfuscator:      obfuscator,
@@ -154,12 +154,29 @@ func NewObfuscatedSshConn(
 		readBuffer:      new(bytes.Buffer),
 		writeBuffer:     new(bytes.Buffer),
 		transformBuffer: new(bytes.Buffer),
+		paddingLength:   -1,
 	}, nil
 }
 
+// GetMetrics implements the common.MetricsSource interface.
+func (conn *ObfuscatedSSHConn) GetMetrics() common.LogFields {
+	logFields := make(common.LogFields)
+	if conn.mode == OBFUSCATION_CONN_MODE_CLIENT {
+		paddingLength := conn.obfuscator.GetPaddingLength()
+		if paddingLength != -1 {
+			logFields["upstream_ossh_padding"] = paddingLength
+		}
+	} else {
+		if conn.paddingLength != -1 {
+			logFields["downstream_ossh_padding"] = conn.paddingLength
+		}
+	}
+	return logFields
+}
+
 // Read wraps standard Read, transparently applying the obfuscation
 // transformations.
-func (conn *ObfuscatedSshConn) Read(buffer []byte) (int, error) {
+func (conn *ObfuscatedSSHConn) Read(buffer []byte) (int, error) {
 	if conn.readState == OBFUSCATION_READ_STATE_FINISHED {
 		return conn.Conn.Read(buffer)
 	}
@@ -172,7 +189,7 @@ func (conn *ObfuscatedSshConn) Read(buffer []byte) (int, error) {
 
 // Write wraps standard Write, transparently applying the obfuscation
 // transformations.
-func (conn *ObfuscatedSshConn) Write(buffer []byte) (int, error) {
+func (conn *ObfuscatedSSHConn) Write(buffer []byte) (int, error) {
 	if conn.writeState == OBFUSCATION_WRITE_STATE_FINISHED {
 		return conn.Conn.Write(buffer)
 	}
@@ -224,7 +241,7 @@ func (conn *ObfuscatedSshConn) Write(buffer []byte) (int, error) {
 // State OBFUSCATION_READ_STATE_FLUSH: after SSH_MSG_NEWKEYS, no more
 // packets are read by this function, but bytes from the SSH_MSG_NEWKEYS
 // packet may need to be buffered due to partial reading.
-func (conn *ObfuscatedSshConn) readAndTransform(buffer []byte) (int, error) {
+func (conn *ObfuscatedSSHConn) readAndTransform(buffer []byte) (int, error) {
 
 	nextState := conn.readState
 
@@ -233,7 +250,7 @@ func (conn *ObfuscatedSshConn) readAndTransform(buffer []byte) (int, error) {
 		// TODO: only client should accept multiple lines?
 		if conn.readBuffer.Len() == 0 {
 			for {
-				err := readSshIdentificationLine(
+				err := readSSHIdentificationLine(
 					conn.Conn, conn.readDeobfuscate, conn.readBuffer)
 				if err != nil {
 					return 0, common.ContextError(err)
@@ -252,7 +269,7 @@ func (conn *ObfuscatedSshConn) readAndTransform(buffer []byte) (int, error) {
 
 	case OBFUSCATION_READ_STATE_KEX_PACKETS:
 		if conn.readBuffer.Len() == 0 {
-			isMsgNewKeys, err := readSshPacket(
+			isMsgNewKeys, err := readSSHPacket(
 				conn.Conn, conn.readDeobfuscate, conn.readBuffer)
 			if err != nil {
 				return 0, common.ContextError(err)
@@ -306,7 +323,7 @@ func (conn *ObfuscatedSshConn) readAndTransform(buffer []byte) (int, error) {
 // will ignore (http://tools.ietf.org/html/rfc4253#section-4.2).
 //
 // State OBFUSCATION_WRITE_STATE_IDENTIFICATION_LINE: before
-// packets are sent, the ssh peer sends an identification line terminated by CRLF:
+// packets are sent, the SSH peer sends an identification line terminated by CRLF:
 // http://www.ietf.org/rfc/rfc4253.txt sec 4.2.
 // In this state, the CRLF terminator is used to parse message boundaries.
 //
@@ -326,7 +343,7 @@ func (conn *ObfuscatedSshConn) readAndTransform(buffer []byte) (int, error) {
 // padding during the KEX phase as a partial defense against traffic analysis.
 // (The transformer can do this since only the payload and not the padding of
 // these packets is authenticated in the "exchange hash").
-func (conn *ObfuscatedSshConn) transformAndWrite(buffer []byte) error {
+func (conn *ObfuscatedSSHConn) transformAndWrite(buffer []byte) error {
 
 	// The seed message (client) and identification line padding (server)
 	// are injected before any standard SSH traffic.
@@ -341,6 +358,7 @@ func (conn *ObfuscatedSshConn) transformAndWrite(buffer []byte) error {
 		if err != nil {
 			return common.ContextError(err)
 		}
+		conn.paddingLength = len(padding)
 		conn.writeObfuscate(padding)
 		_, err = conn.Conn.Write(padding)
 		if err != nil {
@@ -359,14 +377,14 @@ func (conn *ObfuscatedSshConn) transformAndWrite(buffer []byte) error {
 
 	switch conn.writeState {
 	case OBFUSCATION_WRITE_STATE_IDENTIFICATION_LINE:
-		hasIdentificationLine := extractSshIdentificationLine(
+		hasIdentificationLine := extractSSHIdentificationLine(
 			conn.writeBuffer, conn.transformBuffer)
 		if hasIdentificationLine {
 			conn.writeState = OBFUSCATION_WRITE_STATE_KEX_PACKETS
 		}
 
 	case OBFUSCATION_WRITE_STATE_KEX_PACKETS:
-		hasMsgNewKeys, err := extractSshPackets(
+		hasMsgNewKeys, err := extractSSHPackets(
 			conn.legacyPadding, conn.writeBuffer, conn.transformBuffer)
 		if err != nil {
 			return common.ContextError(err)
@@ -403,7 +421,7 @@ func (conn *ObfuscatedSshConn) transformAndWrite(buffer []byte) error {
 	return nil
 }
 
-func readSshIdentificationLine(
+func readSSHIdentificationLine(
 	conn net.Conn,
 	deobfuscate func([]byte),
 	readBuffer *bytes.Buffer) error {
@@ -430,7 +448,7 @@ func readSshIdentificationLine(
 	return nil
 }
 
-func readSshPacket(
+func readSSHPacket(
 	conn net.Conn,
 	deobfuscate func([]byte),
 	readBuffer *bytes.Buffer) (bool, error) {
@@ -449,7 +467,7 @@ func readSshPacket(
 	prefix := readBuffer.Bytes()[prefixOffset : prefixOffset+SSH_PACKET_PREFIX_LENGTH]
 	deobfuscate(prefix)
 
-	_, _, payloadLength, messageLength, err := getSshPacketPrefix(prefix)
+	_, _, payloadLength, messageLength, err := getSSHPacketPrefix(prefix)
 	if err != nil {
 		return false, common.ContextError(err)
 	}
@@ -480,11 +498,13 @@ func readSshPacket(
 // From the original patch to sshd.c:
 // https://bitbucket.org/psiphon/psiphon-circumvention-system/commits/f40865ce624b680be840dc2432283c8137bd896d
 func makeServerIdentificationLinePadding() ([]byte, error) {
+
 	paddingLength, err := common.MakeSecureRandomInt(OBFUSCATE_MAX_PADDING - 2) // 2 = CRLF
 	if err != nil {
 		return nil, common.ContextError(err)
 	}
 	paddingLength += 2
+
 	padding := make([]byte, paddingLength)
 
 	// For backwards compatibility with some clients, send no more than 512 characters
@@ -513,7 +533,7 @@ func makeServerIdentificationLinePadding() ([]byte, error) {
 	return padding, nil
 }
 
-func extractSshIdentificationLine(writeBuffer, transformBuffer *bytes.Buffer) bool {
+func extractSSHIdentificationLine(writeBuffer, transformBuffer *bytes.Buffer) bool {
 	index := bytes.Index(writeBuffer.Bytes(), []byte("\r\n"))
 	if index != -1 {
 		lineLength := index + 2 // + 2 for \r\n
@@ -523,13 +543,13 @@ func extractSshIdentificationLine(writeBuffer, transformBuffer *bytes.Buffer) bo
 	return false
 }
 
-func extractSshPackets(
+func extractSSHPackets(
 	legacyPadding bool, writeBuffer, transformBuffer *bytes.Buffer) (bool, error) {
 
 	hasMsgNewKeys := false
 	for writeBuffer.Len() >= SSH_PACKET_PREFIX_LENGTH {
 
-		packetLength, paddingLength, payloadLength, messageLength, err := getSshPacketPrefix(
+		packetLength, paddingLength, payloadLength, messageLength, err := getSSHPacketPrefix(
 			writeBuffer.Bytes()[:SSH_PACKET_PREFIX_LENGTH])
 		if err != nil {
 			return false, common.ContextError(err)
@@ -593,7 +613,7 @@ func extractSshPackets(
 			return false, common.ContextError(err)
 		}
 
-		setSshPacketPrefix(
+		setSSHPacketPrefix(
 			transformedPacket,
 			packetLength+extraPaddingLength,
 			paddingLength+extraPaddingLength)
@@ -604,12 +624,12 @@ func extractSshPackets(
 	return hasMsgNewKeys, nil
 }
 
-func getSshPacketPrefix(buffer []byte) (int, int, int, int, error) {
+func getSSHPacketPrefix(buffer []byte) (int, int, int, int, error) {
 
 	packetLength := int(binary.BigEndian.Uint32(buffer[0 : SSH_PACKET_PREFIX_LENGTH-1]))
 
 	if packetLength < 1 || packetLength > SSH_MAX_PACKET_LENGTH {
-		return 0, 0, 0, 0, common.ContextError(errors.New("invalid ssh packet length"))
+		return 0, 0, 0, 0, common.ContextError(errors.New("invalid SSH packet length"))
 	}
 
 	paddingLength := int(buffer[SSH_PACKET_PREFIX_LENGTH-1])
@@ -619,7 +639,7 @@ func getSshPacketPrefix(buffer []byte) (int, int, int, int, error) {
 	return packetLength, paddingLength, payloadLength, messageLength, nil
 }
 
-func setSshPacketPrefix(buffer []byte, packetLength, paddingLength int) {
+func setSSHPacketPrefix(buffer []byte, packetLength, paddingLength int) {
 	binary.BigEndian.PutUint32(buffer, uint32(packetLength))
 	buffer[SSH_PACKET_PREFIX_LENGTH-1] = byte(paddingLength)
 }

+ 17 - 8
psiphon/common/obfuscator/obfuscator.go

@@ -45,6 +45,7 @@ const (
 // https://github.com/brl/obfuscated-openssh/blob/master/README.obfuscation
 type Obfuscator struct {
 	seedMessage          []byte
+	paddingLength        int
 	clientToServerCipher *rc4.Cipher
 	serverToClientCipher *rc4.Cipher
 }
@@ -88,13 +89,14 @@ func NewClientObfuscator(
 		maxPadding = *config.MaxPadding
 	}
 
-	seedMessage, err := makeSeedMessage(minPadding, maxPadding, seed, clientToServerCipher)
+	seedMessage, paddingLength, err := makeSeedMessage(minPadding, maxPadding, seed, clientToServerCipher)
 	if err != nil {
 		return nil, common.ContextError(err)
 	}
 
 	return &Obfuscator{
 		seedMessage:          seedMessage,
+		paddingLength:        paddingLength,
 		clientToServerCipher: clientToServerCipher,
 		serverToClientCipher: serverToClientCipher}, nil
 }
@@ -111,10 +113,17 @@ func NewServerObfuscator(
 	}
 
 	return &Obfuscator{
+		paddingLength:        -1,
 		clientToServerCipher: clientToServerCipher,
 		serverToClientCipher: serverToClientCipher}, nil
 }
 
+// GetPaddingLength returns the client seed message padding length. Only valid
+// for NewClientObfuscator.
+func (obfuscator *Obfuscator) GetPaddingLength() int {
+	return obfuscator.paddingLength
+}
+
 // SendSeedMessage returns the seed message created in NewObfuscatorClient,
 // removing the reference so that it may be garbage collected.
 func (obfuscator *Obfuscator) SendSeedMessage() []byte {
@@ -176,31 +185,31 @@ func deriveKey(seed, keyword, iv []byte) ([]byte, error) {
 	return digest[0:OBFUSCATE_KEY_LENGTH], nil
 }
 
-func makeSeedMessage(minPadding, maxPadding int, seed []byte, clientToServerCipher *rc4.Cipher) ([]byte, error) {
+func makeSeedMessage(minPadding, maxPadding int, seed []byte, clientToServerCipher *rc4.Cipher) ([]byte, int, error) {
 	padding, err := common.MakeSecureRandomPadding(minPadding, maxPadding)
 	if err != nil {
-		return nil, common.ContextError(err)
+		return nil, 0, common.ContextError(err)
 	}
 	buffer := new(bytes.Buffer)
 	err = binary.Write(buffer, binary.BigEndian, seed)
 	if err != nil {
-		return nil, common.ContextError(err)
+		return nil, 0, common.ContextError(err)
 	}
 	err = binary.Write(buffer, binary.BigEndian, uint32(OBFUSCATE_MAGIC_VALUE))
 	if err != nil {
-		return nil, common.ContextError(err)
+		return nil, 0, common.ContextError(err)
 	}
 	err = binary.Write(buffer, binary.BigEndian, uint32(len(padding)))
 	if err != nil {
-		return nil, common.ContextError(err)
+		return nil, 0, common.ContextError(err)
 	}
 	err = binary.Write(buffer, binary.BigEndian, padding)
 	if err != nil {
-		return nil, common.ContextError(err)
+		return nil, 0, common.ContextError(err)
 	}
 	seedMessage := buffer.Bytes()
 	clientToServerCipher.XORKeyStream(seedMessage[len(seed):], seedMessage[len(seed):])
-	return seedMessage, nil
+	return seedMessage, len(padding), nil
 }
 
 func readSeedMessage(

+ 2 - 2
psiphon/common/obfuscator/obfuscator_test.go

@@ -113,7 +113,7 @@ func TestObfuscatedSSHConn(t *testing.T) {
 		conn, err := listener.Accept()
 
 		if err == nil {
-			conn, err = NewObfuscatedSshConn(
+			conn, err = NewObfuscatedSSHConn(
 				OBFUSCATION_CONN_MODE_SERVER, conn, keyword, nil, nil)
 		}
 
@@ -139,7 +139,7 @@ func TestObfuscatedSSHConn(t *testing.T) {
 		conn, err := net.DialTimeout("tcp", serverAddress, 5*time.Second)
 
 		if err == nil {
-			conn, err = NewObfuscatedSshConn(
+			conn, err = NewObfuscatedSSHConn(
 				OBFUSCATION_CONN_MODE_CLIENT, conn, keyword, nil, nil)
 		}
 

+ 16 - 8
psiphon/common/parameters/clientParameters.go

@@ -181,6 +181,7 @@ const (
 
 const (
 	useNetworkLatencyMultiplier = 1
+	serverSideOnly              = 2
 )
 
 // defaultClientParameters specifies the type, default value, and minimum
@@ -249,14 +250,14 @@ var defaultClientParameters = map[string]struct {
 	FragmentorMaxWriteBytes:            {value: 1500, minimum: 1},
 	FragmentorMinDelay:                 {value: time.Duration(0), minimum: time.Duration(0)},
 	FragmentorMaxDelay:                 {value: 10 * time.Millisecond, minimum: time.Duration(0)},
-	FragmentorDownstreamProbability:    {value: 0.5, minimum: 0.0},
-	FragmentorDownstreamLimitProtocols: {value: protocol.TunnelProtocols{}},
-	FragmentorDownstreamMinTotalBytes:  {value: 0, minimum: 0},
-	FragmentorDownstreamMaxTotalBytes:  {value: 0, minimum: 0},
-	FragmentorDownstreamMinWriteBytes:  {value: 1, minimum: 1},
-	FragmentorDownstreamMaxWriteBytes:  {value: 1500, minimum: 1},
-	FragmentorDownstreamMinDelay:       {value: time.Duration(0), minimum: time.Duration(0)},
-	FragmentorDownstreamMaxDelay:       {value: 10 * time.Millisecond, minimum: time.Duration(0)},
+	FragmentorDownstreamProbability:    {value: 0.5, minimum: 0.0, flags: serverSideOnly},
+	FragmentorDownstreamLimitProtocols: {value: protocol.TunnelProtocols{}, flags: serverSideOnly},
+	FragmentorDownstreamMinTotalBytes:  {value: 0, minimum: 0, flags: serverSideOnly},
+	FragmentorDownstreamMaxTotalBytes:  {value: 0, minimum: 0, flags: serverSideOnly},
+	FragmentorDownstreamMinWriteBytes:  {value: 1, minimum: 1, flags: serverSideOnly},
+	FragmentorDownstreamMaxWriteBytes:  {value: 1500, minimum: 1, flags: serverSideOnly},
+	FragmentorDownstreamMinDelay:       {value: time.Duration(0), minimum: time.Duration(0), flags: serverSideOnly},
+	FragmentorDownstreamMaxDelay:       {value: 10 * time.Millisecond, minimum: time.Duration(0), flags: serverSideOnly},
 
 	// The Psiphon server will reject obfuscated SSH seed messages with
 	// padding greater than OBFUSCATE_MAX_PADDING.
@@ -356,6 +357,13 @@ var defaultClientParameters = map[string]struct {
 	PickUserAgentProbability:     {value: 0.5, minimum: 0.0},
 }
 
+// IsServerSideOnly indicates if the parameter specified by name is used
+// server-side only.
+func IsServerSideOnly(name string) bool {
+	defaultParameter, ok := defaultClientParameters[name]
+	return ok && (defaultParameter.flags&serverSideOnly) != 0
+}
+
 // ClientParameters is a set of client parameters. To use the parameters, call
 // Get. To apply new values to the parameters, call Set.
 type ClientParameters struct {

+ 5 - 3
psiphon/common/protocol/protocol.go

@@ -230,15 +230,17 @@ func (profiles TLSProfiles) PruneInvalid() TLSProfiles {
 }
 
 const (
-	QUIC_VERSION_GQUIC39 = "gQUICv39"
-	QUIC_VERSION_GQUIC43 = "gQUICv43"
-	QUIC_VERSION_GQUIC44 = "gQUICv44"
+	QUIC_VERSION_GQUIC39    = "gQUICv39"
+	QUIC_VERSION_GQUIC43    = "gQUICv43"
+	QUIC_VERSION_GQUIC44    = "gQUICv44"
+	QUIC_VERSION_OBFUSCATED = "OBFUSCATED"
 )
 
 var SupportedQUICVersions = QUICVersions{
 	QUIC_VERSION_GQUIC39,
 	QUIC_VERSION_GQUIC43,
 	QUIC_VERSION_GQUIC44,
+	QUIC_VERSION_OBFUSCATED,
 }
 
 type QUICVersions []string

+ 384 - 0
psiphon/common/quic/obfuscator.go

@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2018, Psiphon Inc.
+ * All rights reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+package quic
+
+import (
+	"crypto/rand"
+	"crypto/sha256"
+	"fmt"
+	"io"
+	"net"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/Psiphon-Labs/goarista/monotime"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/crypto/hkdf"
+	"github.com/Yawning/chacha20"
+)
+
+const (
+	MAX_QUIC_IPV4_PACKET_SIZE            = 1252
+	MAX_QUIC_IPV6_PACKET_SIZE            = 1232
+	MAX_OBFUSCATED_QUIC_IPV4_PACKET_SIZE = 1372
+	MAX_OBFUSCATED_QUIC_IPV6_PACKET_SIZE = 1352
+	MAX_PADDING_SIZE                     = 64
+	NONCE_SIZE                           = 12
+)
+
+// ObfuscatedPacketConn wraps a QUIC net.PacketConn with an obfuscation layer
+// that obscures QUIC packets, adding random padding and producing uniformly
+// random payload.
+//
+// The crypto performed by ObfuscatedPacketConn is purely for obfuscation to
+// frusctrate wire-speed DPI and does not add privacy/security. The small
+// nonce space and single key per server is not cryptographically secure.
+//
+// A server-side ObfuscatedPacketConn performs simple QUIC DPI to distinguish
+// between obfuscated and non-obfsucated peer flows and responds accordingly.
+//
+// The header and padding added by ObfuscatedPacketConn on top of the QUIC
+// payload will increase UDP packets beyond the QUIC max of 1280 bytes,
+// introducing some risk of fragmentation and/or dropped packets.
+type ObfuscatedPacketConn struct {
+	net.PacketConn
+	isServer       bool
+	isClosed       int32
+	runWaitGroup   *sync.WaitGroup
+	stopBroadcast  chan struct{}
+	obfuscationKey [32]byte
+	peerModesMutex sync.Mutex
+	peerModes      map[string]*peerMode
+}
+
+type peerMode struct {
+	isObfuscated   bool
+	lastPacketTime monotime.Time
+}
+
+func (p *peerMode) isStale() bool {
+	return monotime.Since(p.lastPacketTime) >= SERVER_IDLE_TIMEOUT
+}
+
+// NewObfuscatedPacketConnPacketConn creates a new ObfuscatedPacketConn.
+func NewObfuscatedPacketConnPacketConn(
+	conn net.PacketConn,
+	isServer bool,
+	obfuscationKey string) (*ObfuscatedPacketConn, error) {
+
+	packetConn := &ObfuscatedPacketConn{
+		PacketConn: conn,
+		isServer:   isServer,
+		peerModes:  make(map[string]*peerMode),
+	}
+
+	secret := []byte(obfuscationKey)
+	salt := []byte("quic-obfuscation-key")
+	_, err := io.ReadFull(
+		hkdf.New(sha256.New, secret, salt, nil), packetConn.obfuscationKey[:])
+	if err != nil {
+		return nil, common.ContextError(err)
+	}
+
+	if isServer {
+
+		packetConn.runWaitGroup = new(sync.WaitGroup)
+		packetConn.stopBroadcast = make(chan struct{}, 1)
+
+		// Reap stale peer mode information to reclaim memory.
+
+		packetConn.runWaitGroup.Add(1)
+		go func() {
+			defer packetConn.runWaitGroup.Done()
+
+			ticker := time.NewTicker(SERVER_IDLE_TIMEOUT / 2)
+			defer ticker.Stop()
+			for {
+				select {
+				case <-ticker.C:
+					packetConn.peerModesMutex.Lock()
+					for address, mode := range packetConn.peerModes {
+						if mode.isStale() {
+							delete(packetConn.peerModes, address)
+						}
+					}
+					packetConn.peerModesMutex.Unlock()
+				case <-packetConn.stopBroadcast:
+					return
+				}
+			}
+		}()
+	}
+
+	return packetConn, nil
+}
+
+func (conn *ObfuscatedPacketConn) Close() error {
+
+	// Ensure close channel only called once.
+	if !atomic.CompareAndSwapInt32(&conn.isClosed, 0, 1) {
+		return nil
+	}
+
+	if conn.isServer {
+		close(conn.stopBroadcast)
+		conn.runWaitGroup.Wait()
+	}
+
+	return conn.PacketConn.Close()
+}
+
+type temporaryNetError struct {
+	err error
+}
+
+func newTemporaryNetError(err error) *temporaryNetError {
+	return &temporaryNetError{err: err}
+}
+
+func (e *temporaryNetError) Timeout() bool {
+	return false
+}
+
+func (e *temporaryNetError) Temporary() bool {
+	return true
+}
+
+func (e *temporaryNetError) Error() string {
+	return e.err.Error()
+}
+
+func (conn *ObfuscatedPacketConn) ReadFrom(p []byte) (int, net.Addr, error) {
+
+	n, addr, err := conn.PacketConn.ReadFrom(p)
+
+	if n > 0 {
+
+		isObfuscated := true
+
+		if conn.isServer {
+
+			// The server handles both plain and obfuscated QUIC packets.
+			// isQUIC performs DPI to determine whether the packet appears to
+			// be QUIC, in which case deobfuscation is not performed. Not all
+			// plain QUIC packets will pass the DPI test, but the initial
+			// packet(s) in a flow are expected to match; so the server
+			// records a peer "mode", referenced by peer address to know when
+			// to skip deobfuscation for later packets.
+			//
+			// It's possible for clients to redial QUIC connections,
+			// transitioning from obfuscated to plain, using the same source
+			// address (IP and port). This is more likely when many clients
+			// are behind NAT. If a packet appears to be QUIC, this will reset
+			// any existing peer "mode" to plain. The obfuscator checks that
+			// its obfuscated packets don't pass the QUIC DPI test.
+			//
+			// TODO: delete peerMode when a packet is a client connection
+			// termination QUIC packet? Will reclaim peerMode memory faster
+			// than relying on reaper.
+
+			isQUIC := isQUIC(p[:n])
+
+			conn.peerModesMutex.Lock()
+			address := addr.String()
+			mode, ok := conn.peerModes[address]
+			if !ok {
+				mode = &peerMode{isObfuscated: !isQUIC}
+				conn.peerModes[address] = mode
+			} else if mode.isStale() {
+				mode.isObfuscated = !isQUIC
+			} else if mode.isObfuscated && isQUIC {
+				mode.isObfuscated = false
+			}
+			isObfuscated = mode.isObfuscated
+			mode.lastPacketTime = monotime.Now()
+			conn.peerModesMutex.Unlock()
+
+		}
+
+		if isObfuscated {
+
+			// We can use p as a scratch buffer for deobfuscation, and this
+			// avoids allocting a buffer.
+
+			if n < (NONCE_SIZE + 1) {
+				return n, addr, newTemporaryNetError(common.ContextError(
+					fmt.Errorf("unexpected obfuscated QUIC packet length: %d", n)))
+			}
+
+			cipher, err := chacha20.NewCipher(conn.obfuscationKey[:], p[0:NONCE_SIZE])
+			if err != nil {
+				return n, addr, common.ContextError(err)
+			}
+			cipher.XORKeyStream(p[NONCE_SIZE:], p[NONCE_SIZE:])
+
+			paddingLen := int(p[NONCE_SIZE])
+			if paddingLen > MAX_PADDING_SIZE || paddingLen > n-(NONCE_SIZE+1) {
+				return n, addr, newTemporaryNetError(common.ContextError(
+					fmt.Errorf("unexpected padding length: %d, %d", paddingLen, n)))
+			}
+
+			n -= (NONCE_SIZE + 1) + paddingLen
+			copy(p[0:n], p[(NONCE_SIZE+1)+paddingLen:n+(NONCE_SIZE+1)+paddingLen])
+		}
+	}
+
+	return n, addr, err
+}
+
+type obfuscatorBuffer struct {
+	buffer [MAX_OBFUSCATED_QUIC_IPV4_PACKET_SIZE]byte
+}
+
+var obfuscatorBufferPool = &sync.Pool{
+	New: func() interface{} {
+		return new(obfuscatorBuffer)
+	},
+}
+
+func getMaxPacketSizes(addr net.Addr) (int, int) {
+	if udpAddr, ok := addr.(*net.UDPAddr); ok && udpAddr.IP.To4() == nil {
+		return MAX_QUIC_IPV6_PACKET_SIZE, MAX_OBFUSCATED_QUIC_IPV6_PACKET_SIZE
+	}
+	return MAX_QUIC_IPV4_PACKET_SIZE, MAX_OBFUSCATED_QUIC_IPV4_PACKET_SIZE
+}
+
+func (conn *ObfuscatedPacketConn) WriteTo(p []byte, addr net.Addr) (int, error) {
+
+	n := len(p)
+
+	isObfuscated := true
+
+	if conn.isServer {
+
+		conn.peerModesMutex.Lock()
+		address := addr.String()
+		mode, ok := conn.peerModes[address]
+		isObfuscated = ok && mode.isObfuscated
+		conn.peerModesMutex.Unlock()
+
+	}
+
+	if isObfuscated {
+
+		maxQUICPacketSize, maxObfuscatedPacketSize := getMaxPacketSizes(addr)
+
+		if n > maxQUICPacketSize {
+			return 0, newTemporaryNetError(common.ContextError(
+				fmt.Errorf("unexpected QUIC packet length: %d", n)))
+		}
+
+		// Note: escape analysis showed a local array escaping to the heap,
+		// so use a buffer pool instead to avoid heap allocation per packet.
+
+		b := obfuscatorBufferPool.Get().(*obfuscatorBuffer)
+		buffer := b.buffer[:]
+		defer obfuscatorBufferPool.Put(b)
+
+		for {
+			_, err := rand.Read(buffer[0:NONCE_SIZE])
+			if err != nil {
+				return 0, common.ContextError(err)
+			}
+
+			// Don't use a random nonce that looks like QUIC, or the
+			// peer will not treat this packet as obfuscated.
+			if !isQUIC(buffer[:]) {
+				break
+			}
+		}
+
+		// Obfuscated QUIC padding results in packets that exceed the
+		// QUIC max packet size of 1280.
+
+		maxPaddingSize := maxObfuscatedPacketSize - (n + (NONCE_SIZE + 1))
+		if maxPaddingSize < 0 {
+			maxPaddingSize = 0
+		}
+		if maxPaddingSize > MAX_PADDING_SIZE {
+			maxPaddingSize = MAX_PADDING_SIZE
+		}
+
+		paddingLen, err := common.MakeSecureRandomRange(0, maxPaddingSize)
+		if err != nil {
+			return 0, common.ContextError(err)
+		}
+
+		buffer[NONCE_SIZE] = uint8(paddingLen)
+		_, err = rand.Read(buffer[(NONCE_SIZE + 1) : (NONCE_SIZE+1)+paddingLen])
+		if err != nil {
+			return 0, common.ContextError(err)
+		}
+
+		copy(buffer[(NONCE_SIZE+1)+paddingLen:], p)
+		dataLen := (NONCE_SIZE + 1) + paddingLen + n
+
+		cipher, err := chacha20.NewCipher(conn.obfuscationKey[:], buffer[0:NONCE_SIZE])
+		if err != nil {
+			return 0, common.ContextError(err)
+		}
+		cipher.XORKeyStream(buffer[NONCE_SIZE:dataLen], buffer[NONCE_SIZE:dataLen])
+
+		p = buffer[:dataLen]
+	}
+
+	_, err := conn.PacketConn.WriteTo(p, addr)
+
+	return n, err
+}
+
+func isQUIC(buffer []byte) bool {
+
+	// As this function is called for every packet, it needs to be fast.
+	//
+	// In all currently supported versions, the first client packet contains
+	// the "CHLO" tag at one of the following offsets. The offset can vary for
+	// a single version.
+	//
+	// Note that v44 does not include the "QUIC version" header field in its
+	// first client packet.
+	//
+	// As QUIC header parsing is complex, with many cases, we are not
+	// presently doing that, although this might improve accuracy as we should
+	// be able to identify the precise offset of "CHLO" based on header
+	// values.
+
+	if (len(buffer) >= 33 &&
+		buffer[29] == 'C' &&
+		buffer[30] == 'H' &&
+		buffer[31] == 'L' &&
+		buffer[32] == 'O') ||
+		(len(buffer) >= 35 &&
+			buffer[31] == 'C' &&
+			buffer[32] == 'H' &&
+			buffer[33] == 'L' &&
+			buffer[34] == 'O') ||
+		(len(buffer) >= 38 &&
+			buffer[34] == 'C' &&
+			buffer[35] == 'H' &&
+			buffer[36] == 'L' &&
+			buffer[37] == 'O') {
+
+		return true
+	}
+
+	return false
+}

+ 112 - 7
psiphon/common/quic/quic.go

@@ -68,10 +68,14 @@ var serverIdleTimeout = SERVER_IDLE_TIMEOUT
 // Listener is a net.Listener.
 type Listener struct {
 	quic_go.Listener
+	logger common.Logger
 }
 
 // Listen creates a new Listener.
-func Listen(addr string) (*Listener, error) {
+func Listen(
+	logger common.Logger,
+	address string,
+	obfuscationKey string) (*Listener, error) {
 
 	certificate, privateKey, err := common.GenerateWebServerCertificate(
 		common.GenerateHostName())
@@ -97,8 +101,29 @@ func Listen(addr string) (*Listener, error) {
 		KeepAlive:             true,
 	}
 
-	quicListener, err := quic_go.ListenAddr(
-		addr, tlsConfig, quicConfig)
+	addr, err := net.ResolveUDPAddr("udp", address)
+	if err != nil {
+		return nil, common.ContextError(err)
+	}
+
+	udpConn, err := net.ListenUDP("udp", addr)
+	if err != nil {
+		return nil, common.ContextError(err)
+	}
+
+	var packetConn net.PacketConn
+	packetConn, err = NewObfuscatedPacketConnPacketConn(
+		udpConn, true, obfuscationKey)
+	if err != nil {
+		return nil, common.ContextError(err)
+	}
+
+	// This wrapping must be outermost to ensure that all
+	// ReadFrom errors are intercepted and logged.
+	packetConn = newLoggingPacketConn(logger, packetConn)
+
+	quicListener, err := quic_go.Listen(
+		packetConn, tlsConfig, quicConfig)
 	if err != nil {
 		return nil, common.ContextError(err)
 	}
@@ -126,9 +151,10 @@ func (listener *Listener) Accept() (net.Conn, error) {
 }
 
 var supportedVersionNumbers = map[string]quic_go.VersionNumber{
-	protocol.QUIC_VERSION_GQUIC39: quic_go.VersionGQUIC39,
-	protocol.QUIC_VERSION_GQUIC43: quic_go.VersionGQUIC43,
-	protocol.QUIC_VERSION_GQUIC44: quic_go.VersionGQUIC44,
+	protocol.QUIC_VERSION_GQUIC39:    quic_go.VersionGQUIC39,
+	protocol.QUIC_VERSION_GQUIC43:    quic_go.VersionGQUIC43,
+	protocol.QUIC_VERSION_GQUIC44:    quic_go.VersionGQUIC44,
+	protocol.QUIC_VERSION_OBFUSCATED: quic_go.VersionGQUIC43,
 }
 
 // Dial establishes a new QUIC session and stream to the server specified by
@@ -145,7 +171,8 @@ func Dial(
 	packetConn net.PacketConn,
 	remoteAddr *net.UDPAddr,
 	quicSNIAddress string,
-	negotiateQUICVersion string) (net.Conn, error) {
+	negotiateQUICVersion string,
+	obfuscationKey string) (net.Conn, error) {
 
 	var versions []quic_go.VersionNumber
 
@@ -169,6 +196,15 @@ func Dial(
 		quicConfig.HandshakeTimeout = deadline.Sub(time.Now())
 	}
 
+	if negotiateQUICVersion == protocol.QUIC_VERSION_OBFUSCATED {
+		var err error
+		packetConn, err = NewObfuscatedPacketConnPacketConn(
+			packetConn, false, obfuscationKey)
+		if err != nil {
+			return nil, common.ContextError(err)
+		}
+	}
+
 	session, err := quic_go.DialContext(
 		ctx,
 		packetConn,
@@ -383,3 +419,72 @@ func isErrorIndicatingClosed(err error) bool {
 	}
 	return false
 }
+
+// loggingPacketConn is a workaround for issues in the quic-go server (as of
+// revision ffdfa1).
+//
+// 1. quic-go will shutdown the QUIC server on any error returned from
+//    ReadFrom, even net.Error.Temporary() errors.
+//
+// 2. The server shutdown hangs due to a mutex deadlock:
+//
+//    sync.(*RWMutex).Lock+0x2c                                          /usr/local/go/src/sync/rwmutex.go:93
+//    [...]/lucas-clemente/quic-go.(*packetHandlerMap).CloseServer+0x41  [...]/lucas-clemente/quic-go/packet_handler_map.go:77
+//    [...]/lucas-clemente/quic-go.(*server).closeWithMutex+0x37         [...]/lucas-clemente/quic-go/server.go:314
+//    [...]/lucas-clemente/quic-go.(*server).closeWithError+0xa2         [...]/lucas-clemente/quic-go/server.go:336
+//    [...]/lucas-clemente/quic-go.(*packetHandlerMap).close+0x1da       [...]/lucas-clemente/quic-go/packet_handler_map.go:115
+//    [...]/lucas-clemente/quic-go.(*packetHandlerMap).listen+0x230      [...]/lucas-clemente/quic-go/packet_handler_map.go:130
+//
+//    packetHandlerMap.CloseServer is attempting to lock the same mutex that
+//    is already locked in packetHandlerMap.close, which deadlocks. As
+//    packetHandlerMap and its mutex are used by all client sessions, this
+//    effectively hangs the entire server.
+//
+// loggingPacketConn PacketConn ReadFrom errors and returns any usable values
+// or loops and calls ReadFrom again. In practise, due to the nature of UDP
+// sockets, ReadFrom errors are exceptional as they will mosyt likely not
+// occur due to network transmission failures. ObfuscatedPacketConn returns
+// errors that could be due to network transmission failures that corrupt
+// packets; these are marked as net.Error.Temporary() and loggingPacketConn
+// logs these at debug level.
+//
+// loggingPacketConn assumes quic-go revision ffdfa1 behavior and will break
+// other behavior, such as setting deadlines and expecting net.Error.Timeout()
+// errors from ReadFrom.
+type loggingPacketConn struct {
+	net.PacketConn
+	logger common.Logger
+}
+
+func newLoggingPacketConn(
+	logger common.Logger,
+	packetConn net.PacketConn) *loggingPacketConn {
+
+	return &loggingPacketConn{
+		PacketConn: packetConn,
+		logger:     logger,
+	}
+}
+
+func (conn *loggingPacketConn) ReadFrom(p []byte) (int, net.Addr, error) {
+
+	for {
+		n, addr, err := conn.PacketConn.ReadFrom(p)
+
+		if err != nil && conn.logger != nil {
+			message := "ReadFrom failed"
+			if e, ok := err.(net.Error); ok && e.Temporary() {
+				conn.logger.WithContextFields(
+					common.LogFields{"error": err}).Debug(message)
+			} else {
+				conn.logger.WithContextFields(
+					common.LogFields{"error": err}).Warning(message)
+			}
+		}
+		err = nil
+
+		if n > 0 || addr != nil {
+			return n, addr, nil
+		}
+	}
+}

+ 8 - 2
psiphon/common/quic/quic_test.go

@@ -57,7 +57,12 @@ func runQUIC(t *testing.T, negotiateQUICVersion string) {
 	// connection termination packets.
 	serverIdleTimeout = 1 * time.Second
 
-	listener, err := Listen("127.0.0.1:0")
+	obfuscationKey, err := common.MakeSecureRandomStringHex(32)
+	if err != nil {
+		t.Fatalf("MakeSecureRandomStringHex failed: %s", err)
+	}
+
+	listener, err := Listen(nil, "127.0.0.1:0", obfuscationKey)
 	if err != nil {
 		t.Fatalf("Listen failed: %s", err)
 	}
@@ -126,7 +131,8 @@ func runQUIC(t *testing.T, negotiateQUICVersion string) {
 				packetConn,
 				remoteAddr,
 				serverAddress,
-				negotiateQUICVersion)
+				negotiateQUICVersion,
+				obfuscationKey)
 			if err != nil {
 				return common.ContextError(err)
 			}

+ 27 - 35
psiphon/common/tactics/tactics.go

@@ -617,7 +617,10 @@ func (server *Server) GetTacticsPayload(
 	geoIPData common.GeoIPData,
 	apiParams common.APIParameters) (*Payload, error) {
 
-	tactics, err := server.getTactics(geoIPData, apiParams)
+	// includeServerSideOnly is false: server-side only parameters are not
+	// used by the client, so including them wastes space and unnecessarily
+	// exposes the values.
+	tactics, err := server.getTactics(false, geoIPData, apiParams)
 	if err != nil {
 		return nil, common.ContextError(err)
 	}
@@ -668,6 +671,7 @@ func (server *Server) GetTacticsPayload(
 }
 
 func (server *Server) getTactics(
+	includeServerSideOnly bool,
 	geoIPData common.GeoIPData,
 	apiParams common.APIParameters) (*Tactics, error) {
 
@@ -679,7 +683,7 @@ func (server *Server) getTactics(
 		return nil, nil
 	}
 
-	tactics := server.DefaultTactics.clone()
+	tactics := server.DefaultTactics.clone(includeServerSideOnly)
 
 	var aggregatedValues map[string]int
 
@@ -761,7 +765,7 @@ func (server *Server) getTactics(
 			}
 		}
 
-		tactics.merge(&filteredTactics.Tactics)
+		tactics.merge(includeServerSideOnly, &filteredTactics.Tactics)
 
 		// Continue to apply more matches. Last matching tactics has priority for any field.
 	}
@@ -882,7 +886,7 @@ func medianSampleRTTMilliseconds(samples []SpeedTestSample) int {
 	return (samples[mid-1].RTTMilliseconds + samples[mid].RTTMilliseconds) / 2
 }
 
-func (t *Tactics) clone() *Tactics {
+func (t *Tactics) clone(includeServerSideOnly bool) *Tactics {
 
 	u := &Tactics{
 		TTL:         t.TTL,
@@ -895,14 +899,16 @@ func (t *Tactics) clone() *Tactics {
 	if t.Parameters != nil {
 		u.Parameters = make(map[string]interface{})
 		for k, v := range t.Parameters {
-			u.Parameters[k] = v
+			if !parameters.IsServerSideOnly(k) {
+				u.Parameters[k] = v
+			}
 		}
 	}
 
 	return u
 }
 
-func (t *Tactics) merge(u *Tactics) {
+func (t *Tactics) merge(includeServerSideOnly bool, u *Tactics) {
 
 	if u.TTL != "" {
 		t.TTL = u.TTL
@@ -920,7 +926,9 @@ func (t *Tactics) merge(u *Tactics) {
 			t.Parameters = make(map[string]interface{})
 		}
 		for k, v := range u.Parameters {
-			t.Parameters[k] = v
+			if !parameters.IsServerSideOnly(k) {
+				t.Parameters[k] = v
+			}
 		}
 	}
 }
@@ -1106,7 +1114,7 @@ func (listener *Listener) Accept() (net.Conn, error) {
 
 		geoIPData := listener.geoIPLookup(common.IPAddressFromAddr(conn.RemoteAddr()))
 
-		tactics, err := listener.server.getTactics(geoIPData, make(common.APIParameters))
+		tactics, err := listener.server.getTactics(true, geoIPData, make(common.APIParameters))
 		if err != nil {
 			listener.server.logger.WithContextFields(
 				common.LogFields{"error": err}).Warning("failed to get tactics for connection")
@@ -1155,33 +1163,17 @@ func (listener *Listener) Accept() (net.Conn, error) {
 		// or not fragment all TCP connections for a one meek session, the server
 		// will make a coin flip per connection.
 
-		tunnelProtocols := p.TunnelProtocols(parameters.FragmentorDownstreamLimitProtocols)
-		if (len(tunnelProtocols) == 0 ||
-			common.Contains(tunnelProtocols, listener.tunnelProtocol)) &&
-			p.WeightedCoinFlip(parameters.FragmentorDownstreamProbability) {
-
-			totalBytes, err := common.MakeSecureRandomRange(
-				p.Int(parameters.FragmentorDownstreamMinTotalBytes),
-				p.Int(parameters.FragmentorDownstreamMaxTotalBytes))
-			if err != nil {
-				listener.server.logger.WithContextFields(
-					common.LogFields{"error": err}).Warning("MakeSecureRandomRange failed")
-				totalBytes = 0
-			}
-
-			if totalBytes > 0 {
-				conn = fragmentor.NewConn(
-					conn,
-					func(message string) {
-						listener.server.logger.WithContextFields(
-							common.LogFields{"message": message}).Debug("Fragmentor")
-					},
-					totalBytes,
-					p.Int(parameters.FragmentorDownstreamMinWriteBytes),
-					p.Int(parameters.FragmentorDownstreamMaxWriteBytes),
-					p.Duration(parameters.FragmentorDownstreamMinDelay),
-					p.Duration(parameters.FragmentorDownstreamMaxDelay))
-			}
+		fragmentorConfig := fragmentor.NewDownstreamConfig(
+			p, listener.tunnelProtocol)
+
+		if fragmentorConfig.IsFragmenting() {
+			conn = fragmentor.NewConn(
+				fragmentorConfig,
+				func(message string) {
+					listener.server.logger.WithContextFields(
+						common.LogFields{"message": message}).Debug("Fragmentor")
+				},
+				conn)
 		}
 
 		return conn, nil

+ 2 - 1
psiphon/controller.go

@@ -1343,7 +1343,8 @@ func (controller *Controller) doFetchTactics(
 
 	meekConfig.RoundTripperOnly = true
 
-	dialConfig, dialStats := initDialConfig(controller.config, meekConfig)
+	dialConfig, dialStats := initDialConfig(
+		controller.config, meekConfig, tacticsProtocol)
 
 	NoticeRequestingTactics(
 		serverEntry.IpAddress,

+ 0 - 107
psiphon/fragmentor.go

@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2018, Psiphon Inc.
- * All rights reserved.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-package psiphon
-
-import (
-	"context"
-	"fmt"
-	"net"
-
-	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
-	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/fragmentor"
-	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/parameters"
-)
-
-// NewTCPFragmentorDialer creates a TCP dialer that wraps dialed conns in
-// fragmentor.Conn. A single FragmentorProbability coin flip is made and all
-// conns get the same treatment.
-func NewTCPFragmentorDialer(
-	config *DialConfig,
-	tunnelProtocol string,
-	clientParameters *parameters.ClientParameters) Dialer {
-
-	p := clientParameters.Get()
-	coinFlip := p.WeightedCoinFlip(parameters.FragmentorProbability)
-	p = nil
-
-	return func(ctx context.Context, network, addr string) (net.Conn, error) {
-		if network != "tcp" {
-			return nil, common.ContextError(fmt.Errorf("%s unsupported", network))
-		}
-		return DialTCPFragmentor(ctx, addr, config, tunnelProtocol, clientParameters, &coinFlip)
-	}
-}
-
-// DialTCPFragmentor performs a DialTCP and wraps the dialed conn in a
-// fragmentor.Conn, subject to FragmentorProbability and
-// FragmentorLimitProtocols.
-func DialTCPFragmentor(
-	ctx context.Context,
-	addr string,
-	config *DialConfig,
-	tunnelProtocol string,
-	clientParameters *parameters.ClientParameters,
-	oneTimeCoinFlip *bool) (net.Conn, error) {
-
-	conn, err := DialTCP(ctx, addr, config)
-	if err != nil {
-		return nil, common.ContextError(err)
-	}
-
-	p := clientParameters.Get()
-
-	protocols := p.TunnelProtocols(parameters.FragmentorLimitProtocols)
-	if len(protocols) > 0 && !common.Contains(protocols, tunnelProtocol) {
-		return conn, nil
-	}
-
-	var coinFlip bool
-	if oneTimeCoinFlip != nil {
-		coinFlip = *oneTimeCoinFlip
-	} else {
-		coinFlip = p.WeightedCoinFlip(parameters.FragmentorProbability)
-	}
-
-	if coinFlip {
-		return conn, nil
-	}
-
-	totalBytes, err := common.MakeSecureRandomRange(
-		p.Int(parameters.FragmentorMinTotalBytes),
-		p.Int(parameters.FragmentorMaxTotalBytes))
-	if err != nil {
-		totalBytes = 0
-		NoticeAlert("MakeSecureRandomRange failed: %s", common.ContextError(err))
-	}
-
-	if totalBytes == 0 {
-		return conn, nil
-	}
-
-	return fragmentor.NewConn(
-			conn,
-			func(message string) { NoticeInfo(message) },
-			totalBytes,
-			p.Int(parameters.FragmentorMinWriteBytes),
-			p.Int(parameters.FragmentorMaxWriteBytes),
-			p.Duration(parameters.FragmentorMinDelay),
-			p.Duration(parameters.FragmentorMaxDelay)),
-		nil
-}

+ 6 - 16
psiphon/meekConn.go

@@ -235,15 +235,10 @@ func DialMeek(
 
 		scheme = "https"
 
-		tcpDialer := NewTCPFragmentorDialer(
-			dialConfig,
-			meekConfig.ClientTunnelProtocol,
-			meekConfig.ClientParameters)
-
 		tlsConfig := &CustomTLSConfig{
 			ClientParameters:              meekConfig.ClientParameters,
 			DialAddr:                      meekConfig.DialAddress,
-			Dial:                          tcpDialer,
+			Dial:                          NewTCPDialer(dialConfig),
 			SNIServerName:                 meekConfig.SNIServerName,
 			SkipVerify:                    true,
 			TLSProfile:                    meekConfig.TLSProfile,
@@ -345,20 +340,15 @@ func DialMeek(
 			*copyDialConfig = *dialConfig
 			copyDialConfig.UpstreamProxyURL = ""
 
-			dialer = NewTCPFragmentorDialer(
-				copyDialConfig,
-				meekConfig.ClientTunnelProtocol,
-				meekConfig.ClientParameters)
+			dialer = NewTCPDialer(copyDialConfig)
 
 		} else {
 
-			baseDialer := NewTCPFragmentorDialer(
-				dialConfig,
-				meekConfig.ClientTunnelProtocol,
-				meekConfig.ClientParameters)
+			baseDialer := NewTCPDialer(dialConfig)
 
-			// The dialer ignores address that http.Transport will pass in (derived
-			// from the HTTP request URL) and always dials meekConfig.DialAddress.
+			// The dialer ignores any address that http.Transport will pass in
+			// (derived from the HTTP request URL) and always dials
+			// meekConfig.DialAddress.
 			dialer = func(ctx context.Context, network, _ string) (net.Conn, error) {
 				return baseDialer(ctx, network, meekConfig.DialAddress)
 			}

+ 5 - 0
psiphon/net.go

@@ -35,6 +35,7 @@ import (
 
 	"github.com/Psiphon-Labs/dns"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/fragmentor"
 )
 
 const DNS_PORT = 53
@@ -83,6 +84,10 @@ type DialConfig struct {
 	// domain name.
 	// The callback may be invoked by a concurrent goroutine.
 	ResolvedIPCallback func(string)
+
+	// FragmentorConfig specifies whether to layer a fragmentor.Conn on top
+	// of dialed TCP conns, and the fragmentation configuration to use.
+	FragmentorConfig *fragmentor.Config
 }
 
 // NetworkConnectivityChecker defines the interface to the external

+ 26 - 0
psiphon/notice.go

@@ -450,6 +450,32 @@ func noticeWithDialStats(noticeType, ipAddress, region, protocol string, dialSta
 		args = append(args, "TLSProfile", dialStats.TLSProfile)
 	}
 
+	if dialStats.DialPortNumber != "" {
+		args = append(args, "DialPortNumber", dialStats.DialPortNumber)
+	}
+
+	if dialStats.QUICVersion != "" {
+		args = append(args, "QUICVersion", dialStats.QUICVersion)
+	}
+
+	if dialStats.QUICDialSNIAddress != "" {
+		args = append(args, "QUICDialSNIAddress", dialStats.QUICDialSNIAddress)
+	}
+
+	if dialStats.DialConnMetrics != nil {
+		metrics := dialStats.DialConnMetrics.GetMetrics()
+		for name, value := range metrics {
+			args = append(args, name, value)
+		}
+	}
+
+	if dialStats.ObfuscatedSSHConnMetrics != nil {
+		metrics := dialStats.ObfuscatedSSHConnMetrics.GetMetrics()
+		for name, value := range metrics {
+			args = append(args, name, value)
+		}
+	}
+
 	singletonNoticeLogger.outputNotice(
 		noticeType, noticeIsDiagnostic,
 		args...)

+ 78 - 38
psiphon/server/api.go

@@ -134,7 +134,6 @@ func dispatchAPIRequestHandler(
 
 		// TODO: same session-ID-lookup TODO in handshakeAPIRequestHandler
 		// applies here.
-
 		sessionID, err := getStringRequestParam(params, "client_session_id")
 		if err == nil {
 			// Note: follows/duplicates baseRequestParams validation
@@ -309,7 +308,7 @@ var connectedRequestParams = append(
 	[]requestParamSpec{
 		{"session_id", isHexDigits, 0},
 		{"last_connected", isLastConnected, 0},
-		{"establishment_duration", isIntString, requestParamOptional}},
+		{"establishment_duration", isIntString, requestParamOptional | requestParamLogStringAsInt}},
 	baseRequestParams...)
 
 // connectedAPIRequestHandler implements the "connected" API request.
@@ -329,6 +328,19 @@ func connectedAPIRequestHandler(
 		return nil, common.ContextError(err)
 	}
 
+	// Update upstream fragmentor metrics, as the client may have performed
+	// more upstream fragmentation since the previous metrics reported by the
+	// handshake request.
+
+	// TODO: same session-ID-lookup TODO in handshakeAPIRequestHandler
+	// applies here.
+	sessionID, _ := getStringRequestParam(params, "client_session_id")
+	err = support.TunnelServer.UpdateClientAPIParameters(
+		sessionID, copyUpstreamFragmentorParams(params))
+	if err != nil {
+		return nil, common.ContextError(err)
+	}
+
 	log.LogRawFieldsWithTimestamp(
 		getRequestLogFields(
 			"connected",
@@ -515,43 +527,58 @@ type requestParamSpec struct {
 }
 
 const (
-	requestParamOptional  = 1
-	requestParamNotLogged = 2
-	requestParamArray     = 4
-	requestParamJSON      = 8
+	requestParamOptional       = 1
+	requestParamNotLogged      = 2
+	requestParamArray          = 4
+	requestParamJSON           = 8
+	requestParamLogStringAsInt = 16
 )
 
+var upstreamFragmentorParams = []requestParamSpec{
+	{"upstream_bytes_fragmented", isIntString, requestParamOptional | requestParamLogStringAsInt},
+	{"upstream_min_bytes_written", isIntString, requestParamOptional | requestParamLogStringAsInt},
+	{"upstream_max_bytes_written", isIntString, requestParamOptional | requestParamLogStringAsInt},
+	{"upstream_min_delayed", isIntString, requestParamOptional | requestParamLogStringAsInt},
+	{"upstream_max_delayed", isIntString, requestParamOptional | requestParamLogStringAsInt},
+}
+
 // baseRequestParams is the list of required and optional
 // request parameters; derived from COMMON_INPUTS and
 // OPTIONAL_COMMON_INPUTS in psi_web.
 // Each param is expected to be a string, unless requestParamArray
 // is specified, in which case an array of string is expected.
-var baseRequestParams = []requestParamSpec{
-	{"server_secret", isServerSecret, requestParamNotLogged},
-	{"client_session_id", isHexDigits, requestParamNotLogged},
-	{"propagation_channel_id", isHexDigits, 0},
-	{"sponsor_id", isHexDigits, 0},
-	{"client_version", isIntString, 0},
-	{"client_platform", isClientPlatform, 0},
-	{"client_build_rev", isHexDigits, requestParamOptional},
-	{"relay_protocol", isRelayProtocol, 0},
-	{"tunnel_whole_device", isBooleanFlag, requestParamOptional},
-	{"device_region", isAnyString, requestParamOptional},
-	{"ssh_client_version", isAnyString, requestParamOptional},
-	{"upstream_proxy_type", isUpstreamProxyType, requestParamOptional},
-	{"upstream_proxy_custom_header_names", isAnyString, requestParamOptional | requestParamArray},
-	{"meek_dial_address", isDialAddress, requestParamOptional},
-	{"meek_resolved_ip_address", isIPAddress, requestParamOptional},
-	{"meek_sni_server_name", isDomain, requestParamOptional},
-	{"meek_host_header", isHostHeader, requestParamOptional},
-	{"meek_transformed_host_name", isBooleanFlag, requestParamOptional},
-	{"user_agent", isAnyString, requestParamOptional},
-	{"tls_profile", isAnyString, requestParamOptional},
-	{"server_entry_region", isRegionCode, requestParamOptional},
-	{"server_entry_source", isServerEntrySource, requestParamOptional},
-	{"server_entry_timestamp", isISO8601Date, requestParamOptional},
-	{tactics.APPLIED_TACTICS_TAG_PARAMETER_NAME, isAnyString, requestParamOptional},
-}
+var baseRequestParams = append(
+	[]requestParamSpec{
+		{"server_secret", isServerSecret, requestParamNotLogged},
+		{"client_session_id", isHexDigits, requestParamNotLogged},
+		{"propagation_channel_id", isHexDigits, 0},
+		{"sponsor_id", isHexDigits, 0},
+		{"client_version", isIntString, requestParamLogStringAsInt},
+		{"client_platform", isClientPlatform, 0},
+		{"client_build_rev", isHexDigits, requestParamOptional},
+		{"relay_protocol", isRelayProtocol, 0},
+		{"tunnel_whole_device", isBooleanFlag, requestParamOptional},
+		{"device_region", isAnyString, requestParamOptional},
+		{"ssh_client_version", isAnyString, requestParamOptional},
+		{"upstream_proxy_type", isUpstreamProxyType, requestParamOptional},
+		{"upstream_proxy_custom_header_names", isAnyString, requestParamOptional | requestParamArray},
+		{"meek_dial_address", isDialAddress, requestParamOptional},
+		{"meek_resolved_ip_address", isIPAddress, requestParamOptional},
+		{"meek_sni_server_name", isDomain, requestParamOptional},
+		{"meek_host_header", isHostHeader, requestParamOptional},
+		{"meek_transformed_host_name", isBooleanFlag, requestParamOptional},
+		{"user_agent", isAnyString, requestParamOptional},
+		{"tls_profile", isAnyString, requestParamOptional},
+		{"server_entry_region", isRegionCode, requestParamOptional},
+		{"server_entry_source", isServerEntrySource, requestParamOptional},
+		{"server_entry_timestamp", isISO8601Date, requestParamOptional},
+		{tactics.APPLIED_TACTICS_TAG_PARAMETER_NAME, isAnyString, requestParamOptional},
+		{"dial_port_number", isIntString, requestParamOptional | requestParamLogStringAsInt},
+		{"quic_version", isAnyString, requestParamOptional},
+		{"quic_dial_sni_address", isAnyString, requestParamOptional},
+		{"upstream_ossh_padding", isIntString, requestParamOptional | requestParamLogStringAsInt},
+	},
+	upstreamFragmentorParams...)
 
 func validateRequestParams(
 	config *Config,
@@ -597,17 +624,28 @@ func copyBaseRequestParams(params common.APIParameters) common.APIParameters {
 
 	// Note: not a deep copy; assumes baseRequestParams values
 	// are all scalar types (int, string, etc.)
-
 	paramsCopy := make(common.APIParameters)
 	for _, baseParam := range baseRequestParams {
 		value := params[baseParam.name]
 		if value == nil {
 			continue
 		}
-
 		paramsCopy[baseParam.name] = value
 	}
+	return paramsCopy
+}
+
+func copyUpstreamFragmentorParams(params common.APIParameters) common.APIParameters {
 
+	// Note: not a deep copy
+	paramsCopy := make(common.APIParameters)
+	for _, baseParam := range upstreamFragmentorParams {
+		value := params[baseParam.name]
+		if value == nil {
+			continue
+		}
+		paramsCopy[baseParam.name] = value
+	}
 	return paramsCopy
 }
 
@@ -708,9 +746,6 @@ func getRequestLogFields(
 			// - Boolean fields that come into the api as "1"/"0"
 			//   must be logged as actual boolean values
 			switch expectedParam.name {
-			case "client_version", "establishment_duration":
-				intValue, _ := strconv.Atoi(strValue)
-				logFields[expectedParam.name] = intValue
 			case "meek_dial_address":
 				host, _, _ := net.SplitHostPort(strValue)
 				if isIPAddress(nil, host) {
@@ -736,7 +771,12 @@ func getRequestLogFields(
 					logFields[expectedParam.name] = false
 				}
 			default:
-				logFields[expectedParam.name] = strValue
+				if expectedParam.flags&requestParamLogStringAsInt != 0 {
+					intValue, _ := strconv.Atoi(strValue)
+					logFields[expectedParam.name] = intValue
+				} else {
+					logFields[expectedParam.name] = strValue
+				}
 			}
 
 		case []interface{}:

+ 0 - 8
psiphon/server/log.go

@@ -34,14 +34,6 @@ import (
 	"github.com/sirupsen/logrus"
 )
 
-// MetricsSource is an object that provides metrics to be logged
-type MetricsSource interface {
-
-	// GetMetrics returns a LogFields populated with
-	// metrics from the MetricsSource
-	GetMetrics() LogFields
-}
-
 // ContextLogger adds context logging functionality to the
 // underlying logging packages.
 type ContextLogger struct {

+ 12 - 10
psiphon/server/meek.go

@@ -906,9 +906,9 @@ func (session *meekSession) delete(haveLock bool) {
 	}
 }
 
-// GetMetrics implements the MetricsSource interface.
-func (session *meekSession) GetMetrics() LogFields {
-	logFields := make(LogFields)
+// GetMetrics implements the common.MetricsSource interface.
+func (session *meekSession) GetMetrics() common.LogFields {
+	logFields := make(common.LogFields)
 	logFields["meek_client_retries"] = atomic.LoadInt64(&session.metricClientRetries)
 	logFields["meek_peak_response_size"] = atomic.LoadInt64(&session.metricPeakResponseSize)
 	logFields["meek_peak_cached_response_size"] = atomic.LoadInt64(&session.metricPeakCachedResponseSize)
@@ -937,9 +937,10 @@ func makeMeekTLSConfig(
 	}
 
 	config := &tris.Config{
-		Certificates: []tris.Certificate{tlsCertificate},
-		NextProtos:   []string{"http/1.1"},
-		MinVersion:   tris.VersionTLS10,
+		Certificates:            []tris.Certificate{tlsCertificate},
+		NextProtos:              []string{"http/1.1"},
+		MinVersion:              tris.VersionTLS10,
+		UseExtendedMasterSecret: true,
 	}
 
 	if isFronted {
@@ -1375,9 +1376,10 @@ func (conn *meekConn) SetWriteDeadline(t time.Time) error {
 	return common.ContextError(errors.New("not supported"))
 }
 
-// GetMetrics implements the MetricsSource interface. The metrics are maintained
-// in the meek session type; but logTunnel, which calls MetricsSource.GetMetrics,
-// has a pointer only to this conn, so it calls through to the session.
-func (conn *meekConn) GetMetrics() LogFields {
+// GetMetrics implements the common.MetricsSource interface. The metrics are
+// maintained in the meek session type; but logTunnel, which calls
+// MetricsSource.GetMetrics, has a pointer only to this conn, so it calls
+// through to the session.
+func (conn *meekConn) GetMetrics() common.LogFields {
 	return conn.meekSession.GetMetrics()
 }

+ 64 - 7
psiphon/server/server_test.go

@@ -129,6 +129,7 @@ func TestSSH(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -144,6 +145,23 @@ func TestOSSH(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
+		})
+}
+
+func TestFragmentedOSSH(t *testing.T) {
+	runServer(t,
+		&runServerConfig{
+			tunnelProtocol:       "OSSH",
+			enableSSHAPIRequests: true,
+			doHotReload:          false,
+			doDefaultSponsorID:   false,
+			denyTrafficRules:     false,
+			requireAuthorization: true,
+			omitAuthorization:    false,
+			doTunneledWebRequest: true,
+			doTunneledNTPRequest: true,
+			forceFragmenting:     true,
 		})
 }
 
@@ -159,6 +177,7 @@ func TestUnfrontedMeek(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -175,6 +194,7 @@ func TestUnfrontedMeekHTTPS(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -191,6 +211,7 @@ func TestUnfrontedMeekHTTPSTLS13(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -207,6 +228,7 @@ func TestUnfrontedMeekSessionTicket(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -223,6 +245,7 @@ func TestUnfrontedMeekSessionTicketTLS13(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -238,6 +261,7 @@ func TestQUICOSSH(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -256,6 +280,7 @@ func TestMarionetteOSSH(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -271,6 +296,7 @@ func TestWebTransportAPIRequests(t *testing.T) {
 			omitAuthorization:    true,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -286,6 +312,7 @@ func TestHotReload(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -301,6 +328,7 @@ func TestDefaultSessionID(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -316,6 +344,7 @@ func TestDenyTrafficRules(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -331,6 +360,7 @@ func TestOmitAuthorization(t *testing.T) {
 			omitAuthorization:    true,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -346,6 +376,7 @@ func TestNoAuthorization(t *testing.T) {
 			omitAuthorization:    true,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -361,6 +392,7 @@ func TestUnusedAuthorization(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -376,6 +408,7 @@ func TestTCPOnlySLOK(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: true,
 			doTunneledNTPRequest: false,
+			forceFragmenting:     false,
 		})
 }
 
@@ -391,6 +424,7 @@ func TestUDPOnlySLOK(t *testing.T) {
 			omitAuthorization:    false,
 			doTunneledWebRequest: false,
 			doTunneledNTPRequest: true,
+			forceFragmenting:     false,
 		})
 }
 
@@ -405,6 +439,7 @@ type runServerConfig struct {
 	omitAuthorization    bool
 	doTunneledWebRequest bool
 	doTunneledNTPRequest bool
+	forceFragmenting     bool
 }
 
 func runServer(t *testing.T, runConfig *runServerConfig) {
@@ -438,7 +473,8 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 	// succeed, overriding the nonfunctional values, for the tunnel to
 	// establish.
 
-	doTactics := protocol.TunnelProtocolUsesMeek(runConfig.tunnelProtocol)
+	doClientTactics := protocol.TunnelProtocolUsesMeek(runConfig.tunnelProtocol)
+	doServerTactics := doClientTactics || runConfig.forceFragmenting
 
 	// All servers require a tactics config with valid keys.
 	tacticsRequestPublicKey, tacticsRequestPrivateKey, tacticsRequestObfuscatedKey, err :=
@@ -468,7 +504,7 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 		generateConfigParams.MarionetteFormat = "http_simple_nonblocking"
 	}
 
-	if doTactics {
+	if doServerTactics {
 		generateConfigParams.TacticsRequestPublicKey = tacticsRequestPublicKey
 		generateConfigParams.TacticsRequestObfuscatedKey = tacticsRequestObfuscatedKey
 	}
@@ -501,7 +537,7 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 
 	// Only pave the tactics config when tactics are required. This exercises the
 	// case where the tactics config is omitted.
-	if doTactics {
+	if doServerTactics {
 		tacticsConfigFilename = filepath.Join(testDataDirName, "tactics_config.json")
 		paveTacticsConfigFile(
 			t, tacticsConfigFilename,
@@ -516,7 +552,7 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 	serverConfig["PsinetDatabaseFilename"] = psinetFilename
 	serverConfig["TrafficRulesFilename"] = trafficRulesFilename
 	serverConfig["OSLConfigFilename"] = oslConfigFilename
-	if doTactics {
+	if doServerTactics {
 		serverConfig["TacticsConfigFilename"] = tacticsConfigFilename
 	}
 	serverConfig["LogFilename"] = filepath.Join(testDataDirName, "psiphond.log")
@@ -567,7 +603,9 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 		}
 	}()
 
-	// TODO: monitor logs for more robust wait-until-loaded
+	// TODO: monitor logs for more robust wait-until-loaded. For example,
+	// especially with the race detector on, QUIC-OSSH tests can fail as the
+	// client sends its initial pacjet before the server is ready.
 	time.Sleep(1 * time.Second)
 
 	// Test: hot reload (of psinet and traffic rules)
@@ -607,7 +645,7 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 	localHTTPProxyPort := 8081
 
 	jsonNetworkID := ""
-	if doTactics {
+	if doClientTactics {
 		// Use a distinct prefix for network ID for each test run to
 		// ensure tactics from different runs don't apply; this is
 		// a workaround for the singleton datastore.
@@ -660,7 +698,7 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 		t.Fatalf("error committing configuration file: %s", err)
 	}
 
-	if doTactics {
+	if doClientTactics {
 		// Configure nonfunctional values that must be overridden by tactics.
 
 		applyParameters := make(map[string]interface{})
@@ -668,6 +706,25 @@ func runServer(t *testing.T, runConfig *runServerConfig) {
 		applyParameters[parameters.TunnelConnectTimeout] = "1s"
 		applyParameters[parameters.TunnelRateLimits] = common.RateLimits{WriteBytesPerSecond: 1}
 
+		err = clientConfig.SetClientParameters("", true, applyParameters)
+		if err != nil {
+			t.Fatalf("SetClientParameters failed: %s", err)
+		}
+
+	} else if runConfig.forceFragmenting {
+		// Directly apply same parameters that would've come from tactics.
+
+		applyParameters := make(map[string]interface{})
+
+		applyParameters[parameters.FragmentorLimitProtocols] = protocol.TunnelProtocols{runConfig.tunnelProtocol}
+		applyParameters[parameters.FragmentorProbability] = 1.0
+		applyParameters[parameters.FragmentorMinTotalBytes] = 1000
+		applyParameters[parameters.FragmentorMaxTotalBytes] = 2000
+		applyParameters[parameters.FragmentorMinWriteBytes] = 1
+		applyParameters[parameters.FragmentorMaxWriteBytes] = 100
+		applyParameters[parameters.FragmentorMinDelay] = 1 * time.Millisecond
+		applyParameters[parameters.FragmentorMaxDelay] = 10 * time.Millisecond
+
 		err = clientConfig.SetClientParameters("", true, applyParameters)
 		if err != nil {
 			t.Fatalf("SetClientParameters failed: %s", err)

+ 85 - 29
psiphon/server/tunnelServer.go

@@ -144,7 +144,10 @@ func (server *TunnelServer) Run() error {
 
 		if protocol.TunnelProtocolUsesQUIC(tunnelProtocol) {
 
-			listener, err = quic.Listen(localAddress)
+			listener, err = quic.Listen(
+				CommonLogger(log),
+				localAddress,
+				support.Config.ObfuscatedSSHKey)
 
 		} else if protocol.TunnelProtocolUsesMarionette(tunnelProtocol) {
 
@@ -282,6 +285,15 @@ func (server *TunnelServer) GetClientHandshaked(
 	return server.sshServer.getClientHandshaked(sessionID)
 }
 
+// UpdateClientAPIParameters updates the recorded handhake API parameters for
+// the client corresponding to sessionID.
+func (server *TunnelServer) UpdateClientAPIParameters(
+	sessionID string,
+	apiParams common.APIParameters) error {
+
+	return server.sshServer.updateClientAPIParameters(sessionID, apiParams)
+}
+
 // ExpectClientDomainBytes indicates whether the client was configured to report
 // domain bytes in its handshake response.
 func (server *TunnelServer) ExpectClientDomainBytes(
@@ -765,6 +777,23 @@ func (sshServer *sshServer) getClientHandshaked(
 	return completed, exhausted, nil
 }
 
+func (sshServer *sshServer) updateClientAPIParameters(
+	sessionID string,
+	apiParams common.APIParameters) error {
+
+	sshServer.clientsMutex.Lock()
+	client := sshServer.clients[sessionID]
+	sshServer.clientsMutex.Unlock()
+
+	if client == nil {
+		return common.ContextError(errors.New("unknown session ID"))
+	}
+
+	client.updateAPIParameters(apiParams)
+
+	return nil
+}
+
 func (sshServer *sshServer) revokeClientAuthorizations(sessionID string) {
 	sshServer.clientsMutex.Lock()
 	client := sshServer.clients[sessionID]
@@ -996,7 +1025,7 @@ func newSshClient(
 }
 
 func (sshClient *sshClient) run(
-	clientConn net.Conn, onSSHHandshakeFinished func()) {
+	baseConn net.Conn, onSSHHandshakeFinished func()) {
 
 	// onSSHHandshakeFinished must be called even if the SSH handshake is aborted.
 	defer func() {
@@ -1005,12 +1034,11 @@ func (sshClient *sshClient) run(
 		}
 	}()
 
-	// Some conns report additional metrics
-	metricsSource, isMetricsSource := clientConn.(MetricsSource)
-
 	// Set initial traffic rules, pre-handshake, based on currently known info.
 	sshClient.setTrafficRules()
 
+	conn := baseConn
+
 	// Wrap the base client connection with an ActivityMonitoredConn which will
 	// terminate the connection if no data is received before the deadline. This
 	// timeout is in effect for the entire duration of the SSH connection. Clients
@@ -1019,22 +1047,22 @@ func (sshClient *sshClient) run(
 	// due to buffering.
 
 	activityConn, err := common.NewActivityMonitoredConn(
-		clientConn,
+		conn,
 		SSH_CONNECTION_READ_DEADLINE,
 		false,
 		nil,
 		nil)
 	if err != nil {
-		clientConn.Close()
+		conn.Close()
 		log.WithContextFields(LogFields{"error": err}).Error("NewActivityMonitoredConn failed")
 		return
 	}
-	clientConn = activityConn
+	conn = activityConn
 
 	// Further wrap the connection in a rate limiting ThrottledConn.
 
-	throttledConn := common.NewThrottledConn(clientConn, sshClient.rateLimits())
-	clientConn = throttledConn
+	throttledConn := common.NewThrottledConn(conn, sshClient.rateLimits())
+	conn = throttledConn
 
 	// Run the initial [obfuscated] SSH handshake in a goroutine so we can both
 	// respect shutdownBroadcast and implement a specific handshake timeout.
@@ -1042,11 +1070,11 @@ func (sshClient *sshClient) run(
 	// too long.
 
 	type sshNewServerConnResult struct {
-		conn     net.Conn
-		sshConn  *ssh.ServerConn
-		channels <-chan ssh.NewChannel
-		requests <-chan *ssh.Request
-		err      error
+		obfuscatedSSHConn *obfuscator.ObfuscatedSSHConn
+		sshConn           *ssh.ServerConn
+		channels          <-chan ssh.NewChannel
+		requests          <-chan *ssh.Request
+		err               error
 	}
 
 	resultChannel := make(chan *sshNewServerConnResult, 2)
@@ -1085,17 +1113,17 @@ func (sshClient *sshClient) run(
 		// Wrap the connection in an SSH deobfuscator when required.
 
 		if protocol.TunnelProtocolUsesObfuscatedSSH(sshClient.tunnelProtocol) {
-			// Note: NewObfuscatedSshConn blocks on network I/O
+			// Note: NewObfuscatedSSHConn blocks on network I/O
 			// TODO: ensure this won't block shutdown
-			conn, result.err = obfuscator.NewObfuscatedSshConn(
+			result.obfuscatedSSHConn, result.err = obfuscator.NewObfuscatedSSHConn(
 				obfuscator.OBFUSCATION_CONN_MODE_SERVER,
 				conn,
 				sshClient.sshServer.support.Config.ObfuscatedSSHKey,
-				nil,
-				nil)
+				nil, nil)
 			if result.err != nil {
 				result.err = common.ContextError(result.err)
 			}
+			conn = result.obfuscatedSSHConn
 		}
 
 		if result.err == nil {
@@ -1105,7 +1133,7 @@ func (sshClient *sshClient) run(
 
 		resultChannel <- result
 
-	}(clientConn)
+	}(conn)
 
 	var result *sshNewServerConnResult
 	select {
@@ -1113,7 +1141,7 @@ func (sshClient *sshClient) run(
 	case <-sshClient.sshServer.shutdownBroadcast:
 		// Close() will interrupt an ongoing handshake
 		// TODO: wait for SSH handshake goroutines to exit before returning?
-		clientConn.Close()
+		conn.Close()
 		return
 	}
 
@@ -1122,7 +1150,7 @@ func (sshClient *sshClient) run(
 	}
 
 	if result.err != nil {
-		clientConn.Close()
+		conn.Close()
 		// This is a Debug log due to noise. The handshake often fails due to I/O
 		// errors as clients frequently interrupt connections in progress when
 		// client-side load balancing completes a connection to a different server.
@@ -1144,7 +1172,7 @@ func (sshClient *sshClient) run(
 	sshClient.Unlock()
 
 	if !sshClient.sshServer.registerEstablishedClient(sshClient) {
-		clientConn.Close()
+		conn.Close()
 		log.WithContext().Warning("register failed")
 		return
 	}
@@ -1156,10 +1184,22 @@ func (sshClient *sshClient) run(
 
 	sshClient.sshServer.unregisterEstablishedClient(sshClient)
 
-	var additionalMetrics LogFields
-	if isMetricsSource {
-		additionalMetrics = metricsSource.GetMetrics()
+	// Some conns report additional metrics. Meek conns report resiliency
+	// metrics and fragmentor.Conns report fragmentor configs.
+	//
+	// Limitation: for meek, GetMetrics from underlying fragmentor.Conn(s)
+	// should be called in order to log fragmentor metrics for meek sessions.
+
+	var additionalMetrics []LogFields
+	if metricsSource, ok := baseConn.(common.MetricsSource); ok {
+		additionalMetrics = append(
+			additionalMetrics, LogFields(metricsSource.GetMetrics()))
+	}
+	if result.obfuscatedSSHConn != nil {
+		additionalMetrics = append(
+			additionalMetrics, LogFields(result.obfuscatedSSHConn.GetMetrics()))
 	}
+
 	sshClient.logTunnel(additionalMetrics)
 
 	// Transfer OSL seed state -- the OSL progress -- from the closing
@@ -1724,7 +1764,7 @@ func (sshClient *sshClient) setUDPChannel(channel ssh.Channel) {
 	sshClient.Unlock()
 }
 
-func (sshClient *sshClient) logTunnel(additionalMetrics LogFields) {
+func (sshClient *sshClient) logTunnel(additionalMetrics []LogFields) {
 
 	// Note: reporting duration based on last confirmed data transfer, which
 	// is reads for sshClient.activityConn.GetActiveDuration(), and not
@@ -1767,8 +1807,8 @@ func (sshClient *sshClient) logTunnel(additionalMetrics LogFields) {
 		sshClient.udpTrafficState.bytesDown
 
 	// Merge in additional metrics from the optional metrics source
-	if additionalMetrics != nil {
-		for name, value := range additionalMetrics {
+	for _, metrics := range additionalMetrics {
+		for name, value := range metrics {
 			// Don't overwrite any basic fields
 			if logFields[name] == nil {
 				logFields[name] = value
@@ -2060,6 +2100,22 @@ func (sshClient *sshClient) getHandshaked() (bool, bool) {
 	return completed, exhausted
 }
 
+func (sshClient *sshClient) updateAPIParameters(
+	apiParams common.APIParameters) {
+
+	sshClient.Lock()
+	defer sshClient.Unlock()
+
+	// Only update after handshake has initialized API params.
+	if !sshClient.handshakeState.completed {
+		return
+	}
+
+	for name, value := range apiParams {
+		sshClient.handshakeState.apiParams[name] = value
+	}
+}
+
 func (sshClient *sshClient) expectDomainBytes() bool {
 	sshClient.Lock()
 	defer sshClient.Unlock()

+ 26 - 0
psiphon/serverApi.go

@@ -771,6 +771,32 @@ func getBaseAPIParameters(
 
 	params[tactics.APPLIED_TACTICS_TAG_PARAMETER_NAME] = config.clientParameters.Get().Tag()
 
+	if dialStats.DialPortNumber != "" {
+		params["dial_port_number"] = dialStats.DialPortNumber
+	}
+
+	if dialStats.QUICVersion != "" {
+		params["quic_version"] = dialStats.QUICVersion
+	}
+
+	if dialStats.QUICDialSNIAddress != "" {
+		params["quic_dial_sni_address"] = dialStats.QUICDialSNIAddress
+	}
+
+	if dialStats.DialConnMetrics != nil {
+		metrics := dialStats.DialConnMetrics.GetMetrics()
+		for name, value := range metrics {
+			params[name] = fmt.Sprintf("%v", value)
+		}
+	}
+
+	if dialStats.ObfuscatedSSHConnMetrics != nil {
+		metrics := dialStats.ObfuscatedSSHConnMetrics.GetMetrics()
+		for name, value := range metrics {
+			params[name] = fmt.Sprintf("%v", value)
+		}
+	}
+
 	return params
 }
 

+ 21 - 10
psiphon/tlsDialer.go

@@ -58,6 +58,7 @@ import (
 	"crypto/x509"
 	"encoding/hex"
 	"errors"
+	"io/ioutil"
 	"net"
 	"time"
 
@@ -263,13 +264,6 @@ func CustomTLSDial(
 	network, addr string,
 	config *CustomTLSConfig) (net.Conn, error) {
 
-	if !config.SkipVerify &&
-		config.VerifyLegacyCertificate == nil &&
-		config.TrustedCACertificatesFilename != "" {
-		return nil, common.ContextError(
-			errors.New("TrustedCACertificatesFilename not supported"))
-	}
-
 	dialAddr := addr
 	if config.DialAddr != "" {
 		dialAddr = config.DialAddr
@@ -330,6 +324,20 @@ func CustomTLSDial(
 		copy(obfuscatedSessionTicketKey[:], key)
 	}
 
+	var tlsRootCAs *x509.CertPool
+
+	if !config.SkipVerify &&
+		config.VerifyLegacyCertificate == nil &&
+		config.TrustedCACertificatesFilename != "" {
+
+		tlsRootCAs = x509.NewCertPool()
+		certData, err := ioutil.ReadFile(config.TrustedCACertificatesFilename)
+		if err != nil {
+			return nil, common.ContextError(err)
+		}
+		tlsRootCAs.AppendCertsFromPEM(certData)
+	}
+
 	// Depending on the selected TLS profile, the TLS provider will be tris
 	// (TLS 1.3) or utls (all other profiles).
 
@@ -343,6 +351,7 @@ func CustomTLSDial(
 		}
 
 		tlsConfig := &utls.Config{
+			RootCAs:            tlsRootCAs,
 			InsecureSkipVerify: tlsConfigInsecureSkipVerify,
 			ServerName:         tlsConfigServerName,
 			ClientSessionCache: clientSessionCache,
@@ -377,9 +386,11 @@ func CustomTLSDial(
 		}
 
 		tlsConfig := &tris.Config{
-			InsecureSkipVerify: tlsConfigInsecureSkipVerify,
-			ServerName:         tlsConfigServerName,
-			ClientSessionCache: clientSessionCache,
+			RootCAs:                 tlsRootCAs,
+			InsecureSkipVerify:      tlsConfigInsecureSkipVerify,
+			ServerName:              tlsConfigServerName,
+			ClientSessionCache:      clientSessionCache,
+			UseExtendedMasterSecret: true,
 		}
 
 		conn = &trisConn{

+ 51 - 30
psiphon/tunnel.go

@@ -36,6 +36,7 @@ import (
 	"github.com/Psiphon-Labs/goarista/monotime"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/crypto/ssh"
+	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/fragmentor"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/marionette"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/obfuscator"
 	"github.com/Psiphon-Labs/psiphon-tunnel-core/psiphon/common/parameters"
@@ -131,6 +132,11 @@ type DialStats struct {
 	UserAgent                      string
 	SelectedTLSProfile             bool
 	TLSProfile                     string
+	DialPortNumber                 string
+	QUICVersion                    string
+	QUICDialSNIAddress             string
+	DialConnMetrics                common.MetricsSource
+	ObfuscatedSSHConnMetrics       common.MetricsSource
 }
 
 // ConnectTunnel first makes a network transport connection to the
@@ -691,10 +697,13 @@ func initMeekConfig(
 
 // initDialConfig is a helper that creates a DialConfig for the tunnel.
 func initDialConfig(
-	config *Config, meekConfig *MeekConfig) (*DialConfig, *DialStats) {
+	config *Config,
+	meekConfig *MeekConfig,
+	tunnelProtocol string) (*DialConfig, *DialStats) {
 
-	var upstreamProxyType string
+	p := config.clientParameters.Get()
 
+	var upstreamProxyType string
 	if config.UseUpstreamProxy() {
 		// Note: UpstreamProxyURL will be validated in the dial
 		proxyURL, err := url.Parse(config.UpstreamProxyURL)
@@ -711,9 +720,7 @@ func initDialConfig(
 		}
 	}
 
-	additionalCustomHeaders :=
-		config.clientParameters.Get().HTTPHeaders(parameters.AdditionalCustomHeaders)
-
+	additionalCustomHeaders := p.HTTPHeaders(parameters.AdditionalCustomHeaders)
 	if additionalCustomHeaders != nil {
 		for k, v := range additionalCustomHeaders {
 			dialCustomHeaders[k] = make([]string, len(v))
@@ -722,12 +729,13 @@ func initDialConfig(
 	}
 
 	// Set User-Agent when using meek or an upstream HTTP proxy
-
 	var selectedUserAgent bool
 	if meekConfig != nil || upstreamProxyType == "http" {
 		selectedUserAgent = UserAgentIfUnset(config.clientParameters, dialCustomHeaders)
 	}
 
+	fragmentorConfig := fragmentor.NewUpstreamConfig(p, tunnelProtocol)
+
 	dialConfig := &DialConfig{
 		UpstreamProxyURL:              config.UpstreamProxyURL,
 		CustomHeaders:                 dialCustomHeaders,
@@ -735,6 +743,7 @@ func initDialConfig(
 		DnsServerGetter:               config.DnsServerGetter,
 		IPv6Synthesizer:               config.IPv6Synthesizer,
 		TrustedCACertificatesFilename: config.TrustedCACertificatesFilename,
+		FragmentorConfig:              fragmentorConfig,
 	}
 
 	dialStats := &DialStats{}
@@ -822,46 +831,50 @@ func dialSsh(
 
 	// Note: when SSHClientVersion is "", a default is supplied by the ssh package:
 	// https://godoc.org/golang.org/x/crypto/ssh#ClientConfig
-	var selectedSSHClientVersion bool
-	SSHClientVersion := ""
-	useObfuscatedSsh := false
 	var directDialAddress string
-	var quicDialSNIAddress string
+	var QUICVersion string
+	var QUICDialSNIAddress string
+	var SSHClientVersion string
 	var meekConfig *MeekConfig
 	var err error
 
 	switch selectedProtocol {
 	case protocol.TUNNEL_PROTOCOL_OBFUSCATED_SSH, protocol.TUNNEL_PROTOCOL_TAPDANCE_OBFUSCATED_SSH:
-		useObfuscatedSsh = true
 		directDialAddress = fmt.Sprintf("%s:%d", serverEntry.IpAddress, serverEntry.SshObfuscatedPort)
 
 	case protocol.TUNNEL_PROTOCOL_QUIC_OBFUSCATED_SSH:
-		useObfuscatedSsh = true
 		directDialAddress = fmt.Sprintf("%s:%d", serverEntry.IpAddress, serverEntry.SshObfuscatedQUICPort)
-		quicDialSNIAddress = fmt.Sprintf("%s:%d", common.GenerateHostName(), serverEntry.SshObfuscatedQUICPort)
+		QUICVersion = selectQUICVersion(config.clientParameters)
+		QUICDialSNIAddress = fmt.Sprintf("%s:%d", common.GenerateHostName(), serverEntry.SshObfuscatedQUICPort)
 
 	case protocol.TUNNEL_PROTOCOL_MARIONETTE_OBFUSCATED_SSH:
-		useObfuscatedSsh = true
 		directDialAddress = serverEntry.IpAddress
 
 	case protocol.TUNNEL_PROTOCOL_SSH:
-		selectedSSHClientVersion = true
 		SSHClientVersion = pickSSHClientVersion()
 		directDialAddress = fmt.Sprintf("%s:%d", serverEntry.IpAddress, serverEntry.SshPort)
 
 	default:
-		useObfuscatedSsh = true
 		meekConfig, err = initMeekConfig(config, serverEntry, selectedProtocol, sessionId)
 		if err != nil {
 			return nil, common.ContextError(err)
 		}
 	}
 
-	dialConfig, dialStats := initDialConfig(config, meekConfig)
+	dialConfig, dialStats := initDialConfig(config, meekConfig, selectedProtocol)
+
+	if meekConfig != nil {
+		_, dialStats.DialPortNumber, _ = net.SplitHostPort(meekConfig.DialAddress)
+	} else {
+		_, dialStats.DialPortNumber, _ = net.SplitHostPort(directDialAddress)
+	}
 
-	// Add dial stats specific to SSH dialing
+	switch selectedProtocol {
+	case protocol.TUNNEL_PROTOCOL_QUIC_OBFUSCATED_SSH:
+		dialStats.QUICVersion = QUICVersion
+		dialStats.QUICDialSNIAddress = QUICDialSNIAddress
 
-	if selectedSSHClientVersion {
+	case protocol.TUNNEL_PROTOCOL_SSH:
 		dialStats.SelectedSSHClientVersion = true
 		dialStats.SSHClientVersion = SSHClientVersion
 	}
@@ -902,8 +915,9 @@ func dialSsh(
 			ctx,
 			packetConn,
 			remoteAddr,
-			quicDialSNIAddress,
-			selectQUICVersion(config.clientParameters))
+			QUICDialSNIAddress,
+			QUICVersion,
+			serverEntry.SshObfuscatedKey)
 		if err != nil {
 			return nil, common.ContextError(err)
 		}
@@ -932,21 +946,26 @@ func dialSsh(
 
 	} else {
 
-		dialConn, err = DialTCPFragmentor(
+		dialConn, err = DialTCP(
 			ctx,
 			directDialAddress,
-			dialConfig,
-			selectedProtocol,
-			config.clientParameters,
-			nil)
+			dialConfig)
 		if err != nil {
 			return nil, common.ContextError(err)
 		}
 	}
 
+	// Some conns report additional metrics. fragmentor.Conns report
+	// fragmentor configs.
+	//
+	// Limitation: for meek, GetMetrics from underlying fragmentor.Conn(s)
+	// should be called in order to log fragmentor metrics for meek sessions.
+	if metricsSource, ok := dialConn.(common.MetricsSource); ok {
+		dialStats.DialConnMetrics = metricsSource
+	}
+
 	// If dialConn is not a Closer, tunnel failure detection may be slower
-	_, ok := dialConn.(common.Closer)
-	if !ok {
+	if _, ok := dialConn.(common.Closer); !ok {
 		NoticeAlert("tunnel.dialSsh: dialConn is not a Closer")
 	}
 
@@ -971,8 +990,8 @@ func dialSsh(
 
 	// Add obfuscated SSH layer
 	var sshConn net.Conn = throttledConn
-	if useObfuscatedSsh {
-		sshConn, err = obfuscator.NewObfuscatedSshConn(
+	if protocol.TunnelProtocolUsesObfuscatedSSH(selectedProtocol) {
+		obfuscatedSSHConn, err := obfuscator.NewObfuscatedSSHConn(
 			obfuscator.OBFUSCATION_CONN_MODE_CLIENT,
 			throttledConn,
 			serverEntry.SshObfuscatedKey,
@@ -981,6 +1000,8 @@ func dialSsh(
 		if err != nil {
 			return nil, common.ContextError(err)
 		}
+		sshConn = obfuscatedSSHConn
+		dialStats.ObfuscatedSSHConnMetrics = obfuscatedSSHConn
 	}
 
 	// Now establish the SSH session over the conn transport

+ 0 - 6
vendor/github.com/Psiphon-Labs/tls-tris/13.go

@@ -152,12 +152,6 @@ CurvePreferenceLoop:
 		return errors.New("tls: HelloRetryRequest not implemented") // TODO(filippo)
 	}
 
-	if committer, ok := c.conn.(Committer); ok {
-		if err := committer.Commit(); err != nil {
-			return err
-		}
-	}
-
 	privateKey, serverKS, err := config.generateKeyShare(ks.group)
 	if err != nil {
 		c.sendAlert(alertInternalError)

+ 1 - 0
vendor/github.com/Psiphon-Labs/tls-tris/alert.go

@@ -38,6 +38,7 @@ const (
 	alertInappropriateFallback  alert = 86
 	alertUserCanceled           alert = 90
 	alertNoRenegotiation        alert = 100
+	alertUnsupportedExtension   alert = 110
 	alertCertificateRequired    alert = 116
 	alertNoApplicationProtocol  alert = 120
 	alertSuccess                alert = 255 // dummy value returned by unmarshal functions

+ 13 - 17
vendor/github.com/Psiphon-Labs/tls-tris/common.go

@@ -23,12 +23,11 @@ import (
 )
 
 const (
-	VersionSSL30        = 0x0300
-	VersionTLS10        = 0x0301
-	VersionTLS11        = 0x0302
-	VersionTLS12        = 0x0303
-	VersionTLS13        = 0x0304
-	VersionTLS13Draft28 = 0x7f00 | 28
+	VersionSSL30 = 0x0300
+	VersionTLS10 = 0x0301
+	VersionTLS11 = 0x0302
+	VersionTLS12 = 0x0303
+	VersionTLS13 = 0x0304
 )
 
 const (
@@ -39,7 +38,7 @@ const (
 	maxWarnAlertCount = 5            // maximum number of consecutive warning alerts
 
 	minVersion = VersionTLS12
-	maxVersion = VersionTLS13Draft28
+	maxVersion = VersionTLS13
 )
 
 // TLS record types.
@@ -85,6 +84,7 @@ const (
 	extensionSignatureAlgorithms     uint16 = 13
 	extensionALPN                    uint16 = 16
 	extensionSCT                     uint16 = 18 // https://tools.ietf.org/html/rfc6962#section-6
+	extensionEMS                     uint16 = 23
 	extensionSessionTicket           uint16 = 35
 	extensionPreSharedKey            uint16 = 41
 	extensionEarlyData               uint16 = 42
@@ -260,6 +260,7 @@ type ClientSessionState struct {
 	masterSecret       []byte                // MasterSecret generated by client on a full handshake
 	serverCertificates []*x509.Certificate   // Certificate chain presented by the server
 	verifiedChains     [][]*x509.Certificate // Certificate chains we built for verification
+	useEMS             bool                  // State of extended master secret
 }
 
 // ClientSessionCache is a cache of ClientSessionState objects that can be used
@@ -642,6 +643,10 @@ type Config struct {
 	// for new tickets and any subsequent keys can be used to decrypt old
 	// tickets.
 	sessionTicketKeys []ticketKey
+
+	// UseExtendedMasterSecret indicates whether or not the connection
+	// should use the extended master secret computation if available
+	UseExtendedMasterSecret bool
 }
 
 // ticketKeyNameLen is the number of bytes of identifier that is prepended to
@@ -712,6 +717,7 @@ func (c *Config) Clone() *Config {
 		AcceptDelegatedCredential:   c.AcceptDelegatedCredential,
 		GetDelegatedCredential:      c.GetDelegatedCredential,
 		sessionTicketKeys:           sessionTicketKeys,
+		UseExtendedMasterSecret:     c.UseExtendedMasterSecret,
 	}
 }
 
@@ -882,12 +888,6 @@ func (c *Config) pickVersion(peerSupportedVersions []uint16) (uint16, bool) {
 // configSuppVersArray is the backing array of Config.getSupportedVersions
 var configSuppVersArray = [...]uint16{VersionTLS13, VersionTLS12, VersionTLS11, VersionTLS10, VersionSSL30}
 
-// tls13DraftSuppVersArray is the backing array of Config.getSupportedVersions
-// with TLS 1.3 draft versions included.
-//
-// TODO: remove once TLS 1.3 is finalised.
-var tls13DraftSuppVersArray = [...]uint16{VersionTLS13Draft28, VersionTLS12, VersionTLS11, VersionTLS10, VersionSSL30}
-
 // getSupportedVersions returns the protocol versions that are supported by the
 // current configuration.
 func (c *Config) getSupportedVersions() []uint16 {
@@ -903,10 +903,6 @@ func (c *Config) getSupportedVersions() []uint16 {
 	if maxVersion < minVersion {
 		return nil
 	}
-	// TODO: remove once TLS 1.3 is finalised.
-	if maxVersion == VersionTLS13 {
-		return tls13DraftSuppVersArray[:len(tls13DraftSuppVersArray)-int(minVersion-VersionSSL30)]
-	}
 	return configSuppVersArray[VersionTLS13-maxVersion : VersionTLS13-minVersion+1]
 }
 

+ 13 - 7
vendor/github.com/Psiphon-Labs/tls-tris/conn.go

@@ -66,6 +66,8 @@ type Conn struct {
 	// renegotiation extension. (This is meaningless as a server because
 	// renegotiation is not supported in that case.)
 	secureRenegotiation bool
+	// indicates wether extended MasterSecret extension is used (see RFC7627)
+	useEMS bool
 
 	// clientFinishedIsFirst is true if the client sent the first Finished
 	// message during the most recent handshake. This is recorded because
@@ -472,23 +474,30 @@ func (hc *halfConn) encrypt(b *block, explicitIVLen int) (bool, alert) {
 		case aead:
 			// explicitIVLen is always 0 for TLS1.3
 			payloadLen := len(b.data) - recordHeaderLen - explicitIVLen
+			payloadOffset := recordHeaderLen + explicitIVLen
 			nonce := b.data[recordHeaderLen : recordHeaderLen+explicitIVLen]
 			if len(nonce) == 0 {
 				nonce = hc.seq[:]
 			}
-			payload = b.data[recordHeaderLen+explicitIVLen:]
-			payload = payload[:payloadLen]
 
 			var additionalData []byte
 			if hc.version < VersionTLS13 {
+				// make room in a buffer for payload + MAC
+				b.resize(len(b.data) + c.Overhead())
+
+				payload = b.data[payloadOffset : payloadOffset+payloadLen]
 				copy(hc.additionalData[:], hc.seq[:])
 				copy(hc.additionalData[8:], b.data[:3])
 				binary.BigEndian.PutUint16(hc.additionalData[11:], uint16(payloadLen))
 				additionalData = hc.additionalData[:]
-				b.resize(len(b.data) + c.Overhead())
 			} else {
+				// make room in a buffer for TLSCiphertext.encrypted_record:
+				// payload + MAC + extra data if needed
+				b.resize(len(b.data) + c.Overhead() + 1)
+
+				payload = b.data[payloadOffset : payloadOffset+payloadLen+1]
 				// 1 byte of content type is appended to payload and encrypted
-				payload = append(payload, b.data[0])
+				payload[len(payload)-1] = b.data[0]
 
 				// opaque_type
 				b.data[0] = byte(recordTypeApplicationData)
@@ -498,9 +507,6 @@ func (hc *halfConn) encrypt(b *block, explicitIVLen int) (bool, alert) {
 				additionalData[0] = b.data[0]
 				binary.BigEndian.PutUint16(additionalData[1:], VersionTLS12)
 				binary.BigEndian.PutUint16(additionalData[3:], uint16(len(payload)+c.Overhead()))
-
-				// make room for TLSCiphertext.encrypted_record
-				b.resize(len(payload) + recordHeaderLen + c.Overhead())
 			}
 			c.Seal(payload[:0], nonce, payload, additionalData)
 		case cbcMode:

+ 23 - 6
vendor/github.com/Psiphon-Labs/tls-tris/handshake_client.go

@@ -67,6 +67,7 @@ func makeClientHello(config *Config) (*clientHelloMsg, error) {
 		secureRenegotiationSupported: true,
 		delegatedCredential:          config.AcceptDelegatedCredential,
 		alpnProtocols:                config.NextProtos,
+		extendedMSSupported:          config.UseExtendedMasterSecret,
 	}
 	possibleCipherSuites := config.cipherSuites()
 	hello.cipherSuites = make([]uint16, 0, len(possibleCipherSuites))
@@ -589,6 +590,13 @@ func (hs *clientHandshakeState) doFullHandshake() error {
 			return err
 		}
 	}
+	c.useEMS = hs.serverHello.extendedMSSupported
+	hs.masterSecret = masterFromPreMasterSecret(c.vers, hs.suite, preMasterSecret, hs.hello.random, hs.serverHello.random, hs.finishedHash, c.useEMS)
+
+	if err := c.config.writeKeyLog("CLIENT_RANDOM", hs.hello.random, hs.masterSecret); err != nil {
+		c.sendAlert(alertInternalError)
+		return errors.New("tls: failed to write to key log: " + err.Error())
+	}
 
 	if chainToSend != nil && len(chainToSend.Certificate) > 0 {
 		certVerify := &certificateVerifyMsg{
@@ -631,12 +639,6 @@ func (hs *clientHandshakeState) doFullHandshake() error {
 		}
 	}
 
-	hs.masterSecret = masterFromPreMasterSecret(c.vers, hs.suite, preMasterSecret, hs.hello.random, hs.serverHello.random)
-	if err := c.config.writeKeyLog("CLIENT_RANDOM", hs.hello.random, hs.masterSecret); err != nil {
-		c.sendAlert(alertInternalError)
-		return errors.New("tls: failed to write to key log: " + err.Error())
-	}
-
 	hs.finishedHash.discardHandshakeBuffer()
 
 	return nil
@@ -697,6 +699,16 @@ func (hs *clientHandshakeState) processServerHello() (bool, error) {
 		}
 	}
 
+	if hs.serverHello.extendedMSSupported {
+		if hs.hello.extendedMSSupported {
+			c.useEMS = true
+		} else {
+			// server wants to calculate master secret in a different way than client
+			c.sendAlert(alertUnsupportedExtension)
+			return false, errors.New("tls: unexpected extension (EMS) received in SH")
+		}
+	}
+
 	clientDidNPN := hs.hello.nextProtoNeg
 	clientDidALPN := len(hs.hello.alpnProtocols) > 0
 	serverHasNPN := hs.serverHello.nextProtoNeg
@@ -727,6 +739,10 @@ func (hs *clientHandshakeState) processServerHello() (bool, error) {
 		return false, nil
 	}
 
+	if hs.session.useEMS != c.useEMS {
+		return false, errors.New("differing EMS state")
+	}
+
 	if hs.session.vers != c.vers {
 		c.sendAlert(alertHandshakeFailure)
 		return false, errors.New("tls: server resumed a session with a different version")
@@ -797,6 +813,7 @@ func (hs *clientHandshakeState) readSessionTicket() error {
 		masterSecret:       hs.masterSecret,
 		serverCertificates: c.peerCertificates,
 		verifiedChains:     c.verifiedChains,
+		useEMS:             c.useEMS,
 	}
 
 	return nil

+ 40 - 3
vendor/github.com/Psiphon-Labs/tls-tris/handshake_messages.go

@@ -60,6 +60,7 @@ type clientHelloMsg struct {
 	pskKeyExchangeModes              []uint8
 	earlyData                        bool
 	delegatedCredential              bool
+	extendedMSSupported              bool // RFC7627
 }
 
 // Function used for signature_algorithms and signature_algorithrms_cert
@@ -139,7 +140,8 @@ func (m *clientHelloMsg) equal(i interface{}) bool {
 		eqKeyShares(m.keyShares, m1.keyShares) &&
 		eqUint16s(m.supportedVersions, m1.supportedVersions) &&
 		m.earlyData == m1.earlyData &&
-		m.delegatedCredential == m1.delegatedCredential
+		m.delegatedCredential == m1.delegatedCredential &&
+		m.extendedMSSupported == m1.extendedMSSupported
 }
 
 func (m *clientHelloMsg) marshal() []byte {
@@ -158,7 +160,6 @@ func (m *clientHelloMsg) marshal() []byte {
 	numExtensions := 0
 	extensionsLength := 0
 
-	// Indicates wether to send signature_algorithms_cert extension
 	if m.nextProtoNeg {
 		numExtensions++
 	}
@@ -225,6 +226,9 @@ func (m *clientHelloMsg) marshal() []byte {
 	if m.delegatedCredential {
 		numExtensions++
 	}
+	if m.extendedMSSupported {
+		numExtensions++
+	}
 	if numExtensions > 0 {
 		extensionsLength += 4 * numExtensions
 		length += 2 + extensionsLength
@@ -446,6 +450,10 @@ func (m *clientHelloMsg) marshal() []byte {
 		binary.BigEndian.PutUint16(z, extensionDelegatedCredential)
 		z = z[4:]
 	}
+	if m.extendedMSSupported {
+		binary.BigEndian.PutUint16(z, extensionEMS)
+		z = z[4:]
+	}
 
 	m.raw = x
 
@@ -842,6 +850,14 @@ func (m *clientHelloMsg) randomizedMarshal() []byte {
 				z = z[4:]
 			})
 	}
+	if m.extendedMSSupported && common.FlipCoin() { // May be omitted
+		numExtensions++
+		extensionMarshalers = append(extensionMarshalers,
+			func() {
+				binary.BigEndian.PutUint16(z, extensionEMS)
+				z = z[4:]
+			})
+	}
 
 	// Optional, additional extensions
 
@@ -1001,6 +1017,7 @@ func (m *clientHelloMsg) unmarshal(data []byte) alert {
 	m.pskKeyExchangeModes = nil
 	m.earlyData = false
 	m.delegatedCredential = false
+	m.extendedMSSupported = false
 
 	if len(data) == 0 {
 		// ClientHello is optionally followed by extension data
@@ -1268,6 +1285,12 @@ func (m *clientHelloMsg) unmarshal(data []byte) alert {
 		case extensionDelegatedCredential:
 			// https://tools.ietf.org/html/draft-ietf-tls-subcerts-02
 			m.delegatedCredential = true
+		case extensionEMS:
+			// RFC 7627
+			m.extendedMSSupported = true
+			if length != 0 {
+				return alertDecodeError
+			}
 		}
 		data = data[length:]
 		bindersOffset += length
@@ -1300,6 +1323,9 @@ type serverHelloMsg struct {
 	keyShare    keyShare
 	psk         bool
 	pskIdentity uint16
+
+	// RFC7627
+	extendedMSSupported bool
 }
 
 func (m *serverHelloMsg) equal(i interface{}) bool {
@@ -1333,7 +1359,8 @@ func (m *serverHelloMsg) equal(i interface{}) bool {
 		m.keyShare.group == m1.keyShare.group &&
 		bytes.Equal(m.keyShare.data, m1.keyShare.data) &&
 		m.psk == m1.psk &&
-		m.pskIdentity == m1.pskIdentity
+		m.pskIdentity == m1.pskIdentity &&
+		m.extendedMSSupported == m1.extendedMSSupported
 }
 
 func (m *serverHelloMsg) marshal() []byte {
@@ -1364,6 +1391,9 @@ func (m *serverHelloMsg) marshal() []byte {
 		extensionsLength += 1 + len(m.secureRenegotiation)
 		numExtensions++
 	}
+	if m.extendedMSSupported {
+		numExtensions++
+	}
 	if alpnLen := len(m.alpnProtocol); alpnLen > 0 {
 		if alpnLen >= 256 {
 			panic("invalid ALPN protocol")
@@ -1524,6 +1554,10 @@ func (m *serverHelloMsg) marshal() []byte {
 		z[5] = byte(m.pskIdentity)
 		z = z[6:]
 	}
+	if m.extendedMSSupported {
+		binary.BigEndian.PutUint16(z, extensionEMS)
+		z = z[4:]
+	}
 
 	m.raw = x
 
@@ -1560,6 +1594,7 @@ func (m *serverHelloMsg) unmarshal(data []byte) alert {
 	m.keyShare.data = nil
 	m.psk = false
 	m.pskIdentity = 0
+	m.extendedMSSupported = false
 
 	if len(data) == 0 {
 		// ServerHello is optionally followed by extension data
@@ -1701,6 +1736,8 @@ func (m *serverHelloMsg) unmarshal(data []byte) alert {
 			}
 			m.psk = true
 			m.pskIdentity = uint16(data[0])<<8 | uint16(data[1])
+		case extensionEMS:
+			m.extendedMSSupported = true
 		}
 		data = data[length:]
 	}

+ 14 - 8
vendor/github.com/Psiphon-Labs/tls-tris/handshake_server.go

@@ -16,10 +16,6 @@ import (
 	"sync/atomic"
 )
 
-type Committer interface {
-	Commit() error
-}
-
 // serverHandshakeState contains details of a server handshake in progress.
 // It's discarded once the handshake has completed.
 type serverHandshakeState struct {
@@ -281,10 +277,10 @@ Curves:
 
 	if len(hs.clientHello.alpnProtocols) > 0 {
 		if selectedProto, fallback := mutualProtocol(hs.clientHello.alpnProtocols, c.config.NextProtos); !fallback {
-			if hs.hello != nil {
-				hs.hello.alpnProtocol = selectedProto
-			} else {
+			if hs.hello13Enc != nil {
 				hs.hello13Enc.alpnProtocol = selectedProto
+			} else {
+				hs.hello.alpnProtocol = selectedProto
 			}
 			c.clientProtocol = selectedProto
 		}
@@ -413,6 +409,11 @@ func (hs *serverHandshakeState) checkForResumption() bool {
 		return false
 	}
 
+	// Do not resume connections where client support for EMS has changed
+	if (hs.clientHello.extendedMSSupported && c.config.UseExtendedMasterSecret) != hs.sessionState.usedEMS {
+		return false
+	}
+
 	cipherSuiteOk := false
 	// Check that the client is still offering the ciphersuite in the session.
 	for _, id := range hs.clientHello.cipherSuites {
@@ -450,6 +451,7 @@ func (hs *serverHandshakeState) doResumeHandshake() error {
 	// that we're doing a resumption.
 	hs.hello.sessionId = hs.clientHello.sessionId
 	hs.hello.ticketSupported = hs.sessionState.usedOldKey
+	hs.hello.extendedMSSupported = hs.clientHello.extendedMSSupported && c.config.UseExtendedMasterSecret
 	hs.finishedHash = newFinishedHash(c.vers, hs.suite)
 	hs.finishedHash.discardHandshakeBuffer()
 	hs.finishedHash.Write(hs.clientHello.marshal())
@@ -465,6 +467,7 @@ func (hs *serverHandshakeState) doResumeHandshake() error {
 	}
 
 	hs.masterSecret = hs.sessionState.masterSecret
+	c.useEMS = hs.sessionState.usedEMS
 
 	return nil
 }
@@ -478,6 +481,7 @@ func (hs *serverHandshakeState) doFullHandshake() error {
 
 	hs.hello.ticketSupported = hs.clientHello.ticketSupported && !c.config.SessionTicketsDisabled
 	hs.hello.cipherSuite = hs.suite.id
+	hs.hello.extendedMSSupported = hs.clientHello.extendedMSSupported && c.config.UseExtendedMasterSecret
 
 	hs.finishedHash = newFinishedHash(hs.c.vers, hs.suite)
 	if c.config.ClientAuth == NoClientCert {
@@ -611,7 +615,8 @@ func (hs *serverHandshakeState) doFullHandshake() error {
 		}
 		return err
 	}
-	hs.masterSecret = masterFromPreMasterSecret(c.vers, hs.suite, preMasterSecret, hs.clientHello.random, hs.hello.random)
+	c.useEMS = hs.hello.extendedMSSupported
+	hs.masterSecret = masterFromPreMasterSecret(c.vers, hs.suite, preMasterSecret, hs.clientHello.random, hs.hello.random, hs.finishedHash, c.useEMS)
 	if err := c.config.writeKeyLog("CLIENT_RANDOM", hs.clientHello.random, hs.masterSecret); err != nil {
 		c.sendAlert(alertInternalError)
 		return err
@@ -741,6 +746,7 @@ func (hs *serverHandshakeState) sendSessionTicket() error {
 		cipherSuite:  hs.suite.id,
 		masterSecret: hs.masterSecret,
 		certificates: hs.certsFromClient,
+		usedEMS:      c.useEMS,
 	}
 	m.ticket, err = c.encryptTicket(state.marshal())
 	if err != nil {

+ 16 - 8
vendor/github.com/Psiphon-Labs/tls-tris/prf.go

@@ -117,6 +117,7 @@ var masterSecretLabel = []byte("master secret")
 var keyExpansionLabel = []byte("key expansion")
 var clientFinishedLabel = []byte("client finished")
 var serverFinishedLabel = []byte("server finished")
+var extendedMasterSecretLabel = []byte("extended master secret")
 
 func prfAndHashForVersion(version uint16, suite *cipherSuite) (func(result, secret, label, seed []byte), crypto.Hash) {
 	switch version {
@@ -141,14 +142,21 @@ func prfForVersion(version uint16, suite *cipherSuite) func(result, secret, labe
 
 // masterFromPreMasterSecret generates the master secret from the pre-master
 // secret. See http://tools.ietf.org/html/rfc5246#section-8.1
-func masterFromPreMasterSecret(version uint16, suite *cipherSuite, preMasterSecret, clientRandom, serverRandom []byte) []byte {
-	seed := make([]byte, 0, len(clientRandom)+len(serverRandom))
-	seed = append(seed, clientRandom...)
-	seed = append(seed, serverRandom...)
-
-	masterSecret := make([]byte, masterSecretLength)
-	prfForVersion(version, suite)(masterSecret, preMasterSecret, masterSecretLabel, seed)
-	return masterSecret
+func masterFromPreMasterSecret(version uint16, suite *cipherSuite, preMasterSecret, clientRandom, serverRandom []byte, fin finishedHash, ems bool) []byte {
+	if ems {
+		session_hash := fin.Sum()
+		masterSecret := make([]byte, masterSecretLength)
+		prfForVersion(version, suite)(masterSecret, preMasterSecret, extendedMasterSecretLabel, session_hash)
+		return masterSecret
+	} else {
+		seed := make([]byte, 0, len(clientRandom)+len(serverRandom))
+		seed = append(seed, clientRandom...)
+		seed = append(seed, serverRandom...)
+
+		masterSecret := make([]byte, masterSecretLength)
+		prfForVersion(version, suite)(masterSecret, preMasterSecret, masterSecretLabel, seed)
+		return masterSecret
+	}
 }
 
 // keysFromMasterSecret generates the connection keys from the master

+ 10 - 2
vendor/github.com/Psiphon-Labs/tls-tris/ticket.go

@@ -45,6 +45,7 @@ type SessionTicketSealer interface {
 type sessionState struct {
 	vers         uint16
 	cipherSuite  uint16
+	usedEMS      bool
 	masterSecret []byte
 	certificates [][]byte
 	// usedOldKey is true if the ticket from which this session came from
@@ -59,6 +60,7 @@ func (s *sessionState) equal(i interface{}) bool {
 	}
 
 	if s.vers != s1.vers ||
+		s.usedEMS != s1.usedEMS ||
 		s.cipherSuite != s1.cipherSuite ||
 		!bytes.Equal(s.masterSecret, s1.masterSecret) {
 		return false
@@ -101,7 +103,12 @@ func (s *sessionState) marshal() []byte {
 
 	ret := make([]byte, length)
 	x := ret
-	x[0] = byte(s.vers >> 8)
+	was_used := byte(0)
+	if s.usedEMS {
+		was_used = byte(0x80)
+	}
+
+	x[0] = byte(s.vers>>8) | byte(was_used)
 	x[1] = byte(s.vers)
 	x[2] = byte(s.cipherSuite >> 8)
 	x[3] = byte(s.cipherSuite)
@@ -132,8 +139,9 @@ func (s *sessionState) unmarshal(data []byte) alert {
 		return alertDecodeError
 	}
 
-	s.vers = uint16(data[0])<<8 | uint16(data[1])
+	s.vers = (uint16(data[0])<<8 | uint16(data[1])) & 0x7fff
 	s.cipherSuite = uint16(data[2])<<8 | uint16(data[3])
+	s.usedEMS = (data[0] & 0x80) == 0x80
 	masterSecretLen := int(data[4])<<8 | int(data[5])
 	data = data[6:]
 	if len(data) < masterSecretLen {

+ 122 - 0
vendor/github.com/Yawning/chacha20/LICENSE

@@ -0,0 +1,122 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+

+ 14 - 0
vendor/github.com/Yawning/chacha20/README.md

@@ -0,0 +1,14 @@
+### chacha20 - ChaCha20
+#### Yawning Angel (yawning at schwanenlied dot me)
+
+Yet another Go ChaCha20 implementation.  Everything else I found  was slow,
+didn't support all the variants I need to use, or relied on cgo to go fast.
+
+Features:
+
+ * 20 round, 256 bit key only.  Everything else is pointless and stupid.
+ * IETF 96 bit nonce variant.
+ * XChaCha 24 byte nonce variant.
+ * SSE2 and AVX2 support on amd64 targets.
+ * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20.
+

+ 273 - 0
vendor/github.com/Yawning/chacha20/chacha20.go

@@ -0,0 +1,273 @@
+// chacha20.go - A ChaCha stream cipher implementation.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+package chacha20
+
+import (
+	"crypto/cipher"
+	"encoding/binary"
+	"errors"
+	"math"
+	"runtime"
+)
+
+const (
+	// KeySize is the ChaCha20 key size in bytes.
+	KeySize = 32
+
+	// NonceSize is the ChaCha20 nonce size in bytes.
+	NonceSize = 8
+
+	// INonceSize is the IETF ChaCha20 nonce size in bytes.
+	INonceSize = 12
+
+	// XNonceSize is the XChaCha20 nonce size in bytes.
+	XNonceSize = 24
+
+	// HNonceSize is the HChaCha20 nonce size in bytes.
+	HNonceSize = 16
+
+	// BlockSize is the ChaCha20 block size in bytes.
+	BlockSize = 64
+
+	stateSize    = 16
+	chachaRounds = 20
+
+	// The constant "expand 32-byte k" as little endian uint32s.
+	sigma0 = uint32(0x61707865)
+	sigma1 = uint32(0x3320646e)
+	sigma2 = uint32(0x79622d32)
+	sigma3 = uint32(0x6b206574)
+)
+
+var (
+	// ErrInvalidKey is the error returned when the key is invalid.
+	ErrInvalidKey = errors.New("key length must be KeySize bytes")
+
+	// ErrInvalidNonce is the error returned when the nonce is invalid.
+	ErrInvalidNonce = errors.New("nonce length must be NonceSize/INonceSize/XNonceSize bytes")
+
+	// ErrInvalidCounter is the error returned when the counter is invalid.
+	ErrInvalidCounter = errors.New("block counter is invalid (out of range)")
+
+	useUnsafe    = false
+	usingVectors = false
+	blocksFn     = blocksRef
+)
+
+// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and
+// nonce.
+type Cipher struct {
+	state [stateSize]uint32
+
+	buf  [BlockSize]byte
+	off  int
+	ietf bool
+}
+
+// Reset zeros the key data so that it will no longer appear in the process's
+// memory.
+func (c *Cipher) Reset() {
+	for i := range c.state {
+		c.state[i] = 0
+	}
+	for i := range c.buf {
+		c.buf[i] = 0
+	}
+}
+
+// XORKeyStream sets dst to the result of XORing src with the key stream.  Dst
+// and src may be the same slice but otherwise should not overlap.
+func (c *Cipher) XORKeyStream(dst, src []byte) {
+	if len(dst) < len(src) {
+		src = src[:len(dst)]
+	}
+
+	for remaining := len(src); remaining > 0; {
+		// Process multiple blocks at once.
+		if c.off == BlockSize {
+			nrBlocks := remaining / BlockSize
+			directBytes := nrBlocks * BlockSize
+			if nrBlocks > 0 {
+				blocksFn(&c.state, src, dst, nrBlocks, c.ietf)
+				remaining -= directBytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[directBytes:]
+				src = src[directBytes:]
+			}
+
+			// If there's a partial block, generate 1 block of keystream into
+			// the internal buffer.
+			blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
+			c.off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		toXor := BlockSize - c.off
+		if remaining < toXor {
+			toXor = remaining
+		}
+		if toXor > 0 {
+			for i, v := range src[:toXor] {
+				dst[i] = v ^ c.buf[c.off+i]
+			}
+			dst = dst[toXor:]
+			src = src[toXor:]
+
+			remaining -= toXor
+			c.off += toXor
+		}
+	}
+}
+
+// KeyStream sets dst to the raw keystream.
+func (c *Cipher) KeyStream(dst []byte) {
+	for remaining := len(dst); remaining > 0; {
+		// Process multiple blocks at once.
+		if c.off == BlockSize {
+			nrBlocks := remaining / BlockSize
+			directBytes := nrBlocks * BlockSize
+			if nrBlocks > 0 {
+				blocksFn(&c.state, nil, dst, nrBlocks, c.ietf)
+				remaining -= directBytes
+				if remaining == 0 {
+					return
+				}
+				dst = dst[directBytes:]
+			}
+
+			// If there's a partial block, generate 1 block of keystream into
+			// the internal buffer.
+			blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
+			c.off = 0
+		}
+
+		// Process partial blocks from the buffered keystream.
+		toCopy := BlockSize - c.off
+		if remaining < toCopy {
+			toCopy = remaining
+		}
+		if toCopy > 0 {
+			copy(dst[:toCopy], c.buf[c.off:c.off+toCopy])
+			dst = dst[toCopy:]
+			remaining -= toCopy
+			c.off += toCopy
+		}
+	}
+}
+
+// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key
+// and nonce.
+func (c *Cipher) ReKey(key, nonce []byte) error {
+	if len(key) != KeySize {
+		return ErrInvalidKey
+	}
+
+	switch len(nonce) {
+	case NonceSize:
+	case INonceSize:
+	case XNonceSize:
+		var subkey [KeySize]byte
+		var subnonce [HNonceSize]byte
+		copy(subnonce[:], nonce[0:16])
+		HChaCha(key, &subnonce, &subkey)
+		key = subkey[:]
+		nonce = nonce[16:24]
+		defer func() {
+			for i := range subkey {
+				subkey[i] = 0
+			}
+		}()
+	default:
+		return ErrInvalidNonce
+	}
+
+	c.Reset()
+	c.state[0] = sigma0
+	c.state[1] = sigma1
+	c.state[2] = sigma2
+	c.state[3] = sigma3
+	c.state[4] = binary.LittleEndian.Uint32(key[0:4])
+	c.state[5] = binary.LittleEndian.Uint32(key[4:8])
+	c.state[6] = binary.LittleEndian.Uint32(key[8:12])
+	c.state[7] = binary.LittleEndian.Uint32(key[12:16])
+	c.state[8] = binary.LittleEndian.Uint32(key[16:20])
+	c.state[9] = binary.LittleEndian.Uint32(key[20:24])
+	c.state[10] = binary.LittleEndian.Uint32(key[24:28])
+	c.state[11] = binary.LittleEndian.Uint32(key[28:32])
+	c.state[12] = 0
+	if len(nonce) == INonceSize {
+		c.state[13] = binary.LittleEndian.Uint32(nonce[0:4])
+		c.state[14] = binary.LittleEndian.Uint32(nonce[4:8])
+		c.state[15] = binary.LittleEndian.Uint32(nonce[8:12])
+		c.ietf = true
+	} else {
+		c.state[13] = 0
+		c.state[14] = binary.LittleEndian.Uint32(nonce[0:4])
+		c.state[15] = binary.LittleEndian.Uint32(nonce[4:8])
+		c.ietf = false
+	}
+	c.off = BlockSize
+	return nil
+
+}
+
+// Seek sets the block counter to a given offset.
+func (c *Cipher) Seek(blockCounter uint64) error {
+	if c.ietf {
+		if blockCounter > math.MaxUint32 {
+			return ErrInvalidCounter
+		}
+		c.state[12] = uint32(blockCounter)
+	} else {
+		c.state[12] = uint32(blockCounter)
+		c.state[13] = uint32(blockCounter >> 32)
+	}
+	c.off = BlockSize
+	return nil
+}
+
+// NewCipher returns a new ChaCha20/XChaCha20 instance.
+func NewCipher(key, nonce []byte) (*Cipher, error) {
+	c := new(Cipher)
+	if err := c.ReKey(key, nonce); err != nil {
+		return nil, err
+	}
+	return c, nil
+}
+
+// HChaCha is the HChaCha20 hash function used to make XChaCha.
+func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) {
+	var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded.
+	x[0] = binary.LittleEndian.Uint32(key[0:4])
+	x[1] = binary.LittleEndian.Uint32(key[4:8])
+	x[2] = binary.LittleEndian.Uint32(key[8:12])
+	x[3] = binary.LittleEndian.Uint32(key[12:16])
+	x[4] = binary.LittleEndian.Uint32(key[16:20])
+	x[5] = binary.LittleEndian.Uint32(key[20:24])
+	x[6] = binary.LittleEndian.Uint32(key[24:28])
+	x[7] = binary.LittleEndian.Uint32(key[28:32])
+	x[8] = binary.LittleEndian.Uint32(nonce[0:4])
+	x[9] = binary.LittleEndian.Uint32(nonce[4:8])
+	x[10] = binary.LittleEndian.Uint32(nonce[8:12])
+	x[11] = binary.LittleEndian.Uint32(nonce[12:16])
+	hChaChaRef(&x, out)
+}
+
+func init() {
+	switch runtime.GOARCH {
+	case "386", "amd64":
+		// Abuse unsafe to skip calling binary.LittleEndian.PutUint32
+		// in the critical path.  This is a big boost on systems that are
+		// little endian and not overly picky about alignment.
+		useUnsafe = true
+	}
+}
+
+var _ cipher.Stream = (*Cipher)(nil)

+ 95 - 0
vendor/github.com/Yawning/chacha20/chacha20_amd64.go

@@ -0,0 +1,95 @@
+// chacha20_amd64.go - AMD64 optimized chacha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build amd64,!gccgo,!appengine
+
+package chacha20
+
+import (
+	"math"
+)
+
+var usingAVX2 = false
+
+func blocksAmd64SSE2(x *uint32, inp, outp *byte, nrBlocks uint)
+
+func blocksAmd64AVX2(x *uint32, inp, outp *byte, nrBlocks uint)
+
+func cpuidAmd64(cpuidParams *uint32)
+
+func xgetbv0Amd64(xcrVec *uint32)
+
+func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+	// Probably unneeded, but stating this explicitly simplifies the assembly.
+	if nrBlocks == 0 {
+		return
+	}
+
+	if isIetf {
+		var totalBlocks uint64
+		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+		if totalBlocks > math.MaxUint32 {
+			panic("chacha20: Exceeded keystream per nonce limit")
+		}
+	}
+
+	if in == nil {
+		for i := range out {
+			out[i] = 0
+		}
+		in = out
+	}
+
+	// Pointless to call the AVX2 code for just a single block, since half of
+	// the output gets discarded...
+	if usingAVX2 && nrBlocks > 1 {
+		blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks))
+	} else {
+		blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
+	}
+}
+
+func supportsAVX2() bool {
+	// https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+	const (
+		osXsaveBit = 1 << 27
+		avx2Bit    = 1 << 5
+	)
+
+	// Check to see if CPUID actually supports the leaf that indicates AVX2.
+	// CPUID.(EAX=0H, ECX=0H) >= 7
+	regs := [4]uint32{0x00}
+	cpuidAmd64(&regs[0])
+	if regs[0] < 7 {
+		return false
+	}
+
+	// Check to see if the OS knows how to save/restore XMM/YMM state.
+	// CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
+	regs = [4]uint32{0x01}
+	cpuidAmd64(&regs[0])
+	if regs[2]&osXsaveBit == 0 {
+		return false
+	}
+	xcrRegs := [2]uint32{}
+	xgetbv0Amd64(&xcrRegs[0])
+	if xcrRegs[0]&6 != 6 {
+		return false
+	}
+
+	// Check for AVX2 support.
+	// CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
+	regs = [4]uint32{0x07}
+	cpuidAmd64(&regs[0])
+	return regs[1]&avx2Bit != 0
+}
+
+func init() {
+	blocksFn = blocksAmd64
+	usingVectors = true
+	usingAVX2 = supportsAVX2()
+}

+ 1295 - 0
vendor/github.com/Yawning/chacha20/chacha20_amd64.py

@@ -0,0 +1,1295 @@
+#!/usr/bin/env python3
+#
+# To the extent possible under law, Yawning Angel has waived all copyright
+# and related or neighboring rights to chacha20, using the Creative
+# Commons "CC0" public domain dedication. See LICENSE or
+# <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+#
+# cgo sucks.  Plan 9 assembly sucks.  Real languages have SIMD intrinsics.
+# The least terrible/retarded option is to use a Python code generator, so
+# that's what I did.
+#
+# Code based on Ted Krovetz's vec128 C implementation, with corrections
+# to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
+# output pointers.
+#
+# Dependencies: https://github.com/Maratyszcza/PeachPy
+#
+# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
+#
+
+from peachpy import *
+from peachpy.x86_64 import *
+
+x = Argument(ptr(uint32_t))
+inp = Argument(ptr(const_uint8_t))
+outp = Argument(ptr(uint8_t))
+nrBlocks = Argument(ptr(size_t))
+
+#
+# SSE2 helper functions.  A temporary register is explicitly passed in because
+# the main fast loop uses every single register (and even spills) so manual
+# control is needed.
+#
+# This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
+# in the C code, but the C code has the luxury of an optimizer reordering
+# everything, while this does not.
+#
+
+def ROTW16_sse2(tmp, d):
+    MOVDQA(tmp, d)
+    PSLLD(tmp, 16)
+    PSRLD(d, 16)
+    PXOR(d, tmp)
+
+def ROTW12_sse2(tmp, b):
+    MOVDQA(tmp, b)
+    PSLLD(tmp, 12)
+    PSRLD(b, 20)
+    PXOR(b, tmp)
+
+def ROTW8_sse2(tmp, d):
+    MOVDQA(tmp, d)
+    PSLLD(tmp, 8)
+    PSRLD(d, 24)
+    PXOR(d, tmp)
+
+def ROTW7_sse2(tmp, b):
+    MOVDQA(tmp, b)
+    PSLLD(tmp, 7)
+    PSRLD(b, 25)
+    PXOR(b, tmp)
+
+def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
+    MOVDQU(tmp, [inp+d])
+    PXOR(tmp, v0)
+    MOVDQU([outp+d], tmp)
+    MOVDQU(tmp, [inp+d+16])
+    PXOR(tmp, v1)
+    MOVDQU([outp+d+16], tmp)
+    MOVDQU(tmp, [inp+d+32])
+    PXOR(tmp, v2)
+    MOVDQU([outp+d+32], tmp)
+    MOVDQU(tmp, [inp+d+48])
+    PXOR(tmp, v3)
+    MOVDQU([outp+d+48], tmp)
+
+# SSE2 ChaCha20 (aka vec128).  Does not handle partial blocks, and will
+# process 4/2/1 blocks at a time.
+with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
+    reg_x = GeneralPurposeRegister64()
+    reg_inp = GeneralPurposeRegister64()
+    reg_outp = GeneralPurposeRegister64()
+    reg_blocks = GeneralPurposeRegister64()
+    reg_sp_save = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(reg_x, x)
+    LOAD.ARGUMENT(reg_inp, inp)
+    LOAD.ARGUMENT(reg_outp, outp)
+    LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+    # Align the stack to a 32 byte boundary.
+    MOV(reg_sp_save, registers.rsp)
+    AND(registers.rsp, 0xffffffffffffffe0)
+    SUB(registers.rsp, 0x20)
+
+    # Build the counter increment vector on the stack, and allocate the scratch
+    # space
+    xmm_v0 = XMMRegister()
+    PXOR(xmm_v0, xmm_v0)
+    SUB(registers.rsp, 16+16)
+    MOVDQA([registers.rsp], xmm_v0)
+    reg_tmp = GeneralPurposeRegister32()
+    MOV(reg_tmp, 0x00000001)
+    MOV([registers.rsp], reg_tmp)
+    mem_one = [registers.rsp]     # (Stack) Counter increment vector
+    mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
+
+    mem_s0 = [reg_x]           # (Memory) Cipher state [0..3]
+    mem_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
+    mem_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
+    mem_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
+
+    # xmm_v0 allocated above...
+    xmm_v1 = XMMRegister()
+    xmm_v2 = XMMRegister()
+    xmm_v3 = XMMRegister()
+
+    xmm_v4 = XMMRegister()
+    xmm_v5 = XMMRegister()
+    xmm_v6 = XMMRegister()
+    xmm_v7 = XMMRegister()
+
+    xmm_v8 = XMMRegister()
+    xmm_v9 = XMMRegister()
+    xmm_v10 = XMMRegister()
+    xmm_v11 = XMMRegister()
+
+    xmm_v12 = XMMRegister()
+    xmm_v13 = XMMRegister()
+    xmm_v14 = XMMRegister()
+    xmm_v15 = XMMRegister()
+
+    xmm_tmp = xmm_v12
+
+    #
+    # 4 blocks at a time.
+    #
+
+    reg_rounds = GeneralPurposeRegister64()
+
+    vector_loop4 = Loop()
+    SUB(reg_blocks, 4)
+    JB(vector_loop4.end)
+    with vector_loop4:
+        MOVDQU(xmm_v0, mem_s0)
+        MOVDQU(xmm_v1, mem_s1)
+        MOVDQU(xmm_v2, mem_s2)
+        MOVDQU(xmm_v3, mem_s3)
+
+        MOVDQA(xmm_v4, xmm_v0)
+        MOVDQA(xmm_v5, xmm_v1)
+        MOVDQA(xmm_v6, xmm_v2)
+        MOVDQA(xmm_v7, xmm_v3)
+        PADDQ(xmm_v7, mem_one)
+
+        MOVDQA(xmm_v8, xmm_v0)
+        MOVDQA(xmm_v9, xmm_v1)
+        MOVDQA(xmm_v10, xmm_v2)
+        MOVDQA(xmm_v11, xmm_v7)
+        PADDQ(xmm_v11, mem_one)
+
+        MOVDQA(xmm_v12, xmm_v0)
+        MOVDQA(xmm_v13, xmm_v1)
+        MOVDQA(xmm_v14, xmm_v2)
+        MOVDQA(xmm_v15, xmm_v11)
+        PADDQ(xmm_v15, mem_one)
+
+        MOV(reg_rounds, 20)
+        rounds_loop4 = Loop()
+        with rounds_loop4:
+            # a += b; d ^= a; d = ROTW16(d);
+            PADDD(xmm_v0, xmm_v1)
+            PADDD(xmm_v4, xmm_v5)
+            PADDD(xmm_v8, xmm_v9)
+            PADDD(xmm_v12, xmm_v13)
+            PXOR(xmm_v3, xmm_v0)
+            PXOR(xmm_v7, xmm_v4)
+            PXOR(xmm_v11, xmm_v8)
+            PXOR(xmm_v15, xmm_v12)
+
+            MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+            ROTW16_sse2(xmm_tmp, xmm_v3)
+            ROTW16_sse2(xmm_tmp, xmm_v7)
+            ROTW16_sse2(xmm_tmp, xmm_v11)
+            ROTW16_sse2(xmm_tmp, xmm_v15)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            PADDD(xmm_v2, xmm_v3)
+            PADDD(xmm_v6, xmm_v7)
+            PADDD(xmm_v10, xmm_v11)
+            PADDD(xmm_v14, xmm_v15)
+            PXOR(xmm_v1, xmm_v2)
+            PXOR(xmm_v5, xmm_v6)
+            PXOR(xmm_v9, xmm_v10)
+            PXOR(xmm_v13, xmm_v14)
+            ROTW12_sse2(xmm_tmp, xmm_v1)
+            ROTW12_sse2(xmm_tmp, xmm_v5)
+            ROTW12_sse2(xmm_tmp, xmm_v9)
+            ROTW12_sse2(xmm_tmp, xmm_v13)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+            PADDD(xmm_v0, xmm_v1)
+            PADDD(xmm_v4, xmm_v5)
+            PADDD(xmm_v8, xmm_v9)
+            PADDD(xmm_v12, xmm_v13)
+            PXOR(xmm_v3, xmm_v0)
+            PXOR(xmm_v7, xmm_v4)
+            PXOR(xmm_v11, xmm_v8)
+            PXOR(xmm_v15, xmm_v12)
+
+            MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+            ROTW8_sse2(xmm_tmp, xmm_v3)
+            ROTW8_sse2(xmm_tmp, xmm_v7)
+            ROTW8_sse2(xmm_tmp, xmm_v11)
+            ROTW8_sse2(xmm_tmp, xmm_v15)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            PADDD(xmm_v2, xmm_v3)
+            PADDD(xmm_v6, xmm_v7)
+            PADDD(xmm_v10, xmm_v11)
+            PADDD(xmm_v14, xmm_v15)
+            PXOR(xmm_v1, xmm_v2)
+            PXOR(xmm_v5, xmm_v6)
+            PXOR(xmm_v9, xmm_v10)
+            PXOR(xmm_v13, xmm_v14)
+            ROTW7_sse2(xmm_tmp, xmm_v1)
+            ROTW7_sse2(xmm_tmp, xmm_v5)
+            ROTW7_sse2(xmm_tmp, xmm_v9)
+            ROTW7_sse2(xmm_tmp, xmm_v13)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            PSHUFD(xmm_v1, xmm_v1, 0x39)
+            PSHUFD(xmm_v5, xmm_v5, 0x39)
+            PSHUFD(xmm_v9, xmm_v9, 0x39)
+            PSHUFD(xmm_v13, xmm_v13, 0x39)
+            PSHUFD(xmm_v2, xmm_v2, 0x4e)
+            PSHUFD(xmm_v6, xmm_v6, 0x4e)
+            PSHUFD(xmm_v10, xmm_v10, 0x4e)
+            PSHUFD(xmm_v14, xmm_v14, 0x4e)
+            PSHUFD(xmm_v3, xmm_v3, 0x93)
+            PSHUFD(xmm_v7, xmm_v7, 0x93)
+            PSHUFD(xmm_v11, xmm_v11, 0x93)
+            PSHUFD(xmm_v15, xmm_v15, 0x93)
+
+            MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+            # a += b; d ^= a; d = ROTW16(d);
+            PADDD(xmm_v0, xmm_v1)
+            PADDD(xmm_v4, xmm_v5)
+            PADDD(xmm_v8, xmm_v9)
+            PADDD(xmm_v12, xmm_v13)
+            PXOR(xmm_v3, xmm_v0)
+            PXOR(xmm_v7, xmm_v4)
+            PXOR(xmm_v11, xmm_v8)
+            PXOR(xmm_v15, xmm_v12)
+
+            MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+            ROTW16_sse2(xmm_tmp, xmm_v3)
+            ROTW16_sse2(xmm_tmp, xmm_v7)
+            ROTW16_sse2(xmm_tmp, xmm_v11)
+            ROTW16_sse2(xmm_tmp, xmm_v15)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            PADDD(xmm_v2, xmm_v3)
+            PADDD(xmm_v6, xmm_v7)
+            PADDD(xmm_v10, xmm_v11)
+            PADDD(xmm_v14, xmm_v15)
+            PXOR(xmm_v1, xmm_v2)
+            PXOR(xmm_v5, xmm_v6)
+            PXOR(xmm_v9, xmm_v10)
+            PXOR(xmm_v13, xmm_v14)
+            ROTW12_sse2(xmm_tmp, xmm_v1)
+            ROTW12_sse2(xmm_tmp, xmm_v5)
+            ROTW12_sse2(xmm_tmp, xmm_v9)
+            ROTW12_sse2(xmm_tmp, xmm_v13)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+            PADDD(xmm_v0, xmm_v1)
+            PADDD(xmm_v4, xmm_v5)
+            PADDD(xmm_v8, xmm_v9)
+            PADDD(xmm_v12, xmm_v13)
+            PXOR(xmm_v3, xmm_v0)
+            PXOR(xmm_v7, xmm_v4)
+            PXOR(xmm_v11, xmm_v8)
+            PXOR(xmm_v15, xmm_v12)
+
+            MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+            ROTW8_sse2(xmm_tmp, xmm_v3)
+            ROTW8_sse2(xmm_tmp, xmm_v7)
+            ROTW8_sse2(xmm_tmp, xmm_v11)
+            ROTW8_sse2(xmm_tmp, xmm_v15)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            PADDD(xmm_v2, xmm_v3)
+            PADDD(xmm_v6, xmm_v7)
+            PADDD(xmm_v10, xmm_v11)
+            PADDD(xmm_v14, xmm_v15)
+            PXOR(xmm_v1, xmm_v2)
+            PXOR(xmm_v5, xmm_v6)
+            PXOR(xmm_v9, xmm_v10)
+            PXOR(xmm_v13, xmm_v14)
+            ROTW7_sse2(xmm_tmp, xmm_v1)
+            ROTW7_sse2(xmm_tmp, xmm_v5)
+            ROTW7_sse2(xmm_tmp, xmm_v9)
+            ROTW7_sse2(xmm_tmp, xmm_v13)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            PSHUFD(xmm_v1, xmm_v1, 0x93)
+            PSHUFD(xmm_v5, xmm_v5, 0x93)
+            PSHUFD(xmm_v9, xmm_v9, 0x93)
+            PSHUFD(xmm_v13, xmm_v13, 0x93)
+            PSHUFD(xmm_v2, xmm_v2, 0x4e)
+            PSHUFD(xmm_v6, xmm_v6, 0x4e)
+            PSHUFD(xmm_v10, xmm_v10, 0x4e)
+            PSHUFD(xmm_v14, xmm_v14, 0x4e)
+            PSHUFD(xmm_v3, xmm_v3, 0x39)
+            PSHUFD(xmm_v7, xmm_v7, 0x39)
+            PSHUFD(xmm_v11, xmm_v11, 0x39)
+            PSHUFD(xmm_v15, xmm_v15, 0x39)
+
+            MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop4.begin)
+
+        MOVDQA(mem_tmp0, xmm_tmp)
+
+        PADDD(xmm_v0, mem_s0)
+        PADDD(xmm_v1, mem_s1)
+        PADDD(xmm_v2, mem_s2)
+        PADDD(xmm_v3, mem_s3)
+        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+        MOVDQU(xmm_v3, mem_s3)
+        PADDQ(xmm_v3, mem_one)
+
+        PADDD(xmm_v4, mem_s0)
+        PADDD(xmm_v5, mem_s1)
+        PADDD(xmm_v6, mem_s2)
+        PADDD(xmm_v7, xmm_v3)
+        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+        PADDQ(xmm_v3, mem_one)
+
+        PADDD(xmm_v8, mem_s0)
+        PADDD(xmm_v9, mem_s1)
+        PADDD(xmm_v10, mem_s2)
+        PADDD(xmm_v11, xmm_v3)
+        WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
+        PADDQ(xmm_v3, mem_one)
+
+        MOVDQA(xmm_tmp, mem_tmp0)
+
+        PADDD(xmm_v12, mem_s0)
+        PADDD(xmm_v13, mem_s1)
+        PADDD(xmm_v14, mem_s2)
+        PADDD(xmm_v15, xmm_v3)
+        WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
+        PADDQ(xmm_v3, mem_one)
+
+        MOVDQU(mem_s3, xmm_v3)
+
+        ADD(reg_inp, 4 * 64)
+        ADD(reg_outp, 4 * 64)
+
+        SUB(reg_blocks, 4)
+        JAE(vector_loop4.begin)
+
+    ADD(reg_blocks, 4)
+    out = Label()
+    JZ(out)
+
+    # Past this point, we no longer need to use every single register to hold
+    # the in progress state.
+
+    xmm_s0 = xmm_v8
+    xmm_s1 = xmm_v9
+    xmm_s2 = xmm_v10
+    xmm_s3 = xmm_v11
+    xmm_one = xmm_v13
+    MOVDQU(xmm_s0, mem_s0)
+    MOVDQU(xmm_s1, mem_s1)
+    MOVDQU(xmm_s2, mem_s2)
+    MOVDQU(xmm_s3, mem_s3)
+    MOVDQA(xmm_one, mem_one)
+
+    #
+    # 2 blocks at a time.
+    #
+
+    process_1_block = Label()
+    SUB(reg_blocks, 2)
+    JB(process_1_block) # < 2 blocks remaining.
+
+    MOVDQA(xmm_v0, xmm_s0)
+    MOVDQA(xmm_v1, xmm_s1)
+    MOVDQA(xmm_v2, xmm_s2)
+    MOVDQA(xmm_v3, xmm_s3)
+
+    MOVDQA(xmm_v4, xmm_v0)
+    MOVDQA(xmm_v5, xmm_v1)
+    MOVDQA(xmm_v6, xmm_v2)
+    MOVDQA(xmm_v7, xmm_v3)
+    PADDQ(xmm_v7, xmm_one)
+
+    MOV(reg_rounds, 20)
+    rounds_loop2 = Loop()
+    with rounds_loop2:
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PADDD(xmm_v4, xmm_v5)
+        PXOR(xmm_v3, xmm_v0)
+        PXOR(xmm_v7, xmm_v4)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+        ROTW16_sse2(xmm_tmp, xmm_v7)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PADDD(xmm_v6, xmm_v7)
+        PXOR(xmm_v1, xmm_v2)
+        PXOR(xmm_v5, xmm_v6)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+        ROTW12_sse2(xmm_tmp, xmm_v5)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PADDD(xmm_v4, xmm_v5)
+        PXOR(xmm_v3, xmm_v0)
+        PXOR(xmm_v7, xmm_v4)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+        ROTW8_sse2(xmm_tmp, xmm_v7)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PADDD(xmm_v6, xmm_v7)
+        PXOR(xmm_v1, xmm_v2)
+        PXOR(xmm_v5, xmm_v6)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+        ROTW7_sse2(xmm_tmp, xmm_v5)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x39)
+        PSHUFD(xmm_v5, xmm_v5, 0x39)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v6, xmm_v6, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x93)
+        PSHUFD(xmm_v7, xmm_v7, 0x93)
+
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PADDD(xmm_v4, xmm_v5)
+        PXOR(xmm_v3, xmm_v0)
+        PXOR(xmm_v7, xmm_v4)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+        ROTW16_sse2(xmm_tmp, xmm_v7)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PADDD(xmm_v6, xmm_v7)
+        PXOR(xmm_v1, xmm_v2)
+        PXOR(xmm_v5, xmm_v6)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+        ROTW12_sse2(xmm_tmp, xmm_v5)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PADDD(xmm_v4, xmm_v5)
+        PXOR(xmm_v3, xmm_v0)
+        PXOR(xmm_v7, xmm_v4)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+        ROTW8_sse2(xmm_tmp, xmm_v7)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PADDD(xmm_v6, xmm_v7)
+        PXOR(xmm_v1, xmm_v2)
+        PXOR(xmm_v5, xmm_v6)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+        ROTW7_sse2(xmm_tmp, xmm_v5)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x93)
+        PSHUFD(xmm_v5, xmm_v5, 0x93)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v6, xmm_v6, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x39)
+        PSHUFD(xmm_v7, xmm_v7, 0x39)
+
+        SUB(reg_rounds, 2)
+        JNZ(rounds_loop2.begin)
+
+    PADDD(xmm_v0, xmm_s0)
+    PADDD(xmm_v1, xmm_s1)
+    PADDD(xmm_v2, xmm_s2)
+    PADDD(xmm_v3, xmm_s3)
+    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+    PADDQ(xmm_s3, xmm_one)
+
+    PADDD(xmm_v4, xmm_s0)
+    PADDD(xmm_v5, xmm_s1)
+    PADDD(xmm_v6, xmm_s2)
+    PADDD(xmm_v7, xmm_s3)
+    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+    PADDQ(xmm_s3, xmm_one)
+
+    ADD(reg_inp, 2 * 64)
+    ADD(reg_outp, 2 * 64)
+    SUB(reg_blocks, 2)
+
+    LABEL(process_1_block)
+    ADD(reg_blocks, 2)
+    out_serial = Label()
+    JZ(out_serial)
+
+    #
+    # 1 block at a time.  Only executed once, because if there was > 1,
+    # the parallel code would have processed it already.
+    #
+
+    MOVDQA(xmm_v0, xmm_s0)
+    MOVDQA(xmm_v1, xmm_s1)
+    MOVDQA(xmm_v2, xmm_s2)
+    MOVDQA(xmm_v3, xmm_s3)
+
+    MOV(reg_rounds, 20)
+    rounds_loop1 = Loop()
+    with rounds_loop1:
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x39)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x93)
+
+        # a += b; d ^= a; d = ROTW16(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW16_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW12_sse2(xmm_tmp, xmm_v1)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        PADDD(xmm_v0, xmm_v1)
+        PXOR(xmm_v3, xmm_v0)
+        ROTW8_sse2(xmm_tmp, xmm_v3)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        PADDD(xmm_v2, xmm_v3)
+        PXOR(xmm_v1, xmm_v2)
+        ROTW7_sse2(xmm_tmp, xmm_v1)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        PSHUFD(xmm_v1, xmm_v1, 0x93)
+        PSHUFD(xmm_v2, xmm_v2, 0x4e)
+        PSHUFD(xmm_v3, xmm_v3, 0x39)
+
+        SUB(reg_rounds, 2)
+        JNZ(rounds_loop1.begin)
+
+    PADDD(xmm_v0, xmm_s0)
+    PADDD(xmm_v1, xmm_s1)
+    PADDD(xmm_v2, xmm_s2)
+    PADDD(xmm_v3, xmm_s3)
+    WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+    PADDQ(xmm_s3, xmm_one)
+
+    LABEL(out_serial)
+
+    # Write back the updated counter.  Stoping at 2^70 bytes is the user's
+    # problem, not mine.  (Skipped if there's exactly a multiple of 4 blocks
+    # because the counter is incremented in memory while looping.)
+    MOVDQU(mem_s3, xmm_s3)
+
+    LABEL(out)
+
+    # Paranoia, cleanse the scratch space.
+    PXOR(xmm_v0, xmm_v0)
+    MOVDQA(mem_tmp0, xmm_v0)
+
+    # Remove our stack allocation.
+    MOV(registers.rsp, reg_sp_save)
+
+    RETURN()
+
+#
+# AVX2 helpers.  Like the SSE2 equivalents, the scratch register is explicit,
+# and more helpers are used to increase readability for destructive operations.
+#
+# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
+#
+
+def ADD_avx2(dst, src):
+    VPADDD(dst, dst, src)
+
+def XOR_avx2(dst, src):
+    VPXOR(dst, dst, src)
+
+def ROTW16_avx2(tmp, d):
+    VPSLLD(tmp, d, 16)
+    VPSRLD(d, d, 16)
+    XOR_avx2(d, tmp)
+
+def ROTW12_avx2(tmp, b):
+    VPSLLD(tmp, b, 12)
+    VPSRLD(b, b, 20)
+    XOR_avx2(b, tmp)
+
+def ROTW8_avx2(tmp, d):
+    VPSLLD(tmp, d, 8)
+    VPSRLD(d, d, 24)
+    XOR_avx2(d, tmp)
+
+def ROTW7_avx2(tmp, b):
+    VPSLLD(tmp, b, 7)
+    VPSRLD(b, b, 25)
+    XOR_avx2(b, tmp)
+
+def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
+    # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+    VPERM2I128(tmp, v0, v1, 0x20)
+    VPXOR(tmp, tmp, [inp+d])
+    VMOVDQU([outp+d], tmp)
+
+    # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+    VPERM2I128(tmp, v2, v3, 0x20)
+    VPXOR(tmp, tmp, [inp+d+32])
+    VMOVDQU([outp+d+32], tmp)
+
+    # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
+    VPERM2I128(tmp, v0, v1, 0x31)
+    VPXOR(tmp, tmp, [inp+d+64])
+    VMOVDQU([outp+d+64], tmp)
+
+    # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
+    VPERM2I128(tmp, v2, v3, 0x31)
+    VPXOR(tmp, tmp, [inp+d+96])
+    VMOVDQU([outp+d+96], tmp)
+
+# AVX2 ChaCha20 (aka avx2).  Does not handle partial blocks, will process
+# 8/4/2 blocks at a time.
+with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
+    reg_x = GeneralPurposeRegister64()
+    reg_inp = GeneralPurposeRegister64()
+    reg_outp = GeneralPurposeRegister64()
+    reg_blocks = GeneralPurposeRegister64()
+    reg_sp_save = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(reg_x, x)
+    LOAD.ARGUMENT(reg_inp, inp)
+    LOAD.ARGUMENT(reg_outp, outp)
+    LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+    # Align the stack to a 32 byte boundary.
+    MOV(reg_sp_save, registers.rsp)
+    AND(registers.rsp, 0xffffffffffffffe0)
+    SUB(registers.rsp, 0x20)
+
+    x_s0 = [reg_x]           # (Memory) Cipher state [0..3]
+    x_s1 = [reg_x+16]        # (Memory) Cipher state [4..7]
+    x_s2 = [reg_x+32]        # (Memory) Cipher state [8..11]
+    x_s3 = [reg_x+48]        # (Memory) Cipher state [12..15]
+
+    ymm_v0 = YMMRegister()
+    ymm_v1 = YMMRegister()
+    ymm_v2 = YMMRegister()
+    ymm_v3 = YMMRegister()
+
+    ymm_v4 = YMMRegister()
+    ymm_v5 = YMMRegister()
+    ymm_v6 = YMMRegister()
+    ymm_v7 = YMMRegister()
+
+    ymm_v8 = YMMRegister()
+    ymm_v9 = YMMRegister()
+    ymm_v10 = YMMRegister()
+    ymm_v11 = YMMRegister()
+
+    ymm_v12 = YMMRegister()
+    ymm_v13 = YMMRegister()
+    ymm_v14 = YMMRegister()
+    ymm_v15 = YMMRegister()
+
+    ymm_tmp0 = ymm_v12
+
+    # Allocate the neccecary stack space for the counter vector and two ymm
+    # registers that we will spill.
+    SUB(registers.rsp, 96)
+    mem_tmp0 = [registers.rsp+64]  # (Stack) Scratch space.
+    mem_s3 = [registers.rsp+32]    # (Stack) Working copy of s3. (8x)
+    mem_inc = [registers.rsp]      # (Stack) Counter increment vector.
+
+    # Increment the counter for one side of the state vector.
+    VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
+    VMOVDQU(mem_inc, ymm_tmp0)
+    reg_tmp = GeneralPurposeRegister32()
+    MOV(reg_tmp, 0x00000001)
+    MOV([registers.rsp+16], reg_tmp)
+    VBROADCASTI128(ymm_v3, x_s3)
+    VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
+    VMOVDQA(mem_s3, ymm_v3)
+
+    # As we process 2xN blocks at a time, so the counter increment for both
+    # sides of the state vector is 2.
+    MOV(reg_tmp, 0x00000002)
+    MOV([registers.rsp], reg_tmp)
+    MOV([registers.rsp+16], reg_tmp)
+
+    out_write_even = Label()
+    out_write_odd = Label()
+
+    #
+    # 8 blocks at a time.  Ted Krovetz's avx2 code does not do this, but it's
+    # a decent gain despite all the pain...
+    #
+
+    reg_rounds = GeneralPurposeRegister64()
+
+    vector_loop8 = Loop()
+    SUB(reg_blocks, 8)
+    JB(vector_loop8.end)
+    with vector_loop8:
+        VBROADCASTI128(ymm_v0, x_s0)
+        VBROADCASTI128(ymm_v1, x_s1)
+        VBROADCASTI128(ymm_v2, x_s2)
+        VMOVDQA(ymm_v3, mem_s3)
+
+        VMOVDQA(ymm_v4, ymm_v0)
+        VMOVDQA(ymm_v5, ymm_v1)
+        VMOVDQA(ymm_v6, ymm_v2)
+        VPADDQ(ymm_v7, ymm_v3, mem_inc)
+
+        VMOVDQA(ymm_v8, ymm_v0)
+        VMOVDQA(ymm_v9, ymm_v1)
+        VMOVDQA(ymm_v10, ymm_v2)
+        VPADDQ(ymm_v11, ymm_v7, mem_inc)
+
+        VMOVDQA(ymm_v12, ymm_v0)
+        VMOVDQA(ymm_v13, ymm_v1)
+        VMOVDQA(ymm_v14, ymm_v2)
+        VPADDQ(ymm_v15, ymm_v11, mem_inc)
+
+        MOV(reg_rounds, 20)
+        rounds_loop8 = Loop()
+        with rounds_loop8:
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            ADD_avx2(ymm_v12, ymm_v13)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            XOR_avx2(ymm_v15, ymm_v12)
+
+            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
+            ROTW16_avx2(ymm_tmp0, ymm_v11)
+            ROTW16_avx2(ymm_tmp0, ymm_v15)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            ADD_avx2(ymm_v14, ymm_v15)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            XOR_avx2(ymm_v13, ymm_v14)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
+            ROTW12_avx2(ymm_tmp0, ymm_v9)
+            ROTW12_avx2(ymm_tmp0, ymm_v13)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            ADD_avx2(ymm_v12, ymm_v13)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            XOR_avx2(ymm_v15, ymm_v12)
+
+            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
+            ROTW8_avx2(ymm_tmp0, ymm_v11)
+            ROTW8_avx2(ymm_tmp0, ymm_v15)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            ADD_avx2(ymm_v14, ymm_v15)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            XOR_avx2(ymm_v13, ymm_v14)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+            ROTW7_avx2(ymm_tmp0, ymm_v9)
+            ROTW7_avx2(ymm_tmp0, ymm_v13)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x39)
+            VPSHUFD(ymm_v5, ymm_v5, 0x39)
+            VPSHUFD(ymm_v9, ymm_v9, 0x39)
+            VPSHUFD(ymm_v13, ymm_v13, 0x39)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+            VPSHUFD(ymm_v14, ymm_v14, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x93)
+            VPSHUFD(ymm_v7, ymm_v7, 0x93)
+            VPSHUFD(ymm_v11, ymm_v11, 0x93)
+            VPSHUFD(ymm_v15, ymm_v15, 0x93)
+
+            # a += b; d ^= a; d = ROTW16(d);
+            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            ADD_avx2(ymm_v12, ymm_v13)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            XOR_avx2(ymm_v15, ymm_v12)
+
+            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+            ROTW16_avx2(ymm_tmp0, ymm_v7)
+            ROTW16_avx2(ymm_tmp0, ymm_v11)
+            ROTW16_avx2(ymm_tmp0, ymm_v15)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            ADD_avx2(ymm_v14, ymm_v15)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            XOR_avx2(ymm_v13, ymm_v14)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+            ROTW12_avx2(ymm_tmp0, ymm_v5)
+            ROTW12_avx2(ymm_tmp0, ymm_v9)
+            ROTW12_avx2(ymm_tmp0, ymm_v13)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+            ADD_avx2(ymm_v0, ymm_v1)
+            ADD_avx2(ymm_v4, ymm_v5)
+            ADD_avx2(ymm_v8, ymm_v9)
+            ADD_avx2(ymm_v12, ymm_v13)
+            XOR_avx2(ymm_v3, ymm_v0)
+            XOR_avx2(ymm_v7, ymm_v4)
+            XOR_avx2(ymm_v11, ymm_v8)
+            XOR_avx2(ymm_v15, ymm_v12)
+
+            VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+            ROTW8_avx2(ymm_tmp0, ymm_v7)
+            ROTW8_avx2(ymm_tmp0, ymm_v11)
+            ROTW8_avx2(ymm_tmp0, ymm_v15)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            ADD_avx2(ymm_v6, ymm_v7)
+            ADD_avx2(ymm_v10, ymm_v11)
+            ADD_avx2(ymm_v14, ymm_v15)
+            XOR_avx2(ymm_v1, ymm_v2)
+            XOR_avx2(ymm_v5, ymm_v6)
+            XOR_avx2(ymm_v9, ymm_v10)
+            XOR_avx2(ymm_v13, ymm_v14)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+            ROTW7_avx2(ymm_tmp0, ymm_v5)
+            ROTW7_avx2(ymm_tmp0, ymm_v9)
+            ROTW7_avx2(ymm_tmp0, ymm_v13)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x93)
+            VPSHUFD(ymm_v5, ymm_v5, 0x93)
+            VPSHUFD(ymm_v9, ymm_v9, 0x93)
+            VPSHUFD(ymm_v13, ymm_v13, 0x93)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+            VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+            VPSHUFD(ymm_v14, ymm_v14, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x39)
+            VPSHUFD(ymm_v7, ymm_v7, 0x39)
+            VPSHUFD(ymm_v11, ymm_v11, 0x39)
+            VPSHUFD(ymm_v15, ymm_v15, 0x39)
+
+            VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop8.begin)
+
+        # ymm_v12 is in mem_tmp0 and is current....
+
+        # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
+        VBROADCASTI128(ymm_tmp0, x_s0)
+        ADD_avx2(ymm_v0, ymm_tmp0)
+        ADD_avx2(ymm_v4, ymm_tmp0)
+        ADD_avx2(ymm_v8, ymm_tmp0)
+        ADD_avx2(ymm_tmp0, mem_tmp0)
+        VMOVDQA(mem_tmp0, ymm_tmp0)
+
+        VBROADCASTI128(ymm_tmp0, x_s1)
+        ADD_avx2(ymm_v1, ymm_tmp0)
+        ADD_avx2(ymm_v5, ymm_tmp0)
+        ADD_avx2(ymm_v9, ymm_tmp0)
+        ADD_avx2(ymm_v13, ymm_tmp0)
+
+        VBROADCASTI128(ymm_tmp0, x_s2)
+        ADD_avx2(ymm_v2, ymm_tmp0)
+        ADD_avx2(ymm_v6, ymm_tmp0)
+        ADD_avx2(ymm_v10, ymm_tmp0)
+        ADD_avx2(ymm_v14, ymm_tmp0)
+
+        ADD_avx2(ymm_v3, mem_s3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+        VMOVDQA(ymm_v3, mem_s3)
+        ADD_avx2(ymm_v3, mem_inc)
+
+        ADD_avx2(ymm_v7, ymm_v3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+        ADD_avx2(ymm_v3, mem_inc)
+
+        ADD_avx2(ymm_v11, ymm_v3)
+        WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
+        ADD_avx2(ymm_v3, mem_inc)
+
+        VMOVDQA(ymm_v12, mem_tmp0)
+        ADD_avx2(ymm_v15, ymm_v3)
+        WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
+        ADD_avx2(ymm_v3, mem_inc)
+
+        VMOVDQA(mem_s3, ymm_v3)
+
+        ADD(reg_inp, 8 * 64)
+        ADD(reg_outp, 8 * 64)
+
+        SUB(reg_blocks, 8)
+        JAE(vector_loop8.begin)
+
+    # ymm_v3 contains a current copy of mem_s3 either from when it was built,
+    # or because the loop updates it.  Copy this before we mess with the block
+    # counter in case we need to write it back and return.
+    ymm_s3 = ymm_v11
+    VMOVDQA(ymm_s3, ymm_v3)
+
+    ADD(reg_blocks, 8)
+    JZ(out_write_even)
+
+    # We now actually can do everything in registers.
+    ymm_s0 = ymm_v8
+    VBROADCASTI128(ymm_s0, x_s0)
+    ymm_s1 = ymm_v9
+    VBROADCASTI128(ymm_s1, x_s1)
+    ymm_s2 = ymm_v10
+    VBROADCASTI128(ymm_s2, x_s2)
+    ymm_inc = ymm_v14
+    VMOVDQA(ymm_inc, mem_inc)
+
+    #
+    # 4 blocks at a time.
+    #
+
+    process_2_blocks = Label()
+    SUB(reg_blocks, 4)
+    JB(process_2_blocks) # < 4 blocks remaining.
+
+    VMOVDQA(ymm_v0, ymm_s0)
+    VMOVDQA(ymm_v1, ymm_s1)
+    VMOVDQA(ymm_v2, ymm_s2)
+    VMOVDQA(ymm_v3, ymm_s3)
+
+    VMOVDQA(ymm_v4, ymm_v0)
+    VMOVDQA(ymm_v5, ymm_v1)
+    VMOVDQA(ymm_v6, ymm_v2)
+    VPADDQ(ymm_v7, ymm_v3, ymm_inc)
+
+    MOV(reg_rounds, 20)
+    rounds_loop4 = Loop()
+    with rounds_loop4:
+        # a += b; d ^= a; d = ROTW16(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        ADD_avx2(ymm_v4, ymm_v5)
+        XOR_avx2(ymm_v3, ymm_v0)
+        XOR_avx2(ymm_v7, ymm_v4)
+        ROTW16_avx2(ymm_tmp0, ymm_v3)
+        ROTW16_avx2(ymm_tmp0, ymm_v7)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        ADD_avx2(ymm_v2, ymm_v3)
+        ADD_avx2(ymm_v6, ymm_v7)
+        XOR_avx2(ymm_v1, ymm_v2)
+        XOR_avx2(ymm_v5, ymm_v6)
+        ROTW12_avx2(ymm_tmp0, ymm_v1)
+        ROTW12_avx2(ymm_tmp0, ymm_v5)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        ADD_avx2(ymm_v4, ymm_v5)
+        XOR_avx2(ymm_v3, ymm_v0)
+        XOR_avx2(ymm_v7, ymm_v4)
+        ROTW8_avx2(ymm_tmp0, ymm_v3)
+        ROTW8_avx2(ymm_tmp0, ymm_v7)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        ADD_avx2(ymm_v2, ymm_v3)
+        ADD_avx2(ymm_v6, ymm_v7)
+        XOR_avx2(ymm_v1, ymm_v2)
+        XOR_avx2(ymm_v5, ymm_v6)
+        ROTW7_avx2(ymm_tmp0, ymm_v1)
+        ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        VPSHUFD(ymm_v1, ymm_v1, 0x39)
+        VPSHUFD(ymm_v5, ymm_v5, 0x39)
+        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+        VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+        VPSHUFD(ymm_v3, ymm_v3, 0x93)
+        VPSHUFD(ymm_v7, ymm_v7, 0x93)
+
+        # a += b; d ^= a; d = ROTW16(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        ADD_avx2(ymm_v4, ymm_v5)
+        XOR_avx2(ymm_v3, ymm_v0)
+        XOR_avx2(ymm_v7, ymm_v4)
+        ROTW16_avx2(ymm_tmp0, ymm_v3)
+        ROTW16_avx2(ymm_tmp0, ymm_v7)
+
+        # c += d; b ^= c; b = ROTW12(b);
+        ADD_avx2(ymm_v2, ymm_v3)
+        ADD_avx2(ymm_v6, ymm_v7)
+        XOR_avx2(ymm_v1, ymm_v2)
+        XOR_avx2(ymm_v5, ymm_v6)
+        ROTW12_avx2(ymm_tmp0, ymm_v1)
+        ROTW12_avx2(ymm_tmp0, ymm_v5)
+
+        # a += b; d ^= a; d = ROTW8(d);
+        ADD_avx2(ymm_v0, ymm_v1)
+        ADD_avx2(ymm_v4, ymm_v5)
+        XOR_avx2(ymm_v3, ymm_v0)
+        XOR_avx2(ymm_v7, ymm_v4)
+        ROTW8_avx2(ymm_tmp0, ymm_v3)
+        ROTW8_avx2(ymm_tmp0, ymm_v7)
+
+        # c += d; b ^= c; b = ROTW7(b)
+        ADD_avx2(ymm_v2, ymm_v3)
+        ADD_avx2(ymm_v6, ymm_v7)
+        XOR_avx2(ymm_v1, ymm_v2)
+        XOR_avx2(ymm_v5, ymm_v6)
+        ROTW7_avx2(ymm_tmp0, ymm_v1)
+        ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+        # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+        VPSHUFD(ymm_v1, ymm_v1, 0x93)
+        VPSHUFD(ymm_v5, ymm_v5, 0x93)
+        VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+        VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+        VPSHUFD(ymm_v3, ymm_v3, 0x39)
+        VPSHUFD(ymm_v7, ymm_v7, 0x39)
+
+        SUB(reg_rounds, 2)
+        JNZ(rounds_loop4.begin)
+
+    ADD_avx2(ymm_v0, ymm_s0)
+    ADD_avx2(ymm_v1, ymm_s1)
+    ADD_avx2(ymm_v2, ymm_s2)
+    ADD_avx2(ymm_v3, ymm_s3)
+    WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+    ADD_avx2(ymm_s3, ymm_inc)
+
+    ADD_avx2(ymm_v4, ymm_s0)
+    ADD_avx2(ymm_v5, ymm_s1)
+    ADD_avx2(ymm_v6, ymm_s2)
+    ADD_avx2(ymm_v7, ymm_s3)
+    WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+    ADD_avx2(ymm_s3, ymm_inc)
+
+    ADD(reg_inp, 4 * 64)
+    ADD(reg_outp, 4 * 64)
+    SUB(reg_blocks, 4)
+
+    LABEL(process_2_blocks)
+    ADD(reg_blocks, 4)
+    JZ(out_write_even) # 0 blocks left.
+
+    #
+    # 2/1 blocks at a time.  The two codepaths are unified because
+    # with AVX2 we do 2 blocks at a time anyway, and this only gets called
+    # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
+    # much.
+    #
+
+    vector_loop2 = Loop()
+    with vector_loop2:
+        VMOVDQA(ymm_v0, ymm_s0)
+        VMOVDQA(ymm_v1, ymm_s1)
+        VMOVDQA(ymm_v2, ymm_s2)
+        VMOVDQA(ymm_v3, ymm_s3)
+
+        MOV(reg_rounds, 20)
+        rounds_loop2 = Loop()
+        with rounds_loop2:
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x39)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x93)
+
+            # a += b; d ^= a; d = ROTW16(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW12(b);
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+            # a += b; d ^= a; d = ROTW8(d);
+            ADD_avx2(ymm_v0, ymm_v1)
+            XOR_avx2(ymm_v3, ymm_v0)
+            ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+            # c += d; b ^= c; b = ROTW7(b)
+            ADD_avx2(ymm_v2, ymm_v3)
+            XOR_avx2(ymm_v1, ymm_v2)
+            ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+            # b = ROTV1(b); c = ROTV2(c);  d = ROTV3(d);
+            VPSHUFD(ymm_v1, ymm_v1, 0x93)
+            VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+            VPSHUFD(ymm_v3, ymm_v3, 0x39)
+
+            SUB(reg_rounds, 2)
+            JNZ(rounds_loop2.begin)
+
+        ADD_avx2(ymm_v0, ymm_s0)
+        ADD_avx2(ymm_v1, ymm_s1)
+        ADD_avx2(ymm_v2, ymm_s2)
+        ADD_avx2(ymm_v3, ymm_s3)
+
+        # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+        VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
+        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
+        VMOVDQU([reg_outp], ymm_tmp0)
+
+        # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+        VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
+        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
+        VMOVDQU([reg_outp+32], ymm_tmp0)
+
+        SUB(reg_blocks, 1)
+        JZ(out_write_odd)
+
+        ADD_avx2(ymm_s3, ymm_inc)
+
+        # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
+        VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
+        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
+        VMOVDQU([reg_outp+64], ymm_tmp0)
+
+        # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
+        VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
+        VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
+        VMOVDQU([reg_outp+96], ymm_tmp0)
+
+        SUB(reg_blocks, 1)
+        JZ(out_write_even)
+
+        ADD(reg_inp, 2 * 64)
+        ADD(reg_outp, 2 * 64)
+        JMP(vector_loop2.begin)
+
+    LABEL(out_write_odd)
+    VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
+
+    LABEL(out_write_even)
+    VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
+
+    # Paranoia, cleanse the scratch space.
+    VPXOR(ymm_v0, ymm_v0, ymm_v0)
+    VMOVDQA(mem_tmp0, ymm_v0)
+    VMOVDQA(mem_s3, ymm_v0)
+
+    # Clear all YMM (and XMM) registers.
+    VZEROALL()
+
+    # Remove our stack allocation.
+    MOV(registers.rsp, reg_sp_save)
+
+    RETURN()
+
+#
+# CPUID
+#
+
+cpuidParams = Argument(ptr(uint32_t))
+
+with Function("cpuidAmd64", (cpuidParams,)):
+    reg_params = registers.r15
+    LOAD.ARGUMENT(reg_params, cpuidParams)
+
+    MOV(registers.eax, [reg_params])
+    MOV(registers.ecx, [reg_params+8])
+
+    CPUID()
+
+    MOV([reg_params], registers.eax)
+    MOV([reg_params+4], registers.ebx)
+    MOV([reg_params+8], registers.ecx)
+    MOV([reg_params+12], registers.edx)
+
+    RETURN()
+
+#
+# XGETBV (ECX = 0)
+#
+
+xcrVec = Argument(ptr(uint32_t))
+
+with Function("xgetbv0Amd64", (xcrVec,)):
+    reg_vec = GeneralPurposeRegister64()
+
+    LOAD.ARGUMENT(reg_vec, xcrVec)
+
+    XOR(registers.ecx, registers.ecx)
+
+    XGETBV()
+
+    MOV([reg_vec], registers.eax)
+    MOV([reg_vec+4], registers.edx)
+
+    RETURN()

+ 1180 - 0
vendor/github.com/Yawning/chacha20/chacha20_amd64.s

@@ -0,0 +1,1180 @@
+// +build !noasm
+// Generated by PeachPy 0.2.0 from chacha20_amd64.py
+
+
+// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64SSE2(SB),4,$0-32
+	MOVQ x+0(FP), AX
+	MOVQ inp+8(FP), BX
+	MOVQ outp+16(FP), CX
+	MOVQ nrBlocks+24(FP), DX
+	MOVQ SP, DI
+	ANDQ $18446744073709551584, SP
+	SUBQ $32, SP
+	PXOR X0, X0
+	SUBQ $32, SP
+	MOVO X0, 0(SP)
+	MOVL $1, SI
+	MOVL SI, 0(SP)
+	SUBQ $4, DX
+	JCS vector_loop4_end
+vector_loop4_begin:
+		MOVOU 0(AX), X0
+		MOVOU 16(AX), X1
+		MOVOU 32(AX), X2
+		MOVOU 48(AX), X3
+		MOVO X0, X4
+		MOVO X1, X5
+		MOVO X2, X6
+		MOVO X3, X7
+		PADDQ 0(SP), X7
+		MOVO X0, X8
+		MOVO X1, X9
+		MOVO X2, X10
+		MOVO X7, X11
+		PADDQ 0(SP), X11
+		MOVO X0, X12
+		MOVO X1, X13
+		MOVO X2, X14
+		MOVO X11, X15
+		PADDQ 0(SP), X15
+		MOVQ $20, SI
+rounds_loop4_begin:
+			PADDL X1, X0
+			PADDL X5, X4
+			PADDL X9, X8
+			PADDL X13, X12
+			PXOR X0, X3
+			PXOR X4, X7
+			PXOR X8, X11
+			PXOR X12, X15
+			MOVO X12, 16(SP)
+			MOVO X3, X12
+			PSLLL $16, X12
+			PSRLL $16, X3
+			PXOR X12, X3
+			MOVO X7, X12
+			PSLLL $16, X12
+			PSRLL $16, X7
+			PXOR X12, X7
+			MOVO X11, X12
+			PSLLL $16, X12
+			PSRLL $16, X11
+			PXOR X12, X11
+			MOVO X15, X12
+			PSLLL $16, X12
+			PSRLL $16, X15
+			PXOR X12, X15
+			PADDL X3, X2
+			PADDL X7, X6
+			PADDL X11, X10
+			PADDL X15, X14
+			PXOR X2, X1
+			PXOR X6, X5
+			PXOR X10, X9
+			PXOR X14, X13
+			MOVO X1, X12
+			PSLLL $12, X12
+			PSRLL $20, X1
+			PXOR X12, X1
+			MOVO X5, X12
+			PSLLL $12, X12
+			PSRLL $20, X5
+			PXOR X12, X5
+			MOVO X9, X12
+			PSLLL $12, X12
+			PSRLL $20, X9
+			PXOR X12, X9
+			MOVO X13, X12
+			PSLLL $12, X12
+			PSRLL $20, X13
+			PXOR X12, X13
+			MOVO 16(SP), X12
+			PADDL X1, X0
+			PADDL X5, X4
+			PADDL X9, X8
+			PADDL X13, X12
+			PXOR X0, X3
+			PXOR X4, X7
+			PXOR X8, X11
+			PXOR X12, X15
+			MOVO X12, 16(SP)
+			MOVO X3, X12
+			PSLLL $8, X12
+			PSRLL $24, X3
+			PXOR X12, X3
+			MOVO X7, X12
+			PSLLL $8, X12
+			PSRLL $24, X7
+			PXOR X12, X7
+			MOVO X11, X12
+			PSLLL $8, X12
+			PSRLL $24, X11
+			PXOR X12, X11
+			MOVO X15, X12
+			PSLLL $8, X12
+			PSRLL $24, X15
+			PXOR X12, X15
+			PADDL X3, X2
+			PADDL X7, X6
+			PADDL X11, X10
+			PADDL X15, X14
+			PXOR X2, X1
+			PXOR X6, X5
+			PXOR X10, X9
+			PXOR X14, X13
+			MOVO X1, X12
+			PSLLL $7, X12
+			PSRLL $25, X1
+			PXOR X12, X1
+			MOVO X5, X12
+			PSLLL $7, X12
+			PSRLL $25, X5
+			PXOR X12, X5
+			MOVO X9, X12
+			PSLLL $7, X12
+			PSRLL $25, X9
+			PXOR X12, X9
+			MOVO X13, X12
+			PSLLL $7, X12
+			PSRLL $25, X13
+			PXOR X12, X13
+			PSHUFL $57, X1, X1
+			PSHUFL $57, X5, X5
+			PSHUFL $57, X9, X9
+			PSHUFL $57, X13, X13
+			PSHUFL $78, X2, X2
+			PSHUFL $78, X6, X6
+			PSHUFL $78, X10, X10
+			PSHUFL $78, X14, X14
+			PSHUFL $147, X3, X3
+			PSHUFL $147, X7, X7
+			PSHUFL $147, X11, X11
+			PSHUFL $147, X15, X15
+			MOVO 16(SP), X12
+			PADDL X1, X0
+			PADDL X5, X4
+			PADDL X9, X8
+			PADDL X13, X12
+			PXOR X0, X3
+			PXOR X4, X7
+			PXOR X8, X11
+			PXOR X12, X15
+			MOVO X12, 16(SP)
+			MOVO X3, X12
+			PSLLL $16, X12
+			PSRLL $16, X3
+			PXOR X12, X3
+			MOVO X7, X12
+			PSLLL $16, X12
+			PSRLL $16, X7
+			PXOR X12, X7
+			MOVO X11, X12
+			PSLLL $16, X12
+			PSRLL $16, X11
+			PXOR X12, X11
+			MOVO X15, X12
+			PSLLL $16, X12
+			PSRLL $16, X15
+			PXOR X12, X15
+			PADDL X3, X2
+			PADDL X7, X6
+			PADDL X11, X10
+			PADDL X15, X14
+			PXOR X2, X1
+			PXOR X6, X5
+			PXOR X10, X9
+			PXOR X14, X13
+			MOVO X1, X12
+			PSLLL $12, X12
+			PSRLL $20, X1
+			PXOR X12, X1
+			MOVO X5, X12
+			PSLLL $12, X12
+			PSRLL $20, X5
+			PXOR X12, X5
+			MOVO X9, X12
+			PSLLL $12, X12
+			PSRLL $20, X9
+			PXOR X12, X9
+			MOVO X13, X12
+			PSLLL $12, X12
+			PSRLL $20, X13
+			PXOR X12, X13
+			MOVO 16(SP), X12
+			PADDL X1, X0
+			PADDL X5, X4
+			PADDL X9, X8
+			PADDL X13, X12
+			PXOR X0, X3
+			PXOR X4, X7
+			PXOR X8, X11
+			PXOR X12, X15
+			MOVO X12, 16(SP)
+			MOVO X3, X12
+			PSLLL $8, X12
+			PSRLL $24, X3
+			PXOR X12, X3
+			MOVO X7, X12
+			PSLLL $8, X12
+			PSRLL $24, X7
+			PXOR X12, X7
+			MOVO X11, X12
+			PSLLL $8, X12
+			PSRLL $24, X11
+			PXOR X12, X11
+			MOVO X15, X12
+			PSLLL $8, X12
+			PSRLL $24, X15
+			PXOR X12, X15
+			PADDL X3, X2
+			PADDL X7, X6
+			PADDL X11, X10
+			PADDL X15, X14
+			PXOR X2, X1
+			PXOR X6, X5
+			PXOR X10, X9
+			PXOR X14, X13
+			MOVO X1, X12
+			PSLLL $7, X12
+			PSRLL $25, X1
+			PXOR X12, X1
+			MOVO X5, X12
+			PSLLL $7, X12
+			PSRLL $25, X5
+			PXOR X12, X5
+			MOVO X9, X12
+			PSLLL $7, X12
+			PSRLL $25, X9
+			PXOR X12, X9
+			MOVO X13, X12
+			PSLLL $7, X12
+			PSRLL $25, X13
+			PXOR X12, X13
+			PSHUFL $147, X1, X1
+			PSHUFL $147, X5, X5
+			PSHUFL $147, X9, X9
+			PSHUFL $147, X13, X13
+			PSHUFL $78, X2, X2
+			PSHUFL $78, X6, X6
+			PSHUFL $78, X10, X10
+			PSHUFL $78, X14, X14
+			PSHUFL $57, X3, X3
+			PSHUFL $57, X7, X7
+			PSHUFL $57, X11, X11
+			PSHUFL $57, X15, X15
+			MOVO 16(SP), X12
+			SUBQ $2, SI
+			JNE rounds_loop4_begin
+		MOVO X12, 16(SP)
+		PADDL 0(AX), X0
+		PADDL 16(AX), X1
+		PADDL 32(AX), X2
+		PADDL 48(AX), X3
+		MOVOU 0(BX), X12
+		PXOR X0, X12
+		MOVOU X12, 0(CX)
+		MOVOU 16(BX), X12
+		PXOR X1, X12
+		MOVOU X12, 16(CX)
+		MOVOU 32(BX), X12
+		PXOR X2, X12
+		MOVOU X12, 32(CX)
+		MOVOU 48(BX), X12
+		PXOR X3, X12
+		MOVOU X12, 48(CX)
+		MOVOU 48(AX), X3
+		PADDQ 0(SP), X3
+		PADDL 0(AX), X4
+		PADDL 16(AX), X5
+		PADDL 32(AX), X6
+		PADDL X3, X7
+		MOVOU 64(BX), X12
+		PXOR X4, X12
+		MOVOU X12, 64(CX)
+		MOVOU 80(BX), X12
+		PXOR X5, X12
+		MOVOU X12, 80(CX)
+		MOVOU 96(BX), X12
+		PXOR X6, X12
+		MOVOU X12, 96(CX)
+		MOVOU 112(BX), X12
+		PXOR X7, X12
+		MOVOU X12, 112(CX)
+		PADDQ 0(SP), X3
+		PADDL 0(AX), X8
+		PADDL 16(AX), X9
+		PADDL 32(AX), X10
+		PADDL X3, X11
+		MOVOU 128(BX), X12
+		PXOR X8, X12
+		MOVOU X12, 128(CX)
+		MOVOU 144(BX), X12
+		PXOR X9, X12
+		MOVOU X12, 144(CX)
+		MOVOU 160(BX), X12
+		PXOR X10, X12
+		MOVOU X12, 160(CX)
+		MOVOU 176(BX), X12
+		PXOR X11, X12
+		MOVOU X12, 176(CX)
+		PADDQ 0(SP), X3
+		MOVO 16(SP), X12
+		PADDL 0(AX), X12
+		PADDL 16(AX), X13
+		PADDL 32(AX), X14
+		PADDL X3, X15
+		MOVOU 192(BX), X0
+		PXOR X12, X0
+		MOVOU X0, 192(CX)
+		MOVOU 208(BX), X0
+		PXOR X13, X0
+		MOVOU X0, 208(CX)
+		MOVOU 224(BX), X0
+		PXOR X14, X0
+		MOVOU X0, 224(CX)
+		MOVOU 240(BX), X0
+		PXOR X15, X0
+		MOVOU X0, 240(CX)
+		PADDQ 0(SP), X3
+		MOVOU X3, 48(AX)
+		ADDQ $256, BX
+		ADDQ $256, CX
+		SUBQ $4, DX
+		JCC vector_loop4_begin
+vector_loop4_end:
+	ADDQ $4, DX
+	JEQ out
+	MOVOU 0(AX), X8
+	MOVOU 16(AX), X9
+	MOVOU 32(AX), X10
+	MOVOU 48(AX), X11
+	MOVO 0(SP), X13
+	SUBQ $2, DX
+	JCS process_1_block
+	MOVO X8, X0
+	MOVO X9, X1
+	MOVO X10, X2
+	MOVO X11, X3
+	MOVO X0, X4
+	MOVO X1, X5
+	MOVO X2, X6
+	MOVO X3, X7
+	PADDQ X13, X7
+	MOVQ $20, SI
+rounds_loop2_begin:
+		PADDL X1, X0
+		PADDL X5, X4
+		PXOR X0, X3
+		PXOR X4, X7
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		MOVO X7, X12
+		PSLLL $16, X12
+		PSRLL $16, X7
+		PXOR X12, X7
+		PADDL X3, X2
+		PADDL X7, X6
+		PXOR X2, X1
+		PXOR X6, X5
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		MOVO X5, X12
+		PSLLL $12, X12
+		PSRLL $20, X5
+		PXOR X12, X5
+		PADDL X1, X0
+		PADDL X5, X4
+		PXOR X0, X3
+		PXOR X4, X7
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		MOVO X7, X12
+		PSLLL $8, X12
+		PSRLL $24, X7
+		PXOR X12, X7
+		PADDL X3, X2
+		PADDL X7, X6
+		PXOR X2, X1
+		PXOR X6, X5
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		MOVO X5, X12
+		PSLLL $7, X12
+		PSRLL $25, X5
+		PXOR X12, X5
+		PSHUFL $57, X1, X1
+		PSHUFL $57, X5, X5
+		PSHUFL $78, X2, X2
+		PSHUFL $78, X6, X6
+		PSHUFL $147, X3, X3
+		PSHUFL $147, X7, X7
+		PADDL X1, X0
+		PADDL X5, X4
+		PXOR X0, X3
+		PXOR X4, X7
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		MOVO X7, X12
+		PSLLL $16, X12
+		PSRLL $16, X7
+		PXOR X12, X7
+		PADDL X3, X2
+		PADDL X7, X6
+		PXOR X2, X1
+		PXOR X6, X5
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		MOVO X5, X12
+		PSLLL $12, X12
+		PSRLL $20, X5
+		PXOR X12, X5
+		PADDL X1, X0
+		PADDL X5, X4
+		PXOR X0, X3
+		PXOR X4, X7
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		MOVO X7, X12
+		PSLLL $8, X12
+		PSRLL $24, X7
+		PXOR X12, X7
+		PADDL X3, X2
+		PADDL X7, X6
+		PXOR X2, X1
+		PXOR X6, X5
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		MOVO X5, X12
+		PSLLL $7, X12
+		PSRLL $25, X5
+		PXOR X12, X5
+		PSHUFL $147, X1, X1
+		PSHUFL $147, X5, X5
+		PSHUFL $78, X2, X2
+		PSHUFL $78, X6, X6
+		PSHUFL $57, X3, X3
+		PSHUFL $57, X7, X7
+		SUBQ $2, SI
+		JNE rounds_loop2_begin
+	PADDL X8, X0
+	PADDL X9, X1
+	PADDL X10, X2
+	PADDL X11, X3
+	MOVOU 0(BX), X12
+	PXOR X0, X12
+	MOVOU X12, 0(CX)
+	MOVOU 16(BX), X12
+	PXOR X1, X12
+	MOVOU X12, 16(CX)
+	MOVOU 32(BX), X12
+	PXOR X2, X12
+	MOVOU X12, 32(CX)
+	MOVOU 48(BX), X12
+	PXOR X3, X12
+	MOVOU X12, 48(CX)
+	PADDQ X13, X11
+	PADDL X8, X4
+	PADDL X9, X5
+	PADDL X10, X6
+	PADDL X11, X7
+	MOVOU 64(BX), X12
+	PXOR X4, X12
+	MOVOU X12, 64(CX)
+	MOVOU 80(BX), X12
+	PXOR X5, X12
+	MOVOU X12, 80(CX)
+	MOVOU 96(BX), X12
+	PXOR X6, X12
+	MOVOU X12, 96(CX)
+	MOVOU 112(BX), X12
+	PXOR X7, X12
+	MOVOU X12, 112(CX)
+	PADDQ X13, X11
+	ADDQ $128, BX
+	ADDQ $128, CX
+	SUBQ $2, DX
+process_1_block:
+	ADDQ $2, DX
+	JEQ out_serial
+	MOVO X8, X0
+	MOVO X9, X1
+	MOVO X10, X2
+	MOVO X11, X3
+	MOVQ $20, SI
+rounds_loop1_begin:
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		PSHUFL $57, X1, X1
+		PSHUFL $78, X2, X2
+		PSHUFL $147, X3, X3
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $16, X12
+		PSRLL $16, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $12, X12
+		PSRLL $20, X1
+		PXOR X12, X1
+		PADDL X1, X0
+		PXOR X0, X3
+		MOVO X3, X12
+		PSLLL $8, X12
+		PSRLL $24, X3
+		PXOR X12, X3
+		PADDL X3, X2
+		PXOR X2, X1
+		MOVO X1, X12
+		PSLLL $7, X12
+		PSRLL $25, X1
+		PXOR X12, X1
+		PSHUFL $147, X1, X1
+		PSHUFL $78, X2, X2
+		PSHUFL $57, X3, X3
+		SUBQ $2, SI
+		JNE rounds_loop1_begin
+	PADDL X8, X0
+	PADDL X9, X1
+	PADDL X10, X2
+	PADDL X11, X3
+	MOVOU 0(BX), X12
+	PXOR X0, X12
+	MOVOU X12, 0(CX)
+	MOVOU 16(BX), X12
+	PXOR X1, X12
+	MOVOU X12, 16(CX)
+	MOVOU 32(BX), X12
+	PXOR X2, X12
+	MOVOU X12, 32(CX)
+	MOVOU 48(BX), X12
+	PXOR X3, X12
+	MOVOU X12, 48(CX)
+	PADDQ X13, X11
+out_serial:
+	MOVOU X11, 48(AX)
+out:
+	PXOR X0, X0
+	MOVO X0, 16(SP)
+	MOVQ DI, SP
+	RET
+
+// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64AVX2(SB),4,$0-32
+	MOVQ x+0(FP), AX
+	MOVQ inp+8(FP), BX
+	MOVQ outp+16(FP), CX
+	MOVQ nrBlocks+24(FP), DX
+	MOVQ SP, DI
+	ANDQ $18446744073709551584, SP
+	SUBQ $32, SP
+	SUBQ $96, SP
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0
+	MOVL $1, SI
+	MOVL SI, 16(SP)
+	BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x30 // VBROADCASTI128 ymm1, [rax + 48]
+	BYTE $0xC5; BYTE $0xF5; BYTE $0xD4; BYTE $0x0C; BYTE $0x24 // VPADDQ ymm1, ymm1, [rsp]
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
+	MOVL $2, SI
+	MOVL SI, 0(SP)
+	MOVL SI, 16(SP)
+	SUBQ $8, DX
+	JCS vector_loop8_end
+vector_loop8_begin:
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x10 // VBROADCASTI128 ymm2, [rax]
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x10 // VBROADCASTI128 ymm3, [rax + 16]
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x60; BYTE $0x20 // VBROADCASTI128 ymm4, [rax + 32]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
+		BYTE $0xC5; BYTE $0x75; BYTE $0xD4; BYTE $0x04; BYTE $0x24 // VPADDQ ymm8, ymm1, [rsp]
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCA // VMOVDQA ymm9, ymm2
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD3 // VMOVDQA ymm10, ymm3
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xDC // VMOVDQA ymm11, ymm4
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xD4; BYTE $0x24; BYTE $0x24 // VPADDQ ymm12, ymm8, [rsp]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xC2 // VMOVDQA ymm0, ymm2
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xEB // VMOVDQA ymm13, ymm3
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF4 // VMOVDQA ymm14, ymm4
+		BYTE $0xC5; BYTE $0x1D; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm12, [rsp]
+		MOVQ $20, SI
+rounds_loop8_begin:
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
+			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
+			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
+			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
+			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x39 // VPSHUFD ymm10, ymm10, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x93 // VPSHUFD ymm12, ymm12, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
+			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
+			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+			BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+			BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+			BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
+			BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+			BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+			BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+			BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+			BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+			BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+			BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+			BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
+			BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+			BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+			BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x93 // VPSHUFD ymm10, ymm10, 147
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x39 // VPSHUFD ymm12, ymm12, 57
+			BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+			SUBQ $2, SI
+			JNE rounds_loop8_begin
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax]
+		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm2, ymm2, ymm0
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm5, ymm5, ymm0
+		BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC8 // VPADDD ymm9, ymm9, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xFE; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VPADDD ymm0, ymm0, [rsp + 64]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x10 // VBROADCASTI128 ymm0, [rax + 16]
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm3, ymm3, ymm0
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm6, ymm6, ymm0
+		BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm10, ymm10, ymm0
+		BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm13, ymm13, ymm0
+		BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x20 // VBROADCASTI128 ymm0, [rax + 32]
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0
+		BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm0
+		BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm11, ymm11, ymm0
+		BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm14, ymm14, ymm0
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VPADDD ymm1, ymm1, [rsp + 32]
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm1
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+		BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm12, ymm12, ymm1
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x20 // VPERM2I128 ymm0, ymm9, ymm10, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x20 // VPERM2I128 ymm0, ymm11, ymm12, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x31 // VPERM2I128 ymm0, ymm9, ymm10, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x31 // VPERM2I128 ymm0, ymm11, ymm12, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+		BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xF9 // VPADDD ymm15, ymm15, ymm1
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x20 // VPERM2I128 ymm2, ymm0, ymm13, 32
+		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 384]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 384], ymm2
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x20 // VPERM2I128 ymm2, ymm14, ymm15, 32
+		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 416]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 416], ymm2
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x31 // VPERM2I128 ymm2, ymm0, ymm13, 49
+		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 448]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 448], ymm2
+		BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x31 // VPERM2I128 ymm2, ymm14, ymm15, 49
+		BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 480]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 480], ymm2
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
+		ADDQ $512, BX
+		ADDQ $512, CX
+		SUBQ $8, DX
+		JCC vector_loop8_begin
+vector_loop8_end:
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE1 // VMOVDQA ymm12, ymm1
+	ADDQ $8, DX
+	JEQ out_write_even
+	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x08 // VBROADCASTI128 ymm9, [rax]
+	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x10 // VBROADCASTI128 ymm10, [rax + 16]
+	BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x20 // VBROADCASTI128 ymm11, [rax + 32]
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x34; BYTE $0x24 // VMOVDQA ymm14, [rsp]
+	SUBQ $4, DX
+	JCS process_2_blocks
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
+	BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
+	BYTE $0xC4; BYTE $0x41; BYTE $0x75; BYTE $0xD4; BYTE $0xC6 // VPADDQ ymm8, ymm1, ymm14
+	MOVQ $20, SI
+rounds_loop4_begin:
+		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+		BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
+		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+		BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+		BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+		BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+		BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+		BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+		BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+		BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+		BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+		BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+		BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+		BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+		BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
+		SUBQ $2, SI
+		JNE rounds_loop4_begin
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+	BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm9
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x4D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm10
+	BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm11
+	BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
+	BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
+	BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
+	BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
+	BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
+	BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
+	BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+	ADDQ $256, BX
+	ADDQ $256, CX
+	SUBQ $4, DX
+process_2_blocks:
+	ADDQ $4, DX
+	JEQ out_write_even
+vector_loop2_begin:
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
+		BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
+		MOVQ $20, SI
+rounds_loop2_begin:
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+			BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+			BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+			BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+			BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+			BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+			BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+			SUBQ $2, SI
+			JNE rounds_loop2_begin
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
+		BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+		SUBQ $1, DX
+		JEQ out_write_odd
+		BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+		BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+		BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+		BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+		SUBQ $1, DX
+		JEQ out_write_even
+		ADDQ $128, BX
+		ADDQ $128, CX
+		JMP vector_loop2_begin
+out_write_odd:
+	BYTE $0xC4; BYTE $0x43; BYTE $0x1D; BYTE $0x46; BYTE $0xE4; BYTE $0x01 // VPERM2I128 ymm12, ymm12, ymm12, 1
+out_write_even:
+	BYTE $0xC5; BYTE $0x7A; BYTE $0x7F; BYTE $0x60; BYTE $0x30 // VMOVDQU [rax + 48], xmm12
+	BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0xD2 // VPXOR ymm2, ymm2, ymm2
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm2
+	BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm2
+	BYTE $0xC5; BYTE $0xFC; BYTE $0x77 // VZEROALL
+	MOVQ DI, SP
+	RET
+
+// func cpuidAmd64(cpuidParams *uint32)
+TEXT ·cpuidAmd64(SB),4,$0-8
+	MOVQ cpuidParams+0(FP), R15
+	MOVL 0(R15), AX
+	MOVL 8(R15), CX
+	CPUID
+	MOVL AX, 0(R15)
+	MOVL BX, 4(R15)
+	MOVL CX, 8(R15)
+	MOVL DX, 12(R15)
+	RET
+
+// func xgetbv0Amd64(xcrVec *uint32)
+TEXT ·xgetbv0Amd64(SB),4,$0-8
+	MOVQ xcrVec+0(FP), BX
+	XORL CX, CX
+	BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV
+	MOVL AX, 0(BX)
+	MOVL DX, 4(BX)
+	RET

+ 394 - 0
vendor/github.com/Yawning/chacha20/chacha20_ref.go

@@ -0,0 +1,394 @@
+// chacha20_ref.go - Reference ChaCha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build !go1.9
+
+package chacha20
+
+import (
+	"encoding/binary"
+	"math"
+	"unsafe"
+)
+
+func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+	if isIetf {
+		var totalBlocks uint64
+		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+		if totalBlocks > math.MaxUint32 {
+			panic("chacha20: Exceeded keystream per nonce limit")
+		}
+	}
+
+	// This routine ignores x[0]...x[4] in favor the const values since it's
+	// ever so slightly faster.
+
+	for n := 0; n < nrBlocks; n++ {
+		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+		for i := chachaRounds; i > 0; i -= 2 {
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ^= x0
+			x12 = (x12 << 16) | (x12 >> 16)
+			x8 += x12
+			x4 ^= x8
+			x4 = (x4 << 12) | (x4 >> 20)
+			x0 += x4
+			x12 ^= x0
+			x12 = (x12 << 8) | (x12 >> 24)
+			x8 += x12
+			x4 ^= x8
+			x4 = (x4 << 7) | (x4 >> 25)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ^= x1
+			x13 = (x13 << 16) | (x13 >> 16)
+			x9 += x13
+			x5 ^= x9
+			x5 = (x5 << 12) | (x5 >> 20)
+			x1 += x5
+			x13 ^= x1
+			x13 = (x13 << 8) | (x13 >> 24)
+			x9 += x13
+			x5 ^= x9
+			x5 = (x5 << 7) | (x5 >> 25)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ^= x2
+			x14 = (x14 << 16) | (x14 >> 16)
+			x10 += x14
+			x6 ^= x10
+			x6 = (x6 << 12) | (x6 >> 20)
+			x2 += x6
+			x14 ^= x2
+			x14 = (x14 << 8) | (x14 >> 24)
+			x10 += x14
+			x6 ^= x10
+			x6 = (x6 << 7) | (x6 >> 25)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ^= x3
+			x15 = (x15 << 16) | (x15 >> 16)
+			x11 += x15
+			x7 ^= x11
+			x7 = (x7 << 12) | (x7 >> 20)
+			x3 += x7
+			x15 ^= x3
+			x15 = (x15 << 8) | (x15 >> 24)
+			x11 += x15
+			x7 ^= x11
+			x7 = (x7 << 7) | (x7 >> 25)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ^= x0
+			x15 = (x15 << 16) | (x15 >> 16)
+			x10 += x15
+			x5 ^= x10
+			x5 = (x5 << 12) | (x5 >> 20)
+			x0 += x5
+			x15 ^= x0
+			x15 = (x15 << 8) | (x15 >> 24)
+			x10 += x15
+			x5 ^= x10
+			x5 = (x5 << 7) | (x5 >> 25)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ^= x1
+			x12 = (x12 << 16) | (x12 >> 16)
+			x11 += x12
+			x6 ^= x11
+			x6 = (x6 << 12) | (x6 >> 20)
+			x1 += x6
+			x12 ^= x1
+			x12 = (x12 << 8) | (x12 >> 24)
+			x11 += x12
+			x6 ^= x11
+			x6 = (x6 << 7) | (x6 >> 25)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ^= x2
+			x13 = (x13 << 16) | (x13 >> 16)
+			x8 += x13
+			x7 ^= x8
+			x7 = (x7 << 12) | (x7 >> 20)
+			x2 += x7
+			x13 ^= x2
+			x13 = (x13 << 8) | (x13 >> 24)
+			x8 += x13
+			x7 ^= x8
+			x7 = (x7 << 7) | (x7 >> 25)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ^= x3
+			x14 = (x14 << 16) | (x14 >> 16)
+			x9 += x14
+			x4 ^= x9
+			x4 = (x4 << 12) | (x4 >> 20)
+			x3 += x4
+			x14 ^= x3
+			x14 = (x14 << 8) | (x14 >> 24)
+			x9 += x14
+			x4 ^= x9
+			x4 = (x4 << 7) | (x4 >> 25)
+		}
+
+		// On amd64 at least, this is a rather big boost.
+		if useUnsafe {
+			if in != nil {
+				inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = inArr[0] ^ (x0 + sigma0)
+				outArr[1] = inArr[1] ^ (x1 + sigma1)
+				outArr[2] = inArr[2] ^ (x2 + sigma2)
+				outArr[3] = inArr[3] ^ (x3 + sigma3)
+				outArr[4] = inArr[4] ^ (x4 + x[4])
+				outArr[5] = inArr[5] ^ (x5 + x[5])
+				outArr[6] = inArr[6] ^ (x6 + x[6])
+				outArr[7] = inArr[7] ^ (x7 + x[7])
+				outArr[8] = inArr[8] ^ (x8 + x[8])
+				outArr[9] = inArr[9] ^ (x9 + x[9])
+				outArr[10] = inArr[10] ^ (x10 + x[10])
+				outArr[11] = inArr[11] ^ (x11 + x[11])
+				outArr[12] = inArr[12] ^ (x12 + x[12])
+				outArr[13] = inArr[13] ^ (x13 + x[13])
+				outArr[14] = inArr[14] ^ (x14 + x[14])
+				outArr[15] = inArr[15] ^ (x15 + x[15])
+			} else {
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = x0 + sigma0
+				outArr[1] = x1 + sigma1
+				outArr[2] = x2 + sigma2
+				outArr[3] = x3 + sigma3
+				outArr[4] = x4 + x[4]
+				outArr[5] = x5 + x[5]
+				outArr[6] = x6 + x[6]
+				outArr[7] = x7 + x[7]
+				outArr[8] = x8 + x[8]
+				outArr[9] = x9 + x[9]
+				outArr[10] = x10 + x[10]
+				outArr[11] = x11 + x[11]
+				outArr[12] = x12 + x[12]
+				outArr[13] = x13 + x[13]
+				outArr[14] = x14 + x[14]
+				outArr[15] = x15 + x[15]
+			}
+		} else {
+			// Slow path, either the architecture cares about alignment, or is not little endian.
+			x0 += sigma0
+			x1 += sigma1
+			x2 += sigma2
+			x3 += sigma3
+			x4 += x[4]
+			x5 += x[5]
+			x6 += x[6]
+			x7 += x[7]
+			x8 += x[8]
+			x9 += x[9]
+			x10 += x[10]
+			x11 += x[11]
+			x12 += x[12]
+			x13 += x[13]
+			x14 += x[14]
+			x15 += x[15]
+			if in != nil {
+				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
+				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
+				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
+				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
+				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
+				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
+				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
+				binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
+				binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
+				binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
+				binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
+				binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
+				binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
+				binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
+				binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
+				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
+				in = in[BlockSize:]
+			} else {
+				binary.LittleEndian.PutUint32(out[0:4], x0)
+				binary.LittleEndian.PutUint32(out[4:8], x1)
+				binary.LittleEndian.PutUint32(out[8:12], x2)
+				binary.LittleEndian.PutUint32(out[12:16], x3)
+				binary.LittleEndian.PutUint32(out[16:20], x4)
+				binary.LittleEndian.PutUint32(out[20:24], x5)
+				binary.LittleEndian.PutUint32(out[24:28], x6)
+				binary.LittleEndian.PutUint32(out[28:32], x7)
+				binary.LittleEndian.PutUint32(out[32:36], x8)
+				binary.LittleEndian.PutUint32(out[36:40], x9)
+				binary.LittleEndian.PutUint32(out[40:44], x10)
+				binary.LittleEndian.PutUint32(out[44:48], x11)
+				binary.LittleEndian.PutUint32(out[48:52], x12)
+				binary.LittleEndian.PutUint32(out[52:56], x13)
+				binary.LittleEndian.PutUint32(out[56:60], x14)
+				binary.LittleEndian.PutUint32(out[60:64], x15)
+			}
+			out = out[BlockSize:]
+		}
+
+		// Stoping at 2^70 bytes per nonce is the user's responsibility.
+		ctr := uint64(x[13])<<32 | uint64(x[12])
+		ctr++
+		x[12] = uint32(ctr)
+		x[13] = uint32(ctr >> 32)
+	}
+}
+
+func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
+	x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+	x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+	for i := chachaRounds; i > 0; i -= 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 16) | (x12 >> 16)
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 12) | (x4 >> 20)
+		x0 += x4
+		x12 ^= x0
+		x12 = (x12 << 8) | (x12 >> 24)
+		x8 += x12
+		x4 ^= x8
+		x4 = (x4 << 7) | (x4 >> 25)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 16) | (x13 >> 16)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 12) | (x5 >> 20)
+		x1 += x5
+		x13 ^= x1
+		x13 = (x13 << 8) | (x13 >> 24)
+		x9 += x13
+		x5 ^= x9
+		x5 = (x5 << 7) | (x5 >> 25)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 16) | (x14 >> 16)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 12) | (x6 >> 20)
+		x2 += x6
+		x14 ^= x2
+		x14 = (x14 << 8) | (x14 >> 24)
+		x10 += x14
+		x6 ^= x10
+		x6 = (x6 << 7) | (x6 >> 25)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 16) | (x15 >> 16)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 12) | (x7 >> 20)
+		x3 += x7
+		x15 ^= x3
+		x15 = (x15 << 8) | (x15 >> 24)
+		x11 += x15
+		x7 ^= x11
+		x7 = (x7 << 7) | (x7 >> 25)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 16) | (x15 >> 16)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 12) | (x5 >> 20)
+		x0 += x5
+		x15 ^= x0
+		x15 = (x15 << 8) | (x15 >> 24)
+		x10 += x15
+		x5 ^= x10
+		x5 = (x5 << 7) | (x5 >> 25)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 16) | (x12 >> 16)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 12) | (x6 >> 20)
+		x1 += x6
+		x12 ^= x1
+		x12 = (x12 << 8) | (x12 >> 24)
+		x11 += x12
+		x6 ^= x11
+		x6 = (x6 << 7) | (x6 >> 25)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 16) | (x13 >> 16)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 12) | (x7 >> 20)
+		x2 += x7
+		x13 ^= x2
+		x13 = (x13 << 8) | (x13 >> 24)
+		x8 += x13
+		x7 ^= x8
+		x7 = (x7 << 7) | (x7 >> 25)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 16) | (x14 >> 16)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 12) | (x4 >> 20)
+		x3 += x4
+		x14 ^= x3
+		x14 = (x14 << 8) | (x14 >> 24)
+		x9 += x14
+		x4 ^= x9
+		x4 = (x4 << 7) | (x4 >> 25)
+	}
+
+	// HChaCha returns x0...x3 | x12...x15, which corresponds to the
+	// indexes of the ChaCha constant and the indexes of the IV.
+	if useUnsafe {
+		outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
+		outArr[0] = x0
+		outArr[1] = x1
+		outArr[2] = x2
+		outArr[3] = x3
+		outArr[4] = x12
+		outArr[5] = x13
+		outArr[6] = x14
+		outArr[7] = x15
+	} else {
+		binary.LittleEndian.PutUint32(out[0:4], x0)
+		binary.LittleEndian.PutUint32(out[4:8], x1)
+		binary.LittleEndian.PutUint32(out[8:12], x2)
+		binary.LittleEndian.PutUint32(out[12:16], x3)
+		binary.LittleEndian.PutUint32(out[16:20], x12)
+		binary.LittleEndian.PutUint32(out[20:24], x13)
+		binary.LittleEndian.PutUint32(out[24:28], x14)
+		binary.LittleEndian.PutUint32(out[28:32], x15)
+	}
+	return
+}

+ 395 - 0
vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go

@@ -0,0 +1,395 @@
+// chacha20_ref.go - Reference ChaCha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// <http://creativecommons.org/publicdomain/zero/1.0/> for full details.
+
+// +build go1.9
+
+package chacha20
+
+import (
+	"encoding/binary"
+	"math"
+	"math/bits"
+	"unsafe"
+)
+
+func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+	if isIetf {
+		var totalBlocks uint64
+		totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+		if totalBlocks > math.MaxUint32 {
+			panic("chacha20: Exceeded keystream per nonce limit")
+		}
+	}
+
+	// This routine ignores x[0]...x[4] in favor the const values since it's
+	// ever so slightly faster.
+
+	for n := 0; n < nrBlocks; n++ {
+		x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+		x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+		for i := chachaRounds; i > 0; i -= 2 {
+			// quarterround(x, 0, 4, 8, 12)
+			x0 += x4
+			x12 ^= x0
+			x12 = bits.RotateLeft32(x12, 16)
+			x8 += x12
+			x4 ^= x8
+			x4 = bits.RotateLeft32(x4, 12)
+			x0 += x4
+			x12 ^= x0
+			x12 = bits.RotateLeft32(x12, 8)
+			x8 += x12
+			x4 ^= x8
+			x4 = bits.RotateLeft32(x4, 7)
+
+			// quarterround(x, 1, 5, 9, 13)
+			x1 += x5
+			x13 ^= x1
+			x13 = bits.RotateLeft32(x13, 16)
+			x9 += x13
+			x5 ^= x9
+			x5 = bits.RotateLeft32(x5, 12)
+			x1 += x5
+			x13 ^= x1
+			x13 = bits.RotateLeft32(x13, 8)
+			x9 += x13
+			x5 ^= x9
+			x5 = bits.RotateLeft32(x5, 7)
+
+			// quarterround(x, 2, 6, 10, 14)
+			x2 += x6
+			x14 ^= x2
+			x14 = bits.RotateLeft32(x14, 16)
+			x10 += x14
+			x6 ^= x10
+			x6 = bits.RotateLeft32(x6, 12)
+			x2 += x6
+			x14 ^= x2
+			x14 = bits.RotateLeft32(x14, 8)
+			x10 += x14
+			x6 ^= x10
+			x6 = bits.RotateLeft32(x6, 7)
+
+			// quarterround(x, 3, 7, 11, 15)
+			x3 += x7
+			x15 ^= x3
+			x15 = bits.RotateLeft32(x15, 16)
+			x11 += x15
+			x7 ^= x11
+			x7 = bits.RotateLeft32(x7, 12)
+			x3 += x7
+			x15 ^= x3
+			x15 = bits.RotateLeft32(x15, 8)
+			x11 += x15
+			x7 ^= x11
+			x7 = bits.RotateLeft32(x7, 7)
+
+			// quarterround(x, 0, 5, 10, 15)
+			x0 += x5
+			x15 ^= x0
+			x15 = bits.RotateLeft32(x15, 16)
+			x10 += x15
+			x5 ^= x10
+			x5 = bits.RotateLeft32(x5, 12)
+			x0 += x5
+			x15 ^= x0
+			x15 = bits.RotateLeft32(x15, 8)
+			x10 += x15
+			x5 ^= x10
+			x5 = bits.RotateLeft32(x5, 7)
+
+			// quarterround(x, 1, 6, 11, 12)
+			x1 += x6
+			x12 ^= x1
+			x12 = bits.RotateLeft32(x12, 16)
+			x11 += x12
+			x6 ^= x11
+			x6 = bits.RotateLeft32(x6, 12)
+			x1 += x6
+			x12 ^= x1
+			x12 = bits.RotateLeft32(x12, 8)
+			x11 += x12
+			x6 ^= x11
+			x6 = bits.RotateLeft32(x6, 7)
+
+			// quarterround(x, 2, 7, 8, 13)
+			x2 += x7
+			x13 ^= x2
+			x13 = bits.RotateLeft32(x13, 16)
+			x8 += x13
+			x7 ^= x8
+			x7 = bits.RotateLeft32(x7, 12)
+			x2 += x7
+			x13 ^= x2
+			x13 = bits.RotateLeft32(x13, 8)
+			x8 += x13
+			x7 ^= x8
+			x7 = bits.RotateLeft32(x7, 7)
+
+			// quarterround(x, 3, 4, 9, 14)
+			x3 += x4
+			x14 ^= x3
+			x14 = bits.RotateLeft32(x14, 16)
+			x9 += x14
+			x4 ^= x9
+			x4 = bits.RotateLeft32(x4, 12)
+			x3 += x4
+			x14 ^= x3
+			x14 = bits.RotateLeft32(x14, 8)
+			x9 += x14
+			x4 ^= x9
+			x4 = bits.RotateLeft32(x4, 7)
+		}
+
+		// On amd64 at least, this is a rather big boost.
+		if useUnsafe {
+			if in != nil {
+				inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = inArr[0] ^ (x0 + sigma0)
+				outArr[1] = inArr[1] ^ (x1 + sigma1)
+				outArr[2] = inArr[2] ^ (x2 + sigma2)
+				outArr[3] = inArr[3] ^ (x3 + sigma3)
+				outArr[4] = inArr[4] ^ (x4 + x[4])
+				outArr[5] = inArr[5] ^ (x5 + x[5])
+				outArr[6] = inArr[6] ^ (x6 + x[6])
+				outArr[7] = inArr[7] ^ (x7 + x[7])
+				outArr[8] = inArr[8] ^ (x8 + x[8])
+				outArr[9] = inArr[9] ^ (x9 + x[9])
+				outArr[10] = inArr[10] ^ (x10 + x[10])
+				outArr[11] = inArr[11] ^ (x11 + x[11])
+				outArr[12] = inArr[12] ^ (x12 + x[12])
+				outArr[13] = inArr[13] ^ (x13 + x[13])
+				outArr[14] = inArr[14] ^ (x14 + x[14])
+				outArr[15] = inArr[15] ^ (x15 + x[15])
+			} else {
+				outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+				outArr[0] = x0 + sigma0
+				outArr[1] = x1 + sigma1
+				outArr[2] = x2 + sigma2
+				outArr[3] = x3 + sigma3
+				outArr[4] = x4 + x[4]
+				outArr[5] = x5 + x[5]
+				outArr[6] = x6 + x[6]
+				outArr[7] = x7 + x[7]
+				outArr[8] = x8 + x[8]
+				outArr[9] = x9 + x[9]
+				outArr[10] = x10 + x[10]
+				outArr[11] = x11 + x[11]
+				outArr[12] = x12 + x[12]
+				outArr[13] = x13 + x[13]
+				outArr[14] = x14 + x[14]
+				outArr[15] = x15 + x[15]
+			}
+		} else {
+			// Slow path, either the architecture cares about alignment, or is not little endian.
+			x0 += sigma0
+			x1 += sigma1
+			x2 += sigma2
+			x3 += sigma3
+			x4 += x[4]
+			x5 += x[5]
+			x6 += x[6]
+			x7 += x[7]
+			x8 += x[8]
+			x9 += x[9]
+			x10 += x[10]
+			x11 += x[11]
+			x12 += x[12]
+			x13 += x[13]
+			x14 += x[14]
+			x15 += x[15]
+			if in != nil {
+				binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
+				binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
+				binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
+				binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
+				binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
+				binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
+				binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
+				binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
+				binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
+				binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
+				binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
+				binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
+				binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
+				binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
+				binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
+				binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
+				in = in[BlockSize:]
+			} else {
+				binary.LittleEndian.PutUint32(out[0:4], x0)
+				binary.LittleEndian.PutUint32(out[4:8], x1)
+				binary.LittleEndian.PutUint32(out[8:12], x2)
+				binary.LittleEndian.PutUint32(out[12:16], x3)
+				binary.LittleEndian.PutUint32(out[16:20], x4)
+				binary.LittleEndian.PutUint32(out[20:24], x5)
+				binary.LittleEndian.PutUint32(out[24:28], x6)
+				binary.LittleEndian.PutUint32(out[28:32], x7)
+				binary.LittleEndian.PutUint32(out[32:36], x8)
+				binary.LittleEndian.PutUint32(out[36:40], x9)
+				binary.LittleEndian.PutUint32(out[40:44], x10)
+				binary.LittleEndian.PutUint32(out[44:48], x11)
+				binary.LittleEndian.PutUint32(out[48:52], x12)
+				binary.LittleEndian.PutUint32(out[52:56], x13)
+				binary.LittleEndian.PutUint32(out[56:60], x14)
+				binary.LittleEndian.PutUint32(out[60:64], x15)
+			}
+			out = out[BlockSize:]
+		}
+
+		// Stoping at 2^70 bytes per nonce is the user's responsibility.
+		ctr := uint64(x[13])<<32 | uint64(x[12])
+		ctr++
+		x[12] = uint32(ctr)
+		x[13] = uint32(ctr >> 32)
+	}
+}
+
+func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
+	x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+	x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+	for i := chachaRounds; i > 0; i -= 2 {
+		// quarterround(x, 0, 4, 8, 12)
+		x0 += x4
+		x12 ^= x0
+		x12 = bits.RotateLeft32(x12, 16)
+		x8 += x12
+		x4 ^= x8
+		x4 = bits.RotateLeft32(x4, 12)
+		x0 += x4
+		x12 ^= x0
+		x12 = bits.RotateLeft32(x12, 8)
+		x8 += x12
+		x4 ^= x8
+		x4 = bits.RotateLeft32(x4, 7)
+
+		// quarterround(x, 1, 5, 9, 13)
+		x1 += x5
+		x13 ^= x1
+		x13 = bits.RotateLeft32(x13, 16)
+		x9 += x13
+		x5 ^= x9
+		x5 = bits.RotateLeft32(x5, 12)
+		x1 += x5
+		x13 ^= x1
+		x13 = bits.RotateLeft32(x13, 8)
+		x9 += x13
+		x5 ^= x9
+		x5 = bits.RotateLeft32(x5, 7)
+
+		// quarterround(x, 2, 6, 10, 14)
+		x2 += x6
+		x14 ^= x2
+		x14 = bits.RotateLeft32(x14, 16)
+		x10 += x14
+		x6 ^= x10
+		x6 = bits.RotateLeft32(x6, 12)
+		x2 += x6
+		x14 ^= x2
+		x14 = bits.RotateLeft32(x14, 8)
+		x10 += x14
+		x6 ^= x10
+		x6 = bits.RotateLeft32(x6, 7)
+
+		// quarterround(x, 3, 7, 11, 15)
+		x3 += x7
+		x15 ^= x3
+		x15 = bits.RotateLeft32(x15, 16)
+		x11 += x15
+		x7 ^= x11
+		x7 = bits.RotateLeft32(x7, 12)
+		x3 += x7
+		x15 ^= x3
+		x15 = bits.RotateLeft32(x15, 8)
+		x11 += x15
+		x7 ^= x11
+		x7 = bits.RotateLeft32(x7, 7)
+
+		// quarterround(x, 0, 5, 10, 15)
+		x0 += x5
+		x15 ^= x0
+		x15 = bits.RotateLeft32(x15, 16)
+		x10 += x15
+		x5 ^= x10
+		x5 = bits.RotateLeft32(x5, 12)
+		x0 += x5
+		x15 ^= x0
+		x15 = bits.RotateLeft32(x15, 8)
+		x10 += x15
+		x5 ^= x10
+		x5 = bits.RotateLeft32(x5, 7)
+
+		// quarterround(x, 1, 6, 11, 12)
+		x1 += x6
+		x12 ^= x1
+		x12 = bits.RotateLeft32(x12, 16)
+		x11 += x12
+		x6 ^= x11
+		x6 = bits.RotateLeft32(x6, 12)
+		x1 += x6
+		x12 ^= x1
+		x12 = bits.RotateLeft32(x12, 8)
+		x11 += x12
+		x6 ^= x11
+		x6 = bits.RotateLeft32(x6, 7)
+
+		// quarterround(x, 2, 7, 8, 13)
+		x2 += x7
+		x13 ^= x2
+		x13 = bits.RotateLeft32(x13, 16)
+		x8 += x13
+		x7 ^= x8
+		x7 = bits.RotateLeft32(x7, 12)
+		x2 += x7
+		x13 ^= x2
+		x13 = bits.RotateLeft32(x13, 8)
+		x8 += x13
+		x7 ^= x8
+		x7 = bits.RotateLeft32(x7, 7)
+
+		// quarterround(x, 3, 4, 9, 14)
+		x3 += x4
+		x14 ^= x3
+		x14 = bits.RotateLeft32(x14, 16)
+		x9 += x14
+		x4 ^= x9
+		x4 = bits.RotateLeft32(x4, 12)
+		x3 += x4
+		x14 ^= x3
+		x14 = bits.RotateLeft32(x14, 8)
+		x9 += x14
+		x4 ^= x9
+		x4 = bits.RotateLeft32(x4, 7)
+	}
+
+	// HChaCha returns x0...x3 | x12...x15, which corresponds to the
+	// indexes of the ChaCha constant and the indexes of the IV.
+	if useUnsafe {
+		outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
+		outArr[0] = x0
+		outArr[1] = x1
+		outArr[2] = x2
+		outArr[3] = x3
+		outArr[4] = x12
+		outArr[5] = x13
+		outArr[6] = x14
+		outArr[7] = x15
+	} else {
+		binary.LittleEndian.PutUint32(out[0:4], x0)
+		binary.LittleEndian.PutUint32(out[4:8], x1)
+		binary.LittleEndian.PutUint32(out[8:12], x2)
+		binary.LittleEndian.PutUint32(out[12:16], x3)
+		binary.LittleEndian.PutUint32(out[16:20], x12)
+		binary.LittleEndian.PutUint32(out[20:24], x13)
+		binary.LittleEndian.PutUint32(out[24:28], x14)
+		binary.LittleEndian.PutUint32(out[28:32], x15)
+	}
+	return
+}

+ 9 - 3
vendor/vendor.json

@@ -63,10 +63,10 @@
 			"revisionTime": "2018-09-12T16:47:43Z"
 		},
 		{
-			"checksumSHA1": "yibR+itWrPd8QJO4soBScqrbxzg=",
+			"checksumSHA1": "tPhzygqO+Lk+oY/Gp4jMq7YGbhU=",
 			"path": "github.com/Psiphon-Labs/tls-tris",
-			"revision": "5165552b556552cfd96b918a8121934a7d8d0a66",
-			"revisionTime": "2018-09-15T13:40:56Z"
+			"revision": "a7c2751b906303d32dff10bb728a1d0f124e56f4",
+			"revisionTime": "2018-09-29T21:11:36Z"
 		},
 		{
 			"checksumSHA1": "OBN3dfn0yx9L3I2RPo58o27my2k=",
@@ -80,6 +80,12 @@
 			"revision": "1f81de88145c342aad771f4f630012618faaffa7",
 			"revisionTime": "2018-04-25T19:07:11Z"
 		},
+		{
+			"checksumSHA1": "iZ3cpzMPHGeYQWcSLweeYdPqPiw=",
+			"path": "github.com/Yawning/chacha20",
+			"revision": "e3b1f968fc6397b51d963fee8ec8711a47bc0ce8",
+			"revisionTime": "2017-09-04T08:51:04Z"
+		},
 		{
 			"checksumSHA1": "30PBqj9BW03KCVqASvLg3bR+xYc=",
 			"path": "github.com/agl/ed25519/edwards25519",