Browse Source

Update filippo.io/edwards25519

Rod Hynes 1 month ago
parent
commit
01356740fd

+ 1 - 1
go.mod

@@ -32,7 +32,7 @@ replace github.com/pion/ice/v2 => ./replace/ice
 replace github.com/pion/webrtc/v3 => ./replace/webrtc
 
 require (
-	filippo.io/edwards25519 v1.1.0
+	filippo.io/edwards25519 v1.2.0
 	github.com/Jigsaw-Code/outline-sdk v0.0.16
 	github.com/Jigsaw-Code/outline-ss-server v1.8.0
 	github.com/Psiphon-Inc/rotate-safe-writer v0.0.0-20210303140923-464a7a37606e

+ 2 - 0
go.sum

@@ -2,6 +2,8 @@ filippo.io/bigmod v0.0.1 h1:OaEqDr3gEbofpnHbGqZweSL/bLMhy1pb54puiCDeuOA=
 filippo.io/bigmod v0.0.1/go.mod h1:KyzqAbH7bRH6MOuOF1TPfUjvLoi0mRF2bIyD2ouRNQI=
 filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
 filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
+filippo.io/edwards25519 v1.2.0 h1:crnVqOiS4jqYleHd9vaKZ+HKtHfllngJIiOpNpoJsjo=
+filippo.io/edwards25519 v1.2.0/go.mod h1:xzAOLCNug/yB62zG1bQ8uziwrIqIuxhctzJT18Q77mc=
 filippo.io/keygen v0.0.0-20230306160926-5201437acf8e h1:+xwUCyMiCWKWsI0RowhzB4sngpUdMHgU6lLuWJCX5Dg=
 filippo.io/keygen v0.0.0-20230306160926-5201437acf8e/go.mod h1:ZGSiF/b2hd6MRghF/cid0vXw8pXykRTmIu+JSPw/NCQ=
 github.com/AndreasBriese/bbloom v0.0.0-20190825152654-46b345b51c96 h1:cTp8I5+VIoKjsnZuH8vjyaysT/ses3EvZeaV/1UkF2M=

+ 4 - 2
vendor/filippo.io/edwards25519/README.md

@@ -7,8 +7,10 @@ import "filippo.io/edwards25519"
 This library implements the edwards25519 elliptic curve, exposing the necessary APIs to build a wide array of higher-level primitives.
 Read the docs at [pkg.go.dev/filippo.io/edwards25519](https://pkg.go.dev/filippo.io/edwards25519).
 
-The code is originally derived from Adam Langley's internal implementation in the Go standard library, and includes George Tankersley's [performance improvements](https://golang.org/cl/71950). It was then further developed by Henry de Valence for use in ristretto255, and was finally [merged back into the Go standard library](https://golang.org/cl/276272) as of Go 1.17. It now tracks the upstream codebase and extends it with additional functionality.
+The package tracks the upstream standard library package `crypto/internal/fips140/edwards25519` and extends it with additional functionality.
 
-Most users don't need this package, and should instead use `crypto/ed25519` for signatures, `golang.org/x/crypto/curve25519` for Diffie-Hellman, or `github.com/gtank/ristretto255` for prime order group logic. However, for anyone currently using a fork of `crypto/internal/edwards25519`/`crypto/ed25519/internal/edwards25519` or `github.com/agl/edwards25519`, this package should be a safer, faster, and more powerful alternative.
+The code is originally derived from Adam Langley's internal implementation in the Go standard library, and includes George Tankersley's [performance improvements](https://golang.org/cl/71950). It was then further developed by Henry de Valence for use in ristretto255, and was finally [merged back into the Go standard library](https://golang.org/cl/276272) as of Go 1.17.
+
+Most users don't need this package, and should instead use `crypto/ed25519` for signatures, `crypto/ecdh` for Diffie-Hellman, or `github.com/gtank/ristretto255` for prime order group logic. However, for anyone currently using a fork of the internal `edwards25519` package or of `github.com/agl/edwards25519`, this package should be a safer, faster, and more powerful alternative.
 
 Since this package is meant to curb proliferation of edwards25519 implementations in the Go ecosystem, it welcomes requests for new APIs or reviewable performance improvements.

+ 3 - 3
vendor/filippo.io/edwards25519/doc.go

@@ -10,11 +10,11 @@
 // the curve used by the Ed25519 signature scheme.
 //
 // Most users don't need this package, and should instead use crypto/ed25519 for
-// signatures, golang.org/x/crypto/curve25519 for Diffie-Hellman, or
-// github.com/gtank/ristretto255 for prime order group logic.
+// signatures, crypto/ecdh for Diffie-Hellman, or github.com/gtank/ristretto255
+// for prime order group logic.
 //
 // However, developers who do need to interact with low-level edwards25519
 // operations can use this package, which is an extended version of
-// crypto/internal/edwards25519 from the standard library repackaged as
+// crypto/internal/fips140/edwards25519 from the standard library repackaged as
 // an importable module.
 package edwards25519

+ 60 - 8
vendor/filippo.io/edwards25519/extra.go

@@ -9,6 +9,7 @@ package edwards25519
 
 import (
 	"errors"
+	"slices"
 
 	"filippo.io/edwards25519/field"
 )
@@ -100,13 +101,15 @@ func (v *Point) bytesMontgomery(buf *[32]byte) []byte {
 	//
 	//              u = (1 + y) / (1 - y)
 	//
-	// where y = Y / Z.
+	// where y = Y / Z and therefore
+	//
+	//              u = (Z + Y) / (Z - Y)
 
-	var y, recip, u field.Element
+	var n, r, u field.Element
 
-	y.Multiply(&v.y, y.Invert(&v.z))        // y = Y / Z
-	recip.Invert(recip.Subtract(feOne, &y)) // r = 1/(1 - y)
-	u.Multiply(u.Add(feOne, &y), &recip)    // u = (1 + y)*r
+	n.Add(&v.z, &v.y)                // n = Z + Y
+	r.Invert(r.Subtract(&v.z, &v.y)) // r = 1 / (Z - Y)
+	u.Multiply(&n, &r)               // u = n * r
 
 	return copyFieldElement(buf, &u)
 }
@@ -124,7 +127,7 @@ func (v *Point) MultByCofactor(p *Point) *Point {
 	return v.fromP1xP1(&result)
 }
 
-// Given k > 0, set s = s**(2*i).
+// Given k > 0, set s = s**(2*k).
 func (s *Scalar) pow2k(k int) {
 	for i := 0; i < k; i++ {
 		s.Multiply(s, s)
@@ -250,12 +253,14 @@ func (v *Point) MultiScalarMult(scalars []*Scalar, points []*Point) *Point {
 	// between each point in the multiscalar equation.
 
 	// Build lookup tables for each point
-	tables := make([]projLookupTable, len(points))
+	tables := make([]projLookupTable, 0, 2) // avoid allocation for small sizes
+	tables = slices.Grow(tables, len(points))[:len(points)]
 	for i := range tables {
 		tables[i].FromP3(points[i])
 	}
 	// Compute signed radix-16 digits for each scalar
-	digits := make([][64]int8, len(scalars))
+	digits := make([][64]int8, 0, 2) // avoid allocation for small sizes
+	digits = slices.Grow(digits, len(scalars))[:len(scalars)]
 	for i := range digits {
 		digits[i] = scalars[i].signedRadix16()
 	}
@@ -265,6 +270,7 @@ func (v *Point) MultiScalarMult(scalars []*Scalar, points []*Point) *Point {
 	tmp1 := &projP1xP1{}
 	tmp2 := &projP2{}
 	// Lookup-and-add the appropriate multiple of each input point
+	v.Set(NewIdentityPoint())
 	for j := range tables {
 		tables[j].SelectInto(multiple, digits[j][63])
 		tmp1.Add(v, multiple) // tmp1 = v + x_(j,63)*Q in P1xP1 coords
@@ -347,3 +353,49 @@ func (v *Point) VarTimeMultiScalarMult(scalars []*Scalar, points []*Point) *Poin
 	v.fromP2(tmp2)
 	return v
 }
+
+// Select sets v to a if cond == 1 and to b if cond == 0.
+func (v *Point) Select(a, b *Point, cond int) *Point {
+	checkInitialized(a, b)
+	v.x.Select(&a.x, &b.x, cond)
+	v.y.Select(&a.y, &b.y, cond)
+	v.z.Select(&a.z, &b.z, cond)
+	v.t.Select(&a.t, &b.t, cond)
+	return v
+}
+
+// Double sets v = p + p, and returns v.
+func (v *Point) Double(p *Point) *Point {
+	checkInitialized(p)
+
+	pp := new(projP2).FromP3(p)
+	p1 := new(projP1xP1).Double(pp)
+	return v.fromP1xP1(p1)
+}
+
+func (v *Point) addCached(p *Point, qCached *projCached) *Point {
+	result := new(projP1xP1).Add(p, qCached)
+	return v.fromP1xP1(result)
+}
+
+// ScalarMultSlow sets v = x * q, and returns v. It doesn't precompute a large
+// table, so it is considerably slower, but requires less memory.
+//
+// The scalar multiplication is done in constant time.
+func (v *Point) ScalarMultSlow(x *Scalar, q *Point) *Point {
+	checkInitialized(q)
+
+	s := x.Bytes()
+	qCached := new(projCached).FromP3(q)
+	v.Set(NewIdentityPoint())
+	t := new(Point)
+
+	for i := 255; i >= 0; i-- {
+		v.Double(v)
+		t.addCached(v, qCached)
+		cond := (s[i/8] >> (i % 8)) & 1
+		v.Select(t, v, int(cond))
+	}
+
+	return v
+}

+ 17 - 17
vendor/filippo.io/edwards25519/field/fe.go

@@ -90,11 +90,7 @@ func (v *Element) Add(a, b *Element) *Element {
 	v.l2 = a.l2 + b.l2
 	v.l3 = a.l3 + b.l3
 	v.l4 = a.l4 + b.l4
-	// Using the generic implementation here is actually faster than the
-	// assembly. Probably because the body of this function is so simple that
-	// the compiler can figure out better optimizations by inlining the carry
-	// propagation.
-	return v.carryPropagateGeneric()
+	return v.carryPropagate()
 }
 
 // Subtract sets v = a - b, and returns v.
@@ -232,18 +228,22 @@ func (v *Element) bytes(out *[32]byte) []byte {
 	t := *v
 	t.reduce()
 
-	var buf [8]byte
-	for i, l := range [5]uint64{t.l0, t.l1, t.l2, t.l3, t.l4} {
-		bitsOffset := i * 51
-		binary.LittleEndian.PutUint64(buf[:], l<<uint(bitsOffset%8))
-		for i, bb := range buf {
-			off := bitsOffset/8 + i
-			if off >= len(out) {
-				break
-			}
-			out[off] |= bb
-		}
-	}
+	// Pack five 51-bit limbs into four 64-bit words:
+	//
+	//  255    204    153    102     51      0
+	//    ├──l4──┼──l3──┼──l2──┼──l1──┼──l0──┤
+	//   ├───u3───┼───u2───┼───u1───┼───u0───┤
+	// 256      192      128       64        0
+
+	u0 := t.l1<<51 | t.l0
+	u1 := t.l2<<(102-64) | t.l1>>(64-51)
+	u2 := t.l3<<(153-128) | t.l2>>(128-102)
+	u3 := t.l4<<(204-192) | t.l3>>(192-153)
+
+	binary.LittleEndian.PutUint64(out[0*8:], u0)
+	binary.LittleEndian.PutUint64(out[1*8:], u1)
+	binary.LittleEndian.PutUint64(out[2*8:], u2)
+	binary.LittleEndian.PutUint64(out[3*8:], u3)
 
 	return out[:]
 }

+ 1 - 2
vendor/filippo.io/edwards25519/field/fe_amd64.go

@@ -1,7 +1,6 @@
 // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
 
-//go:build amd64 && gc && !purego
-// +build amd64,gc,!purego
+//go:build !purego
 
 package field
 

+ 111 - 92
vendor/filippo.io/edwards25519/field/fe_amd64.s

@@ -1,7 +1,6 @@
 // Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg field. DO NOT EDIT.
 
-//go:build amd64 && gc && !purego
-// +build amd64,gc,!purego
+//go:build !purego
 
 #include "textflag.h"
 
@@ -17,32 +16,36 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
 	MOVQ DX, SI
 
 	// r0 += 19×a1×b4
-	MOVQ   8(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
+	MOVQ 8(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 32(BX)
+	ADDQ AX, DI
+	ADCQ DX, SI
 
 	// r0 += 19×a2×b3
-	MOVQ   16(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
+	MOVQ 16(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 24(BX)
+	ADDQ AX, DI
+	ADCQ DX, SI
 
 	// r0 += 19×a3×b2
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   16(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
+	MOVQ 24(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 16(BX)
+	ADDQ AX, DI
+	ADCQ DX, SI
 
 	// r0 += 19×a4×b1
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   8(BX)
-	ADDQ   AX, DI
-	ADCQ   DX, SI
+	MOVQ 32(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 8(BX)
+	ADDQ AX, DI
+	ADCQ DX, SI
 
 	// r1 = a0×b1
 	MOVQ (CX), AX
@@ -57,25 +60,28 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
 	ADCQ DX, R8
 
 	// r1 += 19×a2×b4
-	MOVQ   16(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
+	MOVQ 16(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 32(BX)
+	ADDQ AX, R9
+	ADCQ DX, R8
 
 	// r1 += 19×a3×b3
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
+	MOVQ 24(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 24(BX)
+	ADDQ AX, R9
+	ADCQ DX, R8
 
 	// r1 += 19×a4×b2
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   16(BX)
-	ADDQ   AX, R9
-	ADCQ   DX, R8
+	MOVQ 32(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 16(BX)
+	ADDQ AX, R9
+	ADCQ DX, R8
 
 	// r2 = a0×b2
 	MOVQ (CX), AX
@@ -96,18 +102,20 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
 	ADCQ DX, R10
 
 	// r2 += 19×a3×b4
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R11
-	ADCQ   DX, R10
+	MOVQ 24(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 32(BX)
+	ADDQ AX, R11
+	ADCQ DX, R10
 
 	// r2 += 19×a4×b3
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(BX)
-	ADDQ   AX, R11
-	ADCQ   DX, R10
+	MOVQ 32(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 24(BX)
+	ADDQ AX, R11
+	ADCQ DX, R10
 
 	// r3 = a0×b3
 	MOVQ (CX), AX
@@ -134,11 +142,12 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
 	ADCQ DX, R12
 
 	// r3 += 19×a4×b4
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(BX)
-	ADDQ   AX, R13
-	ADCQ   DX, R12
+	MOVQ 32(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 32(BX)
+	ADDQ AX, R13
+	ADCQ DX, R12
 
 	// r4 = a0×b4
 	MOVQ (CX), AX
@@ -232,18 +241,22 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
 	MOVQ DX, BX
 
 	// r0 += 38×l1×l4
-	MOVQ   8(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, SI
-	ADCQ   DX, BX
+	MOVQ 8(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	SHLQ $0x01, AX
+	MULQ 32(CX)
+	ADDQ AX, SI
+	ADCQ DX, BX
 
 	// r0 += 38×l2×l3
-	MOVQ   16(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, SI
-	ADCQ   DX, BX
+	MOVQ 16(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	SHLQ $0x01, AX
+	MULQ 24(CX)
+	ADDQ AX, SI
+	ADCQ DX, BX
 
 	// r1 = 2×l0×l1
 	MOVQ (CX), AX
@@ -253,18 +266,21 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
 	MOVQ DX, DI
 
 	// r1 += 38×l2×l4
-	MOVQ   16(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R8
-	ADCQ   DX, DI
+	MOVQ 16(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	SHLQ $0x01, AX
+	MULQ 32(CX)
+	ADDQ AX, R8
+	ADCQ DX, DI
 
 	// r1 += 19×l3×l3
-	MOVQ   24(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, R8
-	ADCQ   DX, DI
+	MOVQ 24(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 24(CX)
+	ADDQ AX, R8
+	ADCQ DX, DI
 
 	// r2 = 2×l0×l2
 	MOVQ (CX), AX
@@ -280,11 +296,13 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
 	ADCQ DX, R9
 
 	// r2 += 38×l3×l4
-	MOVQ   24(CX), AX
-	IMUL3Q $0x26, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R10
-	ADCQ   DX, R9
+	MOVQ 24(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	SHLQ $0x01, AX
+	MULQ 32(CX)
+	ADDQ AX, R10
+	ADCQ DX, R9
 
 	// r3 = 2×l0×l3
 	MOVQ (CX), AX
@@ -294,18 +312,19 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
 	MOVQ DX, R11
 
 	// r3 += 2×l1×l2
-	MOVQ   8(CX), AX
-	IMUL3Q $0x02, AX, AX
-	MULQ   16(CX)
-	ADDQ   AX, R12
-	ADCQ   DX, R11
+	MOVQ 8(CX), AX
+	SHLQ $0x01, AX
+	MULQ 16(CX)
+	ADDQ AX, R12
+	ADCQ DX, R11
 
 	// r3 += 19×l4×l4
-	MOVQ   32(CX), AX
-	IMUL3Q $0x13, AX, AX
-	MULQ   32(CX)
-	ADDQ   AX, R12
-	ADCQ   DX, R11
+	MOVQ 32(CX), DX
+	LEAQ (DX)(DX*8), AX
+	LEAQ (DX)(AX*2), AX
+	MULQ 32(CX)
+	ADDQ AX, R12
+	ADCQ DX, R11
 
 	// r4 = 2×l0×l4
 	MOVQ (CX), AX
@@ -315,11 +334,11 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
 	MOVQ DX, R13
 
 	// r4 += 2×l1×l3
-	MOVQ   8(CX), AX
-	IMUL3Q $0x02, AX, AX
-	MULQ   24(CX)
-	ADDQ   AX, R14
-	ADCQ   DX, R13
+	MOVQ 8(CX), AX
+	SHLQ $0x01, AX
+	MULQ 24(CX)
+	ADDQ AX, R14
+	ADCQ DX, R13
 
 	// r4 += l2×l2
 	MOVQ 16(CX), AX

+ 1 - 2
vendor/filippo.io/edwards25519/field/fe_amd64_noasm.go

@@ -2,8 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 || !gc || purego
-// +build !amd64 !gc purego
+//go:build !amd64 || purego
 
 package field
 

+ 0 - 16
vendor/filippo.io/edwards25519/field/fe_arm64.go

@@ -1,16 +0,0 @@
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build arm64 && gc && !purego
-// +build arm64,gc,!purego
-
-package field
-
-//go:noescape
-func carryPropagate(v *Element)
-
-func (v *Element) carryPropagate() *Element {
-	carryPropagate(v)
-	return v
-}

+ 0 - 42
vendor/filippo.io/edwards25519/field/fe_arm64.s

@@ -1,42 +0,0 @@
-// Copyright (c) 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build arm64 && gc && !purego
-
-#include "textflag.h"
-
-// carryPropagate works exactly like carryPropagateGeneric and uses the
-// same AND, ADD, and LSR+MADD instructions emitted by the compiler, but
-// avoids loading R0-R4 twice and uses LDP and STP.
-//
-// See https://golang.org/issues/43145 for the main compiler issue.
-//
-// func carryPropagate(v *Element)
-TEXT ·carryPropagate(SB),NOFRAME|NOSPLIT,$0-8
-	MOVD v+0(FP), R20
-
-	LDP 0(R20), (R0, R1)
-	LDP 16(R20), (R2, R3)
-	MOVD 32(R20), R4
-
-	AND $0x7ffffffffffff, R0, R10
-	AND $0x7ffffffffffff, R1, R11
-	AND $0x7ffffffffffff, R2, R12
-	AND $0x7ffffffffffff, R3, R13
-	AND $0x7ffffffffffff, R4, R14
-
-	ADD R0>>51, R11, R11
-	ADD R1>>51, R12, R12
-	ADD R2>>51, R13, R13
-	ADD R3>>51, R14, R14
-	// R4>>51 * 19 + R10 -> R10
-	LSR $51, R4, R21
-	MOVD $19, R22
-	MADD R22, R10, R21, R10
-
-	STP (R10, R11), 0(R20)
-	STP (R12, R13), 16(R20)
-	MOVD R14, 32(R20)
-
-	RET

+ 0 - 12
vendor/filippo.io/edwards25519/field/fe_arm64_noasm.go

@@ -1,12 +0,0 @@
-// Copyright (c) 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !arm64 || !gc || purego
-// +build !arm64 !gc purego
-
-package field
-
-func (v *Element) carryPropagate() *Element {
-	return v.carryPropagateGeneric()
-}

+ 88 - 82
vendor/filippo.io/edwards25519/field/fe_generic.go

@@ -12,20 +12,42 @@ type uint128 struct {
 	lo, hi uint64
 }
 
-// mul64 returns a * b.
-func mul64(a, b uint64) uint128 {
+// mul returns a * b.
+func mul(a, b uint64) uint128 {
 	hi, lo := bits.Mul64(a, b)
 	return uint128{lo, hi}
 }
 
-// addMul64 returns v + a * b.
-func addMul64(v uint128, a, b uint64) uint128 {
+// addMul returns v + a * b.
+func addMul(v uint128, a, b uint64) uint128 {
 	hi, lo := bits.Mul64(a, b)
 	lo, c := bits.Add64(lo, v.lo, 0)
 	hi, _ = bits.Add64(hi, v.hi, c)
 	return uint128{lo, hi}
 }
 
+// mul19 returns v * 19.
+func mul19(v uint64) uint64 {
+	// Using this approach seems to yield better optimizations than *19.
+	return v + (v+v<<3)<<1
+}
+
+// addMul19 returns v + 19 * a * b, where a and b are at most 52 bits.
+func addMul19(v uint128, a, b uint64) uint128 {
+	hi, lo := bits.Mul64(mul19(a), b)
+	lo, c := bits.Add64(lo, v.lo, 0)
+	hi, _ = bits.Add64(hi, v.hi, c)
+	return uint128{lo, hi}
+}
+
+// addMul38 returns v + 38 * a * b, where a and b are at most 52 bits.
+func addMul38(v uint128, a, b uint64) uint128 {
+	hi, lo := bits.Mul64(mul19(a), b*2)
+	lo, c := bits.Add64(lo, v.lo, 0)
+	hi, _ = bits.Add64(hi, v.hi, c)
+	return uint128{lo, hi}
+}
+
 // shiftRightBy51 returns a >> 51. a is assumed to be at most 115 bits.
 func shiftRightBy51(a uint128) uint64 {
 	return (a.hi << (64 - 51)) | (a.lo >> 51)
@@ -76,45 +98,40 @@ func feMulGeneric(v, a, b *Element) {
 	//
 	// Finally we add up the columns into wide, overlapping limbs.
 
-	a1_19 := a1 * 19
-	a2_19 := a2 * 19
-	a3_19 := a3 * 19
-	a4_19 := a4 * 19
-
 	// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
-	r0 := mul64(a0, b0)
-	r0 = addMul64(r0, a1_19, b4)
-	r0 = addMul64(r0, a2_19, b3)
-	r0 = addMul64(r0, a3_19, b2)
-	r0 = addMul64(r0, a4_19, b1)
+	r0 := mul(a0, b0)
+	r0 = addMul19(r0, a1, b4)
+	r0 = addMul19(r0, a2, b3)
+	r0 = addMul19(r0, a3, b2)
+	r0 = addMul19(r0, a4, b1)
 
 	// r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2)
-	r1 := mul64(a0, b1)
-	r1 = addMul64(r1, a1, b0)
-	r1 = addMul64(r1, a2_19, b4)
-	r1 = addMul64(r1, a3_19, b3)
-	r1 = addMul64(r1, a4_19, b2)
+	r1 := mul(a0, b1)
+	r1 = addMul(r1, a1, b0)
+	r1 = addMul19(r1, a2, b4)
+	r1 = addMul19(r1, a3, b3)
+	r1 = addMul19(r1, a4, b2)
 
 	// r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3)
-	r2 := mul64(a0, b2)
-	r2 = addMul64(r2, a1, b1)
-	r2 = addMul64(r2, a2, b0)
-	r2 = addMul64(r2, a3_19, b4)
-	r2 = addMul64(r2, a4_19, b3)
+	r2 := mul(a0, b2)
+	r2 = addMul(r2, a1, b1)
+	r2 = addMul(r2, a2, b0)
+	r2 = addMul19(r2, a3, b4)
+	r2 = addMul19(r2, a4, b3)
 
 	// r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4
-	r3 := mul64(a0, b3)
-	r3 = addMul64(r3, a1, b2)
-	r3 = addMul64(r3, a2, b1)
-	r3 = addMul64(r3, a3, b0)
-	r3 = addMul64(r3, a4_19, b4)
+	r3 := mul(a0, b3)
+	r3 = addMul(r3, a1, b2)
+	r3 = addMul(r3, a2, b1)
+	r3 = addMul(r3, a3, b0)
+	r3 = addMul19(r3, a4, b4)
 
 	// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
-	r4 := mul64(a0, b4)
-	r4 = addMul64(r4, a1, b3)
-	r4 = addMul64(r4, a2, b2)
-	r4 = addMul64(r4, a3, b1)
-	r4 = addMul64(r4, a4, b0)
+	r4 := mul(a0, b4)
+	r4 = addMul(r4, a1, b3)
+	r4 = addMul(r4, a2, b2)
+	r4 = addMul(r4, a3, b1)
+	r4 = addMul(r4, a4, b0)
 
 	// After the multiplication, we need to reduce (carry) the five coefficients
 	// to obtain a result with limbs that are at most slightly larger than 2⁵¹,
@@ -149,7 +166,7 @@ func feMulGeneric(v, a, b *Element) {
 	c3 := shiftRightBy51(r3)
 	c4 := shiftRightBy51(r4)
 
-	rr0 := r0.lo&maskLow51Bits + c4*19
+	rr0 := r0.lo&maskLow51Bits + mul19(c4)
 	rr1 := r1.lo&maskLow51Bits + c0
 	rr2 := r2.lo&maskLow51Bits + c1
 	rr3 := r3.lo&maskLow51Bits + c2
@@ -158,8 +175,12 @@ func feMulGeneric(v, a, b *Element) {
 	// Now all coefficients fit into 64-bit registers but are still too large to
 	// be passed around as an Element. We therefore do one last carry chain,
 	// where the carries will be small enough to fit in the wiggle room above 2⁵¹.
-	*v = Element{rr0, rr1, rr2, rr3, rr4}
-	v.carryPropagate()
+
+	v.l0 = rr0&maskLow51Bits + mul19(rr4>>51)
+	v.l1 = rr1&maskLow51Bits + rr0>>51
+	v.l2 = rr2&maskLow51Bits + rr1>>51
+	v.l3 = rr3&maskLow51Bits + rr2>>51
+	v.l4 = rr4&maskLow51Bits + rr3>>51
 }
 
 func feSquareGeneric(v, a *Element) {
@@ -190,44 +211,31 @@ func feSquareGeneric(v, a *Element) {
 	//            l0l4 19×l4l4 19×l3l4 19×l2l4 19×l1l4  =
 	//           --------------------------------------
 	//              r4      r3      r2      r1      r0
-	//
-	// With precomputed 2×, 19×, and 2×19× terms, we can compute each limb with
-	// only three Mul64 and four Add64, instead of five and eight.
-
-	l0_2 := l0 * 2
-	l1_2 := l1 * 2
-
-	l1_38 := l1 * 38
-	l2_38 := l2 * 38
-	l3_38 := l3 * 38
-
-	l3_19 := l3 * 19
-	l4_19 := l4 * 19
 
 	// r0 = l0×l0 + 19×(l1×l4 + l2×l3 + l3×l2 + l4×l1) = l0×l0 + 19×2×(l1×l4 + l2×l3)
-	r0 := mul64(l0, l0)
-	r0 = addMul64(r0, l1_38, l4)
-	r0 = addMul64(r0, l2_38, l3)
+	r0 := mul(l0, l0)
+	r0 = addMul38(r0, l1, l4)
+	r0 = addMul38(r0, l2, l3)
 
 	// r1 = l0×l1 + l1×l0 + 19×(l2×l4 + l3×l3 + l4×l2) = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
-	r1 := mul64(l0_2, l1)
-	r1 = addMul64(r1, l2_38, l4)
-	r1 = addMul64(r1, l3_19, l3)
+	r1 := mul(l0*2, l1)
+	r1 = addMul38(r1, l2, l4)
+	r1 = addMul19(r1, l3, l3)
 
 	// r2 = l0×l2 + l1×l1 + l2×l0 + 19×(l3×l4 + l4×l3) = 2×l0×l2 + l1×l1 + 19×2×l3×l4
-	r2 := mul64(l0_2, l2)
-	r2 = addMul64(r2, l1, l1)
-	r2 = addMul64(r2, l3_38, l4)
+	r2 := mul(l0*2, l2)
+	r2 = addMul(r2, l1, l1)
+	r2 = addMul38(r2, l3, l4)
 
 	// r3 = l0×l3 + l1×l2 + l2×l1 + l3×l0 + 19×l4×l4 = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
-	r3 := mul64(l0_2, l3)
-	r3 = addMul64(r3, l1_2, l2)
-	r3 = addMul64(r3, l4_19, l4)
+	r3 := mul(l0*2, l3)
+	r3 = addMul(r3, l1*2, l2)
+	r3 = addMul19(r3, l4, l4)
 
 	// r4 = l0×l4 + l1×l3 + l2×l2 + l3×l1 + l4×l0 = 2×l0×l4 + 2×l1×l3 + l2×l2
-	r4 := mul64(l0_2, l4)
-	r4 = addMul64(r4, l1_2, l3)
-	r4 = addMul64(r4, l2, l2)
+	r4 := mul(l0*2, l4)
+	r4 = addMul(r4, l1*2, l3)
+	r4 = addMul(r4, l2, l2)
 
 	c0 := shiftRightBy51(r0)
 	c1 := shiftRightBy51(r1)
@@ -235,32 +243,30 @@ func feSquareGeneric(v, a *Element) {
 	c3 := shiftRightBy51(r3)
 	c4 := shiftRightBy51(r4)
 
-	rr0 := r0.lo&maskLow51Bits + c4*19
+	rr0 := r0.lo&maskLow51Bits + mul19(c4)
 	rr1 := r1.lo&maskLow51Bits + c0
 	rr2 := r2.lo&maskLow51Bits + c1
 	rr3 := r3.lo&maskLow51Bits + c2
 	rr4 := r4.lo&maskLow51Bits + c3
 
-	*v = Element{rr0, rr1, rr2, rr3, rr4}
-	v.carryPropagate()
+	v.l0 = rr0&maskLow51Bits + mul19(rr4>>51)
+	v.l1 = rr1&maskLow51Bits + rr0>>51
+	v.l2 = rr2&maskLow51Bits + rr1>>51
+	v.l3 = rr3&maskLow51Bits + rr2>>51
+	v.l4 = rr4&maskLow51Bits + rr3>>51
 }
 
-// carryPropagateGeneric brings the limbs below 52 bits by applying the reduction
+// carryPropagate brings the limbs below 52 bits by applying the reduction
 // identity (a * 2²⁵⁵ + b = a * 19 + b) to the l4 carry.
-func (v *Element) carryPropagateGeneric() *Element {
-	c0 := v.l0 >> 51
-	c1 := v.l1 >> 51
-	c2 := v.l2 >> 51
-	c3 := v.l3 >> 51
-	c4 := v.l4 >> 51
-
-	// c4 is at most 64 - 51 = 13 bits, so c4*19 is at most 18 bits, and
+func (v *Element) carryPropagate() *Element {
+	// (l4>>51) is at most 64 - 51 = 13 bits, so (l4>>51)*19 is at most 18 bits, and
 	// the final l0 will be at most 52 bits. Similarly for the rest.
-	v.l0 = v.l0&maskLow51Bits + c4*19
-	v.l1 = v.l1&maskLow51Bits + c0
-	v.l2 = v.l2&maskLow51Bits + c1
-	v.l3 = v.l3&maskLow51Bits + c2
-	v.l4 = v.l4&maskLow51Bits + c3
+	l0 := v.l0
+	v.l0 = v.l0&maskLow51Bits + mul19(v.l4>>51)
+	v.l4 = v.l4&maskLow51Bits + v.l3>>51
+	v.l3 = v.l3&maskLow51Bits + v.l2>>51
+	v.l2 = v.l2&maskLow51Bits + v.l1>>51
+	v.l1 = v.l1&maskLow51Bits + l0>>51
 
 	return v
 }

+ 53 - 0
vendor/filippo.io/edwards25519/pull.sh

@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+if [ "$#" -ne 1 ]; then
+	echo "Usage: $0 <tag>"
+	exit 1
+fi
+
+TAG="$1"
+TMPDIR="$(mktemp -d)"
+
+cleanup() {
+	rm -rf "$TMPDIR"
+}
+trap cleanup EXIT
+
+command -v git >/dev/null
+command -v git-filter-repo >/dev/null
+
+if [ -d "$HOME/go/.git" ]; then
+	REFERENCE=(--reference "$HOME/go" --dissociate)
+else
+	REFERENCE=()
+fi
+
+git -c advice.detachedHead=false clone --no-checkout "${REFERENCE[@]}" \
+	-b "$TAG" https://go.googlesource.com/go.git "$TMPDIR"
+
+# Simplify the history graph by removing the dev.boringcrypto branches, whose
+# merges end up empty after grafting anyway. This also fixes a weird quirk
+# (maybe a git-filter-repo bug?) where only one file from an old path,
+# src/crypto/ed25519/internal/edwards25519/const.go, would still exist in the
+# filtered repo.
+git -C "$TMPDIR" replace --graft f771edd7f9 99f1bf54eb
+git -C "$TMPDIR" replace --graft 109c13b64f c2f96e686f
+git -C "$TMPDIR" replace --graft aa4da4f189 912f075047
+
+git -C "$TMPDIR" filter-repo --force \
+	--paths-from-file /dev/stdin \
+	--prune-empty always \
+	--prune-degenerate always \
+	--tag-callback 'tag.skip()' <<'EOF'
+src/crypto/internal/fips140/edwards25519
+src/crypto/internal/edwards25519
+src/crypto/ed25519/internal/edwards25519
+EOF
+
+git fetch "$TMPDIR"
+git update-ref "refs/heads/upstream/$TAG" FETCH_HEAD
+
+echo
+echo "Fetched upstream history up to $TAG. Merge with:"
+echo -e "\tgit merge --no-ff --no-commit --allow-unrelated-histories upstream/$TAG"

+ 18 - 9
vendor/filippo.io/edwards25519/scalar.go

@@ -7,6 +7,7 @@ package edwards25519
 import (
 	"encoding/binary"
 	"errors"
+	"math/bits"
 )
 
 // A Scalar is an integer modulo
@@ -179,15 +180,23 @@ func isReduced(s []byte) bool {
 		return false
 	}
 
-	for i := len(s) - 1; i >= 0; i-- {
-		switch {
-		case s[i] > scalarMinusOneBytes[i]:
-			return false
-		case s[i] < scalarMinusOneBytes[i]:
-			return true
-		}
-	}
-	return true
+	s0 := binary.LittleEndian.Uint64(s[:8])
+	s1 := binary.LittleEndian.Uint64(s[8:16])
+	s2 := binary.LittleEndian.Uint64(s[16:24])
+	s3 := binary.LittleEndian.Uint64(s[24:])
+
+	l0 := binary.LittleEndian.Uint64(scalarMinusOneBytes[:8])
+	l1 := binary.LittleEndian.Uint64(scalarMinusOneBytes[8:16])
+	l2 := binary.LittleEndian.Uint64(scalarMinusOneBytes[16:24])
+	l3 := binary.LittleEndian.Uint64(scalarMinusOneBytes[24:])
+
+	// Do a constant time subtraction chain scalarMinusOneBytes - s. If there is
+	// a borrow at the end, then s > scalarMinusOneBytes.
+	_, b := bits.Sub64(l0, s0, 0)
+	_, b = bits.Sub64(l1, s1, b)
+	_, b = bits.Sub64(l2, s2, b)
+	_, b = bits.Sub64(l3, s3, b)
+	return b == 0
 }
 
 // SetBytesWithClamping applies the buffer pruning described in RFC 8032,

+ 1 - 3
vendor/filippo.io/edwards25519/tables.go

@@ -4,9 +4,7 @@
 
 package edwards25519
 
-import (
-	"crypto/subtle"
-)
+import "crypto/subtle"
 
 // A dynamic lookup table for variable-base, constant-time scalar muls.
 type projLookupTable struct {

+ 2 - 2
vendor/modules.txt

@@ -1,8 +1,8 @@
 # filippo.io/bigmod v0.0.1
 ## explicit; go 1.20
 filippo.io/bigmod
-# filippo.io/edwards25519 v1.1.0
-## explicit; go 1.20
+# filippo.io/edwards25519 v1.2.0
+## explicit; go 1.24.0
 filippo.io/edwards25519
 filippo.io/edwards25519/field
 # filippo.io/keygen v0.0.0-20230306160926-5201437acf8e