WIP: Save agent roles integration work before CHORUS rebrand

- Agent roles and coordination features - Chat API integration testing - New configuration and workspace management 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-01 02:21:11 +10:00
parent 81b473d48f
commit 5978a0b8f5
3713 changed files with 1103925 additions and 59 deletions
--- a/vendor/lukechampine.com/blake3/LICENSE
+++ b/vendor/lukechampine.com/blake3/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2020 Luke Champine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/vendor/lukechampine.com/blake3/README.md
+++ b/vendor/lukechampine.com/blake3/README.md
@@ -0,0 +1,66 @@
+blake3
+------
+
+[![GoDoc](https://godoc.org/lukechampine.com/blake3?status.svg)](https://godoc.org/lukechampine.com/blake3)
+[![Go Report Card](http://goreportcard.com/badge/lukechampine.com/blake3)](https://goreportcard.com/report/lukechampine.com/blake3)
+
+```
+go get lukechampine.com/blake3
+```
+
+`blake3` implements the [BLAKE3 cryptographic hash function](https://github.com/BLAKE3-team/BLAKE3).
+This implementation aims to be performant without sacrificing (too much)
+readability, in the hopes of eventually landing in `x/crypto`.
+
+In addition to the pure-Go implementation, this package also contains AVX-512
+and AVX2 routines (generated by [`avo`](https://github.com/mmcloughlin/avo))
+that greatly increase performance for large inputs and outputs.
+
+Contributions are greatly appreciated.
+[All contributors are eligible to receive an Urbit planet.](https://twitter.com/lukechampine/status/1274797924522885134)
+
+
+## Benchmarks
+
+Tested on a 2020 MacBook Air (i5-7600K @ 3.80GHz). Benchmarks will improve as
+soon as I get access to a beefier AVX-512 machine. :wink:
+
+### AVX-512
+
+```
+BenchmarkSum256/64           120 ns/op       533.00 MB/s
+BenchmarkSum256/1024        2229 ns/op       459.36 MB/s
+BenchmarkSum256/65536      16245 ns/op      4034.11 MB/s
+BenchmarkWrite               245 ns/op      4177.38 MB/s
+BenchmarkXOF                 246 ns/op      4159.30 MB/s
+```
+
+### AVX2
+
+```
+BenchmarkSum256/64           120 ns/op       533.00 MB/s
+BenchmarkSum256/1024        2229 ns/op       459.36 MB/s
+BenchmarkSum256/65536      31137 ns/op      2104.76 MB/s
+BenchmarkWrite               487 ns/op      2103.12 MB/s
+BenchmarkXOF                 329 ns/op      3111.27 MB/s
+```
+
+### Pure Go
+
+```
+BenchmarkSum256/64           120 ns/op       533.00 MB/s
+BenchmarkSum256/1024        2229 ns/op       459.36 MB/s
+BenchmarkSum256/65536     133505 ns/op       490.89 MB/s
+BenchmarkWrite              2022 ns/op       506.36 MB/s
+BenchmarkXOF                1914 ns/op       534.98 MB/s
+```
+
+## Shortcomings
+
+There is no assembly routine for single-block compressions. This is most
+noticeable for ~1KB inputs.
+
+Each assembly routine inlines all 7 rounds, causing thousands of lines of
+duplicated code. Ideally the routines could be merged such that only a single
+routine is generated for AVX-512 and AVX2, without sacrificing too much
+performance.
--- a/vendor/lukechampine.com/blake3/bao.go
+++ b/vendor/lukechampine.com/blake3/bao.go
@@ -0,0 +1,151 @@
+package blake3
+
+import (
+	"bytes"
+	"encoding/binary"
+	"io"
+	"math/bits"
+)
+
+// BaoEncodedSize returns the size of a Bao encoding for the provided quantity
+// of data.
+func BaoEncodedSize(dataLen int, outboard bool) int {
+	size := 8
+	if dataLen > 0 {
+		chunks := (dataLen + chunkSize - 1) / chunkSize
+		cvs := 2*chunks - 2 // no I will not elaborate
+		size += cvs * 32
+	}
+	if !outboard {
+		size += dataLen
+	}
+	return size
+}
+
+// BaoEncode computes the intermediate BLAKE3 tree hashes of data and writes
+// them to dst. If outboard is false, the contents of data are also written to
+// dst, interleaved with the tree hashes. It also returns the tree root, i.e.
+// the 256-bit BLAKE3 hash.
+//
+// Note that dst is not written sequentially, and therefore must be initialized
+// with sufficient capacity to hold the encoding; see BaoEncodedSize.
+func BaoEncode(dst io.WriterAt, data io.Reader, dataLen int64, outboard bool) ([32]byte, error) {
+	var counter uint64
+	var chunkBuf [chunkSize]byte
+	var err error
+	read := func(p []byte) []byte {
+		if err == nil {
+			_, err = io.ReadFull(data, p)
+		}
+		return p
+	}
+	write := func(p []byte, off uint64) {
+		if err == nil {
+			_, err = dst.WriteAt(p, int64(off))
+		}
+	}
+
+	// NOTE: unlike the reference implementation, we write directly in
+	// pre-order, rather than writing in post-order and then flipping. This cuts
+	// the I/O required in half, but also makes hashing multiple chunks in SIMD
+	// a lot trickier. I'll save that optimization for a rainy day.
+	var rec func(bufLen uint64, flags uint32, off uint64) (uint64, [8]uint32)
+	rec = func(bufLen uint64, flags uint32, off uint64) (uint64, [8]uint32) {
+		if err != nil {
+			return 0, [8]uint32{}
+		} else if bufLen <= chunkSize {
+			cv := chainingValue(compressChunk(read(chunkBuf[:bufLen]), &iv, counter, flags))
+			counter++
+			if !outboard {
+				write(chunkBuf[:bufLen], off)
+			}
+			return 0, cv
+		}
+		mid := uint64(1) << (bits.Len64(bufLen-1) - 1)
+		lchildren, l := rec(mid, 0, off+64)
+		llen := lchildren * 32
+		if !outboard {
+			llen += (mid / chunkSize) * chunkSize
+		}
+		rchildren, r := rec(bufLen-mid, 0, off+64+llen)
+		write(cvToBytes(&l)[:], off)
+		write(cvToBytes(&r)[:], off+32)
+		return 2 + lchildren + rchildren, chainingValue(parentNode(l, r, iv, flags))
+	}
+
+	binary.LittleEndian.PutUint64(chunkBuf[:8], uint64(dataLen))
+	write(chunkBuf[:8], 0)
+	_, root := rec(uint64(dataLen), flagRoot, 8)
+	return *cvToBytes(&root), err
+}
+
+// BaoDecode reads content and tree data from the provided reader(s), and
+// streams the verified content to dst. It returns false if verification fails.
+// If the content and tree data are interleaved, outboard should be nil.
+func BaoDecode(dst io.Writer, data, outboard io.Reader, root [32]byte) (bool, error) {
+	if outboard == nil {
+		outboard = data
+	}
+	var counter uint64
+	var buf [chunkSize]byte
+	var err error
+	read := func(r io.Reader, p []byte) []byte {
+		if err == nil {
+			_, err = io.ReadFull(r, p)
+		}
+		return p
+	}
+	readParent := func() (l, r [8]uint32) {
+		read(outboard, buf[:64])
+		return bytesToCV(buf[:32]), bytesToCV(buf[32:])
+	}
+
+	var rec func(cv [8]uint32, bufLen uint64, flags uint32) bool
+	rec = func(cv [8]uint32, bufLen uint64, flags uint32) bool {
+		if err != nil {
+			return false
+		} else if bufLen <= chunkSize {
+			n := compressChunk(read(data, buf[:bufLen]), &iv, counter, flags)
+			counter++
+			return cv == chainingValue(n)
+		}
+		l, r := readParent()
+		n := parentNode(l, r, iv, flags)
+		mid := uint64(1) << (bits.Len64(bufLen-1) - 1)
+		return chainingValue(n) == cv && rec(l, mid, 0) && rec(r, bufLen-mid, 0)
+	}
+
+	read(outboard, buf[:8])
+	dataLen := binary.LittleEndian.Uint64(buf[:8])
+	ok := rec(bytesToCV(root[:]), dataLen, flagRoot)
+	return ok, err
+}
+
+type bufferAt struct {
+	buf []byte
+}
+
+func (b *bufferAt) WriteAt(p []byte, off int64) (int, error) {
+	if copy(b.buf[off:], p) != len(p) {
+		panic("bad buffer size")
+	}
+	return len(p), nil
+}
+
+// BaoEncodeBuf returns the Bao encoding and root (i.e. BLAKE3 hash) for data.
+func BaoEncodeBuf(data []byte, outboard bool) ([]byte, [32]byte) {
+	buf := bufferAt{buf: make([]byte, BaoEncodedSize(len(data), outboard))}
+	root, _ := BaoEncode(&buf, bytes.NewReader(data), int64(len(data)), outboard)
+	return buf.buf, root
+}
+
+// BaoVerifyBuf verifies the Bao encoding and root (i.e. BLAKE3 hash) for data.
+// If the content and tree data are interleaved, outboard should be nil.
+func BaoVerifyBuf(data, outboard []byte, root [32]byte) bool {
+	var or io.Reader = bytes.NewReader(outboard)
+	if outboard == nil {
+		or = nil
+	}
+	ok, _ := BaoDecode(io.Discard, bytes.NewReader(data), or, root)
+	return ok
+}
--- a/vendor/lukechampine.com/blake3/blake3.go
+++ b/vendor/lukechampine.com/blake3/blake3.go
@@ -0,0 +1,296 @@
+// Package blake3 implements the BLAKE3 cryptographic hash function.
+package blake3 // import "lukechampine.com/blake3"
+
+import (
+	"encoding/binary"
+	"errors"
+	"hash"
+	"io"
+	"math"
+	"math/bits"
+)
+
+const (
+	flagChunkStart = 1 << iota
+	flagChunkEnd
+	flagParent
+	flagRoot
+	flagKeyedHash
+	flagDeriveKeyContext
+	flagDeriveKeyMaterial
+
+	blockSize = 64
+	chunkSize = 1024
+
+	maxSIMD = 16 // AVX-512 vectors can store 16 words
+)
+
+var iv = [8]uint32{
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
+}
+
+// A node represents a chunk or parent in the BLAKE3 Merkle tree.
+type node struct {
+	cv       [8]uint32 // chaining value from previous node
+	block    [16]uint32
+	counter  uint64
+	blockLen uint32
+	flags    uint32
+}
+
+// parentNode returns a node that incorporates the chaining values of two child
+// nodes.
+func parentNode(left, right [8]uint32, key [8]uint32, flags uint32) node {
+	n := node{
+		cv:       key,
+		counter:  0,         // counter is reset for parents
+		blockLen: blockSize, // block is full
+		flags:    flags | flagParent,
+	}
+	copy(n.block[:8], left[:])
+	copy(n.block[8:], right[:])
+	return n
+}
+
+// Hasher implements hash.Hash.
+type Hasher struct {
+	key   [8]uint32
+	flags uint32
+	size  int // output size, for Sum
+
+	// log(n) set of Merkle subtree roots, at most one per height.
+	stack   [50][8]uint32 // 2^50 * maxSIMD * chunkSize = 2^64
+	counter uint64        // number of buffers hashed; also serves as a bit vector indicating which stack elems are occupied
+
+	buf    [maxSIMD * chunkSize]byte
+	buflen int
+}
+
+func (h *Hasher) hasSubtreeAtHeight(i int) bool {
+	return h.counter&(1<<i) != 0
+}
+
+func (h *Hasher) pushSubtree(cv [8]uint32) {
+	// seek to first open stack slot, merging subtrees as we go
+	i := 0
+	for h.hasSubtreeAtHeight(i) {
+		cv = chainingValue(parentNode(h.stack[i], cv, h.key, h.flags))
+		i++
+	}
+	h.stack[i] = cv
+	h.counter++
+}
+
+// rootNode computes the root of the Merkle tree. It does not modify the
+// stack.
+func (h *Hasher) rootNode() node {
+	n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags)
+	for i := bits.TrailingZeros64(h.counter); i < bits.Len64(h.counter); i++ {
+		if h.hasSubtreeAtHeight(i) {
+			n = parentNode(h.stack[i], chainingValue(n), h.key, h.flags)
+		}
+	}
+	n.flags |= flagRoot
+	return n
+}
+
+// Write implements hash.Hash.
+func (h *Hasher) Write(p []byte) (int, error) {
+	lenp := len(p)
+	for len(p) > 0 {
+		if h.buflen == len(h.buf) {
+			n := compressBuffer(&h.buf, h.buflen, &h.key, h.counter*maxSIMD, h.flags)
+			h.pushSubtree(chainingValue(n))
+			h.buflen = 0
+		}
+		n := copy(h.buf[h.buflen:], p)
+		h.buflen += n
+		p = p[n:]
+	}
+	return lenp, nil
+}
+
+// Sum implements hash.Hash.
+func (h *Hasher) Sum(b []byte) (sum []byte) {
+	// We need to append h.Size() bytes to b. Reuse b's capacity if possible;
+	// otherwise, allocate a new slice.
+	if total := len(b) + h.Size(); cap(b) >= total {
+		sum = b[:total]
+	} else {
+		sum = make([]byte, total)
+		copy(sum, b)
+	}
+	// Read into the appended portion of sum. Use a low-latency-low-throughput
+	// path for small digests (requiring a single compression), and a
+	// high-latency-high-throughput path for large digests.
+	if dst := sum[len(b):]; len(dst) <= 64 {
+		var out [64]byte
+		wordsToBytes(compressNode(h.rootNode()), &out)
+		copy(dst, out[:])
+	} else {
+		h.XOF().Read(dst)
+	}
+	return
+}
+
+// Reset implements hash.Hash.
+func (h *Hasher) Reset() {
+	h.counter = 0
+	h.buflen = 0
+}
+
+// BlockSize implements hash.Hash.
+func (h *Hasher) BlockSize() int { return 64 }
+
+// Size implements hash.Hash.
+func (h *Hasher) Size() int { return h.size }
+
+// XOF returns an OutputReader initialized with the current hash state.
+func (h *Hasher) XOF() *OutputReader {
+	return &OutputReader{
+		n: h.rootNode(),
+	}
+}
+
+func newHasher(key [8]uint32, flags uint32, size int) *Hasher {
+	return &Hasher{
+		key:   key,
+		flags: flags,
+		size:  size,
+	}
+}
+
+// New returns a Hasher for the specified digest size and key. If key is nil,
+// the hash is unkeyed. Otherwise, len(key) must be 32.
+func New(size int, key []byte) *Hasher {
+	if key == nil {
+		return newHasher(iv, 0, size)
+	}
+	var keyWords [8]uint32
+	for i := range keyWords {
+		keyWords[i] = binary.LittleEndian.Uint32(key[i*4:])
+	}
+	return newHasher(keyWords, flagKeyedHash, size)
+}
+
+// Sum256 and Sum512 always use the same hasher state, so we can save some time
+// when hashing small inputs by constructing the hasher ahead of time.
+var defaultHasher = New(64, nil)
+
+// Sum256 returns the unkeyed BLAKE3 hash of b, truncated to 256 bits.
+func Sum256(b []byte) (out [32]byte) {
+	out512 := Sum512(b)
+	copy(out[:], out512[:])
+	return
+}
+
+// Sum512 returns the unkeyed BLAKE3 hash of b, truncated to 512 bits.
+func Sum512(b []byte) (out [64]byte) {
+	var n node
+	if len(b) <= blockSize {
+		hashBlock(&out, b)
+		return
+	} else if len(b) <= chunkSize {
+		n = compressChunk(b, &iv, 0, 0)
+		n.flags |= flagRoot
+	} else {
+		h := *defaultHasher
+		h.Write(b)
+		n = h.rootNode()
+	}
+	wordsToBytes(compressNode(n), &out)
+	return
+}
+
+// DeriveKey derives a subkey from ctx and srcKey. ctx should be hardcoded,
+// globally unique, and application-specific. A good format for ctx strings is:
+//
+//	[application] [commit timestamp] [purpose]
+//
+// e.g.:
+//
+//	example.com 2019-12-25 16:18:03 session tokens v1
+//
+// The purpose of these requirements is to ensure that an attacker cannot trick
+// two different applications into using the same context string.
+func DeriveKey(subKey []byte, ctx string, srcKey []byte) {
+	// construct the derivation Hasher
+	const derivationIVLen = 32
+	h := newHasher(iv, flagDeriveKeyContext, 32)
+	h.Write([]byte(ctx))
+	derivationIV := h.Sum(make([]byte, 0, derivationIVLen))
+	var ivWords [8]uint32
+	for i := range ivWords {
+		ivWords[i] = binary.LittleEndian.Uint32(derivationIV[i*4:])
+	}
+	h = newHasher(ivWords, flagDeriveKeyMaterial, 0)
+	// derive the subKey
+	h.Write(srcKey)
+	h.XOF().Read(subKey)
+}
+
+// An OutputReader produces an seekable stream of 2^64 - 1 pseudorandom output
+// bytes.
+type OutputReader struct {
+	n   node
+	buf [maxSIMD * blockSize]byte
+	off uint64
+}
+
+// Read implements io.Reader. Callers may assume that Read returns len(p), nil
+// unless the read would extend beyond the end of the stream.
+func (or *OutputReader) Read(p []byte) (int, error) {
+	if or.off == math.MaxUint64 {
+		return 0, io.EOF
+	} else if rem := math.MaxUint64 - or.off; uint64(len(p)) > rem {
+		p = p[:rem]
+	}
+	lenp := len(p)
+	for len(p) > 0 {
+		if or.off%(maxSIMD*blockSize) == 0 {
+			or.n.counter = or.off / blockSize
+			compressBlocks(&or.buf, or.n)
+		}
+		n := copy(p, or.buf[or.off%(maxSIMD*blockSize):])
+		p = p[n:]
+		or.off += uint64(n)
+	}
+	return lenp, nil
+}
+
+// Seek implements io.Seeker.
+func (or *OutputReader) Seek(offset int64, whence int) (int64, error) {
+	off := or.off
+	switch whence {
+	case io.SeekStart:
+		if offset < 0 {
+			return 0, errors.New("seek position cannot be negative")
+		}
+		off = uint64(offset)
+	case io.SeekCurrent:
+		if offset < 0 {
+			if uint64(-offset) > off {
+				return 0, errors.New("seek position cannot be negative")
+			}
+			off -= uint64(-offset)
+		} else {
+			off += uint64(offset)
+		}
+	case io.SeekEnd:
+		off = uint64(offset) - 1
+	default:
+		panic("invalid whence")
+	}
+	or.off = off
+	or.n.counter = uint64(off) / blockSize
+	if or.off%(maxSIMD*blockSize) != 0 {
+		compressBlocks(&or.buf, or.n)
+	}
+	// NOTE: or.off >= 2^63 will result in a negative return value.
+	// Nothing we can do about this.
+	return int64(or.off), nil
+}
+
+// ensure that Hasher implements hash.Hash
+var _ hash.Hash = (*Hasher)(nil)
--- a/vendor/lukechampine.com/blake3/blake3_amd64.s
+++ b/vendor/lukechampine.com/blake3/blake3_amd64.s
--- a/vendor/lukechampine.com/blake3/compress_amd64.go
+++ b/vendor/lukechampine.com/blake3/compress_amd64.go
@@ -0,0 +1,144 @@
+package blake3
+
+import "unsafe"
+
+//go:generate go run avo/gen.go -out blake3_amd64.s
+
+//go:noescape
+func compressChunksAVX512(cvs *[16][8]uint32, buf *[16 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
+
+//go:noescape
+func compressChunksAVX2(cvs *[8][8]uint32, buf *[8 * chunkSize]byte, key *[8]uint32, counter uint64, flags uint32)
+
+//go:noescape
+func compressBlocksAVX512(out *[1024]byte, block *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
+
+//go:noescape
+func compressBlocksAVX2(out *[512]byte, msgs *[16]uint32, cv *[8]uint32, counter uint64, blockLen uint32, flags uint32)
+
+//go:noescape
+func compressParentsAVX2(parents *[8][8]uint32, cvs *[16][8]uint32, key *[8]uint32, flags uint32)
+
+func compressNode(n node) (out [16]uint32) {
+	compressNodeGeneric(&out, n)
+	return
+}
+
+func compressBufferAVX512(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	var cvs [maxSIMD][8]uint32
+	compressChunksAVX512(&cvs, buf, key, counter, flags)
+	numChunks := uint64(buflen / chunkSize)
+	if buflen%chunkSize != 0 {
+		// use non-asm for remainder
+		partialChunk := buf[buflen-buflen%chunkSize : buflen]
+		cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
+		numChunks++
+	}
+	return mergeSubtrees(&cvs, numChunks, key, flags)
+}
+
+func compressBufferAVX2(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	var cvs [maxSIMD][8]uint32
+	cvHalves := (*[2][8][8]uint32)(unsafe.Pointer(&cvs))
+	bufHalves := (*[2][8 * chunkSize]byte)(unsafe.Pointer(buf))
+	compressChunksAVX2(&cvHalves[0], &bufHalves[0], key, counter, flags)
+	numChunks := uint64(buflen / chunkSize)
+	if numChunks > 8 {
+		compressChunksAVX2(&cvHalves[1], &bufHalves[1], key, counter+8, flags)
+	}
+	if buflen%chunkSize != 0 {
+		// use non-asm for remainder
+		partialChunk := buf[buflen-buflen%chunkSize : buflen]
+		cvs[numChunks] = chainingValue(compressChunk(partialChunk, key, counter+numChunks, flags))
+		numChunks++
+	}
+	return mergeSubtrees(&cvs, numChunks, key, flags)
+}
+
+func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	switch {
+	case haveAVX512 && buflen >= chunkSize*2:
+		return compressBufferAVX512(buf, buflen, key, counter, flags)
+	case haveAVX2 && buflen >= chunkSize*2:
+		return compressBufferAVX2(buf, buflen, key, counter, flags)
+	default:
+		return compressBufferGeneric(buf, buflen, key, counter, flags)
+	}
+}
+
+func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
+	n := node{
+		cv:       *key,
+		counter:  counter,
+		blockLen: blockSize,
+		flags:    flags | flagChunkStart,
+	}
+	blockBytes := (*[64]byte)(unsafe.Pointer(&n.block))[:]
+	for len(chunk) > blockSize {
+		copy(blockBytes, chunk)
+		chunk = chunk[blockSize:]
+		n.cv = chainingValue(n)
+		n.flags &^= flagChunkStart
+	}
+	// pad last block with zeros
+	n.block = [16]uint32{}
+	copy(blockBytes, chunk)
+	n.blockLen = uint32(len(chunk))
+	n.flags |= flagChunkEnd
+	return n
+}
+
+func hashBlock(out *[64]byte, buf []byte) {
+	var block [16]uint32
+	copy((*[64]byte)(unsafe.Pointer(&block))[:], buf)
+	compressNodeGeneric((*[16]uint32)(unsafe.Pointer(out)), node{
+		cv:       iv,
+		block:    block,
+		blockLen: uint32(len(buf)),
+		flags:    flagChunkStart | flagChunkEnd | flagRoot,
+	})
+}
+
+func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
+	switch {
+	case haveAVX512:
+		compressBlocksAVX512(out, &n.block, &n.cv, n.counter, n.blockLen, n.flags)
+	case haveAVX2:
+		outs := (*[2][512]byte)(unsafe.Pointer(out))
+		compressBlocksAVX2(&outs[0], &n.block, &n.cv, n.counter, n.blockLen, n.flags)
+		compressBlocksAVX2(&outs[1], &n.block, &n.cv, n.counter+8, n.blockLen, n.flags)
+	default:
+		outs := (*[maxSIMD][64]byte)(unsafe.Pointer(out))
+		compressBlocksGeneric(outs, n)
+	}
+}
+
+func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
+	if !haveAVX2 {
+		return mergeSubtreesGeneric(cvs, numCVs, key, flags)
+	}
+	for numCVs > 2 {
+		if numCVs%2 == 0 {
+			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
+		} else {
+			keep := cvs[numCVs-1]
+			compressParentsAVX2((*[8][8]uint32)(unsafe.Pointer(cvs)), cvs, key, flags)
+			cvs[numCVs/2] = keep
+			numCVs++
+		}
+		numCVs /= 2
+	}
+	return parentNode(cvs[0], cvs[1], *key, flags)
+}
+
+func wordsToBytes(words [16]uint32, block *[64]byte) {
+	*block = *(*[64]byte)(unsafe.Pointer(&words))
+}
+
+func bytesToCV(b []byte) [8]uint32 {
+	return *(*[8]uint32)(unsafe.Pointer(&b[0]))
+}
+
+func cvToBytes(cv *[8]uint32) *[32]byte {
+	return (*[32]byte)(unsafe.Pointer(cv))
+}
--- a/vendor/lukechampine.com/blake3/compress_generic.go
+++ b/vendor/lukechampine.com/blake3/compress_generic.go
@@ -0,0 +1,143 @@
+package blake3
+
+import (
+	"bytes"
+	"math/bits"
+)
+
+func compressNodeGeneric(out *[16]uint32, n node) {
+	g := func(a, b, c, d, mx, my uint32) (uint32, uint32, uint32, uint32) {
+		a += b + mx
+		d = bits.RotateLeft32(d^a, -16)
+		c += d
+		b = bits.RotateLeft32(b^c, -12)
+		a += b + my
+		d = bits.RotateLeft32(d^a, -8)
+		c += d
+		b = bits.RotateLeft32(b^c, -7)
+		return a, b, c, d
+	}
+
+	// NOTE: we unroll all of the rounds, as well as the permutations that occur
+	// between rounds.
+
+	// round 1 (also initializes state)
+	// columns
+	s0, s4, s8, s12 := g(n.cv[0], n.cv[4], iv[0], uint32(n.counter), n.block[0], n.block[1])
+	s1, s5, s9, s13 := g(n.cv[1], n.cv[5], iv[1], uint32(n.counter>>32), n.block[2], n.block[3])
+	s2, s6, s10, s14 := g(n.cv[2], n.cv[6], iv[2], n.blockLen, n.block[4], n.block[5])
+	s3, s7, s11, s15 := g(n.cv[3], n.cv[7], iv[3], n.flags, n.block[6], n.block[7])
+	// diagonals
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[8], n.block[9])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[10], n.block[11])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[12], n.block[13])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[14], n.block[15])
+
+	// round 2
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[2], n.block[6])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[3], n.block[10])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[7], n.block[0])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[4], n.block[13])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[1], n.block[11])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[12], n.block[5])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[9], n.block[14])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[15], n.block[8])
+
+	// round 3
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[3], n.block[4])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[10], n.block[12])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[13], n.block[2])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[7], n.block[14])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[6], n.block[5])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[9], n.block[0])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[11], n.block[15])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[8], n.block[1])
+
+	// round 4
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[10], n.block[7])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[12], n.block[9])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[14], n.block[3])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[13], n.block[15])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[4], n.block[0])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[11], n.block[2])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[5], n.block[8])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[1], n.block[6])
+
+	// round 5
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[12], n.block[13])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[9], n.block[11])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[15], n.block[10])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[14], n.block[8])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[7], n.block[2])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[5], n.block[3])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[0], n.block[1])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[6], n.block[4])
+
+	// round 6
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[9], n.block[14])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[11], n.block[5])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[8], n.block[12])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[15], n.block[1])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[13], n.block[3])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[0], n.block[10])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[2], n.block[6])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[4], n.block[7])
+
+	// round 7
+	s0, s4, s8, s12 = g(s0, s4, s8, s12, n.block[11], n.block[15])
+	s1, s5, s9, s13 = g(s1, s5, s9, s13, n.block[5], n.block[0])
+	s2, s6, s10, s14 = g(s2, s6, s10, s14, n.block[1], n.block[9])
+	s3, s7, s11, s15 = g(s3, s7, s11, s15, n.block[8], n.block[6])
+	s0, s5, s10, s15 = g(s0, s5, s10, s15, n.block[14], n.block[10])
+	s1, s6, s11, s12 = g(s1, s6, s11, s12, n.block[2], n.block[12])
+	s2, s7, s8, s13 = g(s2, s7, s8, s13, n.block[3], n.block[4])
+	s3, s4, s9, s14 = g(s3, s4, s9, s14, n.block[7], n.block[13])
+
+	// finalization
+	*out = [16]uint32{
+		s0 ^ s8, s1 ^ s9, s2 ^ s10, s3 ^ s11,
+		s4 ^ s12, s5 ^ s13, s6 ^ s14, s7 ^ s15,
+		s8 ^ n.cv[0], s9 ^ n.cv[1], s10 ^ n.cv[2], s11 ^ n.cv[3],
+		s12 ^ n.cv[4], s13 ^ n.cv[5], s14 ^ n.cv[6], s15 ^ n.cv[7],
+	}
+}
+
+func chainingValue(n node) (cv [8]uint32) {
+	full := compressNode(n)
+	copy(cv[:], full[:])
+	return
+}
+
+func compressBufferGeneric(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) (n node) {
+	if buflen <= chunkSize {
+		return compressChunk(buf[:buflen], key, counter, flags)
+	}
+	var cvs [maxSIMD][8]uint32
+	var numCVs uint64
+	for bb := bytes.NewBuffer(buf[:buflen]); bb.Len() > 0; numCVs++ {
+		cvs[numCVs] = chainingValue(compressChunk(bb.Next(chunkSize), key, counter+numCVs, flags))
+	}
+	return mergeSubtrees(&cvs, numCVs, key, flags)
+}
+
+func compressBlocksGeneric(outs *[maxSIMD][64]byte, n node) {
+	for i := range outs {
+		wordsToBytes(compressNode(n), &outs[i])
+		n.counter++
+	}
+}
+
+func mergeSubtreesGeneric(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
+	for numCVs > 2 {
+		rem := numCVs / 2
+		for i := range cvs[:rem] {
+			cvs[i] = chainingValue(parentNode(cvs[i*2], cvs[i*2+1], *key, flags))
+		}
+		if numCVs%2 != 0 {
+			cvs[rem] = cvs[rem*2]
+			rem++
+		}
+		numCVs = rem
+	}
+	return parentNode(cvs[0], cvs[1], *key, flags)
+}
--- a/vendor/lukechampine.com/blake3/compress_noasm.go
+++ b/vendor/lukechampine.com/blake3/compress_noasm.go
@@ -0,0 +1,93 @@
+//go:build !amd64
+// +build !amd64
+
+package blake3
+
+import "encoding/binary"
+
+func compressNode(n node) (out [16]uint32) {
+	compressNodeGeneric(&out, n)
+	return
+}
+
+func compressBuffer(buf *[maxSIMD * chunkSize]byte, buflen int, key *[8]uint32, counter uint64, flags uint32) node {
+	return compressBufferGeneric(buf, buflen, key, counter, flags)
+}
+
+func compressChunk(chunk []byte, key *[8]uint32, counter uint64, flags uint32) node {
+	n := node{
+		cv:       *key,
+		counter:  counter,
+		blockLen: blockSize,
+		flags:    flags | flagChunkStart,
+	}
+	var block [blockSize]byte
+	for len(chunk) > blockSize {
+		copy(block[:], chunk)
+		chunk = chunk[blockSize:]
+		bytesToWords(block, &n.block)
+		n.cv = chainingValue(n)
+		n.flags &^= flagChunkStart
+	}
+	// pad last block with zeros
+	block = [blockSize]byte{}
+	n.blockLen = uint32(len(chunk))
+	copy(block[:], chunk)
+	bytesToWords(block, &n.block)
+	n.flags |= flagChunkEnd
+	return n
+}
+
+func hashBlock(out *[64]byte, buf []byte) {
+	var block [64]byte
+	var words [16]uint32
+	copy(block[:], buf)
+	bytesToWords(block, &words)
+	compressNodeGeneric(&words, node{
+		cv:       iv,
+		block:    words,
+		blockLen: uint32(len(buf)),
+		flags:    flagChunkStart | flagChunkEnd | flagRoot,
+	})
+	wordsToBytes(words, out)
+}
+
+func compressBlocks(out *[maxSIMD * blockSize]byte, n node) {
+	var outs [maxSIMD][64]byte
+	compressBlocksGeneric(&outs, n)
+	for i := range outs {
+		copy(out[i*64:], outs[i][:])
+	}
+}
+
+func mergeSubtrees(cvs *[maxSIMD][8]uint32, numCVs uint64, key *[8]uint32, flags uint32) node {
+	return mergeSubtreesGeneric(cvs, numCVs, key, flags)
+}
+
+func bytesToWords(bytes [64]byte, words *[16]uint32) {
+	for i := range words {
+		words[i] = binary.LittleEndian.Uint32(bytes[4*i:])
+	}
+}
+
+func wordsToBytes(words [16]uint32, block *[64]byte) {
+	for i, w := range words {
+		binary.LittleEndian.PutUint32(block[4*i:], w)
+	}
+}
+
+func bytesToCV(b []byte) [8]uint32 {
+	var cv [8]uint32
+	for i := range cv {
+		cv[i] = binary.LittleEndian.Uint32(b[4*i:])
+	}
+	return cv
+}
+
+func cvToBytes(cv *[8]uint32) *[32]byte {
+	var b [32]byte
+	for i, w := range cv {
+		binary.LittleEndian.PutUint32(b[4*i:], w)
+	}
+	return &b
+}
--- a/vendor/lukechampine.com/blake3/cpu.go
+++ b/vendor/lukechampine.com/blake3/cpu.go
@@ -0,0 +1,10 @@
+// +build !darwin
+
+package blake3
+
+import "github.com/klauspost/cpuid/v2"
+
+var (
+	haveAVX2   = cpuid.CPU.Supports(cpuid.AVX2)
+	haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F)
+)
--- a/vendor/lukechampine.com/blake3/cpu_darwin.go
+++ b/vendor/lukechampine.com/blake3/cpu_darwin.go
@@ -0,0 +1,22 @@
+package blake3
+
+import (
+	"syscall"
+
+	"github.com/klauspost/cpuid/v2"
+)
+
+var (
+	haveAVX2   bool
+	haveAVX512 bool
+)
+
+func init() {
+	haveAVX2 = cpuid.CPU.Supports(cpuid.AVX2)
+	haveAVX512 = cpuid.CPU.Supports(cpuid.AVX512F)
+	if !haveAVX512 {
+		// On some Macs, AVX512 detection is buggy, so fallback to sysctl
+		b, _ := syscall.Sysctl("hw.optional.avx512f")
+		haveAVX512 = len(b) > 0 && b[0] == 1
+	}
+}