 9bdcbe0447
			
		
	
	9bdcbe0447
	
	
	
		
			
			Major integrations and fixes: - Added BACKBEAT SDK integration for P2P operation timing - Implemented beat-aware status tracking for distributed operations - Added Docker secrets support for secure license management - Resolved KACHING license validation via HTTPS/TLS - Updated docker-compose configuration for clean stack deployment - Disabled rollback policies to prevent deployment failures - Added license credential storage (CHORUS-DEV-MULTI-001) Technical improvements: - BACKBEAT P2P operation tracking with phase management - Enhanced configuration system with file-based secrets - Improved error handling for license validation - Clean separation of KACHING and CHORUS deployment stacks 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
		
			
				
	
	
		
			831 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			831 lines
		
	
	
		
			15 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
 | |
| 
 | |
| //go:build amd64 && !appengine && !noasm && gc
 | |
| 
 | |
| // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 | |
| TEXT ·decompress4x_main_loop_amd64(SB), $0-8
 | |
| 	// Preload values
 | |
| 	MOVQ    ctx+0(FP), AX
 | |
| 	MOVBQZX 8(AX), DI
 | |
| 	MOVQ    16(AX), BX
 | |
| 	MOVQ    48(AX), SI
 | |
| 	MOVQ    24(AX), R8
 | |
| 	MOVQ    32(AX), R9
 | |
| 	MOVQ    (AX), R10
 | |
| 
 | |
| 	// Main loop
 | |
| main_loop:
 | |
| 	XORL  DX, DX
 | |
| 	CMPQ  BX, SI
 | |
| 	SETGE DL
 | |
| 
 | |
| 	// br0.fillFast32()
 | |
| 	MOVQ    32(R10), R11
 | |
| 	MOVBQZX 40(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill0
 | |
| 	MOVQ    24(R10), AX
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, AX
 | |
| 	MOVQ    (R10), R13
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (AX)(R13*1), R13
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R13
 | |
| 	MOVQ AX, 24(R10)
 | |
| 	ORQ  R13, R11
 | |
| 
 | |
| 	// exhausted += (br0.off < 4)
 | |
| 	CMPQ AX, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill0:
 | |
| 	// val0 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ DI, CX
 | |
| 	MOVQ R11, R13
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val1&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v1.entry))
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// these two writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	MOVW AX, (BX)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 32(R10)
 | |
| 	MOVB R12, 40(R10)
 | |
| 
 | |
| 	// br1.fillFast32()
 | |
| 	MOVQ    80(R10), R11
 | |
| 	MOVBQZX 88(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill1
 | |
| 	MOVQ    72(R10), AX
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, AX
 | |
| 	MOVQ    48(R10), R13
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (AX)(R13*1), R13
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R13
 | |
| 	MOVQ AX, 72(R10)
 | |
| 	ORQ  R13, R11
 | |
| 
 | |
| 	// exhausted += (br1.off < 4)
 | |
| 	CMPQ AX, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill1:
 | |
| 	// val0 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ DI, CX
 | |
| 	MOVQ R11, R13
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val1&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v1.entry))
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// these two writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	MOVW AX, (BX)(R8*1)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 80(R10)
 | |
| 	MOVB R12, 88(R10)
 | |
| 
 | |
| 	// br2.fillFast32()
 | |
| 	MOVQ    128(R10), R11
 | |
| 	MOVBQZX 136(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill2
 | |
| 	MOVQ    120(R10), AX
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, AX
 | |
| 	MOVQ    96(R10), R13
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (AX)(R13*1), R13
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R13
 | |
| 	MOVQ AX, 120(R10)
 | |
| 	ORQ  R13, R11
 | |
| 
 | |
| 	// exhausted += (br2.off < 4)
 | |
| 	CMPQ AX, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill2:
 | |
| 	// val0 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ DI, CX
 | |
| 	MOVQ R11, R13
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val1&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v1.entry))
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// these two writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	MOVW AX, (BX)(R8*2)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 128(R10)
 | |
| 	MOVB R12, 136(R10)
 | |
| 
 | |
| 	// br3.fillFast32()
 | |
| 	MOVQ    176(R10), R11
 | |
| 	MOVBQZX 184(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill3
 | |
| 	MOVQ    168(R10), AX
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, AX
 | |
| 	MOVQ    144(R10), R13
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (AX)(R13*1), R13
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R13
 | |
| 	MOVQ AX, 168(R10)
 | |
| 	ORQ  R13, R11
 | |
| 
 | |
| 	// exhausted += (br3.off < 4)
 | |
| 	CMPQ AX, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill3:
 | |
| 	// val0 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ DI, CX
 | |
| 	MOVQ R11, R13
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val1&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v1.entry))
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// these two writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	LEAQ (R8)(R8*2), CX
 | |
| 	MOVW AX, (BX)(CX*1)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ  R11, 176(R10)
 | |
| 	MOVB  R12, 184(R10)
 | |
| 	ADDQ  $0x02, BX
 | |
| 	TESTB DL, DL
 | |
| 	JZ    main_loop
 | |
| 	MOVQ  ctx+0(FP), AX
 | |
| 	SUBQ  16(AX), BX
 | |
| 	SHLQ  $0x02, BX
 | |
| 	MOVQ  BX, 40(AX)
 | |
| 	RET
 | |
| 
 | |
| // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 | |
| TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
 | |
| 	// Preload values
 | |
| 	MOVQ    ctx+0(FP), CX
 | |
| 	MOVBQZX 8(CX), DI
 | |
| 	MOVQ    16(CX), BX
 | |
| 	MOVQ    48(CX), SI
 | |
| 	MOVQ    24(CX), R8
 | |
| 	MOVQ    32(CX), R9
 | |
| 	MOVQ    (CX), R10
 | |
| 
 | |
| 	// Main loop
 | |
| main_loop:
 | |
| 	XORL  DX, DX
 | |
| 	CMPQ  BX, SI
 | |
| 	SETGE DL
 | |
| 
 | |
| 	// br0.fillFast32()
 | |
| 	MOVQ    32(R10), R11
 | |
| 	MOVBQZX 40(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill0
 | |
| 	MOVQ    24(R10), R13
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, R13
 | |
| 	MOVQ    (R10), R14
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (R13)(R14*1), R14
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R14
 | |
| 	MOVQ R13, 24(R10)
 | |
| 	ORQ  R14, R11
 | |
| 
 | |
| 	// exhausted += (br0.off < 4)
 | |
| 	CMPQ R13, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill0:
 | |
| 	// val0 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v1.entry)
 | |
| 	MOVB   CH, AH
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// val2 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v2 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v2.entry)
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val3 := br0.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v3 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br0.advance(uint8(v3.entry)
 | |
| 	MOVB   CH, AL
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// these four writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 | |
| 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
 | |
| 	MOVL AX, (BX)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 32(R10)
 | |
| 	MOVB R12, 40(R10)
 | |
| 
 | |
| 	// br1.fillFast32()
 | |
| 	MOVQ    80(R10), R11
 | |
| 	MOVBQZX 88(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill1
 | |
| 	MOVQ    72(R10), R13
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, R13
 | |
| 	MOVQ    48(R10), R14
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (R13)(R14*1), R14
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R14
 | |
| 	MOVQ R13, 72(R10)
 | |
| 	ORQ  R14, R11
 | |
| 
 | |
| 	// exhausted += (br1.off < 4)
 | |
| 	CMPQ R13, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill1:
 | |
| 	// val0 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v1.entry)
 | |
| 	MOVB   CH, AH
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// val2 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v2 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v2.entry)
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val3 := br1.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v3 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br1.advance(uint8(v3.entry)
 | |
| 	MOVB   CH, AL
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// these four writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 | |
| 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
 | |
| 	MOVL AX, (BX)(R8*1)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 80(R10)
 | |
| 	MOVB R12, 88(R10)
 | |
| 
 | |
| 	// br2.fillFast32()
 | |
| 	MOVQ    128(R10), R11
 | |
| 	MOVBQZX 136(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill2
 | |
| 	MOVQ    120(R10), R13
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, R13
 | |
| 	MOVQ    96(R10), R14
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (R13)(R14*1), R14
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R14
 | |
| 	MOVQ R13, 120(R10)
 | |
| 	ORQ  R14, R11
 | |
| 
 | |
| 	// exhausted += (br2.off < 4)
 | |
| 	CMPQ R13, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill2:
 | |
| 	// val0 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v1.entry)
 | |
| 	MOVB   CH, AH
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// val2 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v2 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v2.entry)
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val3 := br2.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v3 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br2.advance(uint8(v3.entry)
 | |
| 	MOVB   CH, AL
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// these four writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 | |
| 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
 | |
| 	MOVL AX, (BX)(R8*2)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ R11, 128(R10)
 | |
| 	MOVB R12, 136(R10)
 | |
| 
 | |
| 	// br3.fillFast32()
 | |
| 	MOVQ    176(R10), R11
 | |
| 	MOVBQZX 184(R10), R12
 | |
| 	CMPQ    R12, $0x20
 | |
| 	JBE     skip_fill3
 | |
| 	MOVQ    168(R10), R13
 | |
| 	SUBQ    $0x20, R12
 | |
| 	SUBQ    $0x04, R13
 | |
| 	MOVQ    144(R10), R14
 | |
| 
 | |
| 	// b.value |= uint64(low) << (b.bitsRead & 63)
 | |
| 	MOVL (R13)(R14*1), R14
 | |
| 	MOVQ R12, CX
 | |
| 	SHLQ CL, R14
 | |
| 	MOVQ R13, 168(R10)
 | |
| 	ORQ  R14, R11
 | |
| 
 | |
| 	// exhausted += (br3.off < 4)
 | |
| 	CMPQ R13, $0x04
 | |
| 	ADCB $+0, DL
 | |
| 
 | |
| skip_fill3:
 | |
| 	// val0 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v0 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v0.entry)
 | |
| 	MOVB CH, AL
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val1 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v1 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v1.entry)
 | |
| 	MOVB   CH, AH
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// val2 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v2 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v2.entry)
 | |
| 	MOVB CH, AH
 | |
| 	SHLQ CL, R11
 | |
| 	ADDB CL, R12
 | |
| 
 | |
| 	// val3 := br3.peekTopBits(peekBits)
 | |
| 	MOVQ R11, R13
 | |
| 	MOVQ DI, CX
 | |
| 	SHRQ CL, R13
 | |
| 
 | |
| 	// v3 := table[val0&mask]
 | |
| 	MOVW (R9)(R13*2), CX
 | |
| 
 | |
| 	// br3.advance(uint8(v3.entry)
 | |
| 	MOVB   CH, AL
 | |
| 	SHLQ   CL, R11
 | |
| 	ADDB   CL, R12
 | |
| 	BSWAPL AX
 | |
| 
 | |
| 	// these four writes get coalesced
 | |
| 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 | |
| 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 | |
| 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 | |
| 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
 | |
| 	LEAQ (R8)(R8*2), CX
 | |
| 	MOVL AX, (BX)(CX*1)
 | |
| 
 | |
| 	// update the bitreader structure
 | |
| 	MOVQ  R11, 176(R10)
 | |
| 	MOVB  R12, 184(R10)
 | |
| 	ADDQ  $0x04, BX
 | |
| 	TESTB DL, DL
 | |
| 	JZ    main_loop
 | |
| 	MOVQ  ctx+0(FP), AX
 | |
| 	SUBQ  16(AX), BX
 | |
| 	SHLQ  $0x02, BX
 | |
| 	MOVQ  BX, 40(AX)
 | |
| 	RET
 | |
| 
 | |
| // func decompress1x_main_loop_amd64(ctx *decompress1xContext)
 | |
| TEXT ·decompress1x_main_loop_amd64(SB), $0-8
 | |
| 	MOVQ    ctx+0(FP), CX
 | |
| 	MOVQ    16(CX), DX
 | |
| 	MOVQ    24(CX), BX
 | |
| 	CMPQ    BX, $0x04
 | |
| 	JB      error_max_decoded_size_exceeded
 | |
| 	LEAQ    (DX)(BX*1), BX
 | |
| 	MOVQ    (CX), SI
 | |
| 	MOVQ    (SI), R8
 | |
| 	MOVQ    24(SI), R9
 | |
| 	MOVQ    32(SI), R10
 | |
| 	MOVBQZX 40(SI), R11
 | |
| 	MOVQ    32(CX), SI
 | |
| 	MOVBQZX 8(CX), DI
 | |
| 	JMP     loop_condition
 | |
| 
 | |
| main_loop:
 | |
| 	// Check if we have room for 4 bytes in the output buffer
 | |
| 	LEAQ 4(DX), CX
 | |
| 	CMPQ CX, BX
 | |
| 	JGE  error_max_decoded_size_exceeded
 | |
| 
 | |
| 	// Decode 4 values
 | |
| 	CMPQ R11, $0x20
 | |
| 	JL   bitReader_fillFast_1_end
 | |
| 	SUBQ $0x20, R11
 | |
| 	SUBQ $0x04, R9
 | |
| 	MOVL (R8)(R9*1), R12
 | |
| 	MOVQ R11, CX
 | |
| 	SHLQ CL, R12
 | |
| 	ORQ  R12, R10
 | |
| 
 | |
| bitReader_fillFast_1_end:
 | |
| 	MOVQ    DI, CX
 | |
| 	MOVQ    R10, R12
 | |
| 	SHRQ    CL, R12
 | |
| 	MOVW    (SI)(R12*2), CX
 | |
| 	MOVB    CH, AL
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLQ    CL, R10
 | |
| 	MOVQ    DI, CX
 | |
| 	MOVQ    R10, R12
 | |
| 	SHRQ    CL, R12
 | |
| 	MOVW    (SI)(R12*2), CX
 | |
| 	MOVB    CH, AH
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLQ    CL, R10
 | |
| 	BSWAPL  AX
 | |
| 	CMPQ    R11, $0x20
 | |
| 	JL      bitReader_fillFast_2_end
 | |
| 	SUBQ    $0x20, R11
 | |
| 	SUBQ    $0x04, R9
 | |
| 	MOVL    (R8)(R9*1), R12
 | |
| 	MOVQ    R11, CX
 | |
| 	SHLQ    CL, R12
 | |
| 	ORQ     R12, R10
 | |
| 
 | |
| bitReader_fillFast_2_end:
 | |
| 	MOVQ    DI, CX
 | |
| 	MOVQ    R10, R12
 | |
| 	SHRQ    CL, R12
 | |
| 	MOVW    (SI)(R12*2), CX
 | |
| 	MOVB    CH, AH
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLQ    CL, R10
 | |
| 	MOVQ    DI, CX
 | |
| 	MOVQ    R10, R12
 | |
| 	SHRQ    CL, R12
 | |
| 	MOVW    (SI)(R12*2), CX
 | |
| 	MOVB    CH, AL
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLQ    CL, R10
 | |
| 	BSWAPL  AX
 | |
| 
 | |
| 	// Store the decoded values
 | |
| 	MOVL AX, (DX)
 | |
| 	ADDQ $0x04, DX
 | |
| 
 | |
| loop_condition:
 | |
| 	CMPQ R9, $0x08
 | |
| 	JGE  main_loop
 | |
| 
 | |
| 	// Update ctx structure
 | |
| 	MOVQ ctx+0(FP), AX
 | |
| 	SUBQ 16(AX), DX
 | |
| 	MOVQ DX, 40(AX)
 | |
| 	MOVQ (AX), AX
 | |
| 	MOVQ R9, 24(AX)
 | |
| 	MOVQ R10, 32(AX)
 | |
| 	MOVB R11, 40(AX)
 | |
| 	RET
 | |
| 
 | |
| 	// Report error
 | |
| error_max_decoded_size_exceeded:
 | |
| 	MOVQ ctx+0(FP), AX
 | |
| 	MOVQ $-1, CX
 | |
| 	MOVQ CX, 40(AX)
 | |
| 	RET
 | |
| 
 | |
| // func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
 | |
| // Requires: BMI2
 | |
| TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
 | |
| 	MOVQ    ctx+0(FP), CX
 | |
| 	MOVQ    16(CX), DX
 | |
| 	MOVQ    24(CX), BX
 | |
| 	CMPQ    BX, $0x04
 | |
| 	JB      error_max_decoded_size_exceeded
 | |
| 	LEAQ    (DX)(BX*1), BX
 | |
| 	MOVQ    (CX), SI
 | |
| 	MOVQ    (SI), R8
 | |
| 	MOVQ    24(SI), R9
 | |
| 	MOVQ    32(SI), R10
 | |
| 	MOVBQZX 40(SI), R11
 | |
| 	MOVQ    32(CX), SI
 | |
| 	MOVBQZX 8(CX), DI
 | |
| 	JMP     loop_condition
 | |
| 
 | |
| main_loop:
 | |
| 	// Check if we have room for 4 bytes in the output buffer
 | |
| 	LEAQ 4(DX), CX
 | |
| 	CMPQ CX, BX
 | |
| 	JGE  error_max_decoded_size_exceeded
 | |
| 
 | |
| 	// Decode 4 values
 | |
| 	CMPQ  R11, $0x20
 | |
| 	JL    bitReader_fillFast_1_end
 | |
| 	SUBQ  $0x20, R11
 | |
| 	SUBQ  $0x04, R9
 | |
| 	MOVL  (R8)(R9*1), CX
 | |
| 	SHLXQ R11, CX, CX
 | |
| 	ORQ   CX, R10
 | |
| 
 | |
| bitReader_fillFast_1_end:
 | |
| 	SHRXQ   DI, R10, CX
 | |
| 	MOVW    (SI)(CX*2), CX
 | |
| 	MOVB    CH, AL
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLXQ   CX, R10, R10
 | |
| 	SHRXQ   DI, R10, CX
 | |
| 	MOVW    (SI)(CX*2), CX
 | |
| 	MOVB    CH, AH
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLXQ   CX, R10, R10
 | |
| 	BSWAPL  AX
 | |
| 	CMPQ    R11, $0x20
 | |
| 	JL      bitReader_fillFast_2_end
 | |
| 	SUBQ    $0x20, R11
 | |
| 	SUBQ    $0x04, R9
 | |
| 	MOVL    (R8)(R9*1), CX
 | |
| 	SHLXQ   R11, CX, CX
 | |
| 	ORQ     CX, R10
 | |
| 
 | |
| bitReader_fillFast_2_end:
 | |
| 	SHRXQ   DI, R10, CX
 | |
| 	MOVW    (SI)(CX*2), CX
 | |
| 	MOVB    CH, AH
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLXQ   CX, R10, R10
 | |
| 	SHRXQ   DI, R10, CX
 | |
| 	MOVW    (SI)(CX*2), CX
 | |
| 	MOVB    CH, AL
 | |
| 	MOVBQZX CL, CX
 | |
| 	ADDQ    CX, R11
 | |
| 	SHLXQ   CX, R10, R10
 | |
| 	BSWAPL  AX
 | |
| 
 | |
| 	// Store the decoded values
 | |
| 	MOVL AX, (DX)
 | |
| 	ADDQ $0x04, DX
 | |
| 
 | |
| loop_condition:
 | |
| 	CMPQ R9, $0x08
 | |
| 	JGE  main_loop
 | |
| 
 | |
| 	// Update ctx structure
 | |
| 	MOVQ ctx+0(FP), AX
 | |
| 	SUBQ 16(AX), DX
 | |
| 	MOVQ DX, 40(AX)
 | |
| 	MOVQ (AX), AX
 | |
| 	MOVQ R9, 24(AX)
 | |
| 	MOVQ R10, 32(AX)
 | |
| 	MOVB R11, 40(AX)
 | |
| 	RET
 | |
| 
 | |
| 	// Report error
 | |
| error_max_decoded_size_exceeded:
 | |
| 	MOVQ ctx+0(FP), AX
 | |
| 	MOVQ $-1, CX
 | |
| 	MOVQ CX, 40(AX)
 | |
| 	RET
 |