sing-tun/internal/tschecksum/checksum_generated_amd64.s

// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.

#include "textflag.h"

DATA xmmLoadMasks<>+0(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
DATA xmmLoadMasks<>+16(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff"
DATA xmmLoadMasks<>+32(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff"
DATA xmmLoadMasks<>+48(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff"
DATA xmmLoadMasks<>+64(SB)/16, $"\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
DATA xmmLoadMasks<>+80(SB)/16, $"\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
DATA xmmLoadMasks<>+96(SB)/16, $"\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
GLOBL xmmLoadMasks<>(SB), RODATA|NOPTR, $112

// func checksumAVX2(b []byte, initial uint16) uint16
// Requires: AVX, AVX2, BMI2
TEXT ·checksumAVX2(SB), NOSPLIT|NOFRAME, $0-34
	MOVWQZX initial+24(FP), AX
	XCHGB   AH, AL
	MOVQ    b_base+0(FP), DX
	MOVQ    b_len+8(FP), BX

	// handle odd length buffers; they are difficult to handle in general
	TESTQ   $0x00000001, BX
	JZ      lengthIsEven
	MOVBQZX -1(DX)(BX*1), CX
	DECQ    BX
	ADDQ    CX, AX

lengthIsEven:
	// handle tiny buffers (<=31 bytes) specially
	CMPQ BX, $0x1f
	JGT  bufferIsNotTiny
	XORQ CX, CX
	XORQ SI, SI
	XORQ DI, DI

	// shift twice to start because length is guaranteed to be even
	// n = n >> 2; CF = originalN & 2
	SHRQ $0x02, BX
	JNC  handleTiny4

	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
	MOVWQZX (DX), CX
	ADDQ    $0x02, DX

handleTiny4:
	// n = n >> 1; CF = originalN & 4
	SHRQ $0x01, BX
	JNC  handleTiny8

	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
	MOVLQZX (DX), SI
	ADDQ    $0x04, DX

handleTiny8:
	// n = n >> 1; CF = originalN & 8
	SHRQ $0x01, BX
	JNC  handleTiny16

	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
	MOVQ (DX), DI
	ADDQ $0x08, DX

handleTiny16:
	// n = n >> 1; CF = originalN & 16
	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
	SHRQ $0x01, BX
	JNC  handleTinyFinish
	ADDQ (DX), AX
	ADCQ 8(DX), AX

handleTinyFinish:
	// CF should be included from the previous add, so we use ADCQ.
	// If we arrived via the JNC above, then CF=0 due to the branch condition,
	// so ADCQ will still produce the correct result.
	ADCQ CX, AX
	ADCQ SI, AX
	ADCQ DI, AX
	JMP  foldAndReturn

bufferIsNotTiny:
	// skip all SIMD for small buffers
	CMPQ BX, $0x00000100
	JGE  startSIMD

	// Accumulate carries in this register. It is never expected to overflow.
	XORQ SI, SI

	// We will perform an overlapped read for buffers with length not a multiple of 8.
	// Overlapped in this context means some memory will be read twice, but a shift will
	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
	// preserve any alignment that may exist for the start of the buffer.
	MOVQ BX, CX
	SHRQ $0x03, BX
	ANDQ $0x07, CX
	JZ   handleRemaining8
	LEAQ (DX)(BX*8), DI
	MOVQ -8(DI)(CX*1), DI

	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
	SHLQ $0x03, CX
	NEGQ CX
	ADDQ $0x40, CX
	SHRQ CL, DI
	ADDQ DI, AX
	ADCQ $0x00, SI

handleRemaining8:
	SHRQ $0x01, BX
	JNC  handleRemaining16
	ADDQ (DX), AX
	ADCQ $0x00, SI
	ADDQ $0x08, DX

handleRemaining16:
	SHRQ $0x01, BX
	JNC  handleRemaining32
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x10, DX

handleRemaining32:
	SHRQ $0x01, BX
	JNC  handleRemaining64
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x20, DX

handleRemaining64:
	SHRQ $0x01, BX
	JNC  handleRemaining128
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x40, DX

handleRemaining128:
	SHRQ $0x01, BX
	JNC  handleRemainingComplete
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ 64(DX), AX
	ADCQ 72(DX), AX
	ADCQ 80(DX), AX
	ADCQ 88(DX), AX
	ADCQ 96(DX), AX
	ADCQ 104(DX), AX
	ADCQ 112(DX), AX
	ADCQ 120(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x80, DX

handleRemainingComplete:
	ADDQ SI, AX
	JMP  foldAndReturn

startSIMD:
	VPXOR Y0, Y0, Y0
	VPXOR Y1, Y1, Y1
	VPXOR Y2, Y2, Y2
	VPXOR Y3, Y3, Y3
	MOVQ  BX, CX

	// Update number of bytes remaining after the loop completes
	ANDQ $0xff, BX

	// Number of 256 byte iterations
	SHRQ $0x08, CX
	JZ   smallLoop

bigLoop:
	VPMOVZXWD (DX), Y4
	VPADDD    Y4, Y0, Y0
	VPMOVZXWD 16(DX), Y4
	VPADDD    Y4, Y1, Y1
	VPMOVZXWD 32(DX), Y4
	VPADDD    Y4, Y2, Y2
	VPMOVZXWD 48(DX), Y4
	VPADDD    Y4, Y3, Y3
	VPMOVZXWD 64(DX), Y4
	VPADDD    Y4, Y0, Y0
	VPMOVZXWD 80(DX), Y4
	VPADDD    Y4, Y1, Y1
	VPMOVZXWD 96(DX), Y4
	VPADDD    Y4, Y2, Y2
	VPMOVZXWD 112(DX), Y4
	VPADDD    Y4, Y3, Y3
	VPMOVZXWD 128(DX), Y4
	VPADDD    Y4, Y0, Y0
	VPMOVZXWD 144(DX), Y4
	VPADDD    Y4, Y1, Y1
	VPMOVZXWD 160(DX), Y4
	VPADDD    Y4, Y2, Y2
	VPMOVZXWD 176(DX), Y4
	VPADDD    Y4, Y3, Y3
	VPMOVZXWD 192(DX), Y4
	VPADDD    Y4, Y0, Y0
	VPMOVZXWD 208(DX), Y4
	VPADDD    Y4, Y1, Y1
	VPMOVZXWD 224(DX), Y4
	VPADDD    Y4, Y2, Y2
	VPMOVZXWD 240(DX), Y4
	VPADDD    Y4, Y3, Y3
	ADDQ      $0x00000100, DX
	DECQ      CX
	JNZ       bigLoop
	CMPQ      BX, $0x10
	JLT       doneSmallLoop

	// now read a single 16 byte unit of data at a time
smallLoop:
	VPMOVZXWD (DX), Y4
	VPADDD    Y4, Y0, Y0
	ADDQ      $0x10, DX
	SUBQ      $0x10, BX
	CMPQ      BX, $0x10
	JGE       smallLoop

doneSmallLoop:
	CMPQ BX, $0x00
	JE   doneSIMD

	// There are between 1 and 15 bytes remaining. Perform an overlapped read.
	LEAQ      xmmLoadMasks<>+0(SB), CX
	VMOVDQU   -16(DX)(BX*1), X4
	VPAND     -16(CX)(BX*8), X4, X4
	VPMOVZXWD X4, Y4
	VPADDD    Y4, Y0, Y0

doneSIMD:
	// Multi-chain loop is done, combine the accumulators
	VPADDD Y1, Y0, Y0
	VPADDD Y2, Y0, Y0
	VPADDD Y3, Y0, Y0

	// extract the YMM into a pair of XMM and sum them
	VEXTRACTI128 $0x01, Y0, X1
	VPADDD       X0, X1, X0

	// extract the XMM into GP64
	VPEXTRQ $0x00, X0, CX
	VPEXTRQ $0x01, X0, DX

	// no more AVX code, clear upper registers to avoid SSE slowdowns
	VZEROUPPER
	ADDQ CX, AX
	ADCQ DX, AX

foldAndReturn:
	// add CF and fold
	RORXQ $0x20, AX, CX
	ADCL  CX, AX
	RORXL $0x10, AX, CX
	ADCW  CX, AX
	ADCW  $0x00, AX
	XCHGB AH, AL
	MOVW  AX, ret+32(FP)
	RET

// func checksumSSE2(b []byte, initial uint16) uint16
// Requires: SSE2
TEXT ·checksumSSE2(SB), NOSPLIT|NOFRAME, $0-34
	MOVWQZX initial+24(FP), AX
	XCHGB   AH, AL
	MOVQ    b_base+0(FP), DX
	MOVQ    b_len+8(FP), BX

	// handle odd length buffers; they are difficult to handle in general
	TESTQ   $0x00000001, BX
	JZ      lengthIsEven
	MOVBQZX -1(DX)(BX*1), CX
	DECQ    BX
	ADDQ    CX, AX

lengthIsEven:
	// handle tiny buffers (<=31 bytes) specially
	CMPQ BX, $0x1f
	JGT  bufferIsNotTiny
	XORQ CX, CX
	XORQ SI, SI
	XORQ DI, DI

	// shift twice to start because length is guaranteed to be even
	// n = n >> 2; CF = originalN & 2
	SHRQ $0x02, BX
	JNC  handleTiny4

	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
	MOVWQZX (DX), CX
	ADDQ    $0x02, DX

handleTiny4:
	// n = n >> 1; CF = originalN & 4
	SHRQ $0x01, BX
	JNC  handleTiny8

	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
	MOVLQZX (DX), SI
	ADDQ    $0x04, DX

handleTiny8:
	// n = n >> 1; CF = originalN & 8
	SHRQ $0x01, BX
	JNC  handleTiny16

	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
	MOVQ (DX), DI
	ADDQ $0x08, DX

handleTiny16:
	// n = n >> 1; CF = originalN & 16
	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
	SHRQ $0x01, BX
	JNC  handleTinyFinish
	ADDQ (DX), AX
	ADCQ 8(DX), AX

handleTinyFinish:
	// CF should be included from the previous add, so we use ADCQ.
	// If we arrived via the JNC above, then CF=0 due to the branch condition,
	// so ADCQ will still produce the correct result.
	ADCQ CX, AX
	ADCQ SI, AX
	ADCQ DI, AX
	JMP  foldAndReturn

bufferIsNotTiny:
	// skip all SIMD for small buffers
	CMPQ BX, $0x00000100
	JGE  startSIMD

	// Accumulate carries in this register. It is never expected to overflow.
	XORQ SI, SI

	// We will perform an overlapped read for buffers with length not a multiple of 8.
	// Overlapped in this context means some memory will be read twice, but a shift will
	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
	// preserve any alignment that may exist for the start of the buffer.
	MOVQ BX, CX
	SHRQ $0x03, BX
	ANDQ $0x07, CX
	JZ   handleRemaining8
	LEAQ (DX)(BX*8), DI
	MOVQ -8(DI)(CX*1), DI

	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
	SHLQ $0x03, CX
	NEGQ CX
	ADDQ $0x40, CX
	SHRQ CL, DI
	ADDQ DI, AX
	ADCQ $0x00, SI

handleRemaining8:
	SHRQ $0x01, BX
	JNC  handleRemaining16
	ADDQ (DX), AX
	ADCQ $0x00, SI
	ADDQ $0x08, DX

handleRemaining16:
	SHRQ $0x01, BX
	JNC  handleRemaining32
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x10, DX

handleRemaining32:
	SHRQ $0x01, BX
	JNC  handleRemaining64
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x20, DX

handleRemaining64:
	SHRQ $0x01, BX
	JNC  handleRemaining128
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x40, DX

handleRemaining128:
	SHRQ $0x01, BX
	JNC  handleRemainingComplete
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ 64(DX), AX
	ADCQ 72(DX), AX
	ADCQ 80(DX), AX
	ADCQ 88(DX), AX
	ADCQ 96(DX), AX
	ADCQ 104(DX), AX
	ADCQ 112(DX), AX
	ADCQ 120(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x80, DX

handleRemainingComplete:
	ADDQ SI, AX
	JMP  foldAndReturn

startSIMD:
	PXOR X0, X0
	PXOR X1, X1
	PXOR X2, X2
	PXOR X3, X3
	PXOR X4, X4
	MOVQ BX, CX

	// Update number of bytes remaining after the loop completes
	ANDQ $0xff, BX

	// Number of 256 byte iterations
	SHRQ $0x08, CX
	JZ   smallLoop

bigLoop:
	MOVOU     (DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X2
	MOVOU     16(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X1
	PADDD     X6, X3
	MOVOU     32(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X2
	PADDD     X6, X0
	MOVOU     48(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X3
	PADDD     X6, X1
	MOVOU     64(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X2
	MOVOU     80(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X1
	PADDD     X6, X3
	MOVOU     96(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X2
	PADDD     X6, X0
	MOVOU     112(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X3
	PADDD     X6, X1
	MOVOU     128(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X2
	MOVOU     144(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X1
	PADDD     X6, X3
	MOVOU     160(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X2
	PADDD     X6, X0
	MOVOU     176(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X3
	PADDD     X6, X1
	MOVOU     192(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X2
	MOVOU     208(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X1
	PADDD     X6, X3
	MOVOU     224(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X2
	PADDD     X6, X0
	MOVOU     240(DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X3
	PADDD     X6, X1
	ADDQ      $0x00000100, DX
	DECQ      CX
	JNZ       bigLoop
	CMPQ      BX, $0x10
	JLT       doneSmallLoop

	// now read a single 16 byte unit of data at a time
smallLoop:
	MOVOU     (DX), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X1
	ADDQ      $0x10, DX
	SUBQ      $0x10, BX
	CMPQ      BX, $0x10
	JGE       smallLoop

doneSmallLoop:
	CMPQ BX, $0x00
	JE   doneSIMD

	// There are between 1 and 15 bytes remaining. Perform an overlapped read.
	LEAQ      xmmLoadMasks<>+0(SB), CX
	MOVOU     -16(DX)(BX*1), X5
	PAND      -16(CX)(BX*8), X5
	MOVOA     X5, X6
	PUNPCKHWL X4, X5
	PUNPCKLWL X4, X6
	PADDD     X5, X0
	PADDD     X6, X1

doneSIMD:
	// Multi-chain loop is done, combine the accumulators
	PADDD X1, X0
	PADDD X2, X0
	PADDD X3, X0

	// extract the XMM into GP64
	MOVQ   X0, CX
	PSRLDQ $0x08, X0
	MOVQ   X0, DX
	ADDQ   CX, AX
	ADCQ   DX, AX

foldAndReturn:
	// add CF and fold
	MOVL    AX, CX
	ADCQ    $0x00, CX
	SHRQ    $0x20, AX
	ADDQ    CX, AX
	MOVWQZX AX, CX
	SHRQ    $0x10, AX
	ADDQ    CX, AX
	MOVW    AX, CX
	SHRQ    $0x10, AX
	ADDW    CX, AX
	ADCW    $0x00, AX
	XCHGB   AH, AL
	MOVW    AX, ret+32(FP)
	RET

// func checksumAMD64(b []byte, initial uint16) uint16
TEXT ·checksumAMD64(SB), NOSPLIT|NOFRAME, $0-34
	MOVWQZX initial+24(FP), AX
	XCHGB   AH, AL
	MOVQ    b_base+0(FP), DX
	MOVQ    b_len+8(FP), BX

	// handle odd length buffers; they are difficult to handle in general
	TESTQ   $0x00000001, BX
	JZ      lengthIsEven
	MOVBQZX -1(DX)(BX*1), CX
	DECQ    BX
	ADDQ    CX, AX

lengthIsEven:
	// handle tiny buffers (<=31 bytes) specially
	CMPQ BX, $0x1f
	JGT  bufferIsNotTiny
	XORQ CX, CX
	XORQ SI, SI
	XORQ DI, DI

	// shift twice to start because length is guaranteed to be even
	// n = n >> 2; CF = originalN & 2
	SHRQ $0x02, BX
	JNC  handleTiny4

	// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
	MOVWQZX (DX), CX
	ADDQ    $0x02, DX

handleTiny4:
	// n = n >> 1; CF = originalN & 4
	SHRQ $0x01, BX
	JNC  handleTiny8

	// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
	MOVLQZX (DX), SI
	ADDQ    $0x04, DX

handleTiny8:
	// n = n >> 1; CF = originalN & 8
	SHRQ $0x01, BX
	JNC  handleTiny16

	// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
	MOVQ (DX), DI
	ADDQ $0x08, DX

handleTiny16:
	// n = n >> 1; CF = originalN & 16
	// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
	SHRQ $0x01, BX
	JNC  handleTinyFinish
	ADDQ (DX), AX
	ADCQ 8(DX), AX

handleTinyFinish:
	// CF should be included from the previous add, so we use ADCQ.
	// If we arrived via the JNC above, then CF=0 due to the branch condition,
	// so ADCQ will still produce the correct result.
	ADCQ CX, AX
	ADCQ SI, AX
	ADCQ DI, AX
	JMP  foldAndReturn

bufferIsNotTiny:
	// Number of 256 byte iterations into loop counter
	MOVQ BX, CX

	// Update number of bytes remaining after the loop completes
	ANDQ $0xff, BX
	SHRQ $0x08, CX
	JZ   startCleanup
	CLC
	XORQ SI, SI
	XORQ DI, DI
	XORQ R8, R8
	XORQ R9, R9
	XORQ R10, R10
	XORQ R11, R11
	XORQ R12, R12

bigLoop:
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ $0x00, SI
	ADDQ 32(DX), DI
	ADCQ 40(DX), DI
	ADCQ 48(DX), DI
	ADCQ 56(DX), DI
	ADCQ $0x00, R8
	ADDQ 64(DX), R9
	ADCQ 72(DX), R9
	ADCQ 80(DX), R9
	ADCQ 88(DX), R9
	ADCQ $0x00, R10
	ADDQ 96(DX), R11
	ADCQ 104(DX), R11
	ADCQ 112(DX), R11
	ADCQ 120(DX), R11
	ADCQ $0x00, R12
	ADDQ 128(DX), AX
	ADCQ 136(DX), AX
	ADCQ 144(DX), AX
	ADCQ 152(DX), AX
	ADCQ $0x00, SI
	ADDQ 160(DX), DI
	ADCQ 168(DX), DI
	ADCQ 176(DX), DI
	ADCQ 184(DX), DI
	ADCQ $0x00, R8
	ADDQ 192(DX), R9
	ADCQ 200(DX), R9
	ADCQ 208(DX), R9
	ADCQ 216(DX), R9
	ADCQ $0x00, R10
	ADDQ 224(DX), R11
	ADCQ 232(DX), R11
	ADCQ 240(DX), R11
	ADCQ 248(DX), R11
	ADCQ $0x00, R12
	ADDQ $0x00000100, DX
	SUBQ $0x01, CX
	JNZ  bigLoop
	ADDQ SI, AX
	ADCQ DI, AX
	ADCQ R8, AX
	ADCQ R9, AX
	ADCQ R10, AX
	ADCQ R11, AX
	ADCQ R12, AX

	// accumulate CF (twice, in case the first time overflows)
	ADCQ $0x00, AX
	ADCQ $0x00, AX

startCleanup:
	// Accumulate carries in this register. It is never expected to overflow.
	XORQ SI, SI

	// We will perform an overlapped read for buffers with length not a multiple of 8.
	// Overlapped in this context means some memory will be read twice, but a shift will
	// eliminate the duplicated data. This extra read is performed at the end of the buffer to
	// preserve any alignment that may exist for the start of the buffer.
	MOVQ BX, CX
	SHRQ $0x03, BX
	ANDQ $0x07, CX
	JZ   handleRemaining8
	LEAQ (DX)(BX*8), DI
	MOVQ -8(DI)(CX*1), DI

	// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
	SHLQ $0x03, CX
	NEGQ CX
	ADDQ $0x40, CX
	SHRQ CL, DI
	ADDQ DI, AX
	ADCQ $0x00, SI

handleRemaining8:
	SHRQ $0x01, BX
	JNC  handleRemaining16
	ADDQ (DX), AX
	ADCQ $0x00, SI
	ADDQ $0x08, DX

handleRemaining16:
	SHRQ $0x01, BX
	JNC  handleRemaining32
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x10, DX

handleRemaining32:
	SHRQ $0x01, BX
	JNC  handleRemaining64
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x20, DX

handleRemaining64:
	SHRQ $0x01, BX
	JNC  handleRemaining128
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x40, DX

handleRemaining128:
	SHRQ $0x01, BX
	JNC  handleRemainingComplete
	ADDQ (DX), AX
	ADCQ 8(DX), AX
	ADCQ 16(DX), AX
	ADCQ 24(DX), AX
	ADCQ 32(DX), AX
	ADCQ 40(DX), AX
	ADCQ 48(DX), AX
	ADCQ 56(DX), AX
	ADCQ 64(DX), AX
	ADCQ 72(DX), AX
	ADCQ 80(DX), AX
	ADCQ 88(DX), AX
	ADCQ 96(DX), AX
	ADCQ 104(DX), AX
	ADCQ 112(DX), AX
	ADCQ 120(DX), AX
	ADCQ $0x00, SI
	ADDQ $0x80, DX

handleRemainingComplete:
	ADDQ SI, AX

foldAndReturn:
	// add CF and fold
	MOVL    AX, CX
	ADCQ    $0x00, CX
	SHRQ    $0x20, AX
	ADDQ    CX, AX
	MOVWQZX AX, CX
	SHRQ    $0x10, AX
	ADDQ    CX, AX
	MOVW    AX, CX
	SHRQ    $0x10, AX
	ADDW    CX, AX
	ADCW    $0x00, AX
	XCHGB   AH, AL
	MOVW    AX, ret+32(FP)
	RET