mirror of
https://github.com/SagerNet/sing-tun.git
synced 2025-04-03 20:07:40 +03:00
851 lines
18 KiB
ArmAsm
851 lines
18 KiB
ArmAsm
// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.
|
|
|
|
#include "textflag.h"
|
|
|
|
DATA xmmLoadMasks<>+0(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
|
|
DATA xmmLoadMasks<>+16(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff"
|
|
DATA xmmLoadMasks<>+32(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff"
|
|
DATA xmmLoadMasks<>+48(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff"
|
|
DATA xmmLoadMasks<>+64(SB)/16, $"\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
|
DATA xmmLoadMasks<>+80(SB)/16, $"\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
|
DATA xmmLoadMasks<>+96(SB)/16, $"\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
|
GLOBL xmmLoadMasks<>(SB), RODATA|NOPTR, $112
|
|
|
|
// func checksumAVX2(b []byte, initial uint16) uint16
|
|
// Requires: AVX, AVX2, BMI2
|
|
TEXT ·checksumAVX2(SB), NOSPLIT|NOFRAME, $0-34
|
|
MOVWQZX initial+24(FP), AX
|
|
XCHGB AH, AL
|
|
MOVQ b_base+0(FP), DX
|
|
MOVQ b_len+8(FP), BX
|
|
|
|
// handle odd length buffers; they are difficult to handle in general
|
|
TESTQ $0x00000001, BX
|
|
JZ lengthIsEven
|
|
MOVBQZX -1(DX)(BX*1), CX
|
|
DECQ BX
|
|
ADDQ CX, AX
|
|
|
|
lengthIsEven:
|
|
// handle tiny buffers (<=31 bytes) specially
|
|
CMPQ BX, $0x1f
|
|
JGT bufferIsNotTiny
|
|
XORQ CX, CX
|
|
XORQ SI, SI
|
|
XORQ DI, DI
|
|
|
|
// shift twice to start because length is guaranteed to be even
|
|
// n = n >> 2; CF = originalN & 2
|
|
SHRQ $0x02, BX
|
|
JNC handleTiny4
|
|
|
|
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
|
MOVWQZX (DX), CX
|
|
ADDQ $0x02, DX
|
|
|
|
handleTiny4:
|
|
// n = n >> 1; CF = originalN & 4
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny8
|
|
|
|
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
|
MOVLQZX (DX), SI
|
|
ADDQ $0x04, DX
|
|
|
|
handleTiny8:
|
|
// n = n >> 1; CF = originalN & 8
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny16
|
|
|
|
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
|
MOVQ (DX), DI
|
|
ADDQ $0x08, DX
|
|
|
|
handleTiny16:
|
|
// n = n >> 1; CF = originalN & 16
|
|
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
|
SHRQ $0x01, BX
|
|
JNC handleTinyFinish
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
|
|
handleTinyFinish:
|
|
// CF should be included from the previous add, so we use ADCQ.
|
|
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
|
// so ADCQ will still produce the correct result.
|
|
ADCQ CX, AX
|
|
ADCQ SI, AX
|
|
ADCQ DI, AX
|
|
JMP foldAndReturn
|
|
|
|
bufferIsNotTiny:
|
|
// skip all SIMD for small buffers
|
|
CMPQ BX, $0x00000100
|
|
JGE startSIMD
|
|
|
|
// Accumulate carries in this register. It is never expected to overflow.
|
|
XORQ SI, SI
|
|
|
|
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
|
// Overlapped in this context means some memory will be read twice, but a shift will
|
|
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
|
// preserve any alignment that may exist for the start of the buffer.
|
|
MOVQ BX, CX
|
|
SHRQ $0x03, BX
|
|
ANDQ $0x07, CX
|
|
JZ handleRemaining8
|
|
LEAQ (DX)(BX*8), DI
|
|
MOVQ -8(DI)(CX*1), DI
|
|
|
|
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
|
SHLQ $0x03, CX
|
|
NEGQ CX
|
|
ADDQ $0x40, CX
|
|
SHRQ CL, DI
|
|
ADDQ DI, AX
|
|
ADCQ $0x00, SI
|
|
|
|
handleRemaining8:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining16
|
|
ADDQ (DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x08, DX
|
|
|
|
handleRemaining16:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining32
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x10, DX
|
|
|
|
handleRemaining32:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining64
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x20, DX
|
|
|
|
handleRemaining64:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining128
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x40, DX
|
|
|
|
handleRemaining128:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemainingComplete
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ 64(DX), AX
|
|
ADCQ 72(DX), AX
|
|
ADCQ 80(DX), AX
|
|
ADCQ 88(DX), AX
|
|
ADCQ 96(DX), AX
|
|
ADCQ 104(DX), AX
|
|
ADCQ 112(DX), AX
|
|
ADCQ 120(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x80, DX
|
|
|
|
handleRemainingComplete:
|
|
ADDQ SI, AX
|
|
JMP foldAndReturn
|
|
|
|
startSIMD:
|
|
VPXOR Y0, Y0, Y0
|
|
VPXOR Y1, Y1, Y1
|
|
VPXOR Y2, Y2, Y2
|
|
VPXOR Y3, Y3, Y3
|
|
MOVQ BX, CX
|
|
|
|
// Update number of bytes remaining after the loop completes
|
|
ANDQ $0xff, BX
|
|
|
|
// Number of 256 byte iterations
|
|
SHRQ $0x08, CX
|
|
JZ smallLoop
|
|
|
|
bigLoop:
|
|
VPMOVZXWD (DX), Y4
|
|
VPADDD Y4, Y0, Y0
|
|
VPMOVZXWD 16(DX), Y4
|
|
VPADDD Y4, Y1, Y1
|
|
VPMOVZXWD 32(DX), Y4
|
|
VPADDD Y4, Y2, Y2
|
|
VPMOVZXWD 48(DX), Y4
|
|
VPADDD Y4, Y3, Y3
|
|
VPMOVZXWD 64(DX), Y4
|
|
VPADDD Y4, Y0, Y0
|
|
VPMOVZXWD 80(DX), Y4
|
|
VPADDD Y4, Y1, Y1
|
|
VPMOVZXWD 96(DX), Y4
|
|
VPADDD Y4, Y2, Y2
|
|
VPMOVZXWD 112(DX), Y4
|
|
VPADDD Y4, Y3, Y3
|
|
VPMOVZXWD 128(DX), Y4
|
|
VPADDD Y4, Y0, Y0
|
|
VPMOVZXWD 144(DX), Y4
|
|
VPADDD Y4, Y1, Y1
|
|
VPMOVZXWD 160(DX), Y4
|
|
VPADDD Y4, Y2, Y2
|
|
VPMOVZXWD 176(DX), Y4
|
|
VPADDD Y4, Y3, Y3
|
|
VPMOVZXWD 192(DX), Y4
|
|
VPADDD Y4, Y0, Y0
|
|
VPMOVZXWD 208(DX), Y4
|
|
VPADDD Y4, Y1, Y1
|
|
VPMOVZXWD 224(DX), Y4
|
|
VPADDD Y4, Y2, Y2
|
|
VPMOVZXWD 240(DX), Y4
|
|
VPADDD Y4, Y3, Y3
|
|
ADDQ $0x00000100, DX
|
|
DECQ CX
|
|
JNZ bigLoop
|
|
CMPQ BX, $0x10
|
|
JLT doneSmallLoop
|
|
|
|
// now read a single 16 byte unit of data at a time
|
|
smallLoop:
|
|
VPMOVZXWD (DX), Y4
|
|
VPADDD Y4, Y0, Y0
|
|
ADDQ $0x10, DX
|
|
SUBQ $0x10, BX
|
|
CMPQ BX, $0x10
|
|
JGE smallLoop
|
|
|
|
doneSmallLoop:
|
|
CMPQ BX, $0x00
|
|
JE doneSIMD
|
|
|
|
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
|
LEAQ xmmLoadMasks<>+0(SB), CX
|
|
VMOVDQU -16(DX)(BX*1), X4
|
|
VPAND -16(CX)(BX*8), X4, X4
|
|
VPMOVZXWD X4, Y4
|
|
VPADDD Y4, Y0, Y0
|
|
|
|
doneSIMD:
|
|
// Multi-chain loop is done, combine the accumulators
|
|
VPADDD Y1, Y0, Y0
|
|
VPADDD Y2, Y0, Y0
|
|
VPADDD Y3, Y0, Y0
|
|
|
|
// extract the YMM into a pair of XMM and sum them
|
|
VEXTRACTI128 $0x01, Y0, X1
|
|
VPADDD X0, X1, X0
|
|
|
|
// extract the XMM into GP64
|
|
VPEXTRQ $0x00, X0, CX
|
|
VPEXTRQ $0x01, X0, DX
|
|
|
|
// no more AVX code, clear upper registers to avoid SSE slowdowns
|
|
VZEROUPPER
|
|
ADDQ CX, AX
|
|
ADCQ DX, AX
|
|
|
|
foldAndReturn:
|
|
// add CF and fold
|
|
RORXQ $0x20, AX, CX
|
|
ADCL CX, AX
|
|
RORXL $0x10, AX, CX
|
|
ADCW CX, AX
|
|
ADCW $0x00, AX
|
|
XCHGB AH, AL
|
|
MOVW AX, ret+32(FP)
|
|
RET
|
|
|
|
// func checksumSSE2(b []byte, initial uint16) uint16
|
|
// Requires: SSE2
|
|
TEXT ·checksumSSE2(SB), NOSPLIT|NOFRAME, $0-34
|
|
MOVWQZX initial+24(FP), AX
|
|
XCHGB AH, AL
|
|
MOVQ b_base+0(FP), DX
|
|
MOVQ b_len+8(FP), BX
|
|
|
|
// handle odd length buffers; they are difficult to handle in general
|
|
TESTQ $0x00000001, BX
|
|
JZ lengthIsEven
|
|
MOVBQZX -1(DX)(BX*1), CX
|
|
DECQ BX
|
|
ADDQ CX, AX
|
|
|
|
lengthIsEven:
|
|
// handle tiny buffers (<=31 bytes) specially
|
|
CMPQ BX, $0x1f
|
|
JGT bufferIsNotTiny
|
|
XORQ CX, CX
|
|
XORQ SI, SI
|
|
XORQ DI, DI
|
|
|
|
// shift twice to start because length is guaranteed to be even
|
|
// n = n >> 2; CF = originalN & 2
|
|
SHRQ $0x02, BX
|
|
JNC handleTiny4
|
|
|
|
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
|
MOVWQZX (DX), CX
|
|
ADDQ $0x02, DX
|
|
|
|
handleTiny4:
|
|
// n = n >> 1; CF = originalN & 4
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny8
|
|
|
|
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
|
MOVLQZX (DX), SI
|
|
ADDQ $0x04, DX
|
|
|
|
handleTiny8:
|
|
// n = n >> 1; CF = originalN & 8
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny16
|
|
|
|
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
|
MOVQ (DX), DI
|
|
ADDQ $0x08, DX
|
|
|
|
handleTiny16:
|
|
// n = n >> 1; CF = originalN & 16
|
|
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
|
SHRQ $0x01, BX
|
|
JNC handleTinyFinish
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
|
|
handleTinyFinish:
|
|
// CF should be included from the previous add, so we use ADCQ.
|
|
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
|
// so ADCQ will still produce the correct result.
|
|
ADCQ CX, AX
|
|
ADCQ SI, AX
|
|
ADCQ DI, AX
|
|
JMP foldAndReturn
|
|
|
|
bufferIsNotTiny:
|
|
// skip all SIMD for small buffers
|
|
CMPQ BX, $0x00000100
|
|
JGE startSIMD
|
|
|
|
// Accumulate carries in this register. It is never expected to overflow.
|
|
XORQ SI, SI
|
|
|
|
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
|
// Overlapped in this context means some memory will be read twice, but a shift will
|
|
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
|
// preserve any alignment that may exist for the start of the buffer.
|
|
MOVQ BX, CX
|
|
SHRQ $0x03, BX
|
|
ANDQ $0x07, CX
|
|
JZ handleRemaining8
|
|
LEAQ (DX)(BX*8), DI
|
|
MOVQ -8(DI)(CX*1), DI
|
|
|
|
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
|
SHLQ $0x03, CX
|
|
NEGQ CX
|
|
ADDQ $0x40, CX
|
|
SHRQ CL, DI
|
|
ADDQ DI, AX
|
|
ADCQ $0x00, SI
|
|
|
|
handleRemaining8:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining16
|
|
ADDQ (DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x08, DX
|
|
|
|
handleRemaining16:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining32
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x10, DX
|
|
|
|
handleRemaining32:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining64
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x20, DX
|
|
|
|
handleRemaining64:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining128
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x40, DX
|
|
|
|
handleRemaining128:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemainingComplete
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ 64(DX), AX
|
|
ADCQ 72(DX), AX
|
|
ADCQ 80(DX), AX
|
|
ADCQ 88(DX), AX
|
|
ADCQ 96(DX), AX
|
|
ADCQ 104(DX), AX
|
|
ADCQ 112(DX), AX
|
|
ADCQ 120(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x80, DX
|
|
|
|
handleRemainingComplete:
|
|
ADDQ SI, AX
|
|
JMP foldAndReturn
|
|
|
|
startSIMD:
|
|
PXOR X0, X0
|
|
PXOR X1, X1
|
|
PXOR X2, X2
|
|
PXOR X3, X3
|
|
PXOR X4, X4
|
|
MOVQ BX, CX
|
|
|
|
// Update number of bytes remaining after the loop completes
|
|
ANDQ $0xff, BX
|
|
|
|
// Number of 256 byte iterations
|
|
SHRQ $0x08, CX
|
|
JZ smallLoop
|
|
|
|
bigLoop:
|
|
MOVOU (DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X2
|
|
MOVOU 16(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X1
|
|
PADDD X6, X3
|
|
MOVOU 32(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X2
|
|
PADDD X6, X0
|
|
MOVOU 48(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X3
|
|
PADDD X6, X1
|
|
MOVOU 64(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X2
|
|
MOVOU 80(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X1
|
|
PADDD X6, X3
|
|
MOVOU 96(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X2
|
|
PADDD X6, X0
|
|
MOVOU 112(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X3
|
|
PADDD X6, X1
|
|
MOVOU 128(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X2
|
|
MOVOU 144(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X1
|
|
PADDD X6, X3
|
|
MOVOU 160(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X2
|
|
PADDD X6, X0
|
|
MOVOU 176(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X3
|
|
PADDD X6, X1
|
|
MOVOU 192(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X2
|
|
MOVOU 208(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X1
|
|
PADDD X6, X3
|
|
MOVOU 224(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X2
|
|
PADDD X6, X0
|
|
MOVOU 240(DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X3
|
|
PADDD X6, X1
|
|
ADDQ $0x00000100, DX
|
|
DECQ CX
|
|
JNZ bigLoop
|
|
CMPQ BX, $0x10
|
|
JLT doneSmallLoop
|
|
|
|
// now read a single 16 byte unit of data at a time
|
|
smallLoop:
|
|
MOVOU (DX), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X1
|
|
ADDQ $0x10, DX
|
|
SUBQ $0x10, BX
|
|
CMPQ BX, $0x10
|
|
JGE smallLoop
|
|
|
|
doneSmallLoop:
|
|
CMPQ BX, $0x00
|
|
JE doneSIMD
|
|
|
|
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
|
LEAQ xmmLoadMasks<>+0(SB), CX
|
|
MOVOU -16(DX)(BX*1), X5
|
|
PAND -16(CX)(BX*8), X5
|
|
MOVOA X5, X6
|
|
PUNPCKHWL X4, X5
|
|
PUNPCKLWL X4, X6
|
|
PADDD X5, X0
|
|
PADDD X6, X1
|
|
|
|
doneSIMD:
|
|
// Multi-chain loop is done, combine the accumulators
|
|
PADDD X1, X0
|
|
PADDD X2, X0
|
|
PADDD X3, X0
|
|
|
|
// extract the XMM into GP64
|
|
MOVQ X0, CX
|
|
PSRLDQ $0x08, X0
|
|
MOVQ X0, DX
|
|
ADDQ CX, AX
|
|
ADCQ DX, AX
|
|
|
|
foldAndReturn:
|
|
// add CF and fold
|
|
MOVL AX, CX
|
|
ADCQ $0x00, CX
|
|
SHRQ $0x20, AX
|
|
ADDQ CX, AX
|
|
MOVWQZX AX, CX
|
|
SHRQ $0x10, AX
|
|
ADDQ CX, AX
|
|
MOVW AX, CX
|
|
SHRQ $0x10, AX
|
|
ADDW CX, AX
|
|
ADCW $0x00, AX
|
|
XCHGB AH, AL
|
|
MOVW AX, ret+32(FP)
|
|
RET
|
|
|
|
// func checksumAMD64(b []byte, initial uint16) uint16
|
|
TEXT ·checksumAMD64(SB), NOSPLIT|NOFRAME, $0-34
|
|
MOVWQZX initial+24(FP), AX
|
|
XCHGB AH, AL
|
|
MOVQ b_base+0(FP), DX
|
|
MOVQ b_len+8(FP), BX
|
|
|
|
// handle odd length buffers; they are difficult to handle in general
|
|
TESTQ $0x00000001, BX
|
|
JZ lengthIsEven
|
|
MOVBQZX -1(DX)(BX*1), CX
|
|
DECQ BX
|
|
ADDQ CX, AX
|
|
|
|
lengthIsEven:
|
|
// handle tiny buffers (<=31 bytes) specially
|
|
CMPQ BX, $0x1f
|
|
JGT bufferIsNotTiny
|
|
XORQ CX, CX
|
|
XORQ SI, SI
|
|
XORQ DI, DI
|
|
|
|
// shift twice to start because length is guaranteed to be even
|
|
// n = n >> 2; CF = originalN & 2
|
|
SHRQ $0x02, BX
|
|
JNC handleTiny4
|
|
|
|
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
|
MOVWQZX (DX), CX
|
|
ADDQ $0x02, DX
|
|
|
|
handleTiny4:
|
|
// n = n >> 1; CF = originalN & 4
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny8
|
|
|
|
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
|
MOVLQZX (DX), SI
|
|
ADDQ $0x04, DX
|
|
|
|
handleTiny8:
|
|
// n = n >> 1; CF = originalN & 8
|
|
SHRQ $0x01, BX
|
|
JNC handleTiny16
|
|
|
|
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
|
MOVQ (DX), DI
|
|
ADDQ $0x08, DX
|
|
|
|
handleTiny16:
|
|
// n = n >> 1; CF = originalN & 16
|
|
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
|
SHRQ $0x01, BX
|
|
JNC handleTinyFinish
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
|
|
handleTinyFinish:
|
|
// CF should be included from the previous add, so we use ADCQ.
|
|
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
|
// so ADCQ will still produce the correct result.
|
|
ADCQ CX, AX
|
|
ADCQ SI, AX
|
|
ADCQ DI, AX
|
|
JMP foldAndReturn
|
|
|
|
bufferIsNotTiny:
|
|
// Number of 256 byte iterations into loop counter
|
|
MOVQ BX, CX
|
|
|
|
// Update number of bytes remaining after the loop completes
|
|
ANDQ $0xff, BX
|
|
SHRQ $0x08, CX
|
|
JZ startCleanup
|
|
CLC
|
|
XORQ SI, SI
|
|
XORQ DI, DI
|
|
XORQ R8, R8
|
|
XORQ R9, R9
|
|
XORQ R10, R10
|
|
XORQ R11, R11
|
|
XORQ R12, R12
|
|
|
|
bigLoop:
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ 32(DX), DI
|
|
ADCQ 40(DX), DI
|
|
ADCQ 48(DX), DI
|
|
ADCQ 56(DX), DI
|
|
ADCQ $0x00, R8
|
|
ADDQ 64(DX), R9
|
|
ADCQ 72(DX), R9
|
|
ADCQ 80(DX), R9
|
|
ADCQ 88(DX), R9
|
|
ADCQ $0x00, R10
|
|
ADDQ 96(DX), R11
|
|
ADCQ 104(DX), R11
|
|
ADCQ 112(DX), R11
|
|
ADCQ 120(DX), R11
|
|
ADCQ $0x00, R12
|
|
ADDQ 128(DX), AX
|
|
ADCQ 136(DX), AX
|
|
ADCQ 144(DX), AX
|
|
ADCQ 152(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ 160(DX), DI
|
|
ADCQ 168(DX), DI
|
|
ADCQ 176(DX), DI
|
|
ADCQ 184(DX), DI
|
|
ADCQ $0x00, R8
|
|
ADDQ 192(DX), R9
|
|
ADCQ 200(DX), R9
|
|
ADCQ 208(DX), R9
|
|
ADCQ 216(DX), R9
|
|
ADCQ $0x00, R10
|
|
ADDQ 224(DX), R11
|
|
ADCQ 232(DX), R11
|
|
ADCQ 240(DX), R11
|
|
ADCQ 248(DX), R11
|
|
ADCQ $0x00, R12
|
|
ADDQ $0x00000100, DX
|
|
SUBQ $0x01, CX
|
|
JNZ bigLoop
|
|
ADDQ SI, AX
|
|
ADCQ DI, AX
|
|
ADCQ R8, AX
|
|
ADCQ R9, AX
|
|
ADCQ R10, AX
|
|
ADCQ R11, AX
|
|
ADCQ R12, AX
|
|
|
|
// accumulate CF (twice, in case the first time overflows)
|
|
ADCQ $0x00, AX
|
|
ADCQ $0x00, AX
|
|
|
|
startCleanup:
|
|
// Accumulate carries in this register. It is never expected to overflow.
|
|
XORQ SI, SI
|
|
|
|
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
|
// Overlapped in this context means some memory will be read twice, but a shift will
|
|
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
|
// preserve any alignment that may exist for the start of the buffer.
|
|
MOVQ BX, CX
|
|
SHRQ $0x03, BX
|
|
ANDQ $0x07, CX
|
|
JZ handleRemaining8
|
|
LEAQ (DX)(BX*8), DI
|
|
MOVQ -8(DI)(CX*1), DI
|
|
|
|
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
|
SHLQ $0x03, CX
|
|
NEGQ CX
|
|
ADDQ $0x40, CX
|
|
SHRQ CL, DI
|
|
ADDQ DI, AX
|
|
ADCQ $0x00, SI
|
|
|
|
handleRemaining8:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining16
|
|
ADDQ (DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x08, DX
|
|
|
|
handleRemaining16:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining32
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x10, DX
|
|
|
|
handleRemaining32:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining64
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x20, DX
|
|
|
|
handleRemaining64:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemaining128
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x40, DX
|
|
|
|
handleRemaining128:
|
|
SHRQ $0x01, BX
|
|
JNC handleRemainingComplete
|
|
ADDQ (DX), AX
|
|
ADCQ 8(DX), AX
|
|
ADCQ 16(DX), AX
|
|
ADCQ 24(DX), AX
|
|
ADCQ 32(DX), AX
|
|
ADCQ 40(DX), AX
|
|
ADCQ 48(DX), AX
|
|
ADCQ 56(DX), AX
|
|
ADCQ 64(DX), AX
|
|
ADCQ 72(DX), AX
|
|
ADCQ 80(DX), AX
|
|
ADCQ 88(DX), AX
|
|
ADCQ 96(DX), AX
|
|
ADCQ 104(DX), AX
|
|
ADCQ 112(DX), AX
|
|
ADCQ 120(DX), AX
|
|
ADCQ $0x00, SI
|
|
ADDQ $0x80, DX
|
|
|
|
handleRemainingComplete:
|
|
ADDQ SI, AX
|
|
|
|
foldAndReturn:
|
|
// add CF and fold
|
|
MOVL AX, CX
|
|
ADCQ $0x00, CX
|
|
SHRQ $0x20, AX
|
|
ADDQ CX, AX
|
|
MOVWQZX AX, CX
|
|
SHRQ $0x10, AX
|
|
ADDQ CX, AX
|
|
MOVW AX, CX
|
|
SHRQ $0x10, AX
|
|
ADDW CX, AX
|
|
ADCW $0x00, AX
|
|
XCHGB AH, AL
|
|
MOVW AX, ret+32(FP)
|
|
RET
|