sing-tun/internal/clashtcpip/tcpip_arm64.s
2022-09-08 18:11:26 +08:00

118 lines
2.7 KiB
ArmAsm

#include "textflag.h"
// func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr
//
// args (8 bytes aligned):
// data unsafe.Pointer - 8 bytes - 0 offset
// length uintptr - 8 bytes - 8 offset
// result uintptr - 8 bytes - 16 offset
#define PDATA R0
#define LENGTH R1
#define RESULT R2
#define VSUM V0
TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24
MOVD data+0(FP), PDATA
MOVD length+8(FP), LENGTH
MOVD $0, RESULT
VMOVQ $0, $0, VSUM
#define LOADED_0 V1
#define LOADED_1 V2
#define LOADED_2 V3
#define LOADED_3 V4
BATCH_32:
CMP $32, LENGTH
BLO BATCH_16
VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8]
VREV16 LOADED_0.B8, LOADED_0.B8
VREV16 LOADED_1.B8, LOADED_1.B8
VREV16 LOADED_2.B8, LOADED_2.B8
VREV16 LOADED_3.B8, LOADED_3.B8
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
VUSHLL $0, LOADED_1.H4, LOADED_1.S4
VUSHLL $0, LOADED_2.H4, LOADED_2.S4
VUSHLL $0, LOADED_3.H4, LOADED_3.S4
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
VADD LOADED_1.S4, VSUM.S4, VSUM.S4
VADD LOADED_2.S4, VSUM.S4, VSUM.S4
VADD LOADED_3.S4, VSUM.S4, VSUM.S4
ADD $-32, LENGTH
ADD $32, PDATA
B BATCH_32
#undef LOADED_0
#undef LOADED_1
#undef LOADED_2
#undef LOADED_3
#define LOADED_0 V1
#define LOADED_1 V2
BATCH_16:
CMP $16, LENGTH
BLO BATCH_8
VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8]
VREV16 LOADED_0.B8, LOADED_0.B8
VREV16 LOADED_1.B8, LOADED_1.B8
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
VUSHLL $0, LOADED_1.H4, LOADED_1.S4
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
VADD LOADED_1.S4, VSUM.S4, VSUM.S4
ADD $-16, LENGTH
ADD $16, PDATA
B BATCH_16
#undef LOADED_0
#undef LOADED_1
#define LOADED_0 V1
BATCH_8:
CMP $8, LENGTH
BLO BATCH_2
VLD1 (PDATA), [LOADED_0.B8]
VREV16 LOADED_0.B8, LOADED_0.B8
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
ADD $-8, LENGTH
ADD $8, PDATA
B BATCH_8
#undef LOADED_0
#define LOADED_L R3
#define LOADED_H R4
BATCH_2:
CMP $2, LENGTH
BLO BATCH_1
MOVBU (PDATA), LOADED_H
MOVBU 1(PDATA), LOADED_L
LSL $8, LOADED_H
ORR LOADED_H, LOADED_L, LOADED_L
ADD LOADED_L, RESULT, RESULT
ADD $2, PDATA
ADD $-2, LENGTH
B BATCH_2
#undef LOADED_H
#undef LOADED_L
#define LOADED R3
BATCH_1:
CMP $1, LENGTH
BLO COLLECT
MOVBU (PDATA), LOADED
LSL $8, LOADED
ADD LOADED, RESULT, RESULT
#define EXTRACTED R3
COLLECT:
VMOV VSUM.S[0], EXTRACTED
ADD EXTRACTED, RESULT
VMOV VSUM.S[1], EXTRACTED
ADD EXTRACTED, RESULT
VMOV VSUM.S[2], EXTRACTED
ADD EXTRACTED, RESULT
VMOV VSUM.S[3], EXTRACTED
ADD EXTRACTED, RESULT
#undef VSUM
#undef PDATA
#undef LENGTH
RETURN:
MOVD RESULT, result+16(FP)
RET