mirror of
https://github.com/SagerNet/sing-tun.git
synced 2025-04-02 19:37:40 +03:00
118 lines
2.7 KiB
ArmAsm
118 lines
2.7 KiB
ArmAsm
#include "textflag.h"
|
|
|
|
// func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr
|
|
//
|
|
// args (8 bytes aligned):
|
|
// data unsafe.Pointer - 8 bytes - 0 offset
|
|
// length uintptr - 8 bytes - 8 offset
|
|
// result uintptr - 8 bytes - 16 offset
|
|
#define PDATA R0
|
|
#define LENGTH R1
|
|
#define RESULT R2
|
|
#define VSUM V0
|
|
TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24
|
|
MOVD data+0(FP), PDATA
|
|
MOVD length+8(FP), LENGTH
|
|
MOVD $0, RESULT
|
|
VMOVQ $0, $0, VSUM
|
|
|
|
#define LOADED_0 V1
|
|
#define LOADED_1 V2
|
|
#define LOADED_2 V3
|
|
#define LOADED_3 V4
|
|
BATCH_32:
|
|
CMP $32, LENGTH
|
|
BLO BATCH_16
|
|
VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8]
|
|
VREV16 LOADED_0.B8, LOADED_0.B8
|
|
VREV16 LOADED_1.B8, LOADED_1.B8
|
|
VREV16 LOADED_2.B8, LOADED_2.B8
|
|
VREV16 LOADED_3.B8, LOADED_3.B8
|
|
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
|
|
VUSHLL $0, LOADED_1.H4, LOADED_1.S4
|
|
VUSHLL $0, LOADED_2.H4, LOADED_2.S4
|
|
VUSHLL $0, LOADED_3.H4, LOADED_3.S4
|
|
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
|
|
VADD LOADED_1.S4, VSUM.S4, VSUM.S4
|
|
VADD LOADED_2.S4, VSUM.S4, VSUM.S4
|
|
VADD LOADED_3.S4, VSUM.S4, VSUM.S4
|
|
ADD $-32, LENGTH
|
|
ADD $32, PDATA
|
|
B BATCH_32
|
|
#undef LOADED_0
|
|
#undef LOADED_1
|
|
#undef LOADED_2
|
|
#undef LOADED_3
|
|
|
|
#define LOADED_0 V1
|
|
#define LOADED_1 V2
|
|
BATCH_16:
|
|
CMP $16, LENGTH
|
|
BLO BATCH_8
|
|
VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8]
|
|
VREV16 LOADED_0.B8, LOADED_0.B8
|
|
VREV16 LOADED_1.B8, LOADED_1.B8
|
|
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
|
|
VUSHLL $0, LOADED_1.H4, LOADED_1.S4
|
|
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
|
|
VADD LOADED_1.S4, VSUM.S4, VSUM.S4
|
|
ADD $-16, LENGTH
|
|
ADD $16, PDATA
|
|
B BATCH_16
|
|
#undef LOADED_0
|
|
#undef LOADED_1
|
|
|
|
#define LOADED_0 V1
|
|
BATCH_8:
|
|
CMP $8, LENGTH
|
|
BLO BATCH_2
|
|
VLD1 (PDATA), [LOADED_0.B8]
|
|
VREV16 LOADED_0.B8, LOADED_0.B8
|
|
VUSHLL $0, LOADED_0.H4, LOADED_0.S4
|
|
VADD LOADED_0.S4, VSUM.S4, VSUM.S4
|
|
ADD $-8, LENGTH
|
|
ADD $8, PDATA
|
|
B BATCH_8
|
|
#undef LOADED_0
|
|
|
|
#define LOADED_L R3
|
|
#define LOADED_H R4
|
|
BATCH_2:
|
|
CMP $2, LENGTH
|
|
BLO BATCH_1
|
|
MOVBU (PDATA), LOADED_H
|
|
MOVBU 1(PDATA), LOADED_L
|
|
LSL $8, LOADED_H
|
|
ORR LOADED_H, LOADED_L, LOADED_L
|
|
ADD LOADED_L, RESULT, RESULT
|
|
ADD $2, PDATA
|
|
ADD $-2, LENGTH
|
|
B BATCH_2
|
|
#undef LOADED_H
|
|
#undef LOADED_L
|
|
|
|
#define LOADED R3
|
|
BATCH_1:
|
|
CMP $1, LENGTH
|
|
BLO COLLECT
|
|
MOVBU (PDATA), LOADED
|
|
LSL $8, LOADED
|
|
ADD LOADED, RESULT, RESULT
|
|
|
|
#define EXTRACTED R3
|
|
COLLECT:
|
|
VMOV VSUM.S[0], EXTRACTED
|
|
ADD EXTRACTED, RESULT
|
|
VMOV VSUM.S[1], EXTRACTED
|
|
ADD EXTRACTED, RESULT
|
|
VMOV VSUM.S[2], EXTRACTED
|
|
ADD EXTRACTED, RESULT
|
|
VMOV VSUM.S[3], EXTRACTED
|
|
ADD EXTRACTED, RESULT
|
|
#undef VSUM
|
|
#undef PDATA
|
|
#undef LENGTH
|
|
|
|
RETURN:
|
|
MOVD RESULT, result+16(FP)
|
|
RET
|