#include "textflag.h" // func sumAsmNeon(data unsafe.Pointer, length uintptr) uintptr // // args (8 bytes aligned): // data unsafe.Pointer - 8 bytes - 0 offset // length uintptr - 8 bytes - 8 offset // result uintptr - 8 bytes - 16 offset #define PDATA R0 #define LENGTH R1 #define RESULT R2 #define VSUM V0 TEXT ·sumAsmNeon(SB),NOSPLIT,$0-24 MOVD data+0(FP), PDATA MOVD length+8(FP), LENGTH MOVD $0, RESULT VMOVQ $0, $0, VSUM #define LOADED_0 V1 #define LOADED_1 V2 #define LOADED_2 V3 #define LOADED_3 V4 BATCH_32: CMP $32, LENGTH BLO BATCH_16 VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8, LOADED_2.B8, LOADED_3.B8] VREV16 LOADED_0.B8, LOADED_0.B8 VREV16 LOADED_1.B8, LOADED_1.B8 VREV16 LOADED_2.B8, LOADED_2.B8 VREV16 LOADED_3.B8, LOADED_3.B8 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 VUSHLL $0, LOADED_1.H4, LOADED_1.S4 VUSHLL $0, LOADED_2.H4, LOADED_2.S4 VUSHLL $0, LOADED_3.H4, LOADED_3.S4 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 VADD LOADED_1.S4, VSUM.S4, VSUM.S4 VADD LOADED_2.S4, VSUM.S4, VSUM.S4 VADD LOADED_3.S4, VSUM.S4, VSUM.S4 ADD $-32, LENGTH ADD $32, PDATA B BATCH_32 #undef LOADED_0 #undef LOADED_1 #undef LOADED_2 #undef LOADED_3 #define LOADED_0 V1 #define LOADED_1 V2 BATCH_16: CMP $16, LENGTH BLO BATCH_8 VLD1 (PDATA), [LOADED_0.B8, LOADED_1.B8] VREV16 LOADED_0.B8, LOADED_0.B8 VREV16 LOADED_1.B8, LOADED_1.B8 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 VUSHLL $0, LOADED_1.H4, LOADED_1.S4 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 VADD LOADED_1.S4, VSUM.S4, VSUM.S4 ADD $-16, LENGTH ADD $16, PDATA B BATCH_16 #undef LOADED_0 #undef LOADED_1 #define LOADED_0 V1 BATCH_8: CMP $8, LENGTH BLO BATCH_2 VLD1 (PDATA), [LOADED_0.B8] VREV16 LOADED_0.B8, LOADED_0.B8 VUSHLL $0, LOADED_0.H4, LOADED_0.S4 VADD LOADED_0.S4, VSUM.S4, VSUM.S4 ADD $-8, LENGTH ADD $8, PDATA B BATCH_8 #undef LOADED_0 #define LOADED_L R3 #define LOADED_H R4 BATCH_2: CMP $2, LENGTH BLO BATCH_1 MOVBU (PDATA), LOADED_H MOVBU 1(PDATA), LOADED_L LSL $8, LOADED_H ORR LOADED_H, LOADED_L, LOADED_L ADD LOADED_L, RESULT, RESULT ADD $2, PDATA ADD $-2, LENGTH B BATCH_2 #undef LOADED_H #undef LOADED_L #define LOADED R3 BATCH_1: CMP $1, LENGTH BLO COLLECT MOVBU (PDATA), LOADED LSL $8, LOADED ADD LOADED, RESULT, RESULT #define EXTRACTED R3 COLLECT: VMOV VSUM.S[0], EXTRACTED ADD EXTRACTED, RESULT VMOV VSUM.S[1], EXTRACTED ADD EXTRACTED, RESULT VMOV VSUM.S[2], EXTRACTED ADD EXTRACTED, RESULT VMOV VSUM.S[3], EXTRACTED ADD EXTRACTED, RESULT #undef VSUM #undef PDATA #undef LENGTH RETURN: MOVD RESULT, result+16(FP) RET