utls/bytealg/equal_386.s
Martin Möhrmann ce914eab65 all: replace runtime SSE2 detection with GO386 setting
When GO386=sse2 we can assume sse2 to be present without
a runtime check. If GO386=softfloat is set we can avoid
the usage of SSE2 even if detected.

This might cause a memcpy, memclr and bytealg slowdown of Go
binaries compiled with softfloat on machines that support
SSE2. Such setups are rare and should use GO386=sse2 instead
if performance matters.

On targets that support SSE2 we avoid the runtime overhead of
dynamic cpu feature dispatch.

The removal of runtime sse2 checks also allows to simplify
internal/cpu further by removing handling of the required
feature option as a followup after this CL.

Change-Id: I90a853a8853a405cb665497c6d1a86556947ba17
Reviewed-on: https://go-review.googlesource.com/c/go/+/344350
Trust: Martin Möhrmann <martin@golang.org>
Run-TryBot: Martin Möhrmann <martin@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-08-23 21:22:58 +00:00

130 lines
2.1 KiB
ArmAsm

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT,$0-13
MOVL a+0(FP), SI
MOVL b+4(FP), DI
CMPL SI, DI
JEQ eq
MOVL size+8(FP), BX
LEAL ret+12(FP), AX
JMP memeqbody<>(SB)
eq:
MOVB $1, ret+12(FP)
RET
// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
MOVL a+0(FP), SI
MOVL b+4(FP), DI
CMPL SI, DI
JEQ eq
MOVL 4(DX), BX // compiler stores size at offset 4 in the closure
LEAL ret+8(FP), AX
JMP memeqbody<>(SB)
eq:
MOVB $1, ret+8(FP)
RET
// a in SI
// b in DI
// count in BX
// address of result byte in AX
TEXT memeqbody<>(SB),NOSPLIT,$0-0
CMPL BX, $4
JB small
// 64 bytes at a time using xmm registers
hugeloop:
CMPL BX, $64
JB bigloop
#ifdef GO386_softfloat
JMP bigloop
#endif
MOVOU (SI), X0
MOVOU (DI), X1
MOVOU 16(SI), X2
MOVOU 16(DI), X3
MOVOU 32(SI), X4
MOVOU 32(DI), X5
MOVOU 48(SI), X6
MOVOU 48(DI), X7
PCMPEQB X1, X0
PCMPEQB X3, X2
PCMPEQB X5, X4
PCMPEQB X7, X6
PAND X2, X0
PAND X6, X4
PAND X4, X0
PMOVMSKB X0, DX
ADDL $64, SI
ADDL $64, DI
SUBL $64, BX
CMPL DX, $0xffff
JEQ hugeloop
MOVB $0, (AX)
RET
// 4 bytes at a time using 32-bit register
bigloop:
CMPL BX, $4
JBE leftover
MOVL (SI), CX
MOVL (DI), DX
ADDL $4, SI
ADDL $4, DI
SUBL $4, BX
CMPL CX, DX
JEQ bigloop
MOVB $0, (AX)
RET
// remaining 0-4 bytes
leftover:
MOVL -4(SI)(BX*1), CX
MOVL -4(DI)(BX*1), DX
CMPL CX, DX
SETEQ (AX)
RET
small:
CMPL BX, $0
JEQ equal
LEAL 0(BX*8), CX
NEGL CX
MOVL SI, DX
CMPB DX, $0xfc
JA si_high
// load at SI won't cross a page boundary.
MOVL (SI), SI
JMP si_finish
si_high:
// address ends in 111111xx. Load up to bytes we want, move to correct position.
MOVL -4(SI)(BX*1), SI
SHRL CX, SI
si_finish:
// same for DI.
MOVL DI, DX
CMPB DX, $0xfc
JA di_high
MOVL (DI), DI
JMP di_finish
di_high:
MOVL -4(DI)(BX*1), DI
SHRL CX, DI
di_finish:
SUBL SI, DI
SHLL CX, DI
equal:
SETEQ (AX)
RET