utls/bytealg/compare_386.s
Martin Möhrmann ce914eab65 all: replace runtime SSE2 detection with GO386 setting
When GO386=sse2 we can assume sse2 to be present without
a runtime check. If GO386=softfloat is set we can avoid
the usage of SSE2 even if detected.

This might cause a memcpy, memclr and bytealg slowdown of Go
binaries compiled with softfloat on machines that support
SSE2. Such setups are rare and should use GO386=sse2 instead
if performance matters.

On targets that support SSE2 we avoid the runtime overhead of
dynamic cpu feature dispatch.

The removal of runtime sse2 checks also allows to simplify
internal/cpu further by removing handling of the required
feature option as a followup after this CL.

Change-Id: I90a853a8853a405cb665497c6d1a86556947ba17
Reviewed-on: https://go-review.googlesource.com/c/go/+/344350
Trust: Martin Möhrmann <martin@golang.org>
Run-TryBot: Martin Möhrmann <martin@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-08-23 21:22:58 +00:00

144 lines
2.6 KiB
ArmAsm

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Compare(SB),NOSPLIT,$0-28
MOVL a_base+0(FP), SI
MOVL a_len+4(FP), BX
MOVL b_base+12(FP), DI
MOVL b_len+16(FP), DX
LEAL ret+24(FP), AX
JMP cmpbody<>(SB)
TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
MOVL a_base+0(FP), SI
MOVL a_len+4(FP), BX
MOVL b_base+8(FP), DI
MOVL b_len+12(FP), DX
LEAL ret+16(FP), AX
JMP cmpbody<>(SB)
// input:
// SI = a
// DI = b
// BX = alen
// DX = blen
// AX = address of return word (set to 1/0/-1)
TEXT cmpbody<>(SB),NOSPLIT,$0-0
MOVL DX, BP
SUBL BX, DX // DX = blen-alen
JLE 2(PC)
MOVL BX, BP // BP = min(alen, blen)
CMPL SI, DI
JEQ allsame
CMPL BP, $4
JB small
#ifdef GO386_softfloat
JMP mediumloop
#endif
largeloop:
CMPL BP, $16
JB mediumloop
MOVOU (SI), X0
MOVOU (DI), X1
PCMPEQB X0, X1
PMOVMSKB X1, BX
XORL $0xffff, BX // convert EQ to NE
JNE diff16 // branch if at least one byte is not equal
ADDL $16, SI
ADDL $16, DI
SUBL $16, BP
JMP largeloop
diff16:
BSFL BX, BX // index of first byte that differs
XORL DX, DX
MOVB (SI)(BX*1), CX
CMPB CX, (DI)(BX*1)
SETHI DX
LEAL -1(DX*2), DX // convert 1/0 to +1/-1
MOVL DX, (AX)
RET
mediumloop:
CMPL BP, $4
JBE _0through4
MOVL (SI), BX
MOVL (DI), CX
CMPL BX, CX
JNE diff4
ADDL $4, SI
ADDL $4, DI
SUBL $4, BP
JMP mediumloop
_0through4:
MOVL -4(SI)(BP*1), BX
MOVL -4(DI)(BP*1), CX
CMPL BX, CX
JEQ allsame
diff4:
BSWAPL BX // reverse order of bytes
BSWAPL CX
XORL BX, CX // find bit differences
BSRL CX, CX // index of highest bit difference
SHRL CX, BX // move a's bit to bottom
ANDL $1, BX // mask bit
LEAL -1(BX*2), BX // 1/0 => +1/-1
MOVL BX, (AX)
RET
// 0-3 bytes in common
small:
LEAL (BP*8), CX
NEGL CX
JEQ allsame
// load si
CMPB SI, $0xfc
JA si_high
MOVL (SI), SI
JMP si_finish
si_high:
MOVL -4(SI)(BP*1), SI
SHRL CX, SI
si_finish:
SHLL CX, SI
// same for di
CMPB DI, $0xfc
JA di_high
MOVL (DI), DI
JMP di_finish
di_high:
MOVL -4(DI)(BP*1), DI
SHRL CX, DI
di_finish:
SHLL CX, DI
BSWAPL SI // reverse order of bytes
BSWAPL DI
XORL SI, DI // find bit differences
JEQ allsame
BSRL DI, CX // index of highest bit difference
SHRL CX, SI // move a's bit to bottom
ANDL $1, SI // mask bit
LEAL -1(SI*2), BX // 1/0 => +1/-1
MOVL BX, (AX)
RET
// all the bytes in common are the same, so we just need
// to compare the lengths.
allsame:
XORL BX, BX
XORL CX, CX
TESTL DX, DX
SETLT BX // 1 if alen > blen
SETEQ CX // 1 if alen == blen
LEAL -1(CX)(BX*2), BX // 1,0,-1 result
MOVL BX, (AX)
RET