mirror of
https://github.com/refraction-networking/utls.git
synced 2025-04-06 13:37:36 +03:00
When GO386=sse2 we can assume sse2 to be present without a runtime check. If GO386=softfloat is set we can avoid the usage of SSE2 even if detected. This might cause a memcpy, memclr and bytealg slowdown of Go binaries compiled with softfloat on machines that support SSE2. Such setups are rare and should use GO386=sse2 instead if performance matters. On targets that support SSE2 we avoid the runtime overhead of dynamic cpu feature dispatch. The removal of runtime sse2 checks also allows to simplify internal/cpu further by removing handling of the required feature option as a followup after this CL. Change-Id: I90a853a8853a405cb665497c6d1a86556947ba17 Reviewed-on: https://go-review.googlesource.com/c/go/+/344350 Trust: Martin Möhrmann <martin@golang.org> Run-TryBot: Martin Möhrmann <martin@golang.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
130 lines
2.1 KiB
ArmAsm
130 lines
2.1 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "go_asm.h"
|
|
#include "textflag.h"
|
|
|
|
// memequal(a, b unsafe.Pointer, size uintptr) bool
|
|
TEXT runtime·memequal(SB),NOSPLIT,$0-13
|
|
MOVL a+0(FP), SI
|
|
MOVL b+4(FP), DI
|
|
CMPL SI, DI
|
|
JEQ eq
|
|
MOVL size+8(FP), BX
|
|
LEAL ret+12(FP), AX
|
|
JMP memeqbody<>(SB)
|
|
eq:
|
|
MOVB $1, ret+12(FP)
|
|
RET
|
|
|
|
// memequal_varlen(a, b unsafe.Pointer) bool
|
|
TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
|
|
MOVL a+0(FP), SI
|
|
MOVL b+4(FP), DI
|
|
CMPL SI, DI
|
|
JEQ eq
|
|
MOVL 4(DX), BX // compiler stores size at offset 4 in the closure
|
|
LEAL ret+8(FP), AX
|
|
JMP memeqbody<>(SB)
|
|
eq:
|
|
MOVB $1, ret+8(FP)
|
|
RET
|
|
|
|
// a in SI
|
|
// b in DI
|
|
// count in BX
|
|
// address of result byte in AX
|
|
TEXT memeqbody<>(SB),NOSPLIT,$0-0
|
|
CMPL BX, $4
|
|
JB small
|
|
|
|
// 64 bytes at a time using xmm registers
|
|
hugeloop:
|
|
CMPL BX, $64
|
|
JB bigloop
|
|
#ifdef GO386_softfloat
|
|
JMP bigloop
|
|
#endif
|
|
MOVOU (SI), X0
|
|
MOVOU (DI), X1
|
|
MOVOU 16(SI), X2
|
|
MOVOU 16(DI), X3
|
|
MOVOU 32(SI), X4
|
|
MOVOU 32(DI), X5
|
|
MOVOU 48(SI), X6
|
|
MOVOU 48(DI), X7
|
|
PCMPEQB X1, X0
|
|
PCMPEQB X3, X2
|
|
PCMPEQB X5, X4
|
|
PCMPEQB X7, X6
|
|
PAND X2, X0
|
|
PAND X6, X4
|
|
PAND X4, X0
|
|
PMOVMSKB X0, DX
|
|
ADDL $64, SI
|
|
ADDL $64, DI
|
|
SUBL $64, BX
|
|
CMPL DX, $0xffff
|
|
JEQ hugeloop
|
|
MOVB $0, (AX)
|
|
RET
|
|
|
|
// 4 bytes at a time using 32-bit register
|
|
bigloop:
|
|
CMPL BX, $4
|
|
JBE leftover
|
|
MOVL (SI), CX
|
|
MOVL (DI), DX
|
|
ADDL $4, SI
|
|
ADDL $4, DI
|
|
SUBL $4, BX
|
|
CMPL CX, DX
|
|
JEQ bigloop
|
|
MOVB $0, (AX)
|
|
RET
|
|
|
|
// remaining 0-4 bytes
|
|
leftover:
|
|
MOVL -4(SI)(BX*1), CX
|
|
MOVL -4(DI)(BX*1), DX
|
|
CMPL CX, DX
|
|
SETEQ (AX)
|
|
RET
|
|
|
|
small:
|
|
CMPL BX, $0
|
|
JEQ equal
|
|
|
|
LEAL 0(BX*8), CX
|
|
NEGL CX
|
|
|
|
MOVL SI, DX
|
|
CMPB DX, $0xfc
|
|
JA si_high
|
|
|
|
// load at SI won't cross a page boundary.
|
|
MOVL (SI), SI
|
|
JMP si_finish
|
|
si_high:
|
|
// address ends in 111111xx. Load up to bytes we want, move to correct position.
|
|
MOVL -4(SI)(BX*1), SI
|
|
SHRL CX, SI
|
|
si_finish:
|
|
|
|
// same for DI.
|
|
MOVL DI, DX
|
|
CMPB DX, $0xfc
|
|
JA di_high
|
|
MOVL (DI), DI
|
|
JMP di_finish
|
|
di_high:
|
|
MOVL -4(DI)(BX*1), DI
|
|
SHRL CX, DI
|
|
di_finish:
|
|
|
|
SUBL SI, DI
|
|
SHLL CX, DI
|
|
equal:
|
|
SETEQ (AX)
|
|
RET
|