mirror of
https://github.com/refraction-networking/utls.git
synced 2025-04-06 05:27:36 +03:00
Cleans things up quite a bit. There's still a few more, like runtime.cmpstring, which might also be worth fixing. Change-Id: Ide18dd621efc129cc686db223f47fa0b044b5580 Reviewed-on: https://go-review.googlesource.com/c/148578 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com>
126 lines
3.3 KiB
ArmAsm
126 lines
3.3 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "textflag.h"
|
|
|
|
TEXT ·IndexByte(SB),NOSPLIT,$0-40
|
|
MOVD b_base+0(FP), R0
|
|
MOVD b_len+8(FP), R2
|
|
MOVBU c+24(FP), R1
|
|
MOVD $ret+32(FP), R8
|
|
B indexbytebody<>(SB)
|
|
|
|
TEXT ·IndexByteString(SB),NOSPLIT,$0-32
|
|
MOVD s_base+0(FP), R0
|
|
MOVD s_len+8(FP), R2
|
|
MOVBU c+16(FP), R1
|
|
MOVD $ret+24(FP), R8
|
|
B indexbytebody<>(SB)
|
|
|
|
// input:
|
|
// R0: data
|
|
// R1: byte to search
|
|
// R2: data len
|
|
// R8: address to put result
|
|
TEXT indexbytebody<>(SB),NOSPLIT,$0
|
|
// Core algorithm:
|
|
// For each 32-byte chunk we calculate a 64-bit syndrome value,
|
|
// with two bits per byte. For each tuple, bit 0 is set if the
|
|
// relevant byte matched the requested character and bit 1 is
|
|
// not used (faster than using a 32bit syndrome). Since the bits
|
|
// in the syndrome reflect exactly the order in which things occur
|
|
// in the original string, counting trailing zeros allows to
|
|
// identify exactly which byte has matched.
|
|
|
|
CBZ R2, fail
|
|
MOVD R0, R11
|
|
// Magic constant 0x40100401 allows us to identify
|
|
// which lane matches the requested byte.
|
|
// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
|
|
// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
|
|
MOVD $0x40100401, R5
|
|
VMOV R1, V0.B16
|
|
// Work with aligned 32-byte chunks
|
|
BIC $0x1f, R0, R3
|
|
VMOV R5, V5.S4
|
|
ANDS $0x1f, R0, R9
|
|
AND $0x1f, R2, R10
|
|
BEQ loop
|
|
|
|
// Input string is not 32-byte aligned. We calculate the
|
|
// syndrome value for the aligned 32 bytes block containing
|
|
// the first bytes and mask off the irrelevant part.
|
|
VLD1.P (R3), [V1.B16, V2.B16]
|
|
SUB $0x20, R9, R4
|
|
ADDS R4, R2, R2
|
|
VCMEQ V0.B16, V1.B16, V3.B16
|
|
VCMEQ V0.B16, V2.B16, V4.B16
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16 // 256->128
|
|
VADDP V6.B16, V6.B16, V6.B16 // 128->64
|
|
VMOV V6.D[0], R6
|
|
// Clear the irrelevant lower bits
|
|
LSL $1, R9, R4
|
|
LSR R4, R6, R6
|
|
LSL R4, R6, R6
|
|
// The first block can also be the last
|
|
BLS masklast
|
|
// Have we found something already?
|
|
CBNZ R6, tail
|
|
|
|
loop:
|
|
VLD1.P (R3), [V1.B16, V2.B16]
|
|
SUBS $0x20, R2, R2
|
|
VCMEQ V0.B16, V1.B16, V3.B16
|
|
VCMEQ V0.B16, V2.B16, V4.B16
|
|
// If we're out of data we finish regardless of the result
|
|
BLS end
|
|
// Use a fast check for the termination condition
|
|
VORR V4.B16, V3.B16, V6.B16
|
|
VADDP V6.D2, V6.D2, V6.D2
|
|
VMOV V6.D[0], R6
|
|
// We're not out of data, loop if we haven't found the character
|
|
CBZ R6, loop
|
|
|
|
end:
|
|
// Termination condition found, let's calculate the syndrome value
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
// Only do the clear for the last possible block with less than 32 bytes
|
|
// Condition flags come from SUBS in the loop
|
|
BHS tail
|
|
|
|
masklast:
|
|
// Clear the irrelevant upper bits
|
|
ADD R9, R10, R4
|
|
AND $0x1f, R4, R4
|
|
SUB $0x20, R4, R4
|
|
NEG R4<<1, R4
|
|
LSL R4, R6, R6
|
|
LSR R4, R6, R6
|
|
|
|
tail:
|
|
// Check that we have found a character
|
|
CBZ R6, fail
|
|
// Count the trailing zeros using bit reversing
|
|
RBIT R6, R6
|
|
// Compensate the last post-increment
|
|
SUB $0x20, R3, R3
|
|
// And count the leading zeros
|
|
CLZ R6, R6
|
|
// R6 is twice the offset into the fragment
|
|
ADD R6>>1, R3, R0
|
|
// Compute the offset result
|
|
SUB R11, R0, R0
|
|
MOVD R0, (R8)
|
|
RET
|
|
|
|
fail:
|
|
MOVD $-1, R0
|
|
MOVD R0, (R8)
|
|
RET
|