mirror of
https://github.com/SagerNet/sing-tun.git
synced 2025-04-02 03:17:38 +03:00
Add tailscale checksum
This commit is contained in:
parent
4ebeb2fa86
commit
06b4d4ecd1
11 changed files with 2254 additions and 28 deletions
33
internal/checksum_test/sum_bench_test.go
Normal file
33
internal/checksum_test/sum_bench_test.go
Normal file
|
@ -0,0 +1,33 @@
|
|||
package checksum_test
|
||||
|
||||
import (
|
||||
"crypto/rand"
|
||||
"testing"
|
||||
|
||||
"github.com/sagernet/sing-tun/internal/gtcpip/checksum"
|
||||
"github.com/sagernet/sing-tun/internal/tschecksum"
|
||||
)
|
||||
|
||||
func BenchmarkTsChecksum(b *testing.B) {
|
||||
packet := make([][]byte, 1000)
|
||||
for i := 0; i < 1000; i++ {
|
||||
packet[i] = make([]byte, 1500)
|
||||
rand.Read(packet[i])
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
tschecksum.Checksum(packet[i%1000], 0)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkGChecksum(b *testing.B) {
|
||||
packet := make([][]byte, 1000)
|
||||
for i := 0; i < 1000; i++ {
|
||||
packet[i] = make([]byte, 1500)
|
||||
rand.Read(packet[i])
|
||||
}
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
checksum.Checksum(packet[i%1000], 0)
|
||||
}
|
||||
}
|
|
@ -30,34 +30,6 @@ func Put(b []byte, xsum uint16) {
|
|||
binary.BigEndian.PutUint16(b, xsum)
|
||||
}
|
||||
|
||||
// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
|
||||
// given byte array. This function uses an optimized version of the checksum
|
||||
// algorithm.
|
||||
//
|
||||
// The initial checksum must have been computed on an even number of bytes.
|
||||
func Checksum(buf []byte, initial uint16) uint16 {
|
||||
s, _ := calculateChecksum(buf, false, initial)
|
||||
return s
|
||||
}
|
||||
|
||||
// Checksumer calculates checksum defined in RFC 1071.
|
||||
type Checksumer struct {
|
||||
sum uint16
|
||||
odd bool
|
||||
}
|
||||
|
||||
// Add adds b to checksum.
|
||||
func (c *Checksumer) Add(b []byte) {
|
||||
if len(b) > 0 {
|
||||
c.sum, c.odd = calculateChecksum(b, c.odd, c.sum)
|
||||
}
|
||||
}
|
||||
|
||||
// Checksum returns the latest checksum value.
|
||||
func (c *Checksumer) Checksum() uint16 {
|
||||
return c.sum
|
||||
}
|
||||
|
||||
// Combine combines the two uint16 to form their checksum. This is done
|
||||
// by adding them and the carry.
|
||||
//
|
||||
|
|
13
internal/gtcpip/checksum/checksum_default.go
Normal file
13
internal/gtcpip/checksum/checksum_default.go
Normal file
|
@ -0,0 +1,13 @@
|
|||
//go:build !amd64
|
||||
|
||||
package checksum
|
||||
|
||||
// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
|
||||
// given byte array. This function uses an optimized version of the checksum
|
||||
// algorithm.
|
||||
//
|
||||
// The initial checksum must have been computed on an even number of bytes.
|
||||
func Checksum(buf []byte, initial uint16) uint16 {
|
||||
s, _ := calculateChecksum(buf, false, initial)
|
||||
return s
|
||||
}
|
9
internal/gtcpip/checksum/checksum_ts.go
Normal file
9
internal/gtcpip/checksum/checksum_ts.go
Normal file
|
@ -0,0 +1,9 @@
|
|||
//go:build amd64
|
||||
|
||||
package checksum
|
||||
|
||||
import "github.com/sagernet/sing-tun/internal/tschecksum"
|
||||
|
||||
func Checksum(buf []byte, initial uint16) uint16 {
|
||||
return tschecksum.Checksum(buf, initial)
|
||||
}
|
|
@ -1,3 +1,5 @@
|
|||
//go:build !amd64
|
||||
|
||||
// Copyright 2023 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
|
712
internal/tschecksum/checksum.go
Normal file
712
internal/tschecksum/checksum.go
Normal file
|
@ -0,0 +1,712 @@
|
|||
package tschecksum
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"math/bits"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/sys/cpu"
|
||||
)
|
||||
|
||||
// checksumGeneric64 is a reference implementation of checksum using 64 bit
|
||||
// arithmetic for use in testing or when an architecture-specific implementation
|
||||
// is not available.
|
||||
func checksumGeneric64(b []byte, initial uint16) uint16 {
|
||||
var ac uint64
|
||||
var carry uint64
|
||||
|
||||
if cpu.IsBigEndian {
|
||||
ac = uint64(initial)
|
||||
} else {
|
||||
ac = uint64(bits.ReverseBytes16(initial))
|
||||
}
|
||||
|
||||
for len(b) >= 128 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[24:32]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[32:40]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[40:48]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[48:56]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[56:64]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[64:72]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[72:80]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[80:88]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[88:96]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[96:104]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[104:112]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[112:120]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[120:128]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[24:32]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[32:40]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[40:48]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[48:56]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[56:64]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[64:72]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[72:80]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[80:88]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[88:96]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[96:104]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[104:112]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[112:120]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[120:128]), carry)
|
||||
}
|
||||
b = b[128:]
|
||||
}
|
||||
if len(b) >= 64 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[24:32]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[32:40]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[40:48]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[48:56]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[56:64]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[24:32]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[32:40]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[40:48]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[48:56]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[56:64]), carry)
|
||||
}
|
||||
b = b[64:]
|
||||
}
|
||||
if len(b) >= 32 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[24:32]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[8:16]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[16:24]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[24:32]), carry)
|
||||
}
|
||||
b = b[32:]
|
||||
}
|
||||
if len(b) >= 16 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b[8:16]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[:8]), carry)
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b[8:16]), carry)
|
||||
}
|
||||
b = b[16:]
|
||||
}
|
||||
if len(b) >= 8 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, binary.BigEndian.Uint64(b), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, binary.LittleEndian.Uint64(b), carry)
|
||||
}
|
||||
b = b[8:]
|
||||
}
|
||||
if len(b) >= 4 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, uint64(binary.BigEndian.Uint32(b)), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, uint64(binary.LittleEndian.Uint32(b)), carry)
|
||||
}
|
||||
b = b[4:]
|
||||
}
|
||||
if len(b) >= 2 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, uint64(binary.BigEndian.Uint16(b)), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, uint64(binary.LittleEndian.Uint16(b)), carry)
|
||||
}
|
||||
b = b[2:]
|
||||
}
|
||||
if len(b) >= 1 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add64(ac, uint64(b[0])<<8, carry)
|
||||
} else {
|
||||
ac, carry = bits.Add64(ac, uint64(b[0]), carry)
|
||||
}
|
||||
}
|
||||
|
||||
folded := ipChecksumFold64(ac, carry)
|
||||
if !cpu.IsBigEndian {
|
||||
folded = bits.ReverseBytes16(folded)
|
||||
}
|
||||
return folded
|
||||
}
|
||||
|
||||
// checksumGeneric32 is a reference implementation of checksum using 32 bit
|
||||
// arithmetic for use in testing or when an architecture-specific implementation
|
||||
// is not available.
|
||||
func checksumGeneric32(b []byte, initial uint16) uint16 {
|
||||
var ac uint32
|
||||
var carry uint32
|
||||
|
||||
if cpu.IsBigEndian {
|
||||
ac = uint32(initial)
|
||||
} else {
|
||||
ac = uint32(bits.ReverseBytes16(initial))
|
||||
}
|
||||
|
||||
for len(b) >= 64 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[12:16]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[16:20]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[20:24]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[24:28]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[28:32]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[32:36]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[36:40]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[40:44]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[44:48]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[48:52]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[52:56]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[56:60]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[60:64]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[12:16]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[16:20]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[20:24]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[24:28]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[28:32]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[32:36]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[36:40]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[40:44]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[44:48]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[48:52]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[52:56]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[56:60]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[60:64]), carry)
|
||||
}
|
||||
b = b[64:]
|
||||
}
|
||||
if len(b) >= 32 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[12:16]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[16:20]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[20:24]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[24:28]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[28:32]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[12:16]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[16:20]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[20:24]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[24:28]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[28:32]), carry)
|
||||
}
|
||||
b = b[32:]
|
||||
}
|
||||
if len(b) >= 16 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[12:16]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[4:8]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[8:12]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[12:16]), carry)
|
||||
}
|
||||
b = b[16:]
|
||||
}
|
||||
if len(b) >= 8 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b[4:8]), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[:4]), carry)
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b[4:8]), carry)
|
||||
}
|
||||
b = b[8:]
|
||||
}
|
||||
if len(b) >= 4 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, binary.BigEndian.Uint32(b), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, binary.LittleEndian.Uint32(b), carry)
|
||||
}
|
||||
b = b[4:]
|
||||
}
|
||||
if len(b) >= 2 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, uint32(binary.BigEndian.Uint16(b)), carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, uint32(binary.LittleEndian.Uint16(b)), carry)
|
||||
}
|
||||
b = b[2:]
|
||||
}
|
||||
if len(b) >= 1 {
|
||||
if cpu.IsBigEndian {
|
||||
ac, carry = bits.Add32(ac, uint32(b[0])<<8, carry)
|
||||
} else {
|
||||
ac, carry = bits.Add32(ac, uint32(b[0]), carry)
|
||||
}
|
||||
}
|
||||
|
||||
folded := ipChecksumFold32(ac, carry)
|
||||
if !cpu.IsBigEndian {
|
||||
folded = bits.ReverseBytes16(folded)
|
||||
}
|
||||
return folded
|
||||
}
|
||||
|
||||
// checksumGeneric32Alternate is an alternate reference implementation of
|
||||
// checksum using 32 bit arithmetic for use in testing or when an
|
||||
// architecture-specific implementation is not available.
|
||||
func checksumGeneric32Alternate(b []byte, initial uint16) uint16 {
|
||||
var ac uint32
|
||||
|
||||
if cpu.IsBigEndian {
|
||||
ac = uint32(initial)
|
||||
} else {
|
||||
ac = uint32(bits.ReverseBytes16(initial))
|
||||
}
|
||||
|
||||
for len(b) >= 64 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[14:16]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[16:18]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[18:20]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[20:22]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[22:24]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[24:26]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[26:28]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[28:30]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[30:32]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[32:34]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[34:36]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[36:38]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[38:40]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[40:42]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[42:44]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[44:46]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[46:48]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[48:50]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[50:52]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[52:54]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[54:56]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[56:58]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[58:60]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[60:62]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[62:64]))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[14:16]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[16:18]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[18:20]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[20:22]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[22:24]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[24:26]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[26:28]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[28:30]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[30:32]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[32:34]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[34:36]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[36:38]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[38:40]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[40:42]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[42:44]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[44:46]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[46:48]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[48:50]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[50:52]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[52:54]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[54:56]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[56:58]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[58:60]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[60:62]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[62:64]))
|
||||
}
|
||||
b = b[64:]
|
||||
}
|
||||
if len(b) >= 32 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[14:16]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[16:18]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[18:20]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[20:22]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[22:24]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[24:26]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[26:28]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[28:30]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[30:32]))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[14:16]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[16:18]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[18:20]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[20:22]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[22:24]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[24:26]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[26:28]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[28:30]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[30:32]))
|
||||
}
|
||||
b = b[32:]
|
||||
}
|
||||
if len(b) >= 16 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[14:16]))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[6:8]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[8:10]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[10:12]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[12:14]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[14:16]))
|
||||
}
|
||||
b = b[16:]
|
||||
}
|
||||
if len(b) >= 8 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[6:8]))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[2:4]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[4:6]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[6:8]))
|
||||
}
|
||||
b = b[8:]
|
||||
}
|
||||
if len(b) >= 4 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.BigEndian.Uint16(b[2:4]))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[:2]))
|
||||
ac += uint32(binary.LittleEndian.Uint16(b[2:4]))
|
||||
}
|
||||
b = b[4:]
|
||||
}
|
||||
if len(b) >= 2 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(binary.BigEndian.Uint16(b))
|
||||
} else {
|
||||
ac += uint32(binary.LittleEndian.Uint16(b))
|
||||
}
|
||||
b = b[2:]
|
||||
}
|
||||
if len(b) >= 1 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint32(b[0]) << 8
|
||||
} else {
|
||||
ac += uint32(b[0])
|
||||
}
|
||||
}
|
||||
|
||||
folded := ipChecksumFold32(ac, 0)
|
||||
if !cpu.IsBigEndian {
|
||||
folded = bits.ReverseBytes16(folded)
|
||||
}
|
||||
return folded
|
||||
}
|
||||
|
||||
// checksumGeneric64Alternate is an alternate reference implementation of
|
||||
// checksum using 64 bit arithmetic for use in testing or when an
|
||||
// architecture-specific implementation is not available.
|
||||
func checksumGeneric64Alternate(b []byte, initial uint16) uint16 {
|
||||
var ac uint64
|
||||
|
||||
if cpu.IsBigEndian {
|
||||
ac = uint64(initial)
|
||||
} else {
|
||||
ac = uint64(bits.ReverseBytes16(initial))
|
||||
}
|
||||
|
||||
for len(b) >= 64 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[12:16]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[16:20]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[20:24]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[24:28]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[28:32]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[32:36]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[36:40]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[40:44]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[44:48]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[48:52]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[52:56]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[56:60]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[60:64]))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[12:16]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[16:20]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[20:24]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[24:28]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[28:32]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[32:36]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[36:40]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[40:44]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[44:48]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[48:52]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[52:56]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[56:60]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[60:64]))
|
||||
}
|
||||
b = b[64:]
|
||||
}
|
||||
if len(b) >= 32 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[12:16]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[16:20]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[20:24]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[24:28]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[28:32]))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[12:16]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[16:20]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[20:24]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[24:28]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[28:32]))
|
||||
}
|
||||
b = b[32:]
|
||||
}
|
||||
if len(b) >= 16 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[12:16]))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[4:8]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[8:12]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[12:16]))
|
||||
}
|
||||
b = b[16:]
|
||||
}
|
||||
if len(b) >= 8 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.BigEndian.Uint32(b[4:8]))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[:4]))
|
||||
ac += uint64(binary.LittleEndian.Uint32(b[4:8]))
|
||||
}
|
||||
b = b[8:]
|
||||
}
|
||||
if len(b) >= 4 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint32(b))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint32(b))
|
||||
}
|
||||
b = b[4:]
|
||||
}
|
||||
if len(b) >= 2 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(binary.BigEndian.Uint16(b))
|
||||
} else {
|
||||
ac += uint64(binary.LittleEndian.Uint16(b))
|
||||
}
|
||||
b = b[2:]
|
||||
}
|
||||
if len(b) >= 1 {
|
||||
if cpu.IsBigEndian {
|
||||
ac += uint64(b[0]) << 8
|
||||
} else {
|
||||
ac += uint64(b[0])
|
||||
}
|
||||
}
|
||||
|
||||
folded := ipChecksumFold64(ac, 0)
|
||||
if !cpu.IsBigEndian {
|
||||
folded = bits.ReverseBytes16(folded)
|
||||
}
|
||||
return folded
|
||||
}
|
||||
|
||||
func ipChecksumFold64(unfolded uint64, initialCarry uint64) uint16 {
|
||||
sum, carry := bits.Add32(uint32(unfolded>>32), uint32(unfolded&0xffff_ffff), uint32(initialCarry))
|
||||
// if carry != 0, sum <= 0xffff_fffe, otherwise sum <= 0xffff_ffff
|
||||
// therefore (sum >> 16) + (sum & 0xffff) + carry <= 0x1_fffe; so there is
|
||||
// no need to save the carry flag
|
||||
sum = (sum >> 16) + (sum & 0xffff) + carry
|
||||
// sum <= 0x1_fffe therefore this is the last fold needed:
|
||||
// if (sum >> 16) > 0 then
|
||||
// (sum >> 16) == 1 && (sum & 0xffff) <= 0xfffe and therefore
|
||||
// the addition will not overflow
|
||||
// otherwise (sum >> 16) == 0 and sum will be unchanged
|
||||
sum = (sum >> 16) + (sum & 0xffff)
|
||||
return uint16(sum)
|
||||
}
|
||||
|
||||
func ipChecksumFold32(unfolded uint32, initialCarry uint32) uint16 {
|
||||
sum := (unfolded >> 16) + (unfolded & 0xffff) + initialCarry
|
||||
// sum <= 0x1_ffff:
|
||||
// 0xffff + 0xffff = 0x1_fffe
|
||||
// initialCarry is 0 or 1, for a combined maximum of 0x1_ffff
|
||||
sum = (sum >> 16) + (sum & 0xffff)
|
||||
// sum <= 0x1_0000 therefore this is the last fold needed:
|
||||
// if (sum >> 16) > 0 then
|
||||
// (sum >> 16) == 1 && (sum & 0xffff) == 0 and therefore
|
||||
// the addition will not overflow
|
||||
// otherwise (sum >> 16) == 0 and sum will be unchanged
|
||||
sum = (sum >> 16) + (sum & 0xffff)
|
||||
return uint16(sum)
|
||||
}
|
||||
|
||||
func addrPartialChecksum64(addr []byte, initial, carryIn uint64) (sum, carry uint64) {
|
||||
sum, carry = initial, carryIn
|
||||
switch len(addr) {
|
||||
case 4: // IPv4
|
||||
if cpu.IsBigEndian {
|
||||
sum, carry = bits.Add64(sum, uint64(binary.BigEndian.Uint32(addr)), carry)
|
||||
} else {
|
||||
sum, carry = bits.Add64(sum, uint64(binary.LittleEndian.Uint32(addr)), carry)
|
||||
}
|
||||
case 16: // IPv6
|
||||
if cpu.IsBigEndian {
|
||||
sum, carry = bits.Add64(sum, binary.BigEndian.Uint64(addr), carry)
|
||||
sum, carry = bits.Add64(sum, binary.BigEndian.Uint64(addr[8:]), carry)
|
||||
} else {
|
||||
sum, carry = bits.Add64(sum, binary.LittleEndian.Uint64(addr), carry)
|
||||
sum, carry = bits.Add64(sum, binary.LittleEndian.Uint64(addr[8:]), carry)
|
||||
}
|
||||
default:
|
||||
panic("bad addr length")
|
||||
}
|
||||
return sum, carry
|
||||
}
|
||||
|
||||
func addrPartialChecksum32(addr []byte, initial, carryIn uint32) (sum, carry uint32) {
|
||||
sum, carry = initial, carryIn
|
||||
switch len(addr) {
|
||||
case 4: // IPv4
|
||||
if cpu.IsBigEndian {
|
||||
sum, carry = bits.Add32(sum, binary.BigEndian.Uint32(addr), carry)
|
||||
} else {
|
||||
sum, carry = bits.Add32(sum, binary.LittleEndian.Uint32(addr), carry)
|
||||
}
|
||||
case 16: // IPv6
|
||||
if cpu.IsBigEndian {
|
||||
sum, carry = bits.Add32(sum, binary.BigEndian.Uint32(addr), carry)
|
||||
sum, carry = bits.Add32(sum, binary.BigEndian.Uint32(addr[4:8]), carry)
|
||||
sum, carry = bits.Add32(sum, binary.BigEndian.Uint32(addr[8:12]), carry)
|
||||
sum, carry = bits.Add32(sum, binary.BigEndian.Uint32(addr[12:16]), carry)
|
||||
} else {
|
||||
sum, carry = bits.Add32(sum, binary.LittleEndian.Uint32(addr), carry)
|
||||
sum, carry = bits.Add32(sum, binary.LittleEndian.Uint32(addr[4:8]), carry)
|
||||
sum, carry = bits.Add32(sum, binary.LittleEndian.Uint32(addr[8:12]), carry)
|
||||
sum, carry = bits.Add32(sum, binary.LittleEndian.Uint32(addr[12:16]), carry)
|
||||
}
|
||||
default:
|
||||
panic("bad addr length")
|
||||
}
|
||||
return sum, carry
|
||||
}
|
||||
|
||||
func pseudoHeaderChecksum64(protocol uint8, srcAddr, dstAddr []byte, totalLen uint16) uint16 {
|
||||
var sum uint64
|
||||
if cpu.IsBigEndian {
|
||||
sum = uint64(totalLen) + uint64(protocol)
|
||||
} else {
|
||||
sum = uint64(bits.ReverseBytes16(totalLen)) + uint64(protocol)<<8
|
||||
}
|
||||
sum, carry := addrPartialChecksum64(srcAddr, sum, 0)
|
||||
sum, carry = addrPartialChecksum64(dstAddr, sum, carry)
|
||||
|
||||
foldedSum := ipChecksumFold64(sum, carry)
|
||||
if !cpu.IsBigEndian {
|
||||
foldedSum = bits.ReverseBytes16(foldedSum)
|
||||
}
|
||||
return foldedSum
|
||||
}
|
||||
|
||||
func pseudoHeaderChecksum32(protocol uint8, srcAddr, dstAddr []byte, totalLen uint16) uint16 {
|
||||
var sum uint32
|
||||
if cpu.IsBigEndian {
|
||||
sum = uint32(totalLen) + uint32(protocol)
|
||||
} else {
|
||||
sum = uint32(bits.ReverseBytes16(totalLen)) + uint32(protocol)<<8
|
||||
}
|
||||
sum, carry := addrPartialChecksum32(srcAddr, sum, 0)
|
||||
sum, carry = addrPartialChecksum32(dstAddr, sum, carry)
|
||||
|
||||
foldedSum := ipChecksumFold32(sum, carry)
|
||||
if !cpu.IsBigEndian {
|
||||
foldedSum = bits.ReverseBytes16(foldedSum)
|
||||
}
|
||||
return foldedSum
|
||||
}
|
||||
|
||||
// PseudoHeaderChecksum computes an IP pseudo-header checksum. srcAddr and
|
||||
// dstAddr must be 4 or 16 bytes in length.
|
||||
func PseudoHeaderChecksum(protocol uint8, srcAddr, dstAddr []byte, totalLen uint16) uint16 {
|
||||
if strconv.IntSize < 64 {
|
||||
return pseudoHeaderChecksum32(protocol, srcAddr, dstAddr, totalLen)
|
||||
}
|
||||
return pseudoHeaderChecksum64(protocol, srcAddr, dstAddr, totalLen)
|
||||
}
|
23
internal/tschecksum/checksum_amd64.go
Normal file
23
internal/tschecksum/checksum_amd64.go
Normal file
|
@ -0,0 +1,23 @@
|
|||
package tschecksum
|
||||
|
||||
import "golang.org/x/sys/cpu"
|
||||
|
||||
var checksum = checksumAMD64
|
||||
|
||||
// Checksum computes an IP checksum starting with the provided initial value.
|
||||
// The length of data should be at least 128 bytes for best performance. Smaller
|
||||
// buffers will still compute a correct result.
|
||||
func Checksum(data []byte, initial uint16) uint16 {
|
||||
return checksum(data, initial)
|
||||
}
|
||||
|
||||
func init() {
|
||||
if cpu.X86.HasAVX && cpu.X86.HasAVX2 && cpu.X86.HasBMI2 {
|
||||
checksum = checksumAVX2
|
||||
return
|
||||
}
|
||||
if cpu.X86.HasSSE2 {
|
||||
checksum = checksumSSE2
|
||||
return
|
||||
}
|
||||
}
|
18
internal/tschecksum/checksum_generated_amd64.go
Normal file
18
internal/tschecksum/checksum_generated_amd64.go
Normal file
|
@ -0,0 +1,18 @@
|
|||
// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.
|
||||
|
||||
package tschecksum
|
||||
|
||||
// checksumAVX2 computes an IP checksum using amd64 v3 instructions (AVX2, BMI2)
|
||||
//
|
||||
//go:noescape
|
||||
func checksumAVX2(b []byte, initial uint16) uint16
|
||||
|
||||
// checksumSSE2 computes an IP checksum using amd64 baseline instructions (SSE2)
|
||||
//
|
||||
//go:noescape
|
||||
func checksumSSE2(b []byte, initial uint16) uint16
|
||||
|
||||
// checksumAMD64 computes an IP checksum using amd64 baseline instructions
|
||||
//
|
||||
//go:noescape
|
||||
func checksumAMD64(b []byte, initial uint16) uint16
|
851
internal/tschecksum/checksum_generated_amd64.s
Normal file
851
internal/tschecksum/checksum_generated_amd64.s
Normal file
|
@ -0,0 +1,851 @@
|
|||
// Code generated by command: go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go. DO NOT EDIT.
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA xmmLoadMasks<>+0(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
|
||||
DATA xmmLoadMasks<>+16(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+32(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+48(SB)/16, $"\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+64(SB)/16, $"\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+80(SB)/16, $"\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
DATA xmmLoadMasks<>+96(SB)/16, $"\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"
|
||||
GLOBL xmmLoadMasks<>(SB), RODATA|NOPTR, $112
|
||||
|
||||
// func checksumAVX2(b []byte, initial uint16) uint16
|
||||
// Requires: AVX, AVX2, BMI2
|
||||
TEXT ·checksumAVX2(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// skip all SIMD for small buffers
|
||||
CMPQ BX, $0x00000100
|
||||
JGE startSIMD
|
||||
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
startSIMD:
|
||||
VPXOR Y0, Y0, Y0
|
||||
VPXOR Y1, Y1, Y1
|
||||
VPXOR Y2, Y2, Y2
|
||||
VPXOR Y3, Y3, Y3
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
|
||||
// Number of 256 byte iterations
|
||||
SHRQ $0x08, CX
|
||||
JZ smallLoop
|
||||
|
||||
bigLoop:
|
||||
VPMOVZXWD (DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 16(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 32(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 48(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 64(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 80(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 96(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 112(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 128(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 144(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 160(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 176(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
VPMOVZXWD 192(DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
VPMOVZXWD 208(DX), Y4
|
||||
VPADDD Y4, Y1, Y1
|
||||
VPMOVZXWD 224(DX), Y4
|
||||
VPADDD Y4, Y2, Y2
|
||||
VPMOVZXWD 240(DX), Y4
|
||||
VPADDD Y4, Y3, Y3
|
||||
ADDQ $0x00000100, DX
|
||||
DECQ CX
|
||||
JNZ bigLoop
|
||||
CMPQ BX, $0x10
|
||||
JLT doneSmallLoop
|
||||
|
||||
// now read a single 16 byte unit of data at a time
|
||||
smallLoop:
|
||||
VPMOVZXWD (DX), Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
ADDQ $0x10, DX
|
||||
SUBQ $0x10, BX
|
||||
CMPQ BX, $0x10
|
||||
JGE smallLoop
|
||||
|
||||
doneSmallLoop:
|
||||
CMPQ BX, $0x00
|
||||
JE doneSIMD
|
||||
|
||||
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
||||
LEAQ xmmLoadMasks<>+0(SB), CX
|
||||
VMOVDQU -16(DX)(BX*1), X4
|
||||
VPAND -16(CX)(BX*8), X4, X4
|
||||
VPMOVZXWD X4, Y4
|
||||
VPADDD Y4, Y0, Y0
|
||||
|
||||
doneSIMD:
|
||||
// Multi-chain loop is done, combine the accumulators
|
||||
VPADDD Y1, Y0, Y0
|
||||
VPADDD Y2, Y0, Y0
|
||||
VPADDD Y3, Y0, Y0
|
||||
|
||||
// extract the YMM into a pair of XMM and sum them
|
||||
VEXTRACTI128 $0x01, Y0, X1
|
||||
VPADDD X0, X1, X0
|
||||
|
||||
// extract the XMM into GP64
|
||||
VPEXTRQ $0x00, X0, CX
|
||||
VPEXTRQ $0x01, X0, DX
|
||||
|
||||
// no more AVX code, clear upper registers to avoid SSE slowdowns
|
||||
VZEROUPPER
|
||||
ADDQ CX, AX
|
||||
ADCQ DX, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
RORXQ $0x20, AX, CX
|
||||
ADCL CX, AX
|
||||
RORXL $0x10, AX, CX
|
||||
ADCW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
||||
|
||||
// func checksumSSE2(b []byte, initial uint16) uint16
|
||||
// Requires: SSE2
|
||||
TEXT ·checksumSSE2(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// skip all SIMD for small buffers
|
||||
CMPQ BX, $0x00000100
|
||||
JGE startSIMD
|
||||
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
startSIMD:
|
||||
PXOR X0, X0
|
||||
PXOR X1, X1
|
||||
PXOR X2, X2
|
||||
PXOR X3, X3
|
||||
PXOR X4, X4
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
|
||||
// Number of 256 byte iterations
|
||||
SHRQ $0x08, CX
|
||||
JZ smallLoop
|
||||
|
||||
bigLoop:
|
||||
MOVOU (DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 16(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 32(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 48(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 64(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 80(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 96(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 112(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 128(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 144(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 160(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 176(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
MOVOU 192(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X2
|
||||
MOVOU 208(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X1
|
||||
PADDD X6, X3
|
||||
MOVOU 224(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X2
|
||||
PADDD X6, X0
|
||||
MOVOU 240(DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X3
|
||||
PADDD X6, X1
|
||||
ADDQ $0x00000100, DX
|
||||
DECQ CX
|
||||
JNZ bigLoop
|
||||
CMPQ BX, $0x10
|
||||
JLT doneSmallLoop
|
||||
|
||||
// now read a single 16 byte unit of data at a time
|
||||
smallLoop:
|
||||
MOVOU (DX), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X1
|
||||
ADDQ $0x10, DX
|
||||
SUBQ $0x10, BX
|
||||
CMPQ BX, $0x10
|
||||
JGE smallLoop
|
||||
|
||||
doneSmallLoop:
|
||||
CMPQ BX, $0x00
|
||||
JE doneSIMD
|
||||
|
||||
// There are between 1 and 15 bytes remaining. Perform an overlapped read.
|
||||
LEAQ xmmLoadMasks<>+0(SB), CX
|
||||
MOVOU -16(DX)(BX*1), X5
|
||||
PAND -16(CX)(BX*8), X5
|
||||
MOVOA X5, X6
|
||||
PUNPCKHWL X4, X5
|
||||
PUNPCKLWL X4, X6
|
||||
PADDD X5, X0
|
||||
PADDD X6, X1
|
||||
|
||||
doneSIMD:
|
||||
// Multi-chain loop is done, combine the accumulators
|
||||
PADDD X1, X0
|
||||
PADDD X2, X0
|
||||
PADDD X3, X0
|
||||
|
||||
// extract the XMM into GP64
|
||||
MOVQ X0, CX
|
||||
PSRLDQ $0x08, X0
|
||||
MOVQ X0, DX
|
||||
ADDQ CX, AX
|
||||
ADCQ DX, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
MOVL AX, CX
|
||||
ADCQ $0x00, CX
|
||||
SHRQ $0x20, AX
|
||||
ADDQ CX, AX
|
||||
MOVWQZX AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDQ CX, AX
|
||||
MOVW AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
||||
|
||||
// func checksumAMD64(b []byte, initial uint16) uint16
|
||||
TEXT ·checksumAMD64(SB), NOSPLIT|NOFRAME, $0-34
|
||||
MOVWQZX initial+24(FP), AX
|
||||
XCHGB AH, AL
|
||||
MOVQ b_base+0(FP), DX
|
||||
MOVQ b_len+8(FP), BX
|
||||
|
||||
// handle odd length buffers; they are difficult to handle in general
|
||||
TESTQ $0x00000001, BX
|
||||
JZ lengthIsEven
|
||||
MOVBQZX -1(DX)(BX*1), CX
|
||||
DECQ BX
|
||||
ADDQ CX, AX
|
||||
|
||||
lengthIsEven:
|
||||
// handle tiny buffers (<=31 bytes) specially
|
||||
CMPQ BX, $0x1f
|
||||
JGT bufferIsNotTiny
|
||||
XORQ CX, CX
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
|
||||
// shift twice to start because length is guaranteed to be even
|
||||
// n = n >> 2; CF = originalN & 2
|
||||
SHRQ $0x02, BX
|
||||
JNC handleTiny4
|
||||
|
||||
// tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]
|
||||
MOVWQZX (DX), CX
|
||||
ADDQ $0x02, DX
|
||||
|
||||
handleTiny4:
|
||||
// n = n >> 1; CF = originalN & 4
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny8
|
||||
|
||||
// tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]
|
||||
MOVLQZX (DX), SI
|
||||
ADDQ $0x04, DX
|
||||
|
||||
handleTiny8:
|
||||
// n = n >> 1; CF = originalN & 8
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTiny16
|
||||
|
||||
// tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]
|
||||
MOVQ (DX), DI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleTiny16:
|
||||
// n = n >> 1; CF = originalN & 16
|
||||
// n == 0 now, otherwise we would have branched after comparing with tinyBufferSize
|
||||
SHRQ $0x01, BX
|
||||
JNC handleTinyFinish
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
|
||||
handleTinyFinish:
|
||||
// CF should be included from the previous add, so we use ADCQ.
|
||||
// If we arrived via the JNC above, then CF=0 due to the branch condition,
|
||||
// so ADCQ will still produce the correct result.
|
||||
ADCQ CX, AX
|
||||
ADCQ SI, AX
|
||||
ADCQ DI, AX
|
||||
JMP foldAndReturn
|
||||
|
||||
bufferIsNotTiny:
|
||||
// Number of 256 byte iterations into loop counter
|
||||
MOVQ BX, CX
|
||||
|
||||
// Update number of bytes remaining after the loop completes
|
||||
ANDQ $0xff, BX
|
||||
SHRQ $0x08, CX
|
||||
JZ startCleanup
|
||||
CLC
|
||||
XORQ SI, SI
|
||||
XORQ DI, DI
|
||||
XORQ R8, R8
|
||||
XORQ R9, R9
|
||||
XORQ R10, R10
|
||||
XORQ R11, R11
|
||||
XORQ R12, R12
|
||||
|
||||
bigLoop:
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ 32(DX), DI
|
||||
ADCQ 40(DX), DI
|
||||
ADCQ 48(DX), DI
|
||||
ADCQ 56(DX), DI
|
||||
ADCQ $0x00, R8
|
||||
ADDQ 64(DX), R9
|
||||
ADCQ 72(DX), R9
|
||||
ADCQ 80(DX), R9
|
||||
ADCQ 88(DX), R9
|
||||
ADCQ $0x00, R10
|
||||
ADDQ 96(DX), R11
|
||||
ADCQ 104(DX), R11
|
||||
ADCQ 112(DX), R11
|
||||
ADCQ 120(DX), R11
|
||||
ADCQ $0x00, R12
|
||||
ADDQ 128(DX), AX
|
||||
ADCQ 136(DX), AX
|
||||
ADCQ 144(DX), AX
|
||||
ADCQ 152(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ 160(DX), DI
|
||||
ADCQ 168(DX), DI
|
||||
ADCQ 176(DX), DI
|
||||
ADCQ 184(DX), DI
|
||||
ADCQ $0x00, R8
|
||||
ADDQ 192(DX), R9
|
||||
ADCQ 200(DX), R9
|
||||
ADCQ 208(DX), R9
|
||||
ADCQ 216(DX), R9
|
||||
ADCQ $0x00, R10
|
||||
ADDQ 224(DX), R11
|
||||
ADCQ 232(DX), R11
|
||||
ADCQ 240(DX), R11
|
||||
ADCQ 248(DX), R11
|
||||
ADCQ $0x00, R12
|
||||
ADDQ $0x00000100, DX
|
||||
SUBQ $0x01, CX
|
||||
JNZ bigLoop
|
||||
ADDQ SI, AX
|
||||
ADCQ DI, AX
|
||||
ADCQ R8, AX
|
||||
ADCQ R9, AX
|
||||
ADCQ R10, AX
|
||||
ADCQ R11, AX
|
||||
ADCQ R12, AX
|
||||
|
||||
// accumulate CF (twice, in case the first time overflows)
|
||||
ADCQ $0x00, AX
|
||||
ADCQ $0x00, AX
|
||||
|
||||
startCleanup:
|
||||
// Accumulate carries in this register. It is never expected to overflow.
|
||||
XORQ SI, SI
|
||||
|
||||
// We will perform an overlapped read for buffers with length not a multiple of 8.
|
||||
// Overlapped in this context means some memory will be read twice, but a shift will
|
||||
// eliminate the duplicated data. This extra read is performed at the end of the buffer to
|
||||
// preserve any alignment that may exist for the start of the buffer.
|
||||
MOVQ BX, CX
|
||||
SHRQ $0x03, BX
|
||||
ANDQ $0x07, CX
|
||||
JZ handleRemaining8
|
||||
LEAQ (DX)(BX*8), DI
|
||||
MOVQ -8(DI)(CX*1), DI
|
||||
|
||||
// Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)
|
||||
SHLQ $0x03, CX
|
||||
NEGQ CX
|
||||
ADDQ $0x40, CX
|
||||
SHRQ CL, DI
|
||||
ADDQ DI, AX
|
||||
ADCQ $0x00, SI
|
||||
|
||||
handleRemaining8:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining16
|
||||
ADDQ (DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x08, DX
|
||||
|
||||
handleRemaining16:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining32
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x10, DX
|
||||
|
||||
handleRemaining32:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining64
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x20, DX
|
||||
|
||||
handleRemaining64:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemaining128
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x40, DX
|
||||
|
||||
handleRemaining128:
|
||||
SHRQ $0x01, BX
|
||||
JNC handleRemainingComplete
|
||||
ADDQ (DX), AX
|
||||
ADCQ 8(DX), AX
|
||||
ADCQ 16(DX), AX
|
||||
ADCQ 24(DX), AX
|
||||
ADCQ 32(DX), AX
|
||||
ADCQ 40(DX), AX
|
||||
ADCQ 48(DX), AX
|
||||
ADCQ 56(DX), AX
|
||||
ADCQ 64(DX), AX
|
||||
ADCQ 72(DX), AX
|
||||
ADCQ 80(DX), AX
|
||||
ADCQ 88(DX), AX
|
||||
ADCQ 96(DX), AX
|
||||
ADCQ 104(DX), AX
|
||||
ADCQ 112(DX), AX
|
||||
ADCQ 120(DX), AX
|
||||
ADCQ $0x00, SI
|
||||
ADDQ $0x80, DX
|
||||
|
||||
handleRemainingComplete:
|
||||
ADDQ SI, AX
|
||||
|
||||
foldAndReturn:
|
||||
// add CF and fold
|
||||
MOVL AX, CX
|
||||
ADCQ $0x00, CX
|
||||
SHRQ $0x20, AX
|
||||
ADDQ CX, AX
|
||||
MOVWQZX AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDQ CX, AX
|
||||
MOVW AX, CX
|
||||
SHRQ $0x10, AX
|
||||
ADDW CX, AX
|
||||
ADCW $0x00, AX
|
||||
XCHGB AH, AL
|
||||
MOVW AX, ret+32(FP)
|
||||
RET
|
15
internal/tschecksum/checksum_generic.go
Normal file
15
internal/tschecksum/checksum_generic.go
Normal file
|
@ -0,0 +1,15 @@
|
|||
// This file contains IP checksum algorithms that are not specific to any
|
||||
// architecture and don't use hardware acceleration.
|
||||
|
||||
//go:build !amd64
|
||||
|
||||
package tschecksum
|
||||
|
||||
import "strconv"
|
||||
|
||||
func Checksum(data []byte, initial uint16) uint16 {
|
||||
if strconv.IntSize < 64 {
|
||||
return checksumGeneric32(data, initial)
|
||||
}
|
||||
return checksumGeneric64(data, initial)
|
||||
}
|
578
internal/tschecksum/generate_amd64.go
Normal file
578
internal/tschecksum/generate_amd64.go
Normal file
|
@ -0,0 +1,578 @@
|
|||
//go:build ignore
|
||||
|
||||
//go:generate go run generate_amd64.go -out checksum_generated_amd64.s -stubs checksum_generated_amd64.go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"math/bits"
|
||||
|
||||
"github.com/mmcloughlin/avo/operand"
|
||||
"github.com/mmcloughlin/avo/reg"
|
||||
)
|
||||
|
||||
const checksumSignature = "func(b []byte, initial uint16) uint16"
|
||||
|
||||
func loadParams() (accum, buf, n reg.GPVirtual) {
|
||||
accum, buf, n = GP64(), GP64(), GP64()
|
||||
Load(Param("initial"), accum)
|
||||
XCHGB(accum.As8H(), accum.As8L())
|
||||
Load(Param("b").Base(), buf)
|
||||
Load(Param("b").Len(), n)
|
||||
return
|
||||
}
|
||||
|
||||
type simdStrategy int
|
||||
|
||||
const (
|
||||
sse2 = iota
|
||||
avx2
|
||||
)
|
||||
|
||||
const tinyBufferSize = 31 // A buffer is tiny if it has at most 31 bytes.
|
||||
|
||||
func generateSIMDChecksum(name, doc string, minSIMDSize, chains int, strategy simdStrategy) {
|
||||
TEXT(name, NOSPLIT|NOFRAME, checksumSignature)
|
||||
Pragma("noescape")
|
||||
Doc(doc)
|
||||
|
||||
accum64, buf, n := loadParams()
|
||||
|
||||
handleOddLength(n, buf, accum64)
|
||||
// no chance of overflow because accum64 was initialized by a uint16 and
|
||||
// handleOddLength adds at most a uint8
|
||||
handleTinyBuffers(n, buf, accum64, operand.LabelRef("foldAndReturn"), operand.LabelRef("bufferIsNotTiny"))
|
||||
Label("bufferIsNotTiny")
|
||||
|
||||
const simdReadSize = 16
|
||||
|
||||
if minSIMDSize > tinyBufferSize {
|
||||
Comment("skip all SIMD for small buffers")
|
||||
if minSIMDSize <= math.MaxUint8 {
|
||||
CMPQ(n, operand.U8(minSIMDSize))
|
||||
} else {
|
||||
CMPQ(n, operand.U32(minSIMDSize))
|
||||
}
|
||||
JGE(operand.LabelRef("startSIMD"))
|
||||
|
||||
handleRemaining(n, buf, accum64, minSIMDSize-1)
|
||||
JMP(operand.LabelRef("foldAndReturn"))
|
||||
}
|
||||
|
||||
Label("startSIMD")
|
||||
|
||||
// chains is the number of accumulators to use. This improves speed via
|
||||
// reduced data dependency. We combine the accumulators once when the big
|
||||
// loop is complete.
|
||||
simdAccumulate := make([]reg.VecVirtual, chains)
|
||||
for i := range simdAccumulate {
|
||||
switch strategy {
|
||||
case sse2:
|
||||
simdAccumulate[i] = XMM()
|
||||
PXOR(simdAccumulate[i], simdAccumulate[i])
|
||||
case avx2:
|
||||
simdAccumulate[i] = YMM()
|
||||
VPXOR(simdAccumulate[i], simdAccumulate[i], simdAccumulate[i])
|
||||
}
|
||||
}
|
||||
var zero reg.VecVirtual
|
||||
if strategy == sse2 {
|
||||
zero = XMM()
|
||||
PXOR(zero, zero)
|
||||
}
|
||||
|
||||
// Number of loads per big loop
|
||||
const unroll = 16
|
||||
// Number of bytes
|
||||
loopSize := uint64(simdReadSize * unroll)
|
||||
if bits.Len64(loopSize) != bits.Len64(loopSize-1)+1 {
|
||||
panic("loopSize is not a power of 2")
|
||||
}
|
||||
loopCount := GP64()
|
||||
|
||||
MOVQ(n, loopCount)
|
||||
Comment("Update number of bytes remaining after the loop completes")
|
||||
ANDQ(operand.Imm(loopSize-1), n)
|
||||
Comment(fmt.Sprintf("Number of %d byte iterations", loopSize))
|
||||
SHRQ(operand.Imm(uint64(bits.Len64(loopSize-1))), loopCount)
|
||||
JZ(operand.LabelRef("smallLoop"))
|
||||
Label("bigLoop")
|
||||
for i := 0; i < unroll; i++ {
|
||||
chain := i % chains
|
||||
switch strategy {
|
||||
case sse2:
|
||||
sse2AccumulateStep(i*simdReadSize, buf, zero, simdAccumulate[chain], simdAccumulate[(chain+chains/2)%chains])
|
||||
case avx2:
|
||||
avx2AccumulateStep(i*simdReadSize, buf, simdAccumulate[chain])
|
||||
}
|
||||
}
|
||||
ADDQ(operand.U32(loopSize), buf)
|
||||
DECQ(loopCount)
|
||||
JNZ(operand.LabelRef("bigLoop"))
|
||||
|
||||
Label("bigCleanup")
|
||||
|
||||
CMPQ(n, operand.Imm(uint64(simdReadSize)))
|
||||
JLT(operand.LabelRef("doneSmallLoop"))
|
||||
|
||||
Commentf("now read a single %d byte unit of data at a time", simdReadSize)
|
||||
Label("smallLoop")
|
||||
|
||||
switch strategy {
|
||||
case sse2:
|
||||
sse2AccumulateStep(0, buf, zero, simdAccumulate[0], simdAccumulate[1])
|
||||
case avx2:
|
||||
avx2AccumulateStep(0, buf, simdAccumulate[0])
|
||||
}
|
||||
ADDQ(operand.Imm(uint64(simdReadSize)), buf)
|
||||
SUBQ(operand.Imm(uint64(simdReadSize)), n)
|
||||
CMPQ(n, operand.Imm(uint64(simdReadSize)))
|
||||
JGE(operand.LabelRef("smallLoop"))
|
||||
|
||||
Label("doneSmallLoop")
|
||||
CMPQ(n, operand.Imm(0))
|
||||
JE(operand.LabelRef("doneSIMD"))
|
||||
|
||||
Commentf("There are between 1 and %d bytes remaining. Perform an overlapped read.", simdReadSize-1)
|
||||
|
||||
maskDataPtr := GP64()
|
||||
LEAQ(operand.NewDataAddr(operand.NewStaticSymbol("xmmLoadMasks"), 0), maskDataPtr)
|
||||
dataAddr := operand.Mem{Index: n, Scale: 1, Base: buf, Disp: -simdReadSize}
|
||||
// scale 8 is only correct here because n is guaranteed to be even and we
|
||||
// do not generate masks for odd lengths
|
||||
maskAddr := operand.Mem{Base: maskDataPtr, Index: n, Scale: 8, Disp: -16}
|
||||
remainder := XMM()
|
||||
|
||||
switch strategy {
|
||||
case sse2:
|
||||
MOVOU(dataAddr, remainder)
|
||||
PAND(maskAddr, remainder)
|
||||
low := XMM()
|
||||
MOVOA(remainder, low)
|
||||
PUNPCKHWL(zero, remainder)
|
||||
PUNPCKLWL(zero, low)
|
||||
PADDD(remainder, simdAccumulate[0])
|
||||
PADDD(low, simdAccumulate[1])
|
||||
case avx2:
|
||||
// Note: this is very similar to the sse2 path but MOVOU has a massive
|
||||
// performance hit if used here, presumably due to switching between SSE
|
||||
// and AVX2 modes.
|
||||
VMOVDQU(dataAddr, remainder)
|
||||
VPAND(maskAddr, remainder, remainder)
|
||||
|
||||
temp := YMM()
|
||||
VPMOVZXWD(remainder, temp)
|
||||
VPADDD(temp, simdAccumulate[0], simdAccumulate[0])
|
||||
}
|
||||
|
||||
Label("doneSIMD")
|
||||
|
||||
Comment("Multi-chain loop is done, combine the accumulators")
|
||||
for i := range simdAccumulate {
|
||||
if i == 0 {
|
||||
continue
|
||||
}
|
||||
switch strategy {
|
||||
case sse2:
|
||||
PADDD(simdAccumulate[i], simdAccumulate[0])
|
||||
case avx2:
|
||||
VPADDD(simdAccumulate[i], simdAccumulate[0], simdAccumulate[0])
|
||||
}
|
||||
}
|
||||
|
||||
if strategy == avx2 {
|
||||
Comment("extract the YMM into a pair of XMM and sum them")
|
||||
tmp := YMM()
|
||||
VEXTRACTI128(operand.Imm(1), simdAccumulate[0], tmp.AsX())
|
||||
|
||||
xAccumulate := XMM()
|
||||
VPADDD(simdAccumulate[0].AsX(), tmp.AsX(), xAccumulate)
|
||||
simdAccumulate = []reg.VecVirtual{xAccumulate}
|
||||
}
|
||||
|
||||
Comment("extract the XMM into GP64")
|
||||
low, high := GP64(), GP64()
|
||||
switch strategy {
|
||||
case sse2:
|
||||
MOVQ(simdAccumulate[0], low)
|
||||
PSRLDQ(operand.Imm(8), simdAccumulate[0])
|
||||
MOVQ(simdAccumulate[0], high)
|
||||
case avx2:
|
||||
VPEXTRQ(operand.Imm(0), simdAccumulate[0], low)
|
||||
VPEXTRQ(operand.Imm(1), simdAccumulate[0], high)
|
||||
|
||||
Comment("no more AVX code, clear upper registers to avoid SSE slowdowns")
|
||||
VZEROUPPER()
|
||||
}
|
||||
ADDQ(low, accum64)
|
||||
ADCQ(high, accum64)
|
||||
Label("foldAndReturn")
|
||||
foldWithCF(accum64, strategy == avx2)
|
||||
XCHGB(accum64.As8H(), accum64.As8L())
|
||||
Store(accum64.As16(), ReturnIndex(0))
|
||||
RET()
|
||||
}
|
||||
|
||||
// handleOddLength generates instructions to incorporate the last byte into
|
||||
// accum64 if the length is odd. CF may be set if accum64 overflows; be sure to
|
||||
// handle that if overflow is possible.
|
||||
func handleOddLength(n, buf, accum64 reg.GPVirtual) {
|
||||
Comment("handle odd length buffers; they are difficult to handle in general")
|
||||
TESTQ(operand.U32(1), n)
|
||||
JZ(operand.LabelRef("lengthIsEven"))
|
||||
|
||||
tmp := GP64()
|
||||
MOVBQZX(operand.Mem{Base: buf, Index: n, Scale: 1, Disp: -1}, tmp)
|
||||
DECQ(n)
|
||||
ADDQ(tmp, accum64)
|
||||
|
||||
Label("lengthIsEven")
|
||||
}
|
||||
|
||||
func sse2AccumulateStep(offset int, buf reg.GPVirtual, zero, accumulate1, accumulate2 reg.VecVirtual) {
|
||||
high, low := XMM(), XMM()
|
||||
MOVOU(operand.Mem{Disp: offset, Base: buf}, high)
|
||||
MOVOA(high, low)
|
||||
PUNPCKHWL(zero, high)
|
||||
PUNPCKLWL(zero, low)
|
||||
PADDD(high, accumulate1)
|
||||
PADDD(low, accumulate2)
|
||||
}
|
||||
|
||||
func avx2AccumulateStep(offset int, buf reg.GPVirtual, accumulate reg.VecVirtual) {
|
||||
tmp := YMM()
|
||||
VPMOVZXWD(operand.Mem{Disp: offset, Base: buf}, tmp)
|
||||
VPADDD(tmp, accumulate, accumulate)
|
||||
}
|
||||
|
||||
func generateAMD64Checksum(name, doc string) {
|
||||
TEXT(name, NOSPLIT|NOFRAME, checksumSignature)
|
||||
Pragma("noescape")
|
||||
Doc(doc)
|
||||
|
||||
accum64, buf, n := loadParams()
|
||||
|
||||
handleOddLength(n, buf, accum64)
|
||||
// no chance of overflow because accum64 was initialized by a uint16 and
|
||||
// handleOddLength adds at most a uint8
|
||||
handleTinyBuffers(n, buf, accum64, operand.LabelRef("foldAndReturn"), operand.LabelRef("bufferIsNotTiny"))
|
||||
Label("bufferIsNotTiny")
|
||||
|
||||
const (
|
||||
// numChains is the number of accumulators and carry counters to use.
|
||||
// This improves speed via reduced data dependency. We combine the
|
||||
// accumulators and carry counters once when the loop is complete.
|
||||
numChains = 4
|
||||
unroll = 32 // The number of 64-bit reads to perform per iteration of the loop.
|
||||
loopSize = 8 * unroll // The number of bytes read per iteration of the loop.
|
||||
)
|
||||
if bits.Len(loopSize) != bits.Len(loopSize-1)+1 {
|
||||
panic("loopSize is not a power of 2")
|
||||
}
|
||||
loopCount := GP64()
|
||||
|
||||
Comment(fmt.Sprintf("Number of %d byte iterations into loop counter", loopSize))
|
||||
MOVQ(n, loopCount)
|
||||
Comment("Update number of bytes remaining after the loop completes")
|
||||
ANDQ(operand.Imm(loopSize-1), n)
|
||||
SHRQ(operand.Imm(uint64(bits.Len(loopSize-1))), loopCount)
|
||||
JZ(operand.LabelRef("startCleanup"))
|
||||
CLC()
|
||||
|
||||
chains := make([]struct {
|
||||
accum reg.GPVirtual
|
||||
carries reg.GPVirtual
|
||||
}, numChains)
|
||||
for i := range chains {
|
||||
if i == 0 {
|
||||
chains[i].accum = accum64
|
||||
} else {
|
||||
chains[i].accum = GP64()
|
||||
XORQ(chains[i].accum, chains[i].accum)
|
||||
}
|
||||
chains[i].carries = GP64()
|
||||
XORQ(chains[i].carries, chains[i].carries)
|
||||
}
|
||||
|
||||
Label("bigLoop")
|
||||
|
||||
var curChain int
|
||||
for i := 0; i < unroll; i++ {
|
||||
// It is significantly faster to use a ADCX/ADOX pair instead of plain
|
||||
// ADC, which results in two dependency chains, however those require
|
||||
// ADX support, which was added after AVX2. If AVX2 is available, that's
|
||||
// even better than ADCX/ADOX.
|
||||
//
|
||||
// However, multiple dependency chains using multiple accumulators and
|
||||
// occasionally storing CF into temporary counters seems to work almost
|
||||
// as well.
|
||||
addr := operand.Mem{Disp: i * 8, Base: buf}
|
||||
|
||||
if i%4 == 0 {
|
||||
if i > 0 {
|
||||
ADCQ(operand.Imm(0), chains[curChain].carries)
|
||||
curChain = (curChain + 1) % len(chains)
|
||||
}
|
||||
ADDQ(addr, chains[curChain].accum)
|
||||
} else {
|
||||
ADCQ(addr, chains[curChain].accum)
|
||||
}
|
||||
}
|
||||
ADCQ(operand.Imm(0), chains[curChain].carries)
|
||||
ADDQ(operand.U32(loopSize), buf)
|
||||
SUBQ(operand.Imm(1), loopCount)
|
||||
JNZ(operand.LabelRef("bigLoop"))
|
||||
for i := range chains {
|
||||
if i == 0 {
|
||||
ADDQ(chains[i].carries, accum64)
|
||||
continue
|
||||
}
|
||||
ADCQ(chains[i].accum, accum64)
|
||||
ADCQ(chains[i].carries, accum64)
|
||||
}
|
||||
|
||||
accumulateCF(accum64)
|
||||
|
||||
Label("startCleanup")
|
||||
handleRemaining(n, buf, accum64, loopSize-1)
|
||||
Label("foldAndReturn")
|
||||
foldWithCF(accum64, false)
|
||||
|
||||
XCHGB(accum64.As8H(), accum64.As8L())
|
||||
Store(accum64.As16(), ReturnIndex(0))
|
||||
RET()
|
||||
}
|
||||
|
||||
// handleTinyBuffers computes checksums if the buffer length (the n parameter)
|
||||
// is less than 32. After computing the checksum, a jump to returnLabel will
|
||||
// be executed. Otherwise, if the buffer length is at least 32, nothing will be
|
||||
// modified; a jump to continueLabel will be executed instead.
|
||||
//
|
||||
// When jumping to returnLabel, CF may be set and must be accommodated e.g.
|
||||
// using foldWithCF or accumulateCF.
|
||||
//
|
||||
// Anecdotally, this appears to be faster than attempting to coordinate an
|
||||
// overlapped read (which would also require special handling for buffers
|
||||
// smaller than 8).
|
||||
func handleTinyBuffers(n, buf, accum reg.GPVirtual, returnLabel, continueLabel operand.LabelRef) {
|
||||
Comment("handle tiny buffers (<=31 bytes) specially")
|
||||
CMPQ(n, operand.Imm(tinyBufferSize))
|
||||
JGT(continueLabel)
|
||||
|
||||
tmp2, tmp4, tmp8 := GP64(), GP64(), GP64()
|
||||
XORQ(tmp2, tmp2)
|
||||
XORQ(tmp4, tmp4)
|
||||
XORQ(tmp8, tmp8)
|
||||
|
||||
Comment("shift twice to start because length is guaranteed to be even",
|
||||
"n = n >> 2; CF = originalN & 2")
|
||||
SHRQ(operand.Imm(2), n)
|
||||
JNC(operand.LabelRef("handleTiny4"))
|
||||
Comment("tmp2 = binary.LittleEndian.Uint16(buf[:2]); buf = buf[2:]")
|
||||
MOVWQZX(operand.Mem{Base: buf}, tmp2)
|
||||
ADDQ(operand.Imm(2), buf)
|
||||
|
||||
Label("handleTiny4")
|
||||
Comment("n = n >> 1; CF = originalN & 4")
|
||||
SHRQ(operand.Imm(1), n)
|
||||
JNC(operand.LabelRef("handleTiny8"))
|
||||
Comment("tmp4 = binary.LittleEndian.Uint32(buf[:4]); buf = buf[4:]")
|
||||
MOVLQZX(operand.Mem{Base: buf}, tmp4)
|
||||
ADDQ(operand.Imm(4), buf)
|
||||
|
||||
Label("handleTiny8")
|
||||
Comment("n = n >> 1; CF = originalN & 8")
|
||||
SHRQ(operand.Imm(1), n)
|
||||
JNC(operand.LabelRef("handleTiny16"))
|
||||
Comment("tmp8 = binary.LittleEndian.Uint64(buf[:8]); buf = buf[8:]")
|
||||
MOVQ(operand.Mem{Base: buf}, tmp8)
|
||||
ADDQ(operand.Imm(8), buf)
|
||||
|
||||
Label("handleTiny16")
|
||||
Comment("n = n >> 1; CF = originalN & 16",
|
||||
"n == 0 now, otherwise we would have branched after comparing with tinyBufferSize")
|
||||
SHRQ(operand.Imm(1), n)
|
||||
JNC(operand.LabelRef("handleTinyFinish"))
|
||||
ADDQ(operand.Mem{Base: buf}, accum)
|
||||
ADCQ(operand.Mem{Base: buf, Disp: 8}, accum)
|
||||
|
||||
Label("handleTinyFinish")
|
||||
Comment("CF should be included from the previous add, so we use ADCQ.",
|
||||
"If we arrived via the JNC above, then CF=0 due to the branch condition,",
|
||||
"so ADCQ will still produce the correct result.")
|
||||
ADCQ(tmp2, accum)
|
||||
ADCQ(tmp4, accum)
|
||||
ADCQ(tmp8, accum)
|
||||
|
||||
JMP(returnLabel)
|
||||
}
|
||||
|
||||
// handleRemaining generates a series of conditional unrolled additions,
|
||||
// starting with 8 bytes long and doubling each time until the length reaches
|
||||
// max. This is the reverse order of what may be intuitive, but makes the branch
|
||||
// conditions convenient to compute: perform one right shift each time and test
|
||||
// against CF.
|
||||
//
|
||||
// When done, CF may be set and must be accommodated e.g., using foldWithCF or
|
||||
// accumulateCF.
|
||||
//
|
||||
// If n is not a multiple of 8, an extra 64 bit read at the end of the buffer
|
||||
// will be performed, overlapping with data that will be read later. The
|
||||
// duplicate data will be shifted off.
|
||||
//
|
||||
// The original buffer length must have been at least 8 bytes long, even if
|
||||
// n < 8, otherwise this will access memory before the start of the buffer,
|
||||
// which may be unsafe.
|
||||
func handleRemaining(n, buf, accum64 reg.GPVirtual, max int) {
|
||||
Comment("Accumulate carries in this register. It is never expected to overflow.")
|
||||
carries := GP64()
|
||||
XORQ(carries, carries)
|
||||
|
||||
Comment("We will perform an overlapped read for buffers with length not a multiple of 8.",
|
||||
"Overlapped in this context means some memory will be read twice, but a shift will",
|
||||
"eliminate the duplicated data. This extra read is performed at the end of the buffer to",
|
||||
"preserve any alignment that may exist for the start of the buffer.")
|
||||
leftover := reg.RCX
|
||||
MOVQ(n, leftover)
|
||||
SHRQ(operand.Imm(3), n) // n is now the number of 64 bit reads remaining
|
||||
ANDQ(operand.Imm(0x7), leftover) // leftover is now the number of bytes to read from the end
|
||||
JZ(operand.LabelRef("handleRemaining8"))
|
||||
endBuf := GP64()
|
||||
// endBuf is the position near the end of the buffer that is just past the
|
||||
// last multiple of 8: (buf + len(buf)) & ^0x7
|
||||
LEAQ(operand.Mem{Base: buf, Index: n, Scale: 8}, endBuf)
|
||||
|
||||
overlapRead := GP64()
|
||||
// equivalent to overlapRead = binary.LittleEndian.Uint64(buf[len(buf)-8:len(buf)])
|
||||
MOVQ(operand.Mem{Base: endBuf, Index: leftover, Scale: 1, Disp: -8}, overlapRead)
|
||||
|
||||
Comment("Shift out the duplicated data: overlapRead = overlapRead >> (64 - leftoverBytes*8)")
|
||||
SHLQ(operand.Imm(3), leftover) // leftover = leftover * 8
|
||||
NEGQ(leftover) // leftover = -leftover; this completes the (-leftoverBytes*8) part of the expression
|
||||
ADDQ(operand.Imm(64), leftover) // now we have (64 - leftoverBytes*8)
|
||||
SHRQ(reg.CL, overlapRead) // shift right by (64 - leftoverBytes*8); CL is the low 8 bits of leftover (set to RCX above) and variable shift only accepts CL
|
||||
|
||||
ADDQ(overlapRead, accum64)
|
||||
ADCQ(operand.Imm(0), carries)
|
||||
|
||||
for curBytes := 8; curBytes <= max; curBytes *= 2 {
|
||||
Label(fmt.Sprintf("handleRemaining%d", curBytes))
|
||||
SHRQ(operand.Imm(1), n)
|
||||
if curBytes*2 <= max {
|
||||
JNC(operand.LabelRef(fmt.Sprintf("handleRemaining%d", curBytes*2)))
|
||||
} else {
|
||||
JNC(operand.LabelRef("handleRemainingComplete"))
|
||||
}
|
||||
|
||||
numLoads := curBytes / 8
|
||||
for i := 0; i < numLoads; i++ {
|
||||
addr := operand.Mem{Base: buf, Disp: i * 8}
|
||||
// It is possible to add the multiple dependency chains trick here
|
||||
// that generateAMD64Checksum uses but anecdotally it does not
|
||||
// appear to outweigh the cost.
|
||||
if i == 0 {
|
||||
ADDQ(addr, accum64)
|
||||
continue
|
||||
}
|
||||
ADCQ(addr, accum64)
|
||||
}
|
||||
ADCQ(operand.Imm(0), carries)
|
||||
|
||||
if curBytes > math.MaxUint8 {
|
||||
ADDQ(operand.U32(uint64(curBytes)), buf)
|
||||
} else {
|
||||
ADDQ(operand.U8(uint64(curBytes)), buf)
|
||||
}
|
||||
if curBytes*2 >= max {
|
||||
continue
|
||||
}
|
||||
JMP(operand.LabelRef(fmt.Sprintf("handleRemaining%d", curBytes*2)))
|
||||
}
|
||||
Label("handleRemainingComplete")
|
||||
ADDQ(carries, accum64)
|
||||
}
|
||||
|
||||
func accumulateCF(accum64 reg.GPVirtual) {
|
||||
Comment("accumulate CF (twice, in case the first time overflows)")
|
||||
// accum64 += CF
|
||||
ADCQ(operand.Imm(0), accum64)
|
||||
// accum64 += CF again if the previous add overflowed. The previous add was
|
||||
// 0 or 1. If it overflowed, then accum64 == 0, so adding another 1 can
|
||||
// never overflow.
|
||||
ADCQ(operand.Imm(0), accum64)
|
||||
}
|
||||
|
||||
// foldWithCF generates instructions to fold accum (a GP64) into a 16-bit value
|
||||
// according to ones-complement arithmetic. BMI2 instructions will be used if
|
||||
// allowBMI2 is true (requires fewer instructions).
|
||||
func foldWithCF(accum reg.GPVirtual, allowBMI2 bool) {
|
||||
Comment("add CF and fold")
|
||||
|
||||
// CF|accum max value starts as 0x1_ffff_ffff_ffff_ffff
|
||||
|
||||
tmp := GP64()
|
||||
if allowBMI2 {
|
||||
// effectively, tmp = accum >> 32 (technically, this is a rotate)
|
||||
RORXQ(operand.Imm(32), accum, tmp)
|
||||
// accum as uint32 = uint32(accum) + uint32(tmp64) + CF; max value 0xffff_ffff + CF set
|
||||
ADCL(tmp.As32(), accum.As32())
|
||||
// effectively, tmp64 as uint32 = uint32(accum) >> 16 (also a rotate)
|
||||
RORXL(operand.Imm(16), accum.As32(), tmp.As32())
|
||||
// accum as uint16 = uint16(accum) + uint16(tmp) + CF; max value 0xffff + CF unset or 0xfffe + CF set
|
||||
ADCW(tmp.As16(), accum.As16())
|
||||
} else {
|
||||
// tmp = uint32(accum); max value 0xffff_ffff
|
||||
// MOVL clears the upper 32 bits of a GP64 so this is equivalent to the
|
||||
// non-existent MOVLQZX.
|
||||
MOVL(accum.As32(), tmp.As32())
|
||||
// tmp += CF; max value 0x1_0000_0000, CF unset
|
||||
ADCQ(operand.Imm(0), tmp)
|
||||
// accum = accum >> 32; max value 0xffff_ffff
|
||||
SHRQ(operand.Imm(32), accum)
|
||||
// accum = accum + tmp; max value 0x1_ffff_ffff + CF unset
|
||||
ADDQ(tmp, accum)
|
||||
// tmp = uint16(accum); max value 0xffff
|
||||
MOVWQZX(accum.As16(), tmp)
|
||||
// accum = accum >> 16; max value 0x1_ffff
|
||||
SHRQ(operand.Imm(16), accum)
|
||||
// accum = accum + tmp; max value 0x2_fffe + CF unset
|
||||
ADDQ(tmp, accum)
|
||||
// tmp as uint16 = uint16(accum); max value 0xffff
|
||||
MOVW(accum.As16(), tmp.As16())
|
||||
// accum = accum >> 16; max value 0x2
|
||||
SHRQ(operand.Imm(16), accum)
|
||||
// accum as uint16 = uint16(accum) + uint16(tmp); max value 0xffff + CF unset or 0x2 + CF set
|
||||
ADDW(tmp.As16(), accum.As16())
|
||||
}
|
||||
// accum as uint16 += CF; will not overflow: either CF was 0 or accum <= 0xfffe
|
||||
ADCW(operand.Imm(0), accum.As16())
|
||||
}
|
||||
|
||||
func generateLoadMasks() {
|
||||
var offset int
|
||||
// xmmLoadMasks is a table of masks that can be used with PAND to zero all but the last N bytes in an XMM, N=2,4,6,8,10,12,14
|
||||
GLOBL("xmmLoadMasks", RODATA|NOPTR)
|
||||
|
||||
for n := 2; n < 16; n += 2 {
|
||||
var pattern [16]byte
|
||||
for i := 0; i < len(pattern); i++ {
|
||||
if i < len(pattern)-n {
|
||||
pattern[i] = 0
|
||||
continue
|
||||
}
|
||||
pattern[i] = 0xff
|
||||
}
|
||||
DATA(offset, operand.String(pattern[:]))
|
||||
offset += len(pattern)
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
generateLoadMasks()
|
||||
generateSIMDChecksum("checksumAVX2", "checksumAVX2 computes an IP checksum using amd64 v3 instructions (AVX2, BMI2)", 256, 4, avx2)
|
||||
generateSIMDChecksum("checksumSSE2", "checksumSSE2 computes an IP checksum using amd64 baseline instructions (SSE2)", 256, 4, sse2)
|
||||
generateAMD64Checksum("checksumAMD64", "checksumAMD64 computes an IP checksum using amd64 baseline instructions")
|
||||
Generate()
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue