clone from danieldin95

This commit is contained in:
sicheng
2022-07-29 23:38:54 +08:00
commit ac4f79bbf4
1931 changed files with 568263 additions and 0 deletions

12
vendor/github.com/templexxx/cpu/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,12 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out

32
vendor/github.com/templexxx/cpu/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,32 @@
BSD 3-Clause License
Copyright (c) 2018 Temple3x (temple3x@gmail.com)
Copyright 2017 The Go Authors
Copyright (c) 2015 Klaus Post
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

23
vendor/github.com/templexxx/cpu/README.md generated vendored Normal file
View File

@@ -0,0 +1,23 @@
# cpu
internal/cpu(in Go standard lib) with these detections:
>- AVX512
>
>- Cache Size
>
>- Invariant TSC
>
It also provides:
>- False sharing range, see `X86FalseSharingRange` for X86 platform.
>
>- TSC frequency
>
>- Name
>
>- Family & Model
# Acknowledgement
[klauspost/cpuid](https://github.com/klauspost/cpuid)

234
vendor/github.com/templexxx/cpu/cpu.go generated vendored Normal file
View File

@@ -0,0 +1,234 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package cpu implements processor feature detection
// used by the Go standard library.
package cpu
// debugOptions is set to true by the runtime if go was compiled with GOEXPERIMENT=debugcpu
// and GOOS is Linux or Darwin. This variable is linknamed in runtime/proc.go.
var debugOptions bool
var X86 x86
// "Loads data or instructions from memory to the second-level cache.
// To use the streamer, organize the data or instructions in blocks of 128 bytes,
// aligned on 128 bytes."
// From <Intel® 64 and IA-32 architectures optimization reference manual>,
// in section 3.7.3 "Hardware Prefetching for Second-Level Cache"
//
// In practice, I have found use 128bytes can gain better performance than 64bytes (one cache line).
const X86FalseSharingRange = 128
// The booleans in x86 contain the correspondingly named cpuid feature bit.
// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
// in addition to the cpuid feature bit being set.
// The struct is padded to avoid false sharing.
type x86 struct {
_ [X86FalseSharingRange]byte
HasAES bool
HasADX bool
HasAVX bool
HasAVX2 bool
HasAVX512F bool
HasAVX512DQ bool
HasAVX512BW bool
HasAVX512VL bool
HasBMI1 bool
HasBMI2 bool
HasERMS bool
HasFMA bool
HasOSXSAVE bool
HasPCLMULQDQ bool
HasPOPCNT bool
HasSSE2 bool
HasSSE3 bool
HasSSSE3 bool
HasSSE41 bool
HasSSE42 bool
// The invariant TSC will run at a constant rate in all ACPI P-, C-, and T-states.
// This is the architectural behavior moving forward. On processors with
// invariant TSC support, the OS may use the TSC for wall clock timer services (instead of ACPI or HPET timers).
HasInvariantTSC bool
Cache Cache
// TSCFrequency only meaningful when HasInvariantTSC == true.
// Unit: Hz.
//
// Warn:
// 1. If it's 0, means can't get it. Don't use it.
// 2. Don't use it if you want "100%" precise timestamp.
TSCFrequency uint64
Name string
Signature string // DisplayFamily_DisplayModel.
Family uint32 // CPU family number.
Model uint32 // CPU model number.
_ [X86FalseSharingRange]byte
}
// CPU Cache Size.
// -1 if undetected.
type Cache struct {
L1I int
L1D int
L2 int
L3 int
}
var PPC64 ppc64
// For ppc64x, it is safe to check only for ISA level starting on ISA v3.00,
// since there are no optional categories. There are some exceptions that also
// require kernel support to work (darn, scv), so there are feature bits for
// those as well. The minimum processor requirement is POWER8 (ISA 2.07), so we
// maintain some of the old feature checks for optional categories for
// safety.
// The struct is padded to avoid false sharing.
type ppc64 struct {
_ [CacheLineSize]byte
HasVMX bool // Vector unit (Altivec)
HasDFP bool // Decimal Floating Point unit
HasVSX bool // Vector-scalar unit
HasHTM bool // Hardware Transactional Memory
HasISEL bool // Integer select
HasVCRYPTO bool // Vector cryptography
HasHTMNOSC bool // HTM: kernel-aborted transaction in syscalls
HasDARN bool // Hardware random number generator (requires kernel enablement)
HasSCV bool // Syscall vectored (requires kernel enablement)
IsPOWER8 bool // ISA v2.07 (POWER8)
IsPOWER9 bool // ISA v3.00 (POWER9)
_ [CacheLineSize]byte
}
var ARM64 arm64
// The booleans in arm64 contain the correspondingly named cpu feature bit.
// The struct is padded to avoid false sharing.
type arm64 struct {
_ [CacheLineSize]byte
HasFP bool
HasASIMD bool
HasEVTSTRM bool
HasAES bool
HasPMULL bool
HasSHA1 bool
HasSHA2 bool
HasCRC32 bool
HasATOMICS bool
HasFPHP bool
HasASIMDHP bool
HasCPUID bool
HasASIMDRDM bool
HasJSCVT bool
HasFCMA bool
HasLRCPC bool
HasDCPOP bool
HasSHA3 bool
HasSM3 bool
HasSM4 bool
HasASIMDDP bool
HasSHA512 bool
HasSVE bool
HasASIMDFHM bool
_ [CacheLineSize]byte
}
var S390X s390x
type s390x struct {
_ [CacheLineSize]byte
HasZArch bool // z architecture mode is active [mandatory]
HasSTFLE bool // store facility list extended [mandatory]
HasLDisp bool // long (20-bit) displacements [mandatory]
HasEImm bool // 32-bit immediates [mandatory]
HasDFP bool // decimal floating point
HasETF3Enhanced bool // ETF-3 enhanced
HasMSA bool // message security assist (CPACF)
HasAES bool // KM-AES{128,192,256} functions
HasAESCBC bool // KMC-AES{128,192,256} functions
HasAESCTR bool // KMCTR-AES{128,192,256} functions
HasAESGCM bool // KMA-GCM-AES{128,192,256} functions
HasGHASH bool // KIMD-GHASH function
HasSHA1 bool // K{I,L}MD-SHA-1 functions
HasSHA256 bool // K{I,L}MD-SHA-256 functions
HasSHA512 bool // K{I,L}MD-SHA-512 functions
HasVX bool // vector facility. Note: the runtime sets this when it processes auxv records.
_ [CacheLineSize]byte
}
// initialize examines the processor and sets the relevant variables above.
// This is called by the runtime package early in program initialization,
// before normal init functions are run. env is set by runtime on Linux and Darwin
// if go was compiled with GOEXPERIMENT=debugcpu.
func init() {
doinit()
processOptions("")
}
// options contains the cpu debug options that can be used in GODEBUGCPU.
// Options are arch dependent and are added by the arch specific doinit functions.
// Features that are mandatory for the specific GOARCH should not be added to options
// (e.g. SSE2 on amd64).
var options []option
// Option names should be lower case. e.g. avx instead of AVX.
type option struct {
Name string
Feature *bool
}
// processOptions disables CPU feature values based on the parsed env string.
// The env string is expected to be of the form feature1=0,feature2=0...
// where feature names is one of the architecture specifc list stored in the
// cpu packages options variable. If env contains all=0 then all capabilities
// referenced through the options variable are disabled. Other feature
// names and values other than 0 are silently ignored.
func processOptions(env string) {
field:
for env != "" {
field := ""
i := indexByte(env, ',')
if i < 0 {
field, env = env, ""
} else {
field, env = env[:i], env[i+1:]
}
i = indexByte(field, '=')
if i < 0 {
continue
}
key, value := field[:i], field[i+1:]
// Only allow turning off CPU features by specifying '0'.
if value == "0" {
if key == "all" {
for _, v := range options {
*v.Feature = false
}
return
} else {
for _, v := range options {
if v.Name == key {
*v.Feature = false
continue field
}
}
}
}
}
}
// indexByte returns the index of the first instance of c in s,
// or -1 if c is not present in s.
func indexByte(s string, c byte) int {
for i := 0; i < len(s); i++ {
if s[i] == c {
return i
}
}
return -1
}

7
vendor/github.com/templexxx/cpu/cpu_386.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const GOARCH = "386"

7
vendor/github.com/templexxx/cpu/cpu_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const GOARCH = "amd64"

7
vendor/github.com/templexxx/cpu/cpu_amd64p32.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const GOARCH = "amd64p32"

7
vendor/github.com/templexxx/cpu/cpu_arm.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 32

102
vendor/github.com/templexxx/cpu/cpu_arm64.go generated vendored Normal file
View File

@@ -0,0 +1,102 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 64
// arm64 doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2.
// These are linknamed in runtime/os_linux_arm64.go and are initialized by
// archauxv().
var hwcap uint
var hwcap2 uint
// HWCAP/HWCAP2 bits. These are exposed by Linux.
const (
hwcap_FP = (1 << 0)
hwcap_ASIMD = (1 << 1)
hwcap_EVTSTRM = (1 << 2)
hwcap_AES = (1 << 3)
hwcap_PMULL = (1 << 4)
hwcap_SHA1 = (1 << 5)
hwcap_SHA2 = (1 << 6)
hwcap_CRC32 = (1 << 7)
hwcap_ATOMICS = (1 << 8)
hwcap_FPHP = (1 << 9)
hwcap_ASIMDHP = (1 << 10)
hwcap_CPUID = (1 << 11)
hwcap_ASIMDRDM = (1 << 12)
hwcap_JSCVT = (1 << 13)
hwcap_FCMA = (1 << 14)
hwcap_LRCPC = (1 << 15)
hwcap_DCPOP = (1 << 16)
hwcap_SHA3 = (1 << 17)
hwcap_SM3 = (1 << 18)
hwcap_SM4 = (1 << 19)
hwcap_ASIMDDP = (1 << 20)
hwcap_SHA512 = (1 << 21)
hwcap_SVE = (1 << 22)
hwcap_ASIMDFHM = (1 << 23)
)
func doinit() {
options = []option{
{"evtstrm", &ARM64.HasEVTSTRM},
{"aes", &ARM64.HasAES},
{"pmull", &ARM64.HasPMULL},
{"sha1", &ARM64.HasSHA1},
{"sha2", &ARM64.HasSHA2},
{"crc32", &ARM64.HasCRC32},
{"atomics", &ARM64.HasATOMICS},
{"fphp", &ARM64.HasFPHP},
{"asimdhp", &ARM64.HasASIMDHP},
{"cpuid", &ARM64.HasCPUID},
{"asimdrdm", &ARM64.HasASIMDRDM},
{"jscvt", &ARM64.HasJSCVT},
{"fcma", &ARM64.HasFCMA},
{"lrcpc", &ARM64.HasLRCPC},
{"dcpop", &ARM64.HasDCPOP},
{"sha3", &ARM64.HasSHA3},
{"sm3", &ARM64.HasSM3},
{"sm4", &ARM64.HasSM4},
{"asimddp", &ARM64.HasASIMDDP},
{"sha512", &ARM64.HasSHA512},
{"sve", &ARM64.HasSVE},
{"asimdfhm", &ARM64.HasASIMDFHM},
// These capabilities should always be enabled on arm64:
// {"fp", &ARM64.HasFP},
// {"asimd", &ARM64.HasASIMD},
}
// HWCAP feature bits
ARM64.HasFP = isSet(hwcap, hwcap_FP)
ARM64.HasASIMD = isSet(hwcap, hwcap_ASIMD)
ARM64.HasEVTSTRM = isSet(hwcap, hwcap_EVTSTRM)
ARM64.HasAES = isSet(hwcap, hwcap_AES)
ARM64.HasPMULL = isSet(hwcap, hwcap_PMULL)
ARM64.HasSHA1 = isSet(hwcap, hwcap_SHA1)
ARM64.HasSHA2 = isSet(hwcap, hwcap_SHA2)
ARM64.HasCRC32 = isSet(hwcap, hwcap_CRC32)
ARM64.HasATOMICS = isSet(hwcap, hwcap_ATOMICS)
ARM64.HasFPHP = isSet(hwcap, hwcap_FPHP)
ARM64.HasASIMDHP = isSet(hwcap, hwcap_ASIMDHP)
ARM64.HasCPUID = isSet(hwcap, hwcap_CPUID)
ARM64.HasASIMDRDM = isSet(hwcap, hwcap_ASIMDRDM)
ARM64.HasJSCVT = isSet(hwcap, hwcap_JSCVT)
ARM64.HasFCMA = isSet(hwcap, hwcap_FCMA)
ARM64.HasLRCPC = isSet(hwcap, hwcap_LRCPC)
ARM64.HasDCPOP = isSet(hwcap, hwcap_DCPOP)
ARM64.HasSHA3 = isSet(hwcap, hwcap_SHA3)
ARM64.HasSM3 = isSet(hwcap, hwcap_SM3)
ARM64.HasSM4 = isSet(hwcap, hwcap_SM4)
ARM64.HasASIMDDP = isSet(hwcap, hwcap_ASIMDDP)
ARM64.HasSHA512 = isSet(hwcap, hwcap_SHA512)
ARM64.HasSVE = isSet(hwcap, hwcap_SVE)
ARM64.HasASIMDFHM = isSet(hwcap, hwcap_ASIMDFHM)
}
func isSet(hwc uint, value uint) bool {
return hwc&value != 0
}

7
vendor/github.com/templexxx/cpu/cpu_mips.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpu/cpu_mips64.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpu/cpu_mips64le.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 32

7
vendor/github.com/templexxx/cpu/cpu_mipsle.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 32

16
vendor/github.com/templexxx/cpu/cpu_no_init.go generated vendored Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !386
// +build !amd64
// +build !amd64p32
// +build !arm64
// +build !ppc64
// +build !ppc64le
// +build !s390x
package cpu
func doinit() {
}

68
vendor/github.com/templexxx/cpu/cpu_ppc64x.go generated vendored Normal file
View File

@@ -0,0 +1,68 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ppc64 ppc64le
package cpu
const CacheLineSize = 128
// ppc64x doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2.
// These are linknamed in runtime/os_linux_ppc64x.go and are initialized by
// archauxv().
var hwcap uint
var hwcap2 uint
// HWCAP/HWCAP2 bits. These are exposed by the kernel.
const (
// ISA Level
_PPC_FEATURE2_ARCH_2_07 = 0x80000000
_PPC_FEATURE2_ARCH_3_00 = 0x00800000
// CPU features
_PPC_FEATURE_HAS_ALTIVEC = 0x10000000
_PPC_FEATURE_HAS_DFP = 0x00000400
_PPC_FEATURE_HAS_VSX = 0x00000080
_PPC_FEATURE2_HAS_HTM = 0x40000000
_PPC_FEATURE2_HAS_ISEL = 0x08000000
_PPC_FEATURE2_HAS_VEC_CRYPTO = 0x02000000
_PPC_FEATURE2_HTM_NOSC = 0x01000000
_PPC_FEATURE2_DARN = 0x00200000
_PPC_FEATURE2_SCV = 0x00100000
)
func doinit() {
options = []option{
{"htm", &PPC64.HasHTM},
{"htmnosc", &PPC64.HasHTMNOSC},
{"darn", &PPC64.HasDARN},
{"scv", &PPC64.HasSCV},
// These capabilities should always be enabled on ppc64 and ppc64le:
// {"vmx", &PPC64.HasVMX},
// {"dfp", &PPC64.HasDFP},
// {"vsx", &PPC64.HasVSX},
// {"isel", &PPC64.HasISEL},
// {"vcrypto", &PPC64.HasVCRYPTO},
}
// HWCAP feature bits
PPC64.HasVMX = isSet(hwcap, _PPC_FEATURE_HAS_ALTIVEC)
PPC64.HasDFP = isSet(hwcap, _PPC_FEATURE_HAS_DFP)
PPC64.HasVSX = isSet(hwcap, _PPC_FEATURE_HAS_VSX)
// HWCAP2 feature bits
PPC64.IsPOWER8 = isSet(hwcap2, _PPC_FEATURE2_ARCH_2_07)
PPC64.HasHTM = isSet(hwcap2, _PPC_FEATURE2_HAS_HTM)
PPC64.HasISEL = isSet(hwcap2, _PPC_FEATURE2_HAS_ISEL)
PPC64.HasVCRYPTO = isSet(hwcap2, _PPC_FEATURE2_HAS_VEC_CRYPTO)
PPC64.HasHTMNOSC = isSet(hwcap2, _PPC_FEATURE2_HTM_NOSC)
PPC64.IsPOWER9 = isSet(hwcap2, _PPC_FEATURE2_ARCH_3_00)
PPC64.HasDARN = isSet(hwcap2, _PPC_FEATURE2_DARN)
PPC64.HasSCV = isSet(hwcap2, _PPC_FEATURE2_SCV)
}
func isSet(hwc uint, value uint) bool {
return hwc&value != 0
}

153
vendor/github.com/templexxx/cpu/cpu_s390x.go generated vendored Normal file
View File

@@ -0,0 +1,153 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 256
// bitIsSet reports whether the bit at index is set. The bit index
// is in big endian order, so bit index 0 is the leftmost bit.
func bitIsSet(bits []uint64, index uint) bool {
return bits[index/64]&((1<<63)>>(index%64)) != 0
}
// function is the function code for the named function.
type function uint8
const (
// KM{,A,C,CTR} function codes
aes128 function = 18 // AES-128
aes192 = 19 // AES-192
aes256 = 20 // AES-256
// K{I,L}MD function codes
sha1 = 1 // SHA-1
sha256 = 2 // SHA-256
sha512 = 3 // SHA-512
// KLMD function codes
ghash = 65 // GHASH
)
// queryResult contains the result of a Query function
// call. Bits are numbered in big endian order so the
// leftmost bit (the MSB) is at index 0.
type queryResult struct {
bits [2]uint64
}
// Has reports whether the given functions are present.
func (q *queryResult) Has(fns ...function) bool {
if len(fns) == 0 {
panic("no function codes provided")
}
for _, f := range fns {
if !bitIsSet(q.bits[:], uint(f)) {
return false
}
}
return true
}
// facility is a bit index for the named facility.
type facility uint8
const (
// mandatory facilities
zarch facility = 1 // z architecture mode is active
stflef = 7 // store-facility-list-extended
ldisp = 18 // long-displacement
eimm = 21 // extended-immediate
// miscellaneous facilities
dfp = 42 // decimal-floating-point
etf3eh = 30 // extended-translation 3 enhancement
// cryptography facilities
msa = 17 // message-security-assist
msa3 = 76 // message-security-assist extension 3
msa4 = 77 // message-security-assist extension 4
msa5 = 57 // message-security-assist extension 5
msa8 = 146 // message-security-assist extension 8
// Note: vx and highgprs are excluded because they require
// kernel support and so must be fetched from HWCAP.
)
// facilityList contains the result of an STFLE call.
// Bits are numbered in big endian order so the
// leftmost bit (the MSB) is at index 0.
type facilityList struct {
bits [4]uint64
}
// Has reports whether the given facilities are present.
func (s *facilityList) Has(fs ...facility) bool {
if len(fs) == 0 {
panic("no facility bits provided")
}
for _, f := range fs {
if !bitIsSet(s.bits[:], uint(f)) {
return false
}
}
return true
}
// The following feature detection functions are defined in cpu_s390x.s.
// They are likely to be expensive to call so the results should be cached.
func stfle() facilityList
func kmQuery() queryResult
func kmcQuery() queryResult
func kmctrQuery() queryResult
func kmaQuery() queryResult
func kimdQuery() queryResult
func klmdQuery() queryResult
func doinit() {
options = []option{
{"zarch", &S390X.HasZArch},
{"stfle", &S390X.HasSTFLE},
{"ldisp", &S390X.HasLDisp},
{"msa", &S390X.HasMSA},
{"eimm", &S390X.HasEImm},
{"dfp", &S390X.HasDFP},
{"etf3eh", &S390X.HasETF3Enhanced},
{"vx", &S390X.HasVX},
}
aes := []function{aes128, aes192, aes256}
facilities := stfle()
S390X.HasZArch = facilities.Has(zarch)
S390X.HasSTFLE = facilities.Has(stflef)
S390X.HasLDisp = facilities.Has(ldisp)
S390X.HasEImm = facilities.Has(eimm)
S390X.HasDFP = facilities.Has(dfp)
S390X.HasETF3Enhanced = facilities.Has(etf3eh)
S390X.HasMSA = facilities.Has(msa)
if S390X.HasMSA {
// cipher message
km, kmc := kmQuery(), kmcQuery()
S390X.HasAES = km.Has(aes...)
S390X.HasAESCBC = kmc.Has(aes...)
if facilities.Has(msa4) {
kmctr := kmctrQuery()
S390X.HasAESCTR = kmctr.Has(aes...)
}
if facilities.Has(msa8) {
kma := kmaQuery()
S390X.HasAESGCM = kma.Has(aes...)
}
// compute message digest
kimd := kimdQuery() // intermediate (no padding)
klmd := klmdQuery() // last (padding)
S390X.HasSHA1 = kimd.Has(sha1) && klmd.Has(sha1)
S390X.HasSHA256 = kimd.Has(sha256) && klmd.Has(sha256)
S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512)
S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist
}
}

55
vendor/github.com/templexxx/cpu/cpu_s390x.s generated vendored Normal file
View File

@@ -0,0 +1,55 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// func stfle() facilityList
TEXT ·stfle(SB), NOSPLIT|NOFRAME, $0-32
MOVD $ret+0(FP), R1
MOVD $3, R0 // last doubleword index to store
XC $32, (R1), (R1) // clear 4 doublewords (32 bytes)
WORD $0xb2b01000 // store facility list extended (STFLE)
RET
// func kmQuery() queryResult
TEXT ·kmQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KM-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xB92E0024 // cipher message (KM)
RET
// func kmcQuery() queryResult
TEXT ·kmcQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KMC-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xB92F0024 // cipher message with chaining (KMC)
RET
// func kmctrQuery() queryResult
TEXT ·kmctrQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KMCTR-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xB92D4024 // cipher message with counter (KMCTR)
RET
// func kmaQuery() queryResult
TEXT ·kmaQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KMA-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xb9296024 // cipher message with authentication (KMA)
RET
// func kimdQuery() queryResult
TEXT ·kimdQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KIMD-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xB93E0024 // compute intermediate message digest (KIMD)
RET
// func klmdQuery() queryResult
TEXT ·klmdQuery(SB), NOSPLIT|NOFRAME, $0-16
MOVD $0, R0 // set function code to 0 (KLMD-Query)
MOVD $ret+0(FP), R1 // address of 16-byte return value
WORD $0xB93F0024 // compute last message digest (KLMD)
RET

7
vendor/github.com/templexxx/cpu/cpu_wasm.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cpu
const CacheLineSize = 64

425
vendor/github.com/templexxx/cpu/cpu_x86.go generated vendored Normal file
View File

@@ -0,0 +1,425 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32
package cpu
import (
"fmt"
"strings"
)
const CacheLineSize = 64
// cpuid is implemented in cpu_x86.s.
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
// xgetbv with ecx = 0 is implemented in cpu_x86.s.
func xgetbv() (eax, edx uint32)
const (
// edx bits
cpuid_SSE2 = 1 << 26
// ecx bits
cpuid_SSE3 = 1 << 0
cpuid_PCLMULQDQ = 1 << 1
cpuid_SSSE3 = 1 << 9
cpuid_FMA = 1 << 12
cpuid_SSE41 = 1 << 19
cpuid_SSE42 = 1 << 20
cpuid_POPCNT = 1 << 23
cpuid_AES = 1 << 25
cpuid_OSXSAVE = 1 << 27
cpuid_AVX = 1 << 28
// ebx bits
cpuid_BMI1 = 1 << 3
cpuid_AVX2 = 1 << 5
cpuid_BMI2 = 1 << 8
cpuid_ERMS = 1 << 9
cpuid_ADX = 1 << 19
cpuid_AVX512F = 1 << 16
cpuid_AVX512DQ = 1 << 17
cpuid_AVX512BW = 1 << 30
cpuid_AVX512VL = 1 << 31
// edx bits
cpuid_Invariant_TSC = 1 << 8
)
func doinit() {
options = []option{
{"adx", &X86.HasADX},
{"aes", &X86.HasAES},
{"avx", &X86.HasAVX},
{"avx2", &X86.HasAVX2},
{"bmi1", &X86.HasBMI1},
{"bmi2", &X86.HasBMI2},
{"erms", &X86.HasERMS},
{"fma", &X86.HasFMA},
{"pclmulqdq", &X86.HasPCLMULQDQ},
{"popcnt", &X86.HasPOPCNT},
{"sse3", &X86.HasSSE3},
{"sse41", &X86.HasSSE41},
{"sse42", &X86.HasSSE42},
{"ssse3", &X86.HasSSSE3},
{"avx512f", &X86.HasAVX512F},
{"avx512dq", &X86.HasAVX512DQ},
{"avx512bw", &X86.HasAVX512BW},
{"avx512vl", &X86.HasAVX512VL},
{"invariant_tsc", &X86.HasInvariantTSC},
// sse2 set as last element so it can easily be removed again. See code below.
{"sse2", &X86.HasSSE2},
}
// Remove sse2 from options on amd64(p32) because SSE2 is a mandatory feature for these GOARCHs.
if GOARCH == "amd64" || GOARCH == "amd64p32" {
options = options[:len(options)-1]
}
maxID, _, _, _ := cpuid(0, 0)
if maxID < 1 {
return
}
_, _, ecx1, edx1 := cpuid(1, 0)
X86.HasSSE2 = isSet(edx1, cpuid_SSE2)
X86.HasSSE3 = isSet(ecx1, cpuid_SSE3)
X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ)
X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3)
X86.HasFMA = isSet(ecx1, cpuid_FMA)
X86.HasSSE41 = isSet(ecx1, cpuid_SSE41)
X86.HasSSE42 = isSet(ecx1, cpuid_SSE42)
X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT)
X86.HasAES = isSet(ecx1, cpuid_AES)
X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)
osSupportsAVX := false
osSupportsAVX512 := false
// For XGETBV, OSXSAVE bit is required and sufficient.
if X86.HasOSXSAVE {
eax, _ := xgetbv()
// Check if XMM and YMM registers have OS support.
osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
// Check is ZMM registers have OS support.
osSupportsAVX512 = isSet(eax>>5, 7) && isSet(eax>>1, 3)
}
X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
if maxID < 7 {
return
}
_, ebx7, _, _ := cpuid(7, 0)
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) && osSupportsAVX512
X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) && osSupportsAVX512
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) && osSupportsAVX512
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
X86.HasERMS = isSet(ebx7, cpuid_ERMS)
X86.HasADX = isSet(ebx7, cpuid_ADX)
X86.Cache = getCacheSize()
X86.HasInvariantTSC = hasInvariantTSC()
X86.Family, X86.Model = getFamilyModel()
X86.Signature = makeSignature(X86.Family, X86.Model)
X86.Name = getName()
X86.TSCFrequency = getNativeTSCFrequency(X86.Name, X86.Signature)
}
func isSet(hwc uint32, value uint32) bool {
return hwc&value != 0
}
func hasInvariantTSC() bool {
if maxExtendedFunction() < 0x80000007 {
return false
}
_, _, _, edx := cpuid(0x80000007, 0)
return isSet(edx, cpuid_Invariant_TSC)
}
func getName() string {
if maxExtendedFunction() >= 0x80000004 {
v := make([]uint32, 0, 48)
for i := uint32(0); i < 3; i++ {
a, b, c, d := cpuid(0x80000002+i, 0)
v = append(v, a, b, c, d)
}
return strings.Trim(string(valAsString(v...)), " ")
}
return "unknown"
}
// getNativeTSCFrequency gets TSC frequency from CPUID,
// only supports Intel (Skylake or later microarchitecture) & key information is from Intel manual & kernel codes
// (especially this commit: https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684).
func getNativeTSCFrequency(name, sign string) uint64 {
if vendorID() != Intel {
return 0
}
if maxFunctionID() < 0x15 {
return 0
}
// ApolloLake, GeminiLake, CannonLake (and presumably all new chipsets
// from this point) report the crystal frequency directly via CPUID.0x15.
// That's definitive data that we can rely upon.
eax, ebx, ecx, _ := cpuid(0x15, 0)
// If ebx is 0, the TSC/”core crystal clock” ratio is not enumerated.
// We won't provide TSC frequency detection in this situation.
if eax == 0 || ebx == 0 {
return 0
}
// Skylake, Kabylake and all variants of those two chipsets report a
// crystal frequency of zero.
if ecx == 0 { // Crystal clock frequency is not enumerated.
ecx = getCrystalClockFrequency(sign)
}
// TSC frequency = “core crystal clock frequency” * EBX/EAX.
return uint64(ecx) * (uint64(ebx) / uint64(eax))
}
// Copied from: CPUID Signature values of DisplayFamily and DisplayModel,
// in Intel® 64 and IA-32 Architectures Software Developers Manual
// Volume 4: Model-Specific Registers
// & https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/intel-family.h
const (
IntelFam6SkylakeL = "06_4EH"
IntelFam6Skylake = "06_5EH"
IntelFam6SkylakeX = "06_55H"
IntelFam6KabylakeL = "06_8EH"
IntelFam6Kabylake = "06_9EH"
)
// getCrystalClockFrequency gets crystal clock frequency
// for Intel processors in which CPUID.15H.EBX[31:0] ÷ CPUID.0x15.EAX[31:0] is enumerated
// but CPUID.15H.ECX is not enumerated using this function to get nominal core crystal clock frequency.
//
// Actually these crystal clock frequencies provided by Intel hardcoded tables are not so accurate in some cases,
// e.g. SkyLake server CPU may have issue (All SKX subject the crystal to an EMI reduction circuit that
//reduces its actual frequency by (approximately) -0.25%):
// see https://lore.kernel.org/lkml/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com/
// for more details.
// With this report, I set a coefficient (0.9975) for IntelFam6SkyLakeX.
//
// Unlike the kernel way (mentioned in https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684),
// I prefer the Intel hardcoded tables,
// because after some testing (comparing with wall clock, see https://github.com/templexxx/tsc/tsc_test.go for more details),
// I found hardcoded tables are more accurate.
func getCrystalClockFrequency(sign string) uint32 {
if maxFunctionID() < 0x16 {
return 0
}
switch sign {
case IntelFam6SkylakeL:
return 24 * 1000 * 1000
case IntelFam6Skylake:
return 24 * 1000 * 1000
case IntelFam6SkylakeX:
return 25 * 1000 * 1000 * 0.9975
case IntelFam6KabylakeL:
return 24 * 1000 * 1000
case IntelFam6Kabylake:
return 24 * 1000 * 1000
}
return 0
}
func getFamilyModel() (uint32, uint32) {
if maxFunctionID() < 0x1 {
return 0, 0
}
eax, _, _, _ := cpuid(1, 0)
family := (eax >> 8) & 0xf
displayFamily := family
if family == 0xf {
displayFamily = ((eax >> 20) & 0xff) + family
}
model := (eax >> 4) & 0xf
displayModel := model
if family == 0x6 || family == 0xf {
displayModel = ((eax >> 12) & 0xf0) + model
}
return displayFamily, displayModel
}
// signature format: XX_XXH
func makeSignature(family, model uint32) string {
signature := strings.ToUpper(fmt.Sprintf("0%x_0%xH", family, model))
ss := strings.Split(signature, "_")
for i, s := range ss {
// Maybe insert too more `0`, drop it.
if len(s) > 2 {
s = s[1:]
ss[i] = s
}
}
return strings.Join(ss, "_")
}
// getCacheSize is from
// https://github.com/klauspost/cpuid/blob/5a626f7029c910cc8329dae5405ee4f65034bce5/cpuid.go#L723
func getCacheSize() Cache {
c := Cache{
L1I: -1,
L1D: -1,
L2: -1,
L3: -1,
}
vendor := vendorID()
switch vendor {
case Intel:
if maxFunctionID() < 4 {
return c
}
for i := uint32(0); ; i++ {
eax, ebx, ecx, _ := cpuid(4, i)
cacheType := eax & 15
if cacheType == 0 {
break
}
cacheLevel := (eax >> 5) & 7
coherency := int(ebx&0xfff) + 1
partitions := int((ebx>>12)&0x3ff) + 1
associativity := int((ebx>>22)&0x3ff) + 1
sets := int(ecx) + 1
size := associativity * partitions * coherency * sets
switch cacheLevel {
case 1:
if cacheType == 1 {
// 1 = Data Cache
c.L1D = size
} else if cacheType == 2 {
// 2 = Instruction Cache
c.L1I = size
} else {
if c.L1D < 0 {
c.L1I = size
}
if c.L1I < 0 {
c.L1I = size
}
}
case 2:
c.L2 = size
case 3:
c.L3 = size
}
}
case AMD, Hygon:
// Untested.
if maxExtendedFunction() < 0x80000005 {
return c
}
_, _, ecx, edx := cpuid(0x80000005, 0)
c.L1D = int(((ecx >> 24) & 0xFF) * 1024)
c.L1I = int(((edx >> 24) & 0xFF) * 1024)
if maxExtendedFunction() < 0x80000006 {
return c
}
_, _, ecx, _ = cpuid(0x80000006, 0)
c.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
}
return c
}
func maxFunctionID() uint32 {
a, _, _, _ := cpuid(0, 0)
return a
}
func maxExtendedFunction() uint32 {
eax, _, _, _ := cpuid(0x80000000, 0)
return eax
}
const (
Other = iota
Intel
AMD
VIA
Transmeta
NSC
KVM // Kernel-based Virtual Machine
MSVM // Microsoft Hyper-V or Windows Virtual PC
VMware
XenHVM
Bhyve
Hygon
)
// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
var vendorMapping = map[string]int{
"AMDisbetter!": AMD,
"AuthenticAMD": AMD,
"CentaurHauls": VIA,
"GenuineIntel": Intel,
"TransmetaCPU": Transmeta,
"GenuineTMx86": Transmeta,
"Geode by NSC": NSC,
"VIA VIA VIA ": VIA,
"KVMKVMKVMKVM": KVM,
"Microsoft Hv": MSVM,
"VMwareVMware": VMware,
"XenVMMXenVMM": XenHVM,
"bhyve bhyve ": Bhyve,
"HygonGenuine": Hygon,
}
func vendorID() int {
_, b, c, d := cpuid(0, 0)
v := valAsString(b, d, c)
vend, ok := vendorMapping[string(v)]
if !ok {
return Other
}
return vend
}
func valAsString(values ...uint32) []byte {
r := make([]byte, 4*len(values))
for i, v := range values {
dst := r[i*4:]
dst[0] = byte(v & 0xff)
dst[1] = byte((v >> 8) & 0xff)
dst[2] = byte((v >> 16) & 0xff)
dst[3] = byte((v >> 24) & 0xff)
switch {
case dst[0] == 0:
return r[:i*4]
case dst[1] == 0:
return r[:i*4+1]
case dst[2] == 0:
return r[:i*4+2]
case dst[3] == 0:
return r[:i*4+3]
}
}
return r
}

32
vendor/github.com/templexxx/cpu/cpu_x86.s generated vendored Normal file
View File

@@ -0,0 +1,32 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32
#include "textflag.h"
// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuid(SB), NOSPLIT, $0-24
MOVL eaxArg+0(FP), AX
MOVL ecxArg+4(FP), CX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func xgetbv() (eax, edx uint32)
TEXT ·xgetbv(SB),NOSPLIT,$0-8
#ifdef GOOS_nacl
// nacl does not support XGETBV.
MOVL $0, eax+0(FP)
MOVL $0, edx+4(FP)
#else
MOVL $0, CX
XGETBV
MOVL AX, eax+0(FP)
MOVL DX, edx+4(FP)
#endif
RET

1
vendor/github.com/templexxx/xorsimd/.gitattributes generated vendored Normal file
View File

@@ -0,0 +1 @@
*.s linguist-language=go:x

13
vendor/github.com/templexxx/xorsimd/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,13 @@
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib
# Test binary, build with `go test -c`
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
*.out
.idea

21
vendor/github.com/templexxx/xorsimd/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Temple3x (temple3x@gmail.com)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

46
vendor/github.com/templexxx/xorsimd/README.md generated vendored Normal file
View File

@@ -0,0 +1,46 @@
# XOR SIMD
[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10]
[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg
[2]: https://godoc.org/github.com/templexxx/xorsimd
[3]: https://img.shields.io/badge/license-MIT-blue.svg
[4]: LICENSE
[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg
[6]: https://github.com/templexxx/xorsimd
[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd
[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd
[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg
[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge
## Introduction:
>- XOR code engine in pure Go.
>
>- [High Performance](https://github.com/templexxx/xorsimd#performance):
More than 270GB/s per physics core.
## Performance
Performance depends mainly on:
>- CPU instruction extension.
>
>- Number of source row vectors.
**Platform:**
*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)*
**All test run on a single Core.**
`I/O = (src_num + 1) * vector_size / cost`
| Src Num | Vector size | AVX512 I/O (MB/S) | AVX2 I/O (MB/S) |SSE2 I/O (MB/S) |
|-------|-------------|-------------|---------------|---------------|
|5|4KB| 270403.73 | 142825.25 | 74443.91 |
|5|1MB| 26948.34 | 26887.37 | 26950.65 |
|5|8MB| 17881.32 | 17212.56 | 16402.97 |
|10|4KB| 190445.30 | 102953.59 | 53244.04 |
|10|1MB| 26424.44 | 26618.65 | 26094.39 |
|10|8MB| 15471.31 | 14866.72 | 13565.80 |

5
vendor/github.com/templexxx/xorsimd/go.mod generated vendored Normal file
View File

@@ -0,0 +1,5 @@
module github.com/templexxx/xorsimd
require github.com/templexxx/cpu v0.0.1
go 1.13

2
vendor/github.com/templexxx/xorsimd/go.sum generated vendored Normal file
View File

@@ -0,0 +1,2 @@
github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY=
github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk=

89
vendor/github.com/templexxx/xorsimd/xor.go generated vendored Normal file
View File

@@ -0,0 +1,89 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
package xorsimd
import "github.com/templexxx/cpu"
// EnableAVX512 may slow down CPU Clock (maybe not).
// TODO need more research:
// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/
var EnableAVX512 = true
// cpuFeature indicates which instruction set will be used.
var cpuFeature = getCPUFeature()
const (
avx512 = iota
avx2
sse2
generic
)
// TODO: Add ARM feature...
func getCPUFeature() int {
if hasAVX512() && EnableAVX512 {
return avx512
} else if cpu.X86.HasAVX2 {
return avx2
} else {
return sse2 // amd64 must has sse2
}
}
func hasAVX512() (ok bool) {
return cpu.X86.HasAVX512VL &&
cpu.X86.HasAVX512BW &&
cpu.X86.HasAVX512F &&
cpu.X86.HasAVX512DQ
}
// Encode encodes elements from source slice into a
// destination slice. The source and destination may overlap.
// Encode returns the number of bytes encoded, which will be the minimum of
// len(src[i]) and len(dst).
func Encode(dst []byte, src [][]byte) (n int) {
n = checkLen(dst, src)
if n == 0 {
return
}
dst = dst[:n]
for i := range src {
src[i] = src[i][:n]
}
if len(src) == 1 {
copy(dst, src[0])
return
}
encode(dst, src)
return
}
func checkLen(dst []byte, src [][]byte) int {
n := len(dst)
for i := range src {
if len(src[i]) < n {
n = len(src[i])
}
}
if n <= 0 {
return 0
}
return n
}
// Bytes XORs the bytes in a and b into a
// destination slice. The source and destination may overlap.
//
// Bytes returns the number of bytes encoded, which will be the minimum of
// len(dst), len(a), len(b).
func Bytes(dst, a, b []byte) int {
return Encode(dst, [][]byte{a, b})
}

95
vendor/github.com/templexxx/xorsimd/xor_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,95 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
package xorsimd
func encode(dst []byte, src [][]byte) {
switch cpuFeature {
case avx512:
encodeAVX512(dst, src)
case avx2:
encodeAVX2(dst, src)
default:
encodeSSE2(dst, src)
}
return
}
// Bytes8 XORs of 8 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 8,
// if not, Bytes8 will panic.
func Bytes8(dst, a, b []byte) {
bytes8(&dst[0], &a[0], &b[0])
}
// Bytes16 XORs of packed 16 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 16,
// if not, Bytes16 will panic.
func Bytes16(dst, a, b []byte) {
bytes16(&dst[0], &a[0], &b[0])
}
// Bytes8Align XORs of 8 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 8,
// if not, Bytes8 will panic.
func Bytes8Align(dst, a, b []byte) {
bytes8(&dst[0], &a[0], &b[0])
}
// Bytes16Align XORs of packed 16 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 16,
// if not, Bytes16 will panic.
func Bytes16Align(dst, a, b []byte) {
bytes16(&dst[0], &a[0], &b[0])
}
// BytesA XORs the len(a) bytes in a and b into a
// destination slice.
// The destination should have enough space.
//
// It's used for encoding small bytes slices (< dozens bytes),
// and the slices may not be aligned to 8 bytes or 16 bytes.
// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
// for gain better performance.
func BytesA(dst, a, b []byte) {
bytesN(&dst[0], &a[0], &b[0], len(a))
}
// BytesB XORs the len(b) bytes in a and b into a
// destination slice.
// The destination should have enough space.
//
// It's used for encoding small bytes slices (< dozens bytes),
// and the slices may not be aligned to 8 bytes or 16 bytes.
// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
// for gain better performance.
func BytesB(dst, a, b []byte) {
bytesN(&dst[0], &a[0], &b[0], len(b))
}
//go:noescape
func encodeAVX512(dst []byte, src [][]byte)
//go:noescape
func encodeAVX2(dst []byte, src [][]byte)
//go:noescape
func encodeSSE2(dst []byte, src [][]byte)
//go:noescape
func bytesN(dst, a, b *byte, n int)
//go:noescape
func bytes8(dst, a, b *byte)
//go:noescape
func bytes16(dst, a, b *byte)

205
vendor/github.com/templexxx/xorsimd/xor_generic.go generated vendored Normal file
View File

@@ -0,0 +1,205 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
//
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64
package xorsimd
import (
"runtime"
"unsafe"
)
const wordSize = int(unsafe.Sizeof(uintptr(0)))
const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
func encode(dst []byte, src [][]byte) {
if supportsUnaligned {
fastEncode(dst, src, len(dst))
} else {
// TODO(hanwen): if (dst, a, b) have common alignment
// we could still try fastEncode. It is not clear
// how often this happens, and it's only worth it if
// the block encryption itself is hardware
// accelerated.
safeEncode(dst, src, len(dst))
}
}
// fastEncode xor in bulk. It only works on architectures that
// support unaligned read/writes.
func fastEncode(dst []byte, src [][]byte, n int) {
w := n / wordSize
if w > 0 {
wordBytes := w * wordSize
wordAlignSrc := make([][]byte, len(src))
for i := range src {
wordAlignSrc[i] = src[i][:wordBytes]
}
fastEnc(dst[:wordBytes], wordAlignSrc)
}
for i := n - n%wordSize; i < n; i++ {
s := src[0][i]
for j := 1; j < len(src); j++ {
s ^= src[j][i]
}
dst[i] = s
}
}
func fastEnc(dst []byte, src [][]byte) {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
sw := make([][]uintptr, len(src))
for i := range src {
sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i]))
}
n := len(dst) / wordSize
for i := 0; i < n; i++ {
s := sw[0][i]
for j := 1; j < len(sw); j++ {
s ^= sw[j][i]
}
dw[i] = s
}
}
func safeEncode(dst []byte, src [][]byte, n int) {
for i := 0; i < n; i++ {
s := src[0][i]
for j := 1; j < len(src); j++ {
s ^= src[j][i]
}
dst[i] = s
}
}
// Bytes8 XORs of word 8 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 8,
// if not, Bytes8 will panic.
func Bytes8(dst, a, b []byte) {
bytesWords(dst[:8], a[:8], b[:8])
}
// Bytes16 XORs of packed doubleword 16 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 16,
// if not, Bytes16 will panic.
func Bytes16(dst, a, b []byte) {
bytesWords(dst[:16], a[:16], b[:16])
}
// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.)
// The slice arguments a and b are assumed to be of equal length.
func bytesWords(dst, a, b []byte) {
if supportsUnaligned {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
n := len(b) / wordSize
for i := 0; i < n; i++ {
dw[i] = aw[i] ^ bw[i]
}
} else {
n := len(b)
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
}
}
// Bytes8Align XORs of 8 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 8,
// if not, Bytes8 will panic.
//
// All the byte slices must be aligned to wordsize.
func Bytes8Align(dst, a, b []byte) {
bytesWordsAlign(dst[:8], a[:8], b[:8])
}
// Bytes16Align XORs of packed 16 Bytes.
// The slice arguments a, b, dst's lengths are assumed to be at least 16,
// if not, Bytes16 will panic.
//
// All the byte slices must be aligned to wordsize.
func Bytes16Align(dst, a, b []byte) {
bytesWordsAlign(dst[:16], a[:16], b[:16])
}
// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.)
// The slice arguments a and b are assumed to be of equal length.
//
// All the byte slices must be aligned to wordsize.
func bytesWordsAlign(dst, a, b []byte) {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
n := len(b) / wordSize
for i := 0; i < n; i++ {
dw[i] = aw[i] ^ bw[i]
}
}
// BytesA XORs the len(a) bytes in a and b into a
// destination slice.
// The destination should have enough space.
//
// It's used for encoding small bytes slices (< dozens bytes),
// and the slices may not be aligned to 8 bytes or 16 bytes.
// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
// for gain better performance.
func BytesA(dst, a, b []byte) {
n := len(a)
bytesN(dst[:n], a[:n], b[:n], n)
}
// BytesB XORs the len(b) bytes in a and b into a
// destination slice.
// The destination should have enough space.
//
// It's used for encoding small bytes slices (< dozens bytes),
// and the slices may not be aligned to 8 bytes or 16 bytes.
// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
// for gain better performance.
func BytesB(dst, a, b []byte) {
n := len(b)
bytesN(dst[:n], a[:n], b[:n], n)
}
func bytesN(dst, a, b []byte, n int) {
switch {
case supportsUnaligned:
w := n / wordSize
if w > 0 {
dw := *(*[]uintptr)(unsafe.Pointer(&dst))
aw := *(*[]uintptr)(unsafe.Pointer(&a))
bw := *(*[]uintptr)(unsafe.Pointer(&b))
for i := 0; i < w; i++ {
dw[i] = aw[i] ^ bw[i]
}
}
for i := (n - n%wordSize); i < n; i++ {
dst[i] = a[i] ^ b[i]
}
default:
for i := 0; i < n; i++ {
dst[i] = a[i] ^ b[i]
}
}
}

124
vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,124 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
#include "textflag.h"
#define dst BX // parity's address
#define d2src SI // two-dimension src_slice's address
#define csrc CX // cnt of src
#define len DX // len of vect
#define pos R8 // job position in vect
#define csrc_tmp R9
#define d2src_off R10
#define src_tmp R11
#define not_aligned_len R12
#define src_val0 R13
#define src_val1 R14
// func encodeAVX2(dst []byte, src [][]byte)
TEXT ·encodeAVX2(SB), NOSPLIT, $0
MOVQ d+0(FP), dst
MOVQ s+24(FP), d2src
MOVQ c+32(FP), csrc
MOVQ l+8(FP), len
TESTQ $127, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop128b:
MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp
SUBQ $2, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
VMOVDQU (src_tmp)(pos*1), Y0
VMOVDQU 32(src_tmp)(pos*1), Y1
VMOVDQU 64(src_tmp)(pos*1), Y2
VMOVDQU 96(src_tmp)(pos*1), Y3
next_vect:
ADDQ $24, d2src_off // len(slice) = 24
MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect
VMOVDQU (src_tmp)(pos*1), Y4
VMOVDQU 32(src_tmp)(pos*1), Y5
VMOVDQU 64(src_tmp)(pos*1), Y6
VMOVDQU 96(src_tmp)(pos*1), Y7
VPXOR Y4, Y0, Y0
VPXOR Y5, Y1, Y1
VPXOR Y6, Y2, Y2
VPXOR Y7, Y3, Y3
SUBQ $1, csrc_tmp
JGE next_vect
VMOVDQU Y0, (dst)(pos*1)
VMOVDQU Y1, 32(dst)(pos*1)
VMOVDQU Y2, 64(dst)(pos*1)
VMOVDQU Y3, 96(dst)(pos*1)
ADDQ $128, pos
CMPQ len, pos
JNE loop128b
VZEROUPPER
RET
loop_1b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src
next_vect_1b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVB -1(src_tmp)(len*1), src_val1
XORB src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_1b
MOVB src_val0, -1(dst)(len*1)
SUBQ $1, len
TESTQ $7, len
JNZ loop_1b
CMPQ len, $0
JE ret
TESTQ $127, len
JZ aligned
not_aligned:
TESTQ $7, len
JNE loop_1b
MOVQ len, not_aligned_len
ANDQ $127, not_aligned_len
loop_8b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVQ -8(src_tmp)(len*1), src_val0
next_vect_8b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVQ -8(src_tmp)(len*1), src_val1
XORQ src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_8b
MOVQ src_val0, -8(dst)(len*1)
SUBQ $8, len
SUBQ $8, not_aligned_len
JG loop_8b
CMPQ len, $128
JGE aligned
RET
ret:
RET

124
vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,124 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
#include "textflag.h"
#define dst BX // parity's address
#define d2src SI // two-dimension src_slice's address
#define csrc CX // cnt of src
#define len DX // len of vect
#define pos R8 // job position in vect
#define csrc_tmp R9
#define d2src_off R10
#define src_tmp R11
#define not_aligned_len R12
#define src_val0 R13
#define src_val1 R14
// func encodeAVX512(dst []byte, src [][]byte)
TEXT ·encodeAVX512(SB), NOSPLIT, $0
MOVQ d+0(FP), dst
MOVQ src+24(FP), d2src
MOVQ c+32(FP), csrc
MOVQ l+8(FP), len
TESTQ $255, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop256b:
MOVQ csrc, csrc_tmp // store src_cnt -> csrc_tmp
SUBQ $2, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
VMOVDQU8 (src_tmp)(pos*1), Z0
VMOVDQU8 64(src_tmp)(pos*1), Z1
VMOVDQU8 128(src_tmp)(pos*1), Z2
VMOVDQU8 192(src_tmp)(pos*1), Z3
next_vect:
ADDQ $24, d2src_off // len(slice) = 24
MOVQ (d2src)(d2src_off*1), src_tmp // next data_vect
VMOVDQU8 (src_tmp)(pos*1), Z4
VMOVDQU8 64(src_tmp)(pos*1), Z5
VMOVDQU8 128(src_tmp)(pos*1), Z6
VMOVDQU8 192(src_tmp)(pos*1), Z7
VPXORQ Z4, Z0, Z0
VPXORQ Z5, Z1, Z1
VPXORQ Z6, Z2, Z2
VPXORQ Z7, Z3, Z3
SUBQ $1, csrc_tmp
JGE next_vect
VMOVDQU8 Z0, (dst)(pos*1)
VMOVDQU8 Z1, 64(dst)(pos*1)
VMOVDQU8 Z2, 128(dst)(pos*1)
VMOVDQU8 Z3, 192(dst)(pos*1)
ADDQ $256, pos
CMPQ len, pos
JNE loop256b
VZEROUPPER
RET
loop_1b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVB -1(src_tmp)(len*1), src_val0 // encode from the end of src
next_vect_1b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVB -1(src_tmp)(len*1), src_val1
XORB src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_1b
MOVB src_val0, -1(dst)(len*1)
SUBQ $1, len
TESTQ $7, len
JNZ loop_1b
CMPQ len, $0
JE ret
TESTQ $255, len
JZ aligned
not_aligned:
TESTQ $7, len
JNE loop_1b
MOVQ len, not_aligned_len
ANDQ $255, not_aligned_len
loop_8b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVQ -8(src_tmp)(len*1), src_val0
next_vect_8b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVQ -8(src_tmp)(len*1), src_val1
XORQ src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_8b
MOVQ src_val0, -8(dst)(len*1)
SUBQ $8, len
SUBQ $8, not_aligned_len
JG loop_8b
CMPQ len, $256
JGE aligned
RET
ret:
RET

72
vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,72 @@
#include "textflag.h"
// func bytesN(dst, a, b *byte, n int)
TEXT ·bytesN(SB), NOSPLIT, $0
MOVQ d+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ n+24(FP), DX
TESTQ $15, DX // AND 15 & len, if not zero jump to not_aligned.
JNZ not_aligned
aligned:
MOVQ $0, AX // position in slices
loop16b:
MOVOU (SI)(AX*1), X0 // XOR 16byte forwards.
MOVOU (CX)(AX*1), X1
PXOR X1, X0
MOVOU X0, (BX)(AX*1)
ADDQ $16, AX
CMPQ DX, AX
JNE loop16b
RET
loop_1b:
SUBQ $1, DX // XOR 1byte backwards.
MOVB (SI)(DX*1), DI
MOVB (CX)(DX*1), AX
XORB AX, DI
MOVB DI, (BX)(DX*1)
TESTQ $7, DX // AND 7 & len, if not zero jump to loop_1b.
JNZ loop_1b
CMPQ DX, $0 // if len is 0, ret.
JE ret
TESTQ $15, DX // AND 15 & len, if zero jump to aligned.
JZ aligned
not_aligned:
TESTQ $7, DX // AND $7 & len, if not zero jump to loop_1b.
JNE loop_1b
SUBQ $8, DX // XOR 8bytes backwards.
MOVQ (SI)(DX*1), DI
MOVQ (CX)(DX*1), AX
XORQ AX, DI
MOVQ DI, (BX)(DX*1)
CMPQ DX, $16 // if len is greater or equal 16 here, it must be aligned.
JGE aligned
ret:
RET
// func bytes8(dst, a, b *byte)
TEXT ·bytes8(SB), NOSPLIT, $0
MOVQ d+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVQ (SI), DI
MOVQ (CX), AX
XORQ AX, DI
MOVQ DI, (BX)
RET
// func bytes16(dst, a, b *byte)
TEXT ·bytes16(SB), NOSPLIT, $0
MOVQ d+0(FP), BX
MOVQ a+8(FP), SI
MOVQ b+16(FP), CX
MOVOU (SI), X0
MOVOU (CX), X1
PXOR X1, X0
MOVOU X0, (BX)
RET

123
vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,123 @@
// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
//
// Use of this source code is governed by the MIT License
// that can be found in the LICENSE file.
#include "textflag.h"
#define dst BX // parity's address
#define d2src SI // two-dimension src_slice's address
#define csrc CX // cnt of src
#define len DX // len of vect
#define pos R8 // job position in vect
#define csrc_tmp R9
#define d2src_off R10
#define src_tmp R11
#define not_aligned_len R12
#define src_val0 R13
#define src_val1 R14
// func encodeSSE2(dst []byte, src [][]byte)
TEXT ·encodeSSE2(SB), NOSPLIT, $0
MOVQ d+0(FP), dst
MOVQ src+24(FP), d2src
MOVQ c+32(FP), csrc
MOVQ l+8(FP), len
TESTQ $63, len
JNZ not_aligned
aligned:
MOVQ $0, pos
loop64b:
MOVQ csrc, csrc_tmp
SUBQ $2, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVOU (src_tmp)(pos*1), X0
MOVOU 16(src_tmp)(pos*1), X1
MOVOU 32(src_tmp)(pos*1), X2
MOVOU 48(src_tmp)(pos*1), X3
next_vect:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVOU (src_tmp)(pos*1), X4
MOVOU 16(src_tmp)(pos*1), X5
MOVOU 32(src_tmp)(pos*1), X6
MOVOU 48(src_tmp)(pos*1), X7
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
SUBQ $1, csrc_tmp
JGE next_vect
MOVOU X0, (dst)(pos*1)
MOVOU X1, 16(dst)(pos*1)
MOVOU X2, 32(dst)(pos*1)
MOVOU X3, 48(dst)(pos*1)
ADDQ $64, pos
CMPQ len, pos
JNE loop64b
RET
loop_1b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVB -1(src_tmp)(len*1), src_val0
next_vect_1b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVB -1(src_tmp)(len*1), src_val1
XORB src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_1b
MOVB src_val0, -1(dst)(len*1)
SUBQ $1, len
TESTQ $7, len
JNZ loop_1b
CMPQ len, $0
JE ret
TESTQ $63, len
JZ aligned
not_aligned:
TESTQ $7, len
JNE loop_1b
MOVQ len, not_aligned_len
ANDQ $63, not_aligned_len
loop_8b:
MOVQ csrc, csrc_tmp
MOVQ $0, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
SUBQ $2, csrc_tmp
MOVQ -8(src_tmp)(len*1), src_val0
next_vect_8b:
ADDQ $24, d2src_off
MOVQ (d2src)(d2src_off*1), src_tmp
MOVQ -8(src_tmp)(len*1), src_val1
XORQ src_val1, src_val0
SUBQ $1, csrc_tmp
JGE next_vect_8b
MOVQ src_val0, -8(dst)(len*1)
SUBQ $8, len
SUBQ $8, not_aligned_len
JG loop_8b
CMPQ len, $64
JGE aligned
RET
ret:
RET