clone from danieldin95

2025-10-07 17:40:54 +08:00 · 2022-07-29 23:38:54 +08:00
commit ac4f79bbf4
1931 changed files with 568263 additions and 0 deletions
--- a/vendor/github.com/templexxx/cpu/.gitignore
+++ b/vendor/github.com/templexxx/cpu/.gitignore
@@ -0,0 +1,12 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
--- a/vendor/github.com/templexxx/cpu/LICENSE
+++ b/vendor/github.com/templexxx/cpu/LICENSE
@@ -0,0 +1,32 @@
+BSD 3-Clause License
+
+Copyright (c) 2018 Temple3x (temple3x@gmail.com)
+Copyright 2017 The Go Authors
+Copyright (c) 2015 Klaus Post
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/github.com/templexxx/cpu/README.md
+++ b/vendor/github.com/templexxx/cpu/README.md
@@ -0,0 +1,23 @@
+# cpu
+internal/cpu(in Go standard lib) with these detections:
+
+>- AVX512
+>
+>- Cache Size
+>
+>- Invariant TSC
+>
+
+It also provides:
+
+>- False sharing range, see `X86FalseSharingRange` for X86 platform.
+>
+>- TSC frequency
+>
+>- Name
+>
+>- Family & Model
+
+# Acknowledgement
+
+[klauspost/cpuid](https://github.com/klauspost/cpuid)
--- a/vendor/github.com/templexxx/cpu/cpu.go
+++ b/vendor/github.com/templexxx/cpu/cpu.go
@@ -0,0 +1,234 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cpu implements processor feature detection
+// used by the Go standard library.
+package cpu
+
+// debugOptions is set to true by the runtime if go was compiled with GOEXPERIMENT=debugcpu
+// and GOOS is Linux or Darwin. This variable is linknamed in runtime/proc.go.
+var debugOptions bool
+
+var X86 x86
+
+// "Loads data or instructions from memory to the second-level cache.
+// To use the streamer, organize the data or instructions in blocks of 128 bytes,
+// aligned on 128 bytes."
+// From <Intel® 64 and IA-32 architectures optimization reference manual>,
+// in section 3.7.3 "Hardware Prefetching for Second-Level Cache"
+//
+// In practice, I have found use 128bytes can gain better performance than 64bytes (one cache line).
+const X86FalseSharingRange = 128
+
+// The booleans in x86 contain the correspondingly named cpuid feature bit.
+// HasAVX and HasAVX2 are only set if the OS does support XMM and YMM registers
+// in addition to the cpuid feature bit being set.
+// The struct is padded to avoid false sharing.
+type x86 struct {
+	_            [X86FalseSharingRange]byte
+	HasAES       bool
+	HasADX       bool
+	HasAVX       bool
+	HasAVX2      bool
+	HasAVX512F   bool
+	HasAVX512DQ  bool
+	HasAVX512BW  bool
+	HasAVX512VL  bool
+	HasBMI1      bool
+	HasBMI2      bool
+	HasERMS      bool
+	HasFMA       bool
+	HasOSXSAVE   bool
+	HasPCLMULQDQ bool
+	HasPOPCNT    bool
+	HasSSE2      bool
+	HasSSE3      bool
+	HasSSSE3     bool
+	HasSSE41     bool
+	HasSSE42     bool
+	// The invariant TSC will run at a constant rate in all ACPI P-, C-, and T-states.
+	// This is the architectural behavior moving forward. On processors with
+	// invariant TSC support, the OS may use the TSC for wall clock timer services (instead of ACPI or HPET timers).
+	HasInvariantTSC bool
+
+	Cache Cache
+
+	// TSCFrequency only meaningful when HasInvariantTSC == true.
+	// Unit: Hz.
+	//
+	// Warn:
+	// 1. If it's 0, means can't get it. Don't use it.
+	// 2. Don't use it if you want "100%" precise timestamp.
+	TSCFrequency uint64
+
+	Name      string
+	Signature string // DisplayFamily_DisplayModel.
+	Family    uint32 // CPU family number.
+	Model     uint32 // CPU model number.
+
+	_ [X86FalseSharingRange]byte
+}
+
+// CPU Cache Size.
+// -1 if undetected.
+type Cache struct {
+	L1I int
+	L1D int
+	L2  int
+	L3  int
+}
+
+var PPC64 ppc64
+
+// For ppc64x, it is safe to check only for ISA level starting on ISA v3.00,
+// since there are no optional categories. There are some exceptions that also
+// require kernel support to work (darn, scv), so there are feature bits for
+// those as well. The minimum processor requirement is POWER8 (ISA 2.07), so we
+// maintain some of the old feature checks for optional categories for
+// safety.
+// The struct is padded to avoid false sharing.
+type ppc64 struct {
+	_          [CacheLineSize]byte
+	HasVMX     bool // Vector unit (Altivec)
+	HasDFP     bool // Decimal Floating Point unit
+	HasVSX     bool // Vector-scalar unit
+	HasHTM     bool // Hardware Transactional Memory
+	HasISEL    bool // Integer select
+	HasVCRYPTO bool // Vector cryptography
+	HasHTMNOSC bool // HTM: kernel-aborted transaction in syscalls
+	HasDARN    bool // Hardware random number generator (requires kernel enablement)
+	HasSCV     bool // Syscall vectored (requires kernel enablement)
+	IsPOWER8   bool // ISA v2.07 (POWER8)
+	IsPOWER9   bool // ISA v3.00 (POWER9)
+	_          [CacheLineSize]byte
+}
+
+var ARM64 arm64
+
+// The booleans in arm64 contain the correspondingly named cpu feature bit.
+// The struct is padded to avoid false sharing.
+type arm64 struct {
+	_           [CacheLineSize]byte
+	HasFP       bool
+	HasASIMD    bool
+	HasEVTSTRM  bool
+	HasAES      bool
+	HasPMULL    bool
+	HasSHA1     bool
+	HasSHA2     bool
+	HasCRC32    bool
+	HasATOMICS  bool
+	HasFPHP     bool
+	HasASIMDHP  bool
+	HasCPUID    bool
+	HasASIMDRDM bool
+	HasJSCVT    bool
+	HasFCMA     bool
+	HasLRCPC    bool
+	HasDCPOP    bool
+	HasSHA3     bool
+	HasSM3      bool
+	HasSM4      bool
+	HasASIMDDP  bool
+	HasSHA512   bool
+	HasSVE      bool
+	HasASIMDFHM bool
+	_           [CacheLineSize]byte
+}
+
+var S390X s390x
+
+type s390x struct {
+	_               [CacheLineSize]byte
+	HasZArch        bool // z architecture mode is active [mandatory]
+	HasSTFLE        bool // store facility list extended [mandatory]
+	HasLDisp        bool // long (20-bit) displacements [mandatory]
+	HasEImm         bool // 32-bit immediates [mandatory]
+	HasDFP          bool // decimal floating point
+	HasETF3Enhanced bool // ETF-3 enhanced
+	HasMSA          bool // message security assist (CPACF)
+	HasAES          bool // KM-AES{128,192,256} functions
+	HasAESCBC       bool // KMC-AES{128,192,256} functions
+	HasAESCTR       bool // KMCTR-AES{128,192,256} functions
+	HasAESGCM       bool // KMA-GCM-AES{128,192,256} functions
+	HasGHASH        bool // KIMD-GHASH function
+	HasSHA1         bool // K{I,L}MD-SHA-1 functions
+	HasSHA256       bool // K{I,L}MD-SHA-256 functions
+	HasSHA512       bool // K{I,L}MD-SHA-512 functions
+	HasVX           bool // vector facility. Note: the runtime sets this when it processes auxv records.
+	_               [CacheLineSize]byte
+}
+
+// initialize examines the processor and sets the relevant variables above.
+// This is called by the runtime package early in program initialization,
+// before normal init functions are run. env is set by runtime on Linux and Darwin
+// if go was compiled with GOEXPERIMENT=debugcpu.
+func init() {
+	doinit()
+	processOptions("")
+}
+
+// options contains the cpu debug options that can be used in GODEBUGCPU.
+// Options are arch dependent and are added by the arch specific doinit functions.
+// Features that are mandatory for the specific GOARCH should not be added to options
+// (e.g. SSE2 on amd64).
+var options []option
+
+// Option names should be lower case. e.g. avx instead of AVX.
+type option struct {
+	Name    string
+	Feature *bool
+}
+
+// processOptions disables CPU feature values based on the parsed env string.
+// The env string is expected to be of the form feature1=0,feature2=0...
+// where feature names is one of the architecture specifc list stored in the
+// cpu packages options variable. If env contains all=0 then all capabilities
+// referenced through the options variable are disabled. Other feature
+// names and values other than 0 are silently ignored.
+func processOptions(env string) {
+field:
+	for env != "" {
+		field := ""
+		i := indexByte(env, ',')
+		if i < 0 {
+			field, env = env, ""
+		} else {
+			field, env = env[:i], env[i+1:]
+		}
+		i = indexByte(field, '=')
+		if i < 0 {
+			continue
+		}
+		key, value := field[:i], field[i+1:]
+
+		// Only allow turning off CPU features by specifying '0'.
+		if value == "0" {
+			if key == "all" {
+				for _, v := range options {
+					*v.Feature = false
+				}
+				return
+			} else {
+				for _, v := range options {
+					if v.Name == key {
+						*v.Feature = false
+						continue field
+					}
+				}
+			}
+		}
+	}
+}
+
+// indexByte returns the index of the first instance of c in s,
+// or -1 if c is not present in s.
+func indexByte(s string, c byte) int {
+	for i := 0; i < len(s); i++ {
+		if s[i] == c {
+			return i
+		}
+	}
+	return -1
+}
--- a/vendor/github.com/templexxx/cpu/cpu_386.go
+++ b/vendor/github.com/templexxx/cpu/cpu_386.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const GOARCH = "386"
--- a/vendor/github.com/templexxx/cpu/cpu_amd64.go
+++ b/vendor/github.com/templexxx/cpu/cpu_amd64.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const GOARCH = "amd64"
--- a/vendor/github.com/templexxx/cpu/cpu_amd64p32.go
+++ b/vendor/github.com/templexxx/cpu/cpu_amd64p32.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const GOARCH = "amd64p32"
--- a/vendor/github.com/templexxx/cpu/cpu_arm.go
+++ b/vendor/github.com/templexxx/cpu/cpu_arm.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpu/cpu_arm64.go
+++ b/vendor/github.com/templexxx/cpu/cpu_arm64.go
@@ -0,0 +1,102 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 64
+
+// arm64 doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2.
+// These are linknamed in runtime/os_linux_arm64.go and are initialized by
+// archauxv().
+var hwcap uint
+var hwcap2 uint
+
+// HWCAP/HWCAP2 bits. These are exposed by Linux.
+const (
+	hwcap_FP       = (1 << 0)
+	hwcap_ASIMD    = (1 << 1)
+	hwcap_EVTSTRM  = (1 << 2)
+	hwcap_AES      = (1 << 3)
+	hwcap_PMULL    = (1 << 4)
+	hwcap_SHA1     = (1 << 5)
+	hwcap_SHA2     = (1 << 6)
+	hwcap_CRC32    = (1 << 7)
+	hwcap_ATOMICS  = (1 << 8)
+	hwcap_FPHP     = (1 << 9)
+	hwcap_ASIMDHP  = (1 << 10)
+	hwcap_CPUID    = (1 << 11)
+	hwcap_ASIMDRDM = (1 << 12)
+	hwcap_JSCVT    = (1 << 13)
+	hwcap_FCMA     = (1 << 14)
+	hwcap_LRCPC    = (1 << 15)
+	hwcap_DCPOP    = (1 << 16)
+	hwcap_SHA3     = (1 << 17)
+	hwcap_SM3      = (1 << 18)
+	hwcap_SM4      = (1 << 19)
+	hwcap_ASIMDDP  = (1 << 20)
+	hwcap_SHA512   = (1 << 21)
+	hwcap_SVE      = (1 << 22)
+	hwcap_ASIMDFHM = (1 << 23)
+)
+
+func doinit() {
+	options = []option{
+		{"evtstrm", &ARM64.HasEVTSTRM},
+		{"aes", &ARM64.HasAES},
+		{"pmull", &ARM64.HasPMULL},
+		{"sha1", &ARM64.HasSHA1},
+		{"sha2", &ARM64.HasSHA2},
+		{"crc32", &ARM64.HasCRC32},
+		{"atomics", &ARM64.HasATOMICS},
+		{"fphp", &ARM64.HasFPHP},
+		{"asimdhp", &ARM64.HasASIMDHP},
+		{"cpuid", &ARM64.HasCPUID},
+		{"asimdrdm", &ARM64.HasASIMDRDM},
+		{"jscvt", &ARM64.HasJSCVT},
+		{"fcma", &ARM64.HasFCMA},
+		{"lrcpc", &ARM64.HasLRCPC},
+		{"dcpop", &ARM64.HasDCPOP},
+		{"sha3", &ARM64.HasSHA3},
+		{"sm3", &ARM64.HasSM3},
+		{"sm4", &ARM64.HasSM4},
+		{"asimddp", &ARM64.HasASIMDDP},
+		{"sha512", &ARM64.HasSHA512},
+		{"sve", &ARM64.HasSVE},
+		{"asimdfhm", &ARM64.HasASIMDFHM},
+
+		// These capabilities should always be enabled on arm64:
+		//  {"fp", &ARM64.HasFP},
+		//  {"asimd", &ARM64.HasASIMD},
+	}
+
+	// HWCAP feature bits
+	ARM64.HasFP = isSet(hwcap, hwcap_FP)
+	ARM64.HasASIMD = isSet(hwcap, hwcap_ASIMD)
+	ARM64.HasEVTSTRM = isSet(hwcap, hwcap_EVTSTRM)
+	ARM64.HasAES = isSet(hwcap, hwcap_AES)
+	ARM64.HasPMULL = isSet(hwcap, hwcap_PMULL)
+	ARM64.HasSHA1 = isSet(hwcap, hwcap_SHA1)
+	ARM64.HasSHA2 = isSet(hwcap, hwcap_SHA2)
+	ARM64.HasCRC32 = isSet(hwcap, hwcap_CRC32)
+	ARM64.HasATOMICS = isSet(hwcap, hwcap_ATOMICS)
+	ARM64.HasFPHP = isSet(hwcap, hwcap_FPHP)
+	ARM64.HasASIMDHP = isSet(hwcap, hwcap_ASIMDHP)
+	ARM64.HasCPUID = isSet(hwcap, hwcap_CPUID)
+	ARM64.HasASIMDRDM = isSet(hwcap, hwcap_ASIMDRDM)
+	ARM64.HasJSCVT = isSet(hwcap, hwcap_JSCVT)
+	ARM64.HasFCMA = isSet(hwcap, hwcap_FCMA)
+	ARM64.HasLRCPC = isSet(hwcap, hwcap_LRCPC)
+	ARM64.HasDCPOP = isSet(hwcap, hwcap_DCPOP)
+	ARM64.HasSHA3 = isSet(hwcap, hwcap_SHA3)
+	ARM64.HasSM3 = isSet(hwcap, hwcap_SM3)
+	ARM64.HasSM4 = isSet(hwcap, hwcap_SM4)
+	ARM64.HasASIMDDP = isSet(hwcap, hwcap_ASIMDDP)
+	ARM64.HasSHA512 = isSet(hwcap, hwcap_SHA512)
+	ARM64.HasSVE = isSet(hwcap, hwcap_SVE)
+	ARM64.HasASIMDFHM = isSet(hwcap, hwcap_ASIMDFHM)
+}
+
+func isSet(hwc uint, value uint) bool {
+	return hwc&value != 0
+}
--- a/vendor/github.com/templexxx/cpu/cpu_mips.go
+++ b/vendor/github.com/templexxx/cpu/cpu_mips.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpu/cpu_mips64.go
+++ b/vendor/github.com/templexxx/cpu/cpu_mips64.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpu/cpu_mips64le.go
+++ b/vendor/github.com/templexxx/cpu/cpu_mips64le.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpu/cpu_mipsle.go
+++ b/vendor/github.com/templexxx/cpu/cpu_mipsle.go
@@ -0,0 +1,7 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 32
--- a/vendor/github.com/templexxx/cpu/cpu_no_init.go
+++ b/vendor/github.com/templexxx/cpu/cpu_no_init.go
@@ -0,0 +1,16 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !386
+// +build !amd64
+// +build !amd64p32
+// +build !arm64
+// +build !ppc64
+// +build !ppc64le
+// +build !s390x
+
+package cpu
+
+func doinit() {
+}
--- a/vendor/github.com/templexxx/cpu/cpu_ppc64x.go
+++ b/vendor/github.com/templexxx/cpu/cpu_ppc64x.go
@@ -0,0 +1,68 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package cpu
+
+const CacheLineSize = 128
+
+// ppc64x doesn't have a 'cpuid' equivalent, so we rely on HWCAP/HWCAP2.
+// These are linknamed in runtime/os_linux_ppc64x.go and are initialized by
+// archauxv().
+var hwcap uint
+var hwcap2 uint
+
+// HWCAP/HWCAP2 bits. These are exposed by the kernel.
+const (
+	// ISA Level
+	_PPC_FEATURE2_ARCH_2_07 = 0x80000000
+	_PPC_FEATURE2_ARCH_3_00 = 0x00800000
+
+	// CPU features
+	_PPC_FEATURE_HAS_ALTIVEC     = 0x10000000
+	_PPC_FEATURE_HAS_DFP         = 0x00000400
+	_PPC_FEATURE_HAS_VSX         = 0x00000080
+	_PPC_FEATURE2_HAS_HTM        = 0x40000000
+	_PPC_FEATURE2_HAS_ISEL       = 0x08000000
+	_PPC_FEATURE2_HAS_VEC_CRYPTO = 0x02000000
+	_PPC_FEATURE2_HTM_NOSC       = 0x01000000
+	_PPC_FEATURE2_DARN           = 0x00200000
+	_PPC_FEATURE2_SCV            = 0x00100000
+)
+
+func doinit() {
+	options = []option{
+		{"htm", &PPC64.HasHTM},
+		{"htmnosc", &PPC64.HasHTMNOSC},
+		{"darn", &PPC64.HasDARN},
+		{"scv", &PPC64.HasSCV},
+
+		// These capabilities should always be enabled on ppc64 and ppc64le:
+		//  {"vmx", &PPC64.HasVMX},
+		//  {"dfp", &PPC64.HasDFP},
+		//  {"vsx", &PPC64.HasVSX},
+		//  {"isel", &PPC64.HasISEL},
+		//  {"vcrypto", &PPC64.HasVCRYPTO},
+	}
+
+	// HWCAP feature bits
+	PPC64.HasVMX = isSet(hwcap, _PPC_FEATURE_HAS_ALTIVEC)
+	PPC64.HasDFP = isSet(hwcap, _PPC_FEATURE_HAS_DFP)
+	PPC64.HasVSX = isSet(hwcap, _PPC_FEATURE_HAS_VSX)
+
+	// HWCAP2 feature bits
+	PPC64.IsPOWER8 = isSet(hwcap2, _PPC_FEATURE2_ARCH_2_07)
+	PPC64.HasHTM = isSet(hwcap2, _PPC_FEATURE2_HAS_HTM)
+	PPC64.HasISEL = isSet(hwcap2, _PPC_FEATURE2_HAS_ISEL)
+	PPC64.HasVCRYPTO = isSet(hwcap2, _PPC_FEATURE2_HAS_VEC_CRYPTO)
+	PPC64.HasHTMNOSC = isSet(hwcap2, _PPC_FEATURE2_HTM_NOSC)
+	PPC64.IsPOWER9 = isSet(hwcap2, _PPC_FEATURE2_ARCH_3_00)
+	PPC64.HasDARN = isSet(hwcap2, _PPC_FEATURE2_DARN)
+	PPC64.HasSCV = isSet(hwcap2, _PPC_FEATURE2_SCV)
+}
+
+func isSet(hwc uint, value uint) bool {
+	return hwc&value != 0
+}
--- a/vendor/github.com/templexxx/cpu/cpu_s390x.go
+++ b/vendor/github.com/templexxx/cpu/cpu_s390x.go
@@ -0,0 +1,153 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 256
+
+// bitIsSet reports whether the bit at index is set. The bit index
+// is in big endian order, so bit index 0 is the leftmost bit.
+func bitIsSet(bits []uint64, index uint) bool {
+	return bits[index/64]&((1<<63)>>(index%64)) != 0
+}
+
+// function is the function code for the named function.
+type function uint8
+
+const (
+	// KM{,A,C,CTR} function codes
+	aes128 function = 18 // AES-128
+	aes192          = 19 // AES-192
+	aes256          = 20 // AES-256
+
+	// K{I,L}MD function codes
+	sha1   = 1 // SHA-1
+	sha256 = 2 // SHA-256
+	sha512 = 3 // SHA-512
+
+	// KLMD function codes
+	ghash = 65 // GHASH
+)
+
+// queryResult contains the result of a Query function
+// call. Bits are numbered in big endian order so the
+// leftmost bit (the MSB) is at index 0.
+type queryResult struct {
+	bits [2]uint64
+}
+
+// Has reports whether the given functions are present.
+func (q *queryResult) Has(fns ...function) bool {
+	if len(fns) == 0 {
+		panic("no function codes provided")
+	}
+	for _, f := range fns {
+		if !bitIsSet(q.bits[:], uint(f)) {
+			return false
+		}
+	}
+	return true
+}
+
+// facility is a bit index for the named facility.
+type facility uint8
+
+const (
+	// mandatory facilities
+	zarch  facility = 1  // z architecture mode is active
+	stflef          = 7  // store-facility-list-extended
+	ldisp           = 18 // long-displacement
+	eimm            = 21 // extended-immediate
+
+	// miscellaneous facilities
+	dfp    = 42 // decimal-floating-point
+	etf3eh = 30 // extended-translation 3 enhancement
+
+	// cryptography facilities
+	msa  = 17  // message-security-assist
+	msa3 = 76  // message-security-assist extension 3
+	msa4 = 77  // message-security-assist extension 4
+	msa5 = 57  // message-security-assist extension 5
+	msa8 = 146 // message-security-assist extension 8
+
+	// Note: vx and highgprs are excluded because they require
+	// kernel support and so must be fetched from HWCAP.
+)
+
+// facilityList contains the result of an STFLE call.
+// Bits are numbered in big endian order so the
+// leftmost bit (the MSB) is at index 0.
+type facilityList struct {
+	bits [4]uint64
+}
+
+// Has reports whether the given facilities are present.
+func (s *facilityList) Has(fs ...facility) bool {
+	if len(fs) == 0 {
+		panic("no facility bits provided")
+	}
+	for _, f := range fs {
+		if !bitIsSet(s.bits[:], uint(f)) {
+			return false
+		}
+	}
+	return true
+}
+
+// The following feature detection functions are defined in cpu_s390x.s.
+// They are likely to be expensive to call so the results should be cached.
+func stfle() facilityList
+func kmQuery() queryResult
+func kmcQuery() queryResult
+func kmctrQuery() queryResult
+func kmaQuery() queryResult
+func kimdQuery() queryResult
+func klmdQuery() queryResult
+
+func doinit() {
+	options = []option{
+		{"zarch", &S390X.HasZArch},
+		{"stfle", &S390X.HasSTFLE},
+		{"ldisp", &S390X.HasLDisp},
+		{"msa", &S390X.HasMSA},
+		{"eimm", &S390X.HasEImm},
+		{"dfp", &S390X.HasDFP},
+		{"etf3eh", &S390X.HasETF3Enhanced},
+		{"vx", &S390X.HasVX},
+	}
+
+	aes := []function{aes128, aes192, aes256}
+	facilities := stfle()
+
+	S390X.HasZArch = facilities.Has(zarch)
+	S390X.HasSTFLE = facilities.Has(stflef)
+	S390X.HasLDisp = facilities.Has(ldisp)
+	S390X.HasEImm = facilities.Has(eimm)
+	S390X.HasDFP = facilities.Has(dfp)
+	S390X.HasETF3Enhanced = facilities.Has(etf3eh)
+	S390X.HasMSA = facilities.Has(msa)
+
+	if S390X.HasMSA {
+		// cipher message
+		km, kmc := kmQuery(), kmcQuery()
+		S390X.HasAES = km.Has(aes...)
+		S390X.HasAESCBC = kmc.Has(aes...)
+		if facilities.Has(msa4) {
+			kmctr := kmctrQuery()
+			S390X.HasAESCTR = kmctr.Has(aes...)
+		}
+		if facilities.Has(msa8) {
+			kma := kmaQuery()
+			S390X.HasAESGCM = kma.Has(aes...)
+		}
+
+		// compute message digest
+		kimd := kimdQuery() // intermediate (no padding)
+		klmd := klmdQuery() // last (padding)
+		S390X.HasSHA1 = kimd.Has(sha1) && klmd.Has(sha1)
+		S390X.HasSHA256 = kimd.Has(sha256) && klmd.Has(sha256)
+		S390X.HasSHA512 = kimd.Has(sha512) && klmd.Has(sha512)
+		S390X.HasGHASH = kimd.Has(ghash) // KLMD-GHASH does not exist
+	}
+}
--- a/vendor/github.com/templexxx/cpu/cpu_s390x.s
+++ b/vendor/github.com/templexxx/cpu/cpu_s390x.s
@@ -0,0 +1,55 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func stfle() facilityList
+TEXT ·stfle(SB), NOSPLIT|NOFRAME, $0-32
+	MOVD $ret+0(FP), R1
+	MOVD $3, R0          // last doubleword index to store
+	XC   $32, (R1), (R1) // clear 4 doublewords (32 bytes)
+	WORD $0xb2b01000     // store facility list extended (STFLE)
+	RET
+
+// func kmQuery() queryResult
+TEXT ·kmQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KM-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xB92E0024    // cipher message (KM)
+	RET
+
+// func kmcQuery() queryResult
+TEXT ·kmcQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KMC-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xB92F0024    // cipher message with chaining (KMC)
+	RET
+
+// func kmctrQuery() queryResult
+TEXT ·kmctrQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KMCTR-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xB92D4024    // cipher message with counter (KMCTR)
+	RET
+
+// func kmaQuery() queryResult
+TEXT ·kmaQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KMA-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xb9296024    // cipher message with authentication (KMA)
+	RET
+
+// func kimdQuery() queryResult
+TEXT ·kimdQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KIMD-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xB93E0024    // compute intermediate message digest (KIMD)
+	RET
+
+// func klmdQuery() queryResult
+TEXT ·klmdQuery(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD $0, R0         // set function code to 0 (KLMD-Query)
+	MOVD $ret+0(FP), R1 // address of 16-byte return value
+	WORD $0xB93F0024    // compute last message digest (KLMD)
+	RET
--- a/vendor/github.com/templexxx/cpu/cpu_wasm.go
+++ b/vendor/github.com/templexxx/cpu/cpu_wasm.go
@@ -0,0 +1,7 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cpu
+
+const CacheLineSize = 64
--- a/vendor/github.com/templexxx/cpu/cpu_x86.go
+++ b/vendor/github.com/templexxx/cpu/cpu_x86.go
@@ -0,0 +1,425 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+package cpu
+
+import (
+	"fmt"
+	"strings"
+)
+
+const CacheLineSize = 64
+
+// cpuid is implemented in cpu_x86.s.
+func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+
+// xgetbv with ecx = 0 is implemented in cpu_x86.s.
+func xgetbv() (eax, edx uint32)
+
+const (
+	// edx bits
+	cpuid_SSE2 = 1 << 26
+
+	// ecx bits
+	cpuid_SSE3      = 1 << 0
+	cpuid_PCLMULQDQ = 1 << 1
+	cpuid_SSSE3     = 1 << 9
+	cpuid_FMA       = 1 << 12
+	cpuid_SSE41     = 1 << 19
+	cpuid_SSE42     = 1 << 20
+	cpuid_POPCNT    = 1 << 23
+	cpuid_AES       = 1 << 25
+	cpuid_OSXSAVE   = 1 << 27
+	cpuid_AVX       = 1 << 28
+
+	// ebx bits
+	cpuid_BMI1     = 1 << 3
+	cpuid_AVX2     = 1 << 5
+	cpuid_BMI2     = 1 << 8
+	cpuid_ERMS     = 1 << 9
+	cpuid_ADX      = 1 << 19
+	cpuid_AVX512F  = 1 << 16
+	cpuid_AVX512DQ = 1 << 17
+	cpuid_AVX512BW = 1 << 30
+	cpuid_AVX512VL = 1 << 31
+
+	// edx bits
+	cpuid_Invariant_TSC = 1 << 8
+)
+
+func doinit() {
+	options = []option{
+		{"adx", &X86.HasADX},
+		{"aes", &X86.HasAES},
+		{"avx", &X86.HasAVX},
+		{"avx2", &X86.HasAVX2},
+		{"bmi1", &X86.HasBMI1},
+		{"bmi2", &X86.HasBMI2},
+		{"erms", &X86.HasERMS},
+		{"fma", &X86.HasFMA},
+		{"pclmulqdq", &X86.HasPCLMULQDQ},
+		{"popcnt", &X86.HasPOPCNT},
+		{"sse3", &X86.HasSSE3},
+		{"sse41", &X86.HasSSE41},
+		{"sse42", &X86.HasSSE42},
+		{"ssse3", &X86.HasSSSE3},
+		{"avx512f", &X86.HasAVX512F},
+		{"avx512dq", &X86.HasAVX512DQ},
+		{"avx512bw", &X86.HasAVX512BW},
+		{"avx512vl", &X86.HasAVX512VL},
+		{"invariant_tsc", &X86.HasInvariantTSC},
+
+		// sse2 set as last element so it can easily be removed again. See code below.
+		{"sse2", &X86.HasSSE2},
+	}
+
+	// Remove sse2 from options on amd64(p32) because SSE2 is a mandatory feature for these GOARCHs.
+	if GOARCH == "amd64" || GOARCH == "amd64p32" {
+		options = options[:len(options)-1]
+	}
+
+	maxID, _, _, _ := cpuid(0, 0)
+
+	if maxID < 1 {
+		return
+	}
+
+	_, _, ecx1, edx1 := cpuid(1, 0)
+	X86.HasSSE2 = isSet(edx1, cpuid_SSE2)
+
+	X86.HasSSE3 = isSet(ecx1, cpuid_SSE3)
+	X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ)
+	X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3)
+	X86.HasFMA = isSet(ecx1, cpuid_FMA)
+	X86.HasSSE41 = isSet(ecx1, cpuid_SSE41)
+	X86.HasSSE42 = isSet(ecx1, cpuid_SSE42)
+	X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT)
+	X86.HasAES = isSet(ecx1, cpuid_AES)
+	X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)
+
+	osSupportsAVX := false
+	osSupportsAVX512 := false
+	// For XGETBV, OSXSAVE bit is required and sufficient.
+	if X86.HasOSXSAVE {
+		eax, _ := xgetbv()
+		// Check if XMM and YMM registers have OS support.
+		osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
+		// Check is ZMM registers have OS support.
+		osSupportsAVX512 = isSet(eax>>5, 7) && isSet(eax>>1, 3)
+	}
+
+	X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
+
+	if maxID < 7 {
+		return
+	}
+
+	_, ebx7, _, _ := cpuid(7, 0)
+	X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
+	X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
+	X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
+	X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ) && osSupportsAVX512
+	X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) && osSupportsAVX512
+	X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) && osSupportsAVX512
+	X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
+	X86.HasERMS = isSet(ebx7, cpuid_ERMS)
+	X86.HasADX = isSet(ebx7, cpuid_ADX)
+
+	X86.Cache = getCacheSize()
+
+	X86.HasInvariantTSC = hasInvariantTSC()
+
+	X86.Family, X86.Model = getFamilyModel()
+
+	X86.Signature = makeSignature(X86.Family, X86.Model)
+
+	X86.Name = getName()
+
+	X86.TSCFrequency = getNativeTSCFrequency(X86.Name, X86.Signature)
+}
+
+func isSet(hwc uint32, value uint32) bool {
+	return hwc&value != 0
+}
+
+func hasInvariantTSC() bool {
+	if maxExtendedFunction() < 0x80000007 {
+		return false
+	}
+	_, _, _, edx := cpuid(0x80000007, 0)
+	return isSet(edx, cpuid_Invariant_TSC)
+}
+
+func getName() string {
+	if maxExtendedFunction() >= 0x80000004 {
+		v := make([]uint32, 0, 48)
+		for i := uint32(0); i < 3; i++ {
+			a, b, c, d := cpuid(0x80000002+i, 0)
+			v = append(v, a, b, c, d)
+		}
+		return strings.Trim(string(valAsString(v...)), " ")
+	}
+	return "unknown"
+}
+
+// getNativeTSCFrequency gets TSC frequency from CPUID,
+// only supports Intel (Skylake or later microarchitecture) & key information is from Intel manual & kernel codes
+// (especially this commit: https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684).
+func getNativeTSCFrequency(name, sign string) uint64 {
+
+	if vendorID() != Intel {
+		return 0
+	}
+
+	if maxFunctionID() < 0x15 {
+		return 0
+	}
+
+	// ApolloLake, GeminiLake, CannonLake (and presumably all new chipsets
+	// from this point) report the crystal frequency directly via CPUID.0x15.
+	// That's definitive data that we can rely upon.
+	eax, ebx, ecx, _ := cpuid(0x15, 0)
+
+	// If ebx is 0, the TSC/”core crystal clock” ratio is not enumerated.
+	// We won't provide TSC frequency detection in this situation.
+	if eax == 0 || ebx == 0 {
+		return 0
+	}
+
+	// Skylake, Kabylake and all variants of those two chipsets report a
+	// crystal frequency of zero.
+	if ecx == 0 { // Crystal clock frequency is not enumerated.
+		ecx = getCrystalClockFrequency(sign)
+	}
+
+	// TSC frequency = “core crystal clock frequency” * EBX/EAX.
+	return uint64(ecx) * (uint64(ebx) / uint64(eax))
+}
+
+// Copied from: CPUID Signature values of DisplayFamily and DisplayModel,
+// in Intel® 64 and IA-32 Architectures Software Developer’s Manual
+// Volume 4: Model-Specific Registers
+// & https://github.com/torvalds/linux/blob/master/arch/x86/include/asm/intel-family.h
+const (
+	IntelFam6SkylakeL  = "06_4EH"
+	IntelFam6Skylake   = "06_5EH"
+	IntelFam6SkylakeX  = "06_55H"
+	IntelFam6KabylakeL = "06_8EH"
+	IntelFam6Kabylake  = "06_9EH"
+)
+
+// getCrystalClockFrequency gets crystal clock frequency
+// for Intel processors in which CPUID.15H.EBX[31:0] ÷ CPUID.0x15.EAX[31:0] is enumerated
+// but CPUID.15H.ECX is not enumerated using this function to get nominal core crystal clock frequency.
+//
+// Actually these crystal clock frequencies provided by Intel hardcoded tables are not so accurate in some cases,
+// e.g. SkyLake server CPU may have issue (All SKX subject the crystal to an EMI reduction circuit that
+//reduces its actual frequency by (approximately) -0.25%):
+// see https://lore.kernel.org/lkml/ff6dcea166e8ff8f2f6a03c17beab2cb436aa779.1513920414.git.len.brown@intel.com/
+// for more details.
+// With this report, I set a coefficient (0.9975) for IntelFam6SkyLakeX.
+//
+// Unlike the kernel way (mentioned in https://github.com/torvalds/linux/commit/604dc9170f2435d27da5039a3efd757dceadc684),
+// I prefer the Intel hardcoded tables,
+// because after some testing (comparing with wall clock, see https://github.com/templexxx/tsc/tsc_test.go for more details),
+// I found hardcoded tables are more accurate.
+func getCrystalClockFrequency(sign string) uint32 {
+
+	if maxFunctionID() < 0x16 {
+		return 0
+	}
+
+	switch sign {
+	case IntelFam6SkylakeL:
+		return 24 * 1000 * 1000
+	case IntelFam6Skylake:
+		return 24 * 1000 * 1000
+	case IntelFam6SkylakeX:
+		return 25 * 1000 * 1000 * 0.9975
+	case IntelFam6KabylakeL:
+		return 24 * 1000 * 1000
+	case IntelFam6Kabylake:
+		return 24 * 1000 * 1000
+	}
+
+	return 0
+}
+
+func getFamilyModel() (uint32, uint32) {
+	if maxFunctionID() < 0x1 {
+		return 0, 0
+	}
+	eax, _, _, _ := cpuid(1, 0)
+	family := (eax >> 8) & 0xf
+	displayFamily := family
+	if family == 0xf {
+		displayFamily = ((eax >> 20) & 0xff) + family
+	}
+	model := (eax >> 4) & 0xf
+	displayModel := model
+	if family == 0x6 || family == 0xf {
+		displayModel = ((eax >> 12) & 0xf0) + model
+	}
+	return displayFamily, displayModel
+}
+
+// signature format: XX_XXH
+func makeSignature(family, model uint32) string {
+	signature := strings.ToUpper(fmt.Sprintf("0%x_0%xH", family, model))
+	ss := strings.Split(signature, "_")
+	for i, s := range ss {
+		// Maybe insert too more `0`, drop it.
+		if len(s) > 2 {
+			s = s[1:]
+			ss[i] = s
+		}
+	}
+	return strings.Join(ss, "_")
+}
+
+// getCacheSize is from
+// https://github.com/klauspost/cpuid/blob/5a626f7029c910cc8329dae5405ee4f65034bce5/cpuid.go#L723
+func getCacheSize() Cache {
+	c := Cache{
+		L1I: -1,
+		L1D: -1,
+		L2:  -1,
+		L3:  -1,
+	}
+
+	vendor := vendorID()
+	switch vendor {
+	case Intel:
+		if maxFunctionID() < 4 {
+			return c
+		}
+		for i := uint32(0); ; i++ {
+			eax, ebx, ecx, _ := cpuid(4, i)
+			cacheType := eax & 15
+			if cacheType == 0 {
+				break
+			}
+			cacheLevel := (eax >> 5) & 7
+			coherency := int(ebx&0xfff) + 1
+			partitions := int((ebx>>12)&0x3ff) + 1
+			associativity := int((ebx>>22)&0x3ff) + 1
+			sets := int(ecx) + 1
+			size := associativity * partitions * coherency * sets
+			switch cacheLevel {
+			case 1:
+				if cacheType == 1 {
+					// 1 = Data Cache
+					c.L1D = size
+				} else if cacheType == 2 {
+					// 2 = Instruction Cache
+					c.L1I = size
+				} else {
+					if c.L1D < 0 {
+						c.L1I = size
+					}
+					if c.L1I < 0 {
+						c.L1I = size
+					}
+				}
+			case 2:
+				c.L2 = size
+			case 3:
+				c.L3 = size
+			}
+		}
+	case AMD, Hygon:
+		// Untested.
+		if maxExtendedFunction() < 0x80000005 {
+			return c
+		}
+		_, _, ecx, edx := cpuid(0x80000005, 0)
+		c.L1D = int(((ecx >> 24) & 0xFF) * 1024)
+		c.L1I = int(((edx >> 24) & 0xFF) * 1024)
+
+		if maxExtendedFunction() < 0x80000006 {
+			return c
+		}
+		_, _, ecx, _ = cpuid(0x80000006, 0)
+		c.L2 = int(((ecx >> 16) & 0xFFFF) * 1024)
+	}
+
+	return c
+}
+
+func maxFunctionID() uint32 {
+	a, _, _, _ := cpuid(0, 0)
+	return a
+}
+
+func maxExtendedFunction() uint32 {
+	eax, _, _, _ := cpuid(0x80000000, 0)
+	return eax
+}
+
+const (
+	Other = iota
+	Intel
+	AMD
+	VIA
+	Transmeta
+	NSC
+	KVM  // Kernel-based Virtual Machine
+	MSVM // Microsoft Hyper-V or Windows Virtual PC
+	VMware
+	XenHVM
+	Bhyve
+	Hygon
+)
+
+// Except from http://en.wikipedia.org/wiki/CPUID#EAX.3D0:_Get_vendor_ID
+var vendorMapping = map[string]int{
+	"AMDisbetter!": AMD,
+	"AuthenticAMD": AMD,
+	"CentaurHauls": VIA,
+	"GenuineIntel": Intel,
+	"TransmetaCPU": Transmeta,
+	"GenuineTMx86": Transmeta,
+	"Geode by NSC": NSC,
+	"VIA VIA VIA ": VIA,
+	"KVMKVMKVMKVM": KVM,
+	"Microsoft Hv": MSVM,
+	"VMwareVMware": VMware,
+	"XenVMMXenVMM": XenHVM,
+	"bhyve bhyve ": Bhyve,
+	"HygonGenuine": Hygon,
+}
+
+func vendorID() int {
+	_, b, c, d := cpuid(0, 0)
+	v := valAsString(b, d, c)
+	vend, ok := vendorMapping[string(v)]
+	if !ok {
+		return Other
+	}
+	return vend
+}
+
+func valAsString(values ...uint32) []byte {
+	r := make([]byte, 4*len(values))
+	for i, v := range values {
+		dst := r[i*4:]
+		dst[0] = byte(v & 0xff)
+		dst[1] = byte((v >> 8) & 0xff)
+		dst[2] = byte((v >> 16) & 0xff)
+		dst[3] = byte((v >> 24) & 0xff)
+		switch {
+		case dst[0] == 0:
+			return r[:i*4]
+		case dst[1] == 0:
+			return r[:i*4+1]
+		case dst[2] == 0:
+			return r[:i*4+2]
+		case dst[3] == 0:
+			return r[:i*4+3]
+		}
+	}
+	return r
+}
--- a/vendor/github.com/templexxx/cpu/cpu_x86.s
+++ b/vendor/github.com/templexxx/cpu/cpu_x86.s
@@ -0,0 +1,32 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32
+
+#include "textflag.h"
+
+// func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), NOSPLIT, $0-24
+	MOVL eaxArg+0(FP), AX
+	MOVL ecxArg+4(FP), CX
+	CPUID
+	MOVL AX, eax+8(FP)
+	MOVL BX, ebx+12(FP)
+	MOVL CX, ecx+16(FP)
+	MOVL DX, edx+20(FP)
+	RET
+
+// func xgetbv() (eax, edx uint32)
+TEXT ·xgetbv(SB),NOSPLIT,$0-8
+#ifdef GOOS_nacl
+	// nacl does not support XGETBV.
+	MOVL $0, eax+0(FP)
+	MOVL $0, edx+4(FP)
+#else
+	MOVL $0, CX
+	XGETBV
+	MOVL AX, eax+0(FP)
+	MOVL DX, edx+4(FP)
+#endif
+	RET
--- a/vendor/github.com/templexxx/xorsimd/.gitattributes
+++ b/vendor/github.com/templexxx/xorsimd/.gitattributes
@@ -0,0 +1 @@
+*.s linguist-language=go:x
--- a/vendor/github.com/templexxx/xorsimd/.gitignore
+++ b/vendor/github.com/templexxx/xorsimd/.gitignore
@@ -0,0 +1,13 @@
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, build with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+.idea
--- a/vendor/github.com/templexxx/xorsimd/LICENSE
+++ b/vendor/github.com/templexxx/xorsimd/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Temple3x (temple3x@gmail.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/vendor/github.com/templexxx/xorsimd/README.md
+++ b/vendor/github.com/templexxx/xorsimd/README.md
@@ -0,0 +1,46 @@
+# XOR SIMD
+
+[![GoDoc][1]][2] [![MIT licensed][3]][4] [![Build Status][5]][6] [![Go Report Card][7]][8] [![Sourcegraph][9]][10]
+
+[1]: https://godoc.org/github.com/templexxx/xorsimd?status.svg
+[2]: https://godoc.org/github.com/templexxx/xorsimd
+[3]: https://img.shields.io/badge/license-MIT-blue.svg
+[4]: LICENSE
+[5]: https://github.com/templexxx/xorsimd/workflows/unit-test/badge.svg
+[6]: https://github.com/templexxx/xorsimd
+[7]: https://goreportcard.com/badge/github.com/templexxx/xorsimd
+[8]: https://goreportcard.com/report/github.com/templexxx/xorsimd
+[9]: https://sourcegraph.com/github.com/templexxx/xorsimd/-/badge.svg
+[10]: https://sourcegraph.com/github.com/templexxx/xorsimd?badge
+
+## Introduction:
+
+>- XOR code engine in pure Go.
+>
+>- [High Performance](https://github.com/templexxx/xorsimd#performance): 
+More than 270GB/s per physics core. 
+
+## Performance
+
+Performance depends mainly on:
+
+>- CPU instruction extension.
+>
+>- Number of source row vectors.
+
+**Platform:** 
+
+*AWS c5d.xlarge (Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz)*
+
+**All test run on a single Core.**
+
+`I/O = (src_num + 1) * vector_size / cost`
+
+| Src Num  | Vector size | AVX512 I/O (MB/S) |  AVX2 I/O (MB/S) |SSE2 I/O (MB/S) |
+|-------|-------------|-------------|---------------|---------------|
+|5|4KB|     270403.73    |     142825.25    |    74443.91    |
+|5|1MB|    26948.34     |   26887.37 	      |     26950.65     | 
+|5|8MB|     17881.32     |    17212.56      |  16402.97      | 
+|10|4KB|     190445.30    |   102953.59      |   53244.04       |  
+|10|1MB|   26424.44     |     26618.65   |    26094.39    |   
+|10|8MB|   15471.31      |     14866.72      |    13565.80      |  
--- a/vendor/github.com/templexxx/xorsimd/go.mod
+++ b/vendor/github.com/templexxx/xorsimd/go.mod
@@ -0,0 +1,5 @@
+module github.com/templexxx/xorsimd
+
+require github.com/templexxx/cpu v0.0.1
+
+go 1.13
--- a/vendor/github.com/templexxx/xorsimd/go.sum
+++ b/vendor/github.com/templexxx/xorsimd/go.sum
@@ -0,0 +1,2 @@
+github.com/templexxx/cpu v0.0.1 h1:hY4WdLOgKdc8y13EYklu9OUTXik80BkxHoWvTO6MQQY=
+github.com/templexxx/cpu v0.0.1/go.mod h1:w7Tb+7qgcAlIyX4NhLuDKt78AHA5SzPmq0Wj6HiEnnk=
--- a/vendor/github.com/templexxx/xorsimd/xor.go
+++ b/vendor/github.com/templexxx/xorsimd/xor.go
@@ -0,0 +1,89 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+import "github.com/templexxx/cpu"
+
+// EnableAVX512 may slow down CPU Clock (maybe not).
+// TODO need more research:
+// https://lemire.me/blog/2018/04/19/by-how-much-does-avx-512-slow-down-your-cpu-a-first-experiment/
+var EnableAVX512 = true
+
+// cpuFeature indicates which instruction set will be used.
+var cpuFeature = getCPUFeature()
+
+const (
+	avx512 = iota
+	avx2
+	sse2
+	generic
+)
+
+// TODO: Add ARM feature...
+func getCPUFeature() int {
+	if hasAVX512() && EnableAVX512 {
+		return avx512
+	} else if cpu.X86.HasAVX2 {
+		return avx2
+	} else {
+		return sse2 // amd64 must has sse2
+	}
+}
+
+func hasAVX512() (ok bool) {
+
+	return cpu.X86.HasAVX512VL &&
+		cpu.X86.HasAVX512BW &&
+		cpu.X86.HasAVX512F &&
+		cpu.X86.HasAVX512DQ
+}
+
+// Encode encodes elements from source slice into a
+// destination slice. The source and destination may overlap.
+// Encode returns the number of bytes encoded, which will be the minimum of
+// len(src[i]) and len(dst).
+func Encode(dst []byte, src [][]byte) (n int) {
+	n = checkLen(dst, src)
+	if n == 0 {
+		return
+	}
+
+	dst = dst[:n]
+	for i := range src {
+		src[i] = src[i][:n]
+	}
+
+	if len(src) == 1 {
+		copy(dst, src[0])
+		return
+	}
+
+	encode(dst, src)
+	return
+}
+
+func checkLen(dst []byte, src [][]byte) int {
+	n := len(dst)
+	for i := range src {
+		if len(src[i]) < n {
+			n = len(src[i])
+		}
+	}
+
+	if n <= 0 {
+		return 0
+	}
+	return n
+}
+
+// Bytes XORs the bytes in a and b into a
+// destination slice. The source and destination may overlap.
+//
+// Bytes returns the number of bytes encoded, which will be the minimum of
+// len(dst), len(a), len(b).
+func Bytes(dst, a, b []byte) int {
+	return Encode(dst, [][]byte{a, b})
+}
--- a/vendor/github.com/templexxx/xorsimd/xor_amd64.go
+++ b/vendor/github.com/templexxx/xorsimd/xor_amd64.go
@@ -0,0 +1,95 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+
+package xorsimd
+
+func encode(dst []byte, src [][]byte) {
+
+	switch cpuFeature {
+	case avx512:
+		encodeAVX512(dst, src)
+	case avx2:
+		encodeAVX2(dst, src)
+	default:
+		encodeSSE2(dst, src)
+	}
+	return
+}
+
+// Bytes8 XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+	bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16 XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+	bytes16(&dst[0], &a[0], &b[0])
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8Align(dst, a, b []byte) {
+
+	bytes8(&dst[0], &a[0], &b[0])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16Align(dst, a, b []byte) {
+
+	bytes16(&dst[0], &a[0], &b[0])
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+	bytesN(&dst[0], &a[0], &b[0], len(a))
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+	bytesN(&dst[0], &a[0], &b[0], len(b))
+}
+
+//go:noescape
+func encodeAVX512(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeAVX2(dst []byte, src [][]byte)
+
+//go:noescape
+func encodeSSE2(dst []byte, src [][]byte)
+
+//go:noescape
+func bytesN(dst, a, b *byte, n int)
+
+//go:noescape
+func bytes8(dst, a, b *byte)
+
+//go:noescape
+func bytes16(dst, a, b *byte)
--- a/vendor/github.com/templexxx/xorsimd/xor_generic.go
+++ b/vendor/github.com/templexxx/xorsimd/xor_generic.go
@@ -0,0 +1,205 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+//
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package xorsimd
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+const wordSize = int(unsafe.Sizeof(uintptr(0)))
+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
+
+func encode(dst []byte, src [][]byte) {
+	if supportsUnaligned {
+		fastEncode(dst, src, len(dst))
+	} else {
+		// TODO(hanwen): if (dst, a, b) have common alignment
+		// we could still try fastEncode. It is not clear
+		// how often this happens, and it's only worth it if
+		// the block encryption itself is hardware
+		// accelerated.
+		safeEncode(dst, src, len(dst))
+	}
+
+}
+
+// fastEncode xor in bulk. It only works on architectures that
+// support unaligned read/writes.
+func fastEncode(dst []byte, src [][]byte, n int) {
+	w := n / wordSize
+	if w > 0 {
+		wordBytes := w * wordSize
+
+		wordAlignSrc := make([][]byte, len(src))
+		for i := range src {
+			wordAlignSrc[i] = src[i][:wordBytes]
+		}
+		fastEnc(dst[:wordBytes], wordAlignSrc)
+	}
+
+	for i := n - n%wordSize; i < n; i++ {
+		s := src[0][i]
+		for j := 1; j < len(src); j++ {
+			s ^= src[j][i]
+		}
+		dst[i] = s
+	}
+}
+
+func fastEnc(dst []byte, src [][]byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	sw := make([][]uintptr, len(src))
+	for i := range src {
+		sw[i] = *(*[]uintptr)(unsafe.Pointer(&src[i]))
+	}
+
+	n := len(dst) / wordSize
+	for i := 0; i < n; i++ {
+		s := sw[0][i]
+		for j := 1; j < len(sw); j++ {
+			s ^= sw[j][i]
+		}
+		dw[i] = s
+	}
+}
+
+func safeEncode(dst []byte, src [][]byte, n int) {
+	for i := 0; i < n; i++ {
+		s := src[0][i]
+		for j := 1; j < len(src); j++ {
+			s ^= src[j][i]
+		}
+		dst[i] = s
+	}
+}
+
+// Bytes8 XORs of word 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+func Bytes8(dst, a, b []byte) {
+
+	bytesWords(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16 XORs of packed doubleword 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+func Bytes16(dst, a, b []byte) {
+
+	bytesWords(dst[:16], a[:16], b[:16])
+}
+
+// bytesWords XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+func bytesWords(dst, a, b []byte) {
+	if supportsUnaligned {
+		dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+		aw := *(*[]uintptr)(unsafe.Pointer(&a))
+		bw := *(*[]uintptr)(unsafe.Pointer(&b))
+		n := len(b) / wordSize
+		for i := 0; i < n; i++ {
+			dw[i] = aw[i] ^ bw[i]
+		}
+	} else {
+		n := len(b)
+		for i := 0; i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	}
+}
+
+// Bytes8Align XORs of 8 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 8,
+// if not, Bytes8 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes8Align(dst, a, b []byte) {
+
+	bytesWordsAlign(dst[:8], a[:8], b[:8])
+}
+
+// Bytes16Align XORs of packed 16 Bytes.
+// The slice arguments a, b, dst's lengths are assumed to be at least 16,
+// if not, Bytes16 will panic.
+//
+// All the byte slices must be aligned to wordsize.
+func Bytes16Align(dst, a, b []byte) {
+
+	bytesWordsAlign(dst[:16], a[:16], b[:16])
+}
+
+// bytesWordsAlign XORs multiples of 4 or 8 bytes (depending on architecture.)
+// The slice arguments a and b are assumed to be of equal length.
+//
+// All the byte slices must be aligned to wordsize.
+func bytesWordsAlign(dst, a, b []byte) {
+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
+	n := len(b) / wordSize
+	for i := 0; i < n; i++ {
+		dw[i] = aw[i] ^ bw[i]
+	}
+}
+
+// BytesA XORs the len(a) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesA(dst, a, b []byte) {
+
+	n := len(a)
+	bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+// BytesB XORs the len(b) bytes in a and b into a
+// destination slice.
+// The destination should have enough space.
+//
+// It's used for encoding small bytes slices (< dozens bytes),
+// and the slices may not be aligned to 8 bytes or 16 bytes.
+// If the length is big, it's better to use 'func Bytes(dst, a, b []byte)' instead
+// for gain better performance.
+func BytesB(dst, a, b []byte) {
+
+	n := len(b)
+	bytesN(dst[:n], a[:n], b[:n], n)
+}
+
+func bytesN(dst, a, b []byte, n int) {
+
+	switch {
+	case supportsUnaligned:
+		w := n / wordSize
+		if w > 0 {
+			dw := *(*[]uintptr)(unsafe.Pointer(&dst))
+			aw := *(*[]uintptr)(unsafe.Pointer(&a))
+			bw := *(*[]uintptr)(unsafe.Pointer(&b))
+			for i := 0; i < w; i++ {
+				dw[i] = aw[i] ^ bw[i]
+			}
+		}
+
+		for i := (n - n%wordSize); i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	default:
+		for i := 0; i < n; i++ {
+			dst[i] = a[i] ^ b[i]
+		}
+	}
+}
--- a/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
+++ b/vendor/github.com/templexxx/xorsimd/xoravx2_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX2(dst []byte, src [][]byte)
+TEXT ·encodeAVX2(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  s+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $127, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop128b:
+	MOVQ    csrc, csrc_tmp                // store src_cnt -> csrc_tmp
+	SUBQ    $2, csrc_tmp
+	MOVQ    $0, d2src_off
+	MOVQ    (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+	VMOVDQU (src_tmp)(pos*1), Y0
+	VMOVDQU 32(src_tmp)(pos*1), Y1
+	VMOVDQU 64(src_tmp)(pos*1), Y2
+	VMOVDQU 96(src_tmp)(pos*1), Y3
+
+next_vect:
+	ADDQ    $24, d2src_off                // len(slice) = 24
+	MOVQ    (d2src)(d2src_off*1), src_tmp // next data_vect
+	VMOVDQU (src_tmp)(pos*1), Y4
+	VMOVDQU 32(src_tmp)(pos*1), Y5
+	VMOVDQU 64(src_tmp)(pos*1), Y6
+	VMOVDQU 96(src_tmp)(pos*1), Y7
+	VPXOR   Y4, Y0, Y0
+	VPXOR   Y5, Y1, Y1
+	VPXOR   Y6, Y2, Y2
+	VPXOR   Y7, Y3, Y3
+	SUBQ    $1, csrc_tmp
+	JGE     next_vect
+
+	VMOVDQU Y0, (dst)(pos*1)
+	VMOVDQU Y1, 32(dst)(pos*1)
+	VMOVDQU Y2, 64(dst)(pos*1)
+	VMOVDQU Y3, 96(dst)(pos*1)
+
+	ADDQ $128, pos
+	CMPQ len, pos
+	JNE  loop128b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0  // encode from the end of src
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $127, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $127, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $128
+	JGE  aligned
+	RET
+
+ret:
+	RET
--- a/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
+++ b/vendor/github.com/templexxx/xorsimd/xoravx512_amd64.s
@@ -0,0 +1,124 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeAVX512(dst []byte, src [][]byte)
+TEXT ·encodeAVX512(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  src+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $255, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop256b:
+	MOVQ     csrc, csrc_tmp                // store src_cnt -> csrc_tmp
+	SUBQ     $2, csrc_tmp
+	MOVQ     $0, d2src_off
+	MOVQ     (d2src)(d2src_off*1), src_tmp // get first src_vect's addr -> src_tmp
+	VMOVDQU8 (src_tmp)(pos*1), Z0
+	VMOVDQU8 64(src_tmp)(pos*1), Z1
+	VMOVDQU8 128(src_tmp)(pos*1), Z2
+	VMOVDQU8 192(src_tmp)(pos*1), Z3
+
+next_vect:
+	ADDQ     $24, d2src_off                // len(slice) = 24
+	MOVQ     (d2src)(d2src_off*1), src_tmp // next data_vect
+	VMOVDQU8 (src_tmp)(pos*1), Z4
+	VMOVDQU8 64(src_tmp)(pos*1), Z5
+	VMOVDQU8 128(src_tmp)(pos*1), Z6
+	VMOVDQU8 192(src_tmp)(pos*1), Z7
+	VPXORQ   Z4, Z0, Z0
+	VPXORQ   Z5, Z1, Z1
+	VPXORQ   Z6, Z2, Z2
+	VPXORQ   Z7, Z3, Z3
+	SUBQ     $1, csrc_tmp
+	JGE      next_vect
+
+	VMOVDQU8 Z0, (dst)(pos*1)
+	VMOVDQU8 Z1, 64(dst)(pos*1)
+	VMOVDQU8 Z2, 128(dst)(pos*1)
+	VMOVDQU8 Z3, 192(dst)(pos*1)
+
+	ADDQ $256, pos
+	CMPQ len, pos
+	JNE  loop256b
+	VZEROUPPER
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0  // encode from the end of src
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $255, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $255, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $256
+	JGE  aligned
+	RET
+
+ret:
+	RET
--- a/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
+++ b/vendor/github.com/templexxx/xorsimd/xorbytes_amd64.s
@@ -0,0 +1,72 @@
+#include "textflag.h"
+
+// func bytesN(dst, a, b *byte, n int)
+TEXT ·bytesN(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVQ  n+24(FP), DX
+	TESTQ $15, DX            // AND 15 & len, if not zero jump to not_aligned.
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, AX // position in slices
+
+loop16b:
+	MOVOU (SI)(AX*1), X0   // XOR 16byte forwards.
+	MOVOU (CX)(AX*1), X1
+	PXOR  X1, X0
+	MOVOU X0, (BX)(AX*1)
+	ADDQ  $16, AX
+	CMPQ  DX, AX
+	JNE   loop16b
+	RET
+
+loop_1b:
+	SUBQ  $1, DX           // XOR 1byte backwards.
+	MOVB  (SI)(DX*1), DI
+	MOVB  (CX)(DX*1), AX
+	XORB  AX, DI
+	MOVB  DI, (BX)(DX*1)
+	TESTQ $7, DX           // AND 7 & len, if not zero jump to loop_1b.
+	JNZ   loop_1b
+	CMPQ  DX, $0           // if len is 0, ret.
+	JE    ret
+	TESTQ $15, DX          // AND 15 & len, if zero jump to aligned.
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, DX           // AND $7 & len, if not zero jump to loop_1b.
+	JNE   loop_1b
+	SUBQ  $8, DX           // XOR 8bytes backwards.
+	MOVQ  (SI)(DX*1), DI
+	MOVQ  (CX)(DX*1), AX
+	XORQ  AX, DI
+	MOVQ  DI, (BX)(DX*1)
+	CMPQ  DX, $16          // if len is greater or equal 16 here, it must be aligned.
+	JGE   aligned
+
+ret:
+	RET
+
+// func bytes8(dst, a, b *byte)
+TEXT ·bytes8(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVQ  (SI), DI
+    MOVQ  (CX), AX
+    XORQ  AX, DI
+    MOVQ  DI, (BX)
+    RET
+
+// func bytes16(dst, a, b *byte)
+TEXT ·bytes16(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), BX
+	MOVQ  a+8(FP), SI
+	MOVQ  b+16(FP), CX
+	MOVOU (SI), X0
+    MOVOU (CX), X1
+    PXOR  X1, X0
+    MOVOU X0, (BX)
+    RET
--- a/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
+++ b/vendor/github.com/templexxx/xorsimd/xorsse2_amd64.s
@@ -0,0 +1,123 @@
+// Copyright (c) 2019. Temple3x (temple3x@gmail.com)
+//
+// Use of this source code is governed by the MIT License
+// that can be found in the LICENSE file.
+ 
+#include "textflag.h"
+
+#define dst BX // parity's address
+#define d2src SI // two-dimension src_slice's address
+#define csrc CX // cnt of src
+#define len DX // len of vect
+#define pos R8 // job position in vect
+
+#define csrc_tmp R9
+#define d2src_off R10
+#define src_tmp R11
+#define not_aligned_len R12
+#define src_val0 R13
+#define src_val1 R14
+
+// func encodeSSE2(dst []byte, src [][]byte)
+TEXT ·encodeSSE2(SB), NOSPLIT, $0
+	MOVQ  d+0(FP), dst
+	MOVQ  src+24(FP), d2src
+	MOVQ  c+32(FP), csrc
+	MOVQ  l+8(FP), len
+	TESTQ $63, len
+	JNZ   not_aligned
+
+aligned:
+	MOVQ $0, pos
+
+loop64b:
+	MOVQ  csrc, csrc_tmp
+	SUBQ  $2, csrc_tmp
+	MOVQ  $0, d2src_off
+	MOVQ  (d2src)(d2src_off*1), src_tmp
+	MOVOU (src_tmp)(pos*1), X0
+	MOVOU 16(src_tmp)(pos*1), X1
+	MOVOU 32(src_tmp)(pos*1), X2
+	MOVOU 48(src_tmp)(pos*1), X3
+
+next_vect:
+	ADDQ  $24, d2src_off
+	MOVQ  (d2src)(d2src_off*1), src_tmp
+	MOVOU (src_tmp)(pos*1), X4
+	MOVOU 16(src_tmp)(pos*1), X5
+	MOVOU 32(src_tmp)(pos*1), X6
+	MOVOU 48(src_tmp)(pos*1), X7
+	PXOR  X4, X0
+	PXOR  X5, X1
+	PXOR  X6, X2
+	PXOR  X7, X3
+	SUBQ  $1, csrc_tmp
+	JGE   next_vect
+
+	MOVOU X0, (dst)(pos*1)
+	MOVOU X1, 16(dst)(pos*1)
+	MOVOU X2, 32(dst)(pos*1)
+	MOVOU X3, 48(dst)(pos*1)
+
+	ADDQ $64, pos
+	CMPQ len, pos
+	JNE  loop64b
+	RET
+
+loop_1b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVB -1(src_tmp)(len*1), src_val0
+
+next_vect_1b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVB -1(src_tmp)(len*1), src_val1
+	XORB src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_1b
+
+	MOVB  src_val0, -1(dst)(len*1)
+	SUBQ  $1, len
+	TESTQ $7, len
+	JNZ   loop_1b
+
+	CMPQ  len, $0
+	JE    ret
+	TESTQ $63, len
+	JZ    aligned
+
+not_aligned:
+	TESTQ $7, len
+	JNE   loop_1b
+	MOVQ  len, not_aligned_len
+	ANDQ  $63, not_aligned_len
+
+loop_8b:
+	MOVQ csrc, csrc_tmp
+	MOVQ $0, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	SUBQ $2, csrc_tmp
+	MOVQ -8(src_tmp)(len*1), src_val0
+
+next_vect_8b:
+	ADDQ $24, d2src_off
+	MOVQ (d2src)(d2src_off*1), src_tmp
+	MOVQ -8(src_tmp)(len*1), src_val1
+	XORQ src_val1, src_val0
+	SUBQ $1, csrc_tmp
+	JGE  next_vect_8b
+
+	MOVQ src_val0, -8(dst)(len*1)
+	SUBQ $8, len
+	SUBQ $8, not_aligned_len
+	JG   loop_8b
+
+	CMPQ len, $64
+	JGE  aligned
+	RET
+
+ret:
+	RET