diff --git a/asm/caxpy.go b/asm/caxpy.go index 070de58d..529b1c42 100644 --- a/asm/caxpy.go +++ b/asm/caxpy.go @@ -6,6 +6,12 @@ package asm +func CaxpyUnitary(alpha complex64, x, y []complex64) { + for i, v := range x { + y[i] += alpha * v + } +} + func CaxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) { for i, v := range x { dst[i] = alpha*v + y[i] diff --git a/asm/daxpy.go b/asm/daxpy.go index a77b45db..97fc7bde 100644 --- a/asm/daxpy.go +++ b/asm/daxpy.go @@ -6,6 +6,12 @@ package asm +func DaxpyUnitary(alpha float64, x, y []float64) { + for i, v := range x { + y[i] += alpha * v + } +} + func DaxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) { for i, v := range x { dst[i] = alpha*v + y[i] diff --git a/asm/daxpy_amd64.go b/asm/daxpy_amd64.go index 876a1657..9364e384 100644 --- a/asm/daxpy_amd64.go +++ b/asm/daxpy_amd64.go @@ -6,6 +6,8 @@ package asm +func DaxpyUnitary(alpha float64, x, y []float64) + func DaxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) func DaxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) diff --git a/asm/daxpy_amd64.s b/asm/daxpy_amd64.s index 9c4d1a7e..d43b7f09 100644 --- a/asm/daxpy_amd64.s +++ b/asm/daxpy_amd64.s @@ -41,47 +41,6 @@ // Don't insert stack check preamble. #define NOSPLIT 4 -// func DaxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) -// This function assumes len(y) >= len(x) and len(dst) >= len(x). -// TODO(vladimir-ch): Generate DaxpyUnitary and DaxpyUnitaryTo. -TEXT ·DaxpyUnitaryTo(SB), NOSPLIT, $0 - MOVQ dst+0(FP), R10 - MOVHPD alpha+24(FP), X7 - MOVLPD alpha+24(FP), X7 - MOVQ x+32(FP), R8 - MOVQ x_len+40(FP), DI // n = len(x) - MOVQ y+56(FP), R9 - - MOVQ $0, SI // i = 0 - SUBQ $2, DI // n -= 2 - JL V1 // if n < 0 goto V1 - -U1: // n >= 0 - // dst[i] = alpha * x[i] + y[i] unrolled 2x. - MOVUPD 0(R8)(SI*8), X0 - MOVUPD 0(R9)(SI*8), X1 - MULPD X7, X0 - ADDPD X0, X1 - MOVUPD X1, 0(R10)(SI*8) - - ADDQ $2, SI // i += 2 - SUBQ $2, DI // n -= 2 - JGE U1 // if n >= 0 goto U1 - -V1: - ADDQ $2, DI // n += 2 - JLE E1 // if n <= 0 goto E1 - - // dst[i] = alpha * x[i] + y[i] for last iteration if n is odd. - MOVSD 0(R8)(SI*8), X0 - MOVSD 0(R9)(SI*8), X1 - MULSD X7, X0 - ADDSD X0, X1 - MOVSD X1, 0(R10)(SI*8) - -E1: - RET - // func DaxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) TEXT ·DaxpyInc(SB), NOSPLIT, $0 MOVHPD alpha+0(FP), X7 diff --git a/asm/daxpyunitary_amd64.s b/asm/daxpyunitary_amd64.s new file mode 100644 index 00000000..f05a98cb --- /dev/null +++ b/asm/daxpyunitary_amd64.s @@ -0,0 +1,83 @@ +// Generated by running +// go generate github.com/gonum/internal/asm +// DO NOT EDIT. + +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//+build !noasm,!appengine + +// TODO(fhs): use textflag.h after we drop Go 1.3 support +// #include "textflag.h" +// Don't insert stack check preamble. +#define NOSPLIT 4 + +// func DaxpyUnitary(alpha float64, x, y []float64) +// This function assumes len(y) >= len(x). +TEXT ·DaxpyUnitary(SB), NOSPLIT, $0 + MOVHPD alpha+0(FP), X7 + MOVLPD alpha+0(FP), X7 + MOVQ x+8(FP), R8 + MOVQ x_len+16(FP), DI // n = len(x) + MOVQ y+32(FP), R9 + + MOVQ $0, SI // i = 0 + SUBQ $2, DI // n -= 2 + JL tail // if n < 0 goto tail + +loop: + MOVUPD 0(R8)(SI*8), X0 + MOVUPD 0(R9)(SI*8), X1 + MULPD X7, X0 + ADDPD X0, X1 + MOVUPD X1, 0(R9)(SI*8) + + ADDQ $2, SI // i += 2 + SUBQ $2, DI // n -= 2 + JGE loop // if n >= 0 goto loop + +tail: + ADDQ $2, DI // n += 2 + JLE end // if n <= 0 goto end + + MOVSD 0(R8)(SI*8), X0 + MOVSD 0(R9)(SI*8), X1 + MULSD X7, X0 + ADDSD X0, X1 + MOVSD X1, 0(R9)(SI*8) + +end: + RET diff --git a/asm/daxpyunitaryto_amd64.s b/asm/daxpyunitaryto_amd64.s new file mode 100644 index 00000000..438f5853 --- /dev/null +++ b/asm/daxpyunitaryto_amd64.s @@ -0,0 +1,84 @@ +// Generated by running +// go generate github.com/gonum/internal/asm +// DO NOT EDIT. + +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//+build !noasm,!appengine + +// TODO(fhs): use textflag.h after we drop Go 1.3 support +// #include "textflag.h" +// Don't insert stack check preamble. +#define NOSPLIT 4 + +// func DaxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) +// This function assumes len(y) >= len(x) and len(dst) >= len(x). +TEXT ·DaxpyUnitaryTo(SB), NOSPLIT, $0 + MOVQ dst+0(FP), R10 + MOVHPD alpha+24(FP), X7 + MOVLPD alpha+24(FP), X7 + MOVQ x+32(FP), R8 + MOVQ x_len+40(FP), DI // n = len(x) + MOVQ y+56(FP), R9 + + MOVQ $0, SI // i = 0 + SUBQ $2, DI // n -= 2 + JL tail // if n < 0 goto tail + +loop: + MOVUPD 0(R8)(SI*8), X0 + MOVUPD 0(R9)(SI*8), X1 + MULPD X7, X0 + ADDPD X0, X1 + MOVUPD X1, 0(R10)(SI*8) + + ADDQ $2, SI // i += 2 + SUBQ $2, DI // n -= 2 + JGE loop // if n >= 0 goto loop + +tail: + ADDQ $2, DI // n += 2 + JLE end // if n <= 0 goto end + + MOVSD 0(R8)(SI*8), X0 + MOVSD 0(R9)(SI*8), X1 + MULSD X7, X0 + ADDSD X0, X1 + MOVSD X1, 0(R10)(SI*8) + +end: + RET diff --git a/asm/genasm.go b/asm/genasm.go new file mode 100644 index 00000000..2afffdc5 --- /dev/null +++ b/asm/genasm.go @@ -0,0 +1,158 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//+build ignore + +// gendaxpy creates the assembler code for the family of Daxpy functions. +package main + +import ( + "fmt" + "log" + "os" + "strings" + "text/template" +) + +var asm = template.Must(template.New("asm").Parse(`{{define "header"}}// Generated by running +// go generate github.com/gonum/internal/asm +// DO NOT EDIT. + +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +//+build !noasm,!appengine + +// TODO(fhs): use textflag.h after we drop Go 1.3 support +// #include "textflag.h" +// Don't insert stack check preamble. +#define NOSPLIT 4 +{{end}} + +{{define "unitary_preamble"}} +// func DaxpyUnitary(alpha float64, x, y []float64) +// This function assumes len(y) >= len(x). +TEXT ·DaxpyUnitary(SB), NOSPLIT, $0 + MOVHPD alpha+0(FP), X7 + MOVLPD alpha+0(FP), X7 + MOVQ x+8(FP), R8 + MOVQ x_len+16(FP), DI // n = len(x) + MOVQ y+32(FP), R9 +{{end}} + +{{define "unitaryto_preamble"}} +// func DaxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) +// This function assumes len(y) >= len(x) and len(dst) >= len(x). +TEXT ·DaxpyUnitaryTo(SB), NOSPLIT, $0 + MOVQ dst+0(FP), R10 + MOVHPD alpha+24(FP), X7 + MOVLPD alpha+24(FP), X7 + MOVQ x+32(FP), R8 + MOVQ x_len+40(FP), DI // n = len(x) + MOVQ y+56(FP), R9 +{{end}} + +{{define "unitary_body"}} + MOVQ $0, SI // i = 0 + SUBQ $2, DI // n -= 2 + JL tail // if n < 0 goto tail + +loop: + MOVUPD 0(R8)(SI*8), X0 + MOVUPD 0(R9)(SI*8), X1 + MULPD X7, X0 + ADDPD X0, X1 + MOVUPD X1, 0({{if .To}}R10{{else}}R9{{end}})(SI*8) + + ADDQ $2, SI // i += 2 + SUBQ $2, DI // n -= 2 + JGE loop // if n >= 0 goto loop + +tail: + ADDQ $2, DI // n += 2 + JLE end // if n <= 0 goto end + + MOVSD 0(R8)(SI*8), X0 + MOVSD 0(R9)(SI*8), X1 + MULSD X7, X0 + ADDSD X0, X1 + MOVSD X1, 0({{if .To}}R10{{else}}R9{{end}})(SI*8) + +end: + RET +{{end}}`)) + +type Function struct { + Name string + To bool + + template string +} + +var funcs = []Function{ + { + Name: "DaxpyUnitary", + To: false, + template: `{{template "header" .}}{{template "unitary_preamble" .}}{{template "unitary_body" .}}`, + }, + { + Name: "DaxpyUnitaryTo", + To: true, + template: `{{template "header" .}}{{template "unitaryto_preamble" .}}{{template "unitary_body" .}}`, + }, +} + +func main() { + for _, fn := range funcs { + t, err := template.Must(asm.Clone()).Parse(fn.template) + if err != nil { + log.Fatalf("failed to parse template: %v", err) + } + + file := strings.ToLower(fn.Name) + "_amd64.s" + fmt.Println("Generating", file) + f, err := os.Create(file) + if err != nil { + log.Fatalf("failed to create %q: %v", file, err) + } + err = t.Execute(f, fn) + if err != nil { + log.Fatalf("failed to execute template: %v", err) + } + f.Close() + } +} diff --git a/asm/generate.go b/asm/generate.go index e2521405..bd41927a 100644 --- a/asm/generate.go +++ b/asm/generate.go @@ -4,5 +4,6 @@ //go:generate ./single_precision //go:generate ./complex +//go:generate go run genasm.go package asm diff --git a/asm/saxpy.go b/asm/saxpy.go index 2779dcfb..cc0b8529 100644 --- a/asm/saxpy.go +++ b/asm/saxpy.go @@ -6,6 +6,12 @@ package asm +func SaxpyUnitary(alpha float32, x, y []float32) { + for i, v := range x { + y[i] += alpha * v + } +} + func SaxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) { for i, v := range x { dst[i] = alpha*v + y[i] diff --git a/asm/zaxpy.go b/asm/zaxpy.go index dead5c87..3d825f47 100644 --- a/asm/zaxpy.go +++ b/asm/zaxpy.go @@ -6,6 +6,12 @@ package asm +func ZaxpyUnitary(alpha complex128, x, y []complex128) { + for i, v := range x { + y[i] += alpha * v + } +} + func ZaxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) { for i, v := range x { dst[i] = alpha*v + y[i]