cgo,native: implement dgeqp3 and test

2025-10-19 05:24:52 +08:00 · 2017-02-04 17:16:54 +10:30
parent 52eecc0c38
commit 9bc10c65bd
5 changed files with 380 additions and 0 deletions
--- a/cgo/lapack.go
+++ b/cgo/lapack.go
@@ -116,6 +116,70 @@ type Implementation struct{}

 var _ lapack.Float64 = Implementation{}

+// Dgeqp3 computes a QR factorization with column pivoting of the
+// m×n matrix A: A*P = Q*R using Level 3 BLAS.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//  Q = H_0 H_1 . . . H_{k-1}, where k = min(m,n).
+// Each H_i has the form
+//  H_i = I - tau * v * v^T
+// where tau and v are real vectors with v[0:i-1] = 0 and v[i] = 1;
+// v[i:m] is stored on exit in A[i:m, i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. If
+// jpvt[j] is at least zero, the jth column of A is permuted
+// to the front of A*P (a leading column), if jpvt[j] is -1
+// the jth column of A is a free column. If jpvt[j] < -1, Dgeqp3
+// will panic. On return, jpvt holds the permutation that was
+// applied; the jth column of A*P was the jpvt[j] column of A.
+// jpvt must have length n or Dgeqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors.
+// It must have length min(m, n), otherwise Dgeqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Dgeqp3 will panic. For optimal performance lwork must
+// be at least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return,
+// work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dgeqp3, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgeqp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int) {
+	checkMatrix(m, n, a, lda)
+	if len(jpvt) != n {
+		panic(badIpiv)
+	}
+	if len(tau) != min(m, n) {
+		panic(badTau)
+	}
+	if len(work) < max(1, lwork) {
+		panic(badWork)
+	}
+
+	// Don't update jpvt if querying lwkopt.
+	if lwork == -1 {
+		lapacke.Dgeqp3(m, n, a, lda, nil, nil, work, -1)
+		return
+	}
+
+	jpvt32 := make([]int32, len(jpvt))
+	for i, v := range jpvt {
+		v++
+		if v != int(int32(v)) || v < 0 || n < v {
+			panic("lapack: jpvt element out of range")
+		}
+		jpvt32[i] = int32(v)
+	}
+
+	lapacke.Dgeqp3(m, n, a, lda, jpvt32, tau, work, lwork)
+
+	for i, v := range jpvt32 {
+		jpvt[i] = int(v - 1)
+	}
+}
+
 // Dlacn2 estimates the 1-norm of an n×n matrix A using sequential updates with
 // matrix-vector products provided externally.
 //
--- a/cgo/lapack_test.go
+++ b/cgo/lapack_test.go
@@ -44,6 +44,10 @@ func (bl blockedTranslate) Dorgl2(m, n, k int, a []float64, lda int, tau, work [
 	impl.Dorglq(m, n, k, a, lda, tau, work, len(work))
 }

+func TestDgeqp3(t *testing.T) {
+	testlapack.Dgeqp3Test(t, impl)
+}
+
 func TestDlacn2(t *testing.T) {
 	testlapack.Dlacn2Test(t, impl)
 }
--- a/native/dgeqp3.go
+++ b/native/dgeqp3.go
@@ -0,0 +1,173 @@
+// Copyright ©2017 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package native
+
+import (
+	"github.com/gonum/blas"
+	"github.com/gonum/blas/blas64"
+)
+
+// Dgeqp3 computes a QR factorization with column pivoting of the
+// m×n matrix A: A*P = Q*R using Level 3 BLAS.
+//
+// The matrix Q is represented as a product of elementary reflectors
+//  Q = H_0 H_1 . . . H_{k-1}, where k = min(m,n).
+// Each H_i has the form
+//  H_i = I - tau * v * v^T
+// where tau and v are real vectors with v[0:i-1] = 0 and v[i] = 1;
+// v[i:m] is stored on exit in A[i:m, i], and tau in tau[i].
+//
+// jpvt specifies a column pivot to be applied to A. If
+// jpvt[j] is at least zero, the jth column of A is permuted
+// to the front of A*P (a leading column), if jpvt[j] is -1
+// the jth column of A is a free column. If jpvt[j] < -1, Dgeqp3
+// will panic. On return, jpvt holds the permutation that was
+// applied; the jth column of A*P was the jpvt[j] column of A.
+// jpvt must have length n or Dgeqp3 will panic.
+//
+// tau holds the scalar factors of the elementary reflectors.
+// It must have length min(m, n), otherwise Dgeqp3 will panic.
+//
+// work must have length at least max(1,lwork), and lwork must be at least
+// 3*n+1, otherwise Dgeqp3 will panic. For optimal performance lwork must
+// be at least 2*n+(n+1)*nb, where nb is the optimal blocksize. On return,
+// work[0] will contain the optimal value of lwork.
+//
+// If lwork == -1, instead of performing Dgeqp3, only the optimal value of lwork
+// will be stored in work[0].
+//
+// Dgeqp3 is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int) {
+	const (
+		inb    = 1
+		inbmin = 2
+		ixover = 3
+	)
+	checkMatrix(m, n, a, lda)
+
+	if len(jpvt) != n {
+		panic(badIpiv)
+	}
+	for _, v := range jpvt {
+		if v < -1 || n <= v {
+			panic("lapack: jpvt element out of range")
+		}
+	}
+	minmn := min(m, n)
+	if len(tau) != minmn {
+		panic(badTau)
+	}
+	if len(work) < max(1, lwork) {
+		panic(badWork)
+	}
+
+	var iws, lwkopt, nb int
+	if minmn == 0 {
+		iws = 1
+		lwkopt = 1
+	} else {
+		iws = 3*n + 1
+		nb = impl.Ilaenv(inb, "DGEQRF", " ", m, n, -1, -1)
+		lwkopt = 2*n + (n+1)*nb
+	}
+	work[0] = float64(lwkopt)
+
+	if lwork == -1 {
+		return
+	}
+
+	bi := blas64.Implementation()
+
+	// Move initial columns up front.
+	var nfxd int
+	for j := 0; j < n; j++ {
+		if jpvt[j] == -1 {
+			jpvt[j] = j
+			continue
+		}
+		if j != nfxd {
+			bi.Dswap(m, a[j:], lda, a[nfxd:], lda)
+			jpvt[j], jpvt[nfxd] = jpvt[nfxd], j
+		} else {
+			jpvt[j] = j
+		}
+		nfxd++
+	}
+
+	// Factorize nfxd columns.
+	//
+	// Compute the QR factorization of nfxd columns and update remaining columns.
+	if nfxd > 0 {
+		na := min(m, nfxd)
+		impl.Dgeqrf(m, na, a, lda, tau, work, lwork)
+		iws = max(iws, int(work[0]))
+		if na < n {
+			impl.Dormqr(blas.Left, blas.Trans, m, n-na, na, a, lda, tau[:na], a[na:], lda,
+				work, lwork)
+			iws = max(iws, int(work[0]))
+		}
+	}
+
+	if nfxd >= minmn {
+		work[0] = float64(iws)
+		return
+	}
+
+	// Factorize free columns.
+	sm := m - nfxd
+	sn := n - nfxd
+	sminmn := minmn - nfxd
+
+	// Determine the block size.
+	nb = impl.Ilaenv(inb, "DGEQRF", " ", sm, sn, -1, -1)
+	nbmin := 2
+	nx := 0
+
+	if 1 < nb && nb < sminmn {
+		// Determine when to cross over from blocked to unblocked code.
+		nx = max(0, impl.Ilaenv(ixover, "DGEQRF", " ", sm, sn, -1, -1))
+
+		if nx < sminmn {
+			// Determine if workspace is large enough for blocked code.
+			minws := 2*sn + (sn+1)*nb
+			iws = max(iws, minws)
+			if lwork < minws {
+				// Not enough workspace to use optimal nb. Reduce
+				// nb and determine the minimum value of nb.
+				nb = (lwork - 2*sn) / (sn + 1)
+				nbmin = max(2, impl.Ilaenv(inbmin, "DGEQRF", " ", sm, sn, -1, -1))
+			}
+		}
+	}
+
+	// Initialize partial column norms.
+	// The first n elements of work store the exact column norms.
+	for j := nfxd; j < n; j++ {
+		work[j] = bi.Dnrm2(sm, a[nfxd*lda+j:], lda)
+		work[n+j] = work[j]
+	}
+	j := nfxd
+	if nbmin <= nb && nb < sminmn && nx < sminmn {
+		// Use blocked code initially.
+
+		// Compute factorization.
+		var fjb int
+		for topbmn := minmn - nx; j < topbmn; j += fjb {
+			jb := min(nb, topbmn-j)
+
+			// Factorize jb columns among columns j:n.
+			fjb = impl.Dlaqps(m, n-j, j, jb, a[j:], lda, jpvt[j:], tau[j:],
+				work[j:n], work[j+n:2*n], work[2*n:2*n+jb], work[2*n+jb:], jb)
+		}
+	}
+
+	// Use unblocked code to factor the last or only block.
+	if j < minmn {
+		impl.Dlaqp2(m, n-j, j, a[j:], lda, jpvt[j:], tau[j:],
+			work[j:n], work[j+n:2*n], work[2*n:])
+	}
+
+	work[0] = float64(iws)
+}
--- a/native/lapack_test.go
+++ b/native/lapack_test.go
@@ -68,6 +68,10 @@ func TestDgels(t *testing.T) {
 	testlapack.DgelsTest(t, impl)
 }

+func TestDgeqp3(t *testing.T) {
+	testlapack.Dgeqp3Test(t, impl)
+}
+
 func TestDgeqr2(t *testing.T) {
 	testlapack.Dgeqr2Test(t, impl)
 }
--- a/testlapack/dgeqp3.go
+++ b/testlapack/dgeqp3.go
@@ -0,0 +1,135 @@
+// Copyright ©2015 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package testlapack
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+
+	"github.com/gonum/blas"
+	"github.com/gonum/blas/blas64"
+)
+
+type Dgeqp3er interface {
+	Dlapmter
+	Dgeqp3(m, n int, a []float64, lda int, jpvt []int, tau, work []float64, lwork int)
+}
+
+func Dgeqp3Test(t *testing.T, impl Dgeqp3er) {
+	rnd := rand.New(rand.NewSource(1))
+	for c, test := range []struct {
+		m, n, lda int
+	}{
+		{1, 1, 0},
+		{2, 2, 0},
+		{3, 2, 0},
+		{2, 3, 0},
+		{1, 12, 0},
+		{2, 6, 0},
+		{3, 4, 0},
+		{4, 3, 0},
+		{6, 2, 0},
+		{12, 1, 0},
+		{1, 1, 20},
+		{2, 2, 20},
+		{3, 2, 20},
+		{2, 3, 20},
+		{1, 12, 20},
+		{2, 6, 20},
+		{3, 4, 20},
+		{4, 3, 20},
+		{6, 2, 20},
+		{12, 1, 20},
+		{129, 256, 0},
+		{256, 129, 0},
+		{129, 256, 266},
+		{256, 129, 266},
+	} {
+		n := test.n
+		m := test.m
+		lda := test.lda
+		if lda == 0 {
+			lda = test.n
+		}
+		const (
+			all = iota
+			some
+			none
+		)
+		for _, free := range []int{all, some, none} {
+			a := make([]float64, m*lda)
+			for i := range a {
+				a[i] = rnd.Float64()
+			}
+			aCopy := make([]float64, len(a))
+			copy(aCopy, a)
+			jpvt := make([]int, n)
+			for j := range jpvt {
+				switch free {
+				case all:
+					jpvt[j] = -1
+				case some:
+					jpvt[j] = rnd.Intn(2) - 1
+				case none:
+					jpvt[j] = 0
+				default:
+					panic("bad freedom")
+				}
+			}
+			k := min(m, n)
+			tau := make([]float64, k)
+			for i := range tau {
+				tau[i] = rnd.Float64()
+			}
+			work := make([]float64, 1)
+			impl.Dgeqp3(m, n, a, lda, jpvt, tau, work, -1)
+			lwork := int(work[0])
+			work = make([]float64, lwork)
+			for i := range work {
+				work[i] = rnd.Float64()
+			}
+			impl.Dgeqp3(m, n, a, lda, jpvt, tau, work, lwork)
+
+			// Test that the QR factorization has completed successfully. Compute
+			// Q based on the vectors.
+			q := constructQ("QR", m, n, a, lda, tau)
+
+			// Check that q is orthonormal
+			for i := 0; i < m; i++ {
+				nrm := blas64.Nrm2(m, blas64.Vector{Inc: 1, Data: q.Data[i*m:]})
+				if math.Abs(nrm-1) > 1e-13 {
+					t.Errorf("Case %v, q not normal", c)
+				}
+				for j := 0; j < i; j++ {
+					dot := blas64.Dot(m, blas64.Vector{Inc: 1, Data: q.Data[i*m:]}, blas64.Vector{Inc: 1, Data: q.Data[j*m:]})
+					if math.Abs(dot) > 1e-14 {
+						t.Errorf("Case %v, q not orthogonal", c)
+					}
+				}
+			}
+			// Check that A * P = Q * R
+			r := blas64.General{
+				Rows:   m,
+				Cols:   n,
+				Stride: n,
+				Data:   make([]float64, m*n),
+			}
+			for i := 0; i < m; i++ {
+				for j := i; j < n; j++ {
+					r.Data[i*n+j] = a[i*lda+j]
+				}
+			}
+			got := nanGeneral(m, n, lda)
+			blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, q, r, 0, got)
+
+			want := blas64.General{Rows: m, Cols: n, Stride: lda, Data: aCopy}
+			impl.Dlapmt(true, want.Rows, want.Cols, want.Data, want.Stride, jpvt)
+			if !equalApproxGeneral(got, want, 1e-13) {
+				t.Errorf("Case %v,  Q*R != A*P\nQ*R=%v\nA*P=%v", c, got, want)
+			}
+		}
+	}
+}