From ce09d0a8a44ecaf366916131779b80ee26b59b56 Mon Sep 17 00:00:00 2001
From: kortschak <dan.kortschak@adelaide.edu.au>
Date: Sun, 29 Jan 2017 16:39:13 +1030
Subject: [PATCH] native: implement dtgsja and test

---
 cgo/lapack.go         | 194 +++++++++++++++++++++++
 cgo/lapack_test.go    |   4 +
 lapack.go             |  11 ++
 native/dtgsja.go      | 357 ++++++++++++++++++++++++++++++++++++++++++
 native/general.go     |   3 +
 native/lapack_test.go |   4 +
 testlapack/dtgsja.go  | 165 +++++++++++++++++++
 testlapack/general.go | 107 +++++++++++++
 8 files changed, 845 insertions(+)
 create mode 100644 native/dtgsja.go
 create mode 100644 testlapack/dtgsja.go

diff --git a/cgo/lapack.go b/cgo/lapack.go
index d455716a..f5a03611 100644
--- a/cgo/lapack.go
+++ b/cgo/lapack.go
@@ -16,7 +16,9 @@ import (
 // Copied from lapack/native. Keep in sync.
 const (
 	absIncNotOne    = "lapack: increment not one or negative one"
+	badAlpha        = "lapack: bad alpha length"
 	badAuxv         = "lapack: auxv has insufficient length"
+	badBeta         = "lapack: bad beta length"
 	badD            = "lapack: d has insufficient length"
 	badDecompUpdate = "lapack: bad decomp update"
 	badDiag         = "lapack: bad diag"
@@ -26,6 +28,7 @@ const (
 	badEVComp       = "lapack: bad EVComp"
 	badEVJob        = "lapack: bad EVJob"
 	badEVSide       = "lapack: bad EVSide"
+	badGSVDJob      = "lapack: bad GSVDJob"
 	badHowMany      = "lapack: bad HowMany"
 	badIlo          = "lapack: ilo out of range"
 	badIhi          = "lapack: ihi out of range"
@@ -2482,3 +2485,194 @@ func (impl Implementation) Dgeev(jobvl lapack.LeftEVJob, jobvr lapack.RightEVJob
 	}
 	return first
 }
+
+// Dtgsja computes the generalized singular value decomposition (GSVD)
+// of two real upper triangular or trapezoidal matrices A and B.
+//
+// A and B have the following forms, which may be obtained by the
+// preprocessing subroutine Dggsvp from a general m×n matrix A and p×n
+// matrix B:
+//
+//            n-k-l  k    l
+//  A =    k [  0   A12  A13 ] if m-k-l >= 0;
+//         l [  0    0   A23 ]
+//     m-k-l [  0    0    0  ]
+//
+//            n-k-l  k    l
+//  A =    k [  0   A12  A13 ] if m-k-l < 0;
+//       m-k [  0    0   A23 ]
+//
+//            n-k-l  k    l
+//  B =    l [  0    0   B13 ]
+//       p-l [  0    0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// On exit,
+//
+//  U^T*A*Q = D1*[ 0 R ], V^T*B*Q = D2*[ 0 R ],
+//
+// where U, V and Q are orthogonal matrices.
+// R is a non-singular upper triangular matrix, and D1 and D2 are
+// diagonal matrices, which are of the following structures:
+//
+// If m-k-l >= 0,
+//
+//                    k  l
+//       D1 =     k [ I  0 ]
+//                l [ 0  C ]
+//            m-k-l [ 0  0 ]
+//
+//                  k  l
+//       D2 = l   [ 0  S ]
+//            p-l [ 0  0 ]
+//
+//               n-k-l  k    l
+//  [ 0 R ] = k [  0   R11  R12 ] k
+//            l [  0    0   R22 ] l
+//
+// where
+//
+//  C = diag( alpha_k, ... , alpha_{k+l} ),
+//  S = diag( beta_k,  ... , beta_{k+l} ),
+//  C^2 + S^2 = I.
+//
+// R is stored in
+//  A[0:k+l, n-k-l:n]
+// on exit.
+//
+// If m-k-l < 0,
+//
+//                 k m-k k+l-m
+//      D1 =   k [ I  0    0  ]
+//           m-k [ 0  C    0  ]
+//
+//                   k m-k k+l-m
+//      D2 =   m-k [ 0  S    0  ]
+//           k+l-m [ 0  0    I  ]
+//             p-l [ 0  0    0  ]
+//
+//                 n-k-l  k   m-k  k+l-m
+//  [ 0 R ] =    k [ 0    R11  R12  R13 ]
+//             m-k [ 0     0   R22  R23 ]
+//           k+l-m [ 0     0    0   R33 ]
+//
+// where
+//  C = diag( alpha_k, ... , alpha_m ),
+//  S = diag( beta_k,  ... , beta_m ),
+//  C^2 + S^2 = I.
+//
+//  R = [ R11 R12 R13 ] is stored in A[1:m, n-k-l+1:n]
+//      [  0  R22 R23 ]
+// and R33 is stored in
+//  B[m-k:l, n+m-k-l:n] on exit.
+//
+// The computation of the orthogonal transformation matrices U, V or Q
+// is optional. These matrices may either be formed explicitly, or they
+// may be post-multiplied into input matrices U1, V1, or Q1.
+//
+// Dtgsja essentially uses a variant of Kogbetliantz algorithm to reduce
+// min(l,m-k)×l triangular or trapezoidal matrix A23 and l×l
+// matrix B13 to the form:
+//
+//  U1^T*A13*Q1 = C1*R1; V1^T*B13*Q1 = S1*R1,
+//
+// where U1, V1 and Q1 are orthogonal matrices. C1 and S1 are diagonal
+// matrices satisfying
+//
+//  C1^2 + S1^2 = I,
+//
+// and R1 is an l×l non-singular upper triangular matrix.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//  jobU == lapack.GSVDU        Compute orthogonal matrix U
+//  jobU == lapack.GSVDUnit     Use unit-initialized matrix
+//  jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// k and l specify the sub-blocks in the input matrices A and B:
+//  A23 = A[k:min(k+l,m), n-l:n) and B13 = B[0:l, n-l:n]
+// of A and B, whose GSVD is going to be computed by Dtgsja.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//  tola = max(m, n)*norm(A)*eps,
+//  tolb = max(p, n)*norm(B)*eps,
+// where eps is the machine epsilon.
+//
+// work must have length at least 2*n, otherwise Dtgsja will panic.
+//
+// alpha and beta must have length n or Dtgsja will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//   alpha[0:k] = 1,
+//   beta[0:k]  = 0,
+// if m-k-l >= 0,
+//   alpha[k:k+l] = diag(C),
+//   beta[k:k+l]  = diag(S),
+// if m-k-l < 0,
+//   alpha[k:m]= C, alpha[m:k+l]= 0
+//   beta[k:m] = S, beta[m:k+l] = 1.
+// if k+l < n,
+//   alpha[k+l:n] = 0 and
+//   beta[k+l:n]  = 0.
+//
+// On exit, A[n-k:n, 0:min(k+l,m)] contains the triangular matrix R or part of R
+// and if necessary, B[m-k:l, n+m-k-l:n] contains a part of R.
+//
+// Dtgsja returns whether the routine converged and the number of iteration cycles
+// that were run.
+//
+// Dtgsja is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtgsja(jobU, jobV, jobQ lapack.GSVDJob, m, p, n, k, l int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64) (cycles int, ok bool) {
+	checkMatrix(m, n, a, lda)
+	checkMatrix(p, n, b, ldb)
+
+	if len(alpha) != n {
+		panic(badAlpha)
+	}
+	if len(beta) != n {
+		panic(badBeta)
+	}
+
+	initu := jobU == lapack.GSVDUnit
+	wantu := initu || jobU == lapack.GSVDU
+	if !initu && !wantu && jobU != lapack.GSVDNone {
+		panic(badGSVDJob + "U")
+	}
+	if jobU != lapack.GSVDNone {
+		checkMatrix(m, m, u, ldu)
+	}
+
+	initv := jobV == lapack.GSVDUnit
+	wantv := initv || jobV == lapack.GSVDV
+	if !initv && !wantv && jobV != lapack.GSVDNone {
+		panic(badGSVDJob + "V")
+	}
+	if jobV != lapack.GSVDNone {
+		checkMatrix(p, p, v, ldv)
+	}
+
+	initq := jobQ == lapack.GSVDUnit
+	wantq := initq || jobQ == lapack.GSVDQ
+	if !initq && !wantq && jobQ != lapack.GSVDNone {
+		panic(badGSVDJob + "Q")
+	}
+	if jobQ != lapack.GSVDNone {
+		checkMatrix(n, n, q, ldq)
+	}
+
+	if len(work) < 2*n {
+		panic(badWork)
+	}
+
+	ncycle := []int32{0}
+	ok = lapacke.Dtgsja(lapack.Job(jobU), lapack.Job(jobV), lapack.Job(jobQ), m, p, n, k, l, a, lda, b, ldb, tola, tolb, alpha, beta, u, ldu, v, ldv, q, ldq, work, ncycle)
+	return int(ncycle[0]), ok
+}
diff --git a/cgo/lapack_test.go b/cgo/lapack_test.go
index 67c16de7..eb96550d 100644
--- a/cgo/lapack_test.go
+++ b/cgo/lapack_test.go
@@ -244,6 +244,10 @@ func TestDsytrd(t *testing.T) {
 	testlapack.DsytrdTest(t, impl)
 }
 
+func TestDtgsja(t *testing.T) {
+	testlapack.DtgsjaTest(t, impl)
+}
+
 func TestDtrexc(t *testing.T) {
 	testlapack.DtrexcTest(t, impl)
 }
diff --git a/lapack.go b/lapack.go
index 90da267e..c84e6e25 100644
--- a/lapack.go
+++ b/lapack.go
@@ -108,6 +108,17 @@ const (
 	SVDNone      SVDJob = 'N' // Do not compute singular vectors
 )
 
+// GSVDJob specifies the singular vector computation type for Generalized SVD.
+type GSVDJob byte
+
+const (
+	GSVDU    GSVDJob = 'U' // Compute orthogonal matrix U
+	GSVDV    GSVDJob = 'V' // Compute orthogonal matrix V
+	GSVDQ    GSVDJob = 'Q' // Compute orthogonal matrix Q
+	GSVDUnit GSVDJob = 'I' // Use unit-initialized matrix
+	GSVDNone GSVDJob = 'N' // Do not compute orthogonal matrix
+)
+
 // EVComp specifies how eigenvectors are computed.
 type EVComp byte
 
diff --git a/native/dtgsja.go b/native/dtgsja.go
new file mode 100644
index 00000000..a47e2526
--- /dev/null
+++ b/native/dtgsja.go
@@ -0,0 +1,357 @@
+// Copyright ©2017 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package native
+
+import (
+	"math"
+
+	"github.com/gonum/blas"
+	"github.com/gonum/blas/blas64"
+	"github.com/gonum/lapack"
+)
+
+// Dtgsja computes the generalized singular value decomposition (GSVD)
+// of two real upper triangular or trapezoidal matrices A and B.
+//
+// A and B have the following forms, which may be obtained by the
+// preprocessing subroutine Dggsvp from a general m×n matrix A and p×n
+// matrix B:
+//
+//            n-k-l  k    l
+//  A =    k [  0   A12  A13 ] if m-k-l >= 0;
+//         l [  0    0   A23 ]
+//     m-k-l [  0    0    0  ]
+//
+//            n-k-l  k    l
+//  A =    k [  0   A12  A13 ] if m-k-l < 0;
+//       m-k [  0    0   A23 ]
+//
+//            n-k-l  k    l
+//  B =    l [  0    0   B13 ]
+//       p-l [  0    0    0  ]
+//
+// where the k×k matrix A12 and l×l matrix B13 are non-singular
+// upper triangular. A23 is l×l upper triangular if m-k-l >= 0,
+// otherwise A23 is (m-k)×l upper trapezoidal.
+//
+// On exit,
+//
+//  U^T*A*Q = D1*[ 0 R ], V^T*B*Q = D2*[ 0 R ],
+//
+// where U, V and Q are orthogonal matrices.
+// R is a non-singular upper triangular matrix, and D1 and D2 are
+// diagonal matrices, which are of the following structures:
+//
+// If m-k-l >= 0,
+//
+//                    k  l
+//       D1 =     k [ I  0 ]
+//                l [ 0  C ]
+//            m-k-l [ 0  0 ]
+//
+//                  k  l
+//       D2 = l   [ 0  S ]
+//            p-l [ 0  0 ]
+//
+//               n-k-l  k    l
+//  [ 0 R ] = k [  0   R11  R12 ] k
+//            l [  0    0   R22 ] l
+//
+// where
+//
+//  C = diag( alpha_k, ... , alpha_{k+l} ),
+//  S = diag( beta_k,  ... , beta_{k+l} ),
+//  C^2 + S^2 = I.
+//
+// R is stored in
+//  A[0:k+l, n-k-l:n]
+// on exit.
+//
+// If m-k-l < 0,
+//
+//                 k m-k k+l-m
+//      D1 =   k [ I  0    0  ]
+//           m-k [ 0  C    0  ]
+//
+//                   k m-k k+l-m
+//      D2 =   m-k [ 0  S    0  ]
+//           k+l-m [ 0  0    I  ]
+//             p-l [ 0  0    0  ]
+//
+//                 n-k-l  k   m-k  k+l-m
+//  [ 0 R ] =    k [ 0    R11  R12  R13 ]
+//             m-k [ 0     0   R22  R23 ]
+//           k+l-m [ 0     0    0   R33 ]
+//
+// where
+//  C = diag( alpha_k, ... , alpha_m ),
+//  S = diag( beta_k,  ... , beta_m ),
+//  C^2 + S^2 = I.
+//
+//  R = [ R11 R12 R13 ] is stored in A[0:m, n-k-l:n]
+//      [  0  R22 R23 ]
+// and R33 is stored in
+//  B[m-k:l, n+m-k-l:n] on exit.
+//
+// The computation of the orthogonal transformation matrices U, V or Q
+// is optional. These matrices may either be formed explicitly, or they
+// may be post-multiplied into input matrices U1, V1, or Q1.
+//
+// Dtgsja essentially uses a variant of Kogbetliantz algorithm to reduce
+// min(l,m-k)×l triangular or trapezoidal matrix A23 and l×l
+// matrix B13 to the form:
+//
+//  U1^T*A13*Q1 = C1*R1; V1^T*B13*Q1 = S1*R1,
+//
+// where U1, V1 and Q1 are orthogonal matrices. C1 and S1 are diagonal
+// matrices satisfying
+//
+//  C1^2 + S1^2 = I,
+//
+// and R1 is an l×l non-singular upper triangular matrix.
+//
+// jobU, jobV and jobQ are options for computing the orthogonal matrices. The behavior
+// is as follows
+//  jobU == lapack.GSVDU        Compute orthogonal matrix U
+//  jobU == lapack.GSVDUnit     Use unit-initialized matrix
+//  jobU == lapack.GSVDNone     Do not compute orthogonal matrix.
+// The behavior is the same for jobV and jobQ with the exception that instead of
+// lapack.GSVDU these accept lapack.GSVDV and lapack.GSVDQ respectively.
+// The matrices U, V and Q must be m×m, p×p and n×n respectively unless the
+// relevant job parameter is lapack.GSVDNone.
+//
+// k and l specify the sub-blocks in the input matrices A and B:
+//  A23 = A[k:min(k+l,m), n-l:n) and B13 = B[0:l, n-l:n]
+// of A and B, whose GSVD is going to be computed by Dtgsja.
+//
+// tola and tolb are the convergence criteria for the Jacobi-Kogbetliantz
+// iteration procedure. Generally, they are the same as used in the preprocessing
+// step, for example,
+//  tola = max(m, n)*norm(A)*eps,
+//  tolb = max(p, n)*norm(B)*eps,
+// where eps is the machine epsilon.
+//
+// work must have length at least 2*n, otherwise Dtgsja will panic.
+//
+// alpha and beta must have length n or Dtgsja will panic. On exit, alpha and
+// beta contain the generalized singular value pairs of A and B
+//   alpha[0:k] = 1,
+//   beta[0:k]  = 0,
+// if m-k-l >= 0,
+//   alpha[k:k+l] = diag(C),
+//   beta[k:k+l]  = diag(S),
+// if m-k-l < 0,
+//   alpha[k:m]= C, alpha[m:k+l]= 0
+//   beta[k:m] = S, beta[m:k+l] = 1.
+// if k+l < n,
+//   alpha[k+l:n] = 0 and
+//   beta[k+l:n]  = 0.
+//
+// On exit, A[n-k:n, 0:min(k+l,m)] contains the triangular matrix R or part of R
+// and if necessary, B[m-k:l, n+m-k-l:n] contains a part of R.
+//
+// Dtgsja returns whether the routine converged and the number of iteration cycles
+// that were run.
+//
+// Dtgsja is an internal routine. It is exported for testing purposes.
+func (impl Implementation) Dtgsja(jobU, jobV, jobQ lapack.GSVDJob, m, p, n, k, l int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64) (cycles int, ok bool) {
+	const maxit = 40
+
+	checkMatrix(m, n, a, lda)
+	checkMatrix(p, n, b, ldb)
+
+	if len(alpha) != n {
+		panic(badAlpha)
+	}
+	if len(beta) != n {
+		panic(badBeta)
+	}
+
+	initu := jobU == lapack.GSVDUnit
+	wantu := initu || jobU == lapack.GSVDU
+	if !initu && !wantu && jobU != lapack.GSVDNone {
+		panic(badGSVDJob + "U")
+	}
+	if jobU != lapack.GSVDNone {
+		checkMatrix(m, m, u, ldu)
+	}
+
+	initv := jobV == lapack.GSVDUnit
+	wantv := initv || jobV == lapack.GSVDV
+	if !initv && !wantv && jobV != lapack.GSVDNone {
+		panic(badGSVDJob + "V")
+	}
+	if jobV != lapack.GSVDNone {
+		checkMatrix(p, p, v, ldv)
+	}
+
+	initq := jobQ == lapack.GSVDUnit
+	wantq := initq || jobQ == lapack.GSVDQ
+	if !initq && !wantq && jobQ != lapack.GSVDNone {
+		panic(badGSVDJob + "Q")
+	}
+	if jobQ != lapack.GSVDNone {
+		checkMatrix(n, n, q, ldq)
+	}
+
+	if len(work) < 2*n {
+		panic(badWork)
+	}
+
+	// Initialize U, V and Q, if necessary
+	if initu {
+		impl.Dlaset(blas.All, m, m, 0, 1, u, ldu)
+	}
+	if initv {
+		impl.Dlaset(blas.All, p, p, 0, 1, v, ldv)
+	}
+	if initq {
+		impl.Dlaset(blas.All, n, n, 0, 1, q, ldq)
+	}
+
+	bi := blas64.Implementation()
+	minTol := math.Min(tola, tolb)
+
+	// Loop until convergence.
+	upper := false
+	for cycles = 1; cycles <= maxit; cycles++ {
+		upper = !upper
+
+		for i := 0; i < l-1; i++ {
+			for j := i + 1; j < l; j++ {
+				var a1, a2, a3 float64
+				if k+i < m {
+					a1 = a[(k+i)*lda+n-l+i]
+				}
+				if k+j < m {
+					a3 = a[(k+j)*lda+n-l+j]
+				}
+
+				b1 := b[i*ldb+n-l+i]
+				b3 := b[j*ldb+n-l+j]
+
+				var b2 float64
+				if upper {
+					if k+i < m {
+						a2 = a[(k+i)*lda+n-l+j]
+					}
+					b2 = b[i*ldb+n-l+j]
+				} else {
+					if k+j < m {
+						a2 = a[(k+j)*lda+n-l+i]
+					}
+					b2 = b[j*ldb+n-l+i]
+				}
+
+				csu, snu, csv, snv, csq, snq := impl.Dlags2(upper, a1, a2, a3, b1, b2, b3)
+
+				// Update (k+i)-th and (k+j)-th rows of matrix A: U^T*A.
+				if k+j < m {
+					bi.Drot(l, a[(k+j)*lda+n-l:], 1, a[(k+i)*lda+n-l:], 1, csu, snu)
+				}
+
+				// Update i-th and j-th rows of matrix B: V^T*B.
+				bi.Drot(l, b[j*ldb+n-l:], 1, b[i*ldb+n-l:], 1, csv, snv)
+
+				// Update (n-l+i)-th and (n-l+j)-th columns of matrices
+				// A and B: A*Q and B*Q.
+				bi.Drot(min(k+l, m), a[n-l+j:], lda, a[n-l+i:], lda, csq, snq)
+				bi.Drot(l, b[n-l+j:], ldb, b[n-l+i:], ldb, csq, snq)
+
+				if upper {
+					if k+i < m {
+						a[(k+i)*lda+n-l+j] = 0
+					}
+					b[i*ldb+n-l+j] = 0
+				} else {
+					if k+j < m {
+						a[(k+j)*lda+n-l+i] = 0
+					}
+					b[j*ldb+n-l+i] = 0
+				}
+
+				// Update orthogonal matrices U, V, Q, if desired.
+				if wantu && k+j < m {
+					bi.Drot(m, u[k+j:], ldu, u[k+i:], ldu, csu, snu)
+				}
+				if wantv {
+					bi.Drot(p, v[j:], ldv, v[i:], ldv, csv, snv)
+				}
+				if wantq {
+					bi.Drot(n, q[n-l+j:], ldq, q[n-l+i:], ldq, csq, snq)
+				}
+			}
+		}
+
+		if !upper {
+			// The matrices A13 and B13 were lower triangular at the start
+			// of the cycle, and are now upper triangular.
+			//
+			// Convergence test: test the parallelism of the corresponding
+			// rows of A and B.
+			var error float64
+			for i := 0; i < min(l, m-k); i++ {
+				bi.Dcopy(l-i, a[(k+i)*lda+n-l+i:], 1, work, 1)
+				bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, work[l:], 1)
+				ssmin := impl.Dlapll(l-i, work, 1, work[l:], 1)
+				error = math.Max(error, ssmin)
+			}
+			if math.Abs(error) <= minTol {
+				// The algorithm has converged.
+				// Compute the generalized singular value pairs (alpha, beta)
+				// and set the triangular matrix R to array A.
+				for i := 0; i < k; i++ {
+					alpha[i] = 1
+					beta[i] = 0
+				}
+
+				for i := 0; i < min(l, m-k); i++ {
+					a1 := a[(k+i)*lda+n-l+i]
+					b1 := b[i*ldb+n-l+i]
+
+					if a1 != 0 {
+						gamma := b1 / a1
+
+						// Change sign if necessary.
+						if gamma < 0 {
+							bi.Dscal(l-i, -1, b[i*ldb+n-l+i:], 1)
+							if wantv {
+								bi.Dscal(p, -1, v[i:], ldv)
+							}
+						}
+						beta[k+i], alpha[k+i], _ = impl.Dlartg(math.Abs(gamma), 1)
+
+						if alpha[k+i] >= beta[k+i] {
+							bi.Dscal(l-i, 1/alpha[k+i], a[(k+i)*lda+n-l+i:], 1)
+						} else {
+							bi.Dscal(l-i, 1/beta[k+i], b[i*ldb+n-l+i:], 1)
+							bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+						}
+					} else {
+						alpha[k+i] = 0
+						beta[k+i] = 1
+						bi.Dcopy(l-i, b[i*ldb+n-l+i:], 1, a[(k+i)*lda+n-l+i:], 1)
+					}
+				}
+
+				for i := m; i < k+l; i++ {
+					alpha[i] = 0
+					beta[i] = 1
+				}
+				if k+l < n {
+					for i := k + l; i < n; i++ {
+						alpha[i] = 0
+						beta[i] = 0
+					}
+				}
+
+				return cycles, true
+			}
+		}
+	}
+
+	// The algorithm has not converged after maxit cycles.
+	return cycles, false
+}
diff --git a/native/general.go b/native/general.go
index 4a134331..43ec0439 100644
--- a/native/general.go
+++ b/native/general.go
@@ -20,7 +20,9 @@ var _ lapack.Float64 = Implementation{}
 // This list is duplicated in lapack/cgo. Keep in sync.
 const (
 	absIncNotOne    = "lapack: increment not one or negative one"
+	badAlpha        = "lapack: bad alpha length"
 	badAuxv         = "lapack: auxv has insufficient length"
+	badBeta         = "lapack: bad beta length"
 	badD            = "lapack: d has insufficient length"
 	badDecompUpdate = "lapack: bad decomp update"
 	badDiag         = "lapack: bad diag"
@@ -30,6 +32,7 @@ const (
 	badEVComp       = "lapack: bad EVComp"
 	badEVJob        = "lapack: bad EVJob"
 	badEVSide       = "lapack: bad EVSide"
+	badGSVDJob      = "lapack: bad GSVDJob"
 	badHowMany      = "lapack: bad HowMany"
 	badIlo          = "lapack: ilo out of range"
 	badIhi          = "lapack: ihi out of range"
diff --git a/native/lapack_test.go b/native/lapack_test.go
index 9db5a45f..7f107204 100644
--- a/native/lapack_test.go
+++ b/native/lapack_test.go
@@ -372,6 +372,10 @@ func TestDsytrd(t *testing.T) {
 	testlapack.DsytrdTest(t, impl)
 }
 
+func TestDtgsja(t *testing.T) {
+	testlapack.DtgsjaTest(t, impl)
+}
+
 func TestDtrcon(t *testing.T) {
 	testlapack.DtrconTest(t, impl)
 }
diff --git a/testlapack/dtgsja.go b/testlapack/dtgsja.go
new file mode 100644
index 00000000..b9466220
--- /dev/null
+++ b/testlapack/dtgsja.go
@@ -0,0 +1,165 @@
+// Copyright ©2017 The gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package testlapack
+
+import (
+	"math/rand"
+	"testing"
+
+	"github.com/gonum/blas"
+	"github.com/gonum/blas/blas64"
+	"github.com/gonum/floats"
+	"github.com/gonum/lapack"
+)
+
+type Dtgsjaer interface {
+	Dlanger
+	Dtgsja(jobU, jobV, jobQ lapack.GSVDJob, m, p, n, k, l int, a []float64, lda int, b []float64, ldb int, tola, tolb float64, alpha, beta, u []float64, ldu int, v []float64, ldv int, q []float64, ldq int, work []float64) (cycles int, ok bool)
+}
+
+func DtgsjaTest(t *testing.T, impl Dtgsjaer) {
+	rnd := rand.New(rand.NewSource(1))
+	for cas, test := range []struct {
+		m, p, n, k, l, lda, ldb, ldu, ldv, ldq int
+
+		ok bool
+	}{
+		{m: 5, p: 5, n: 5, k: 2, l: 2, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 5, p: 5, n: 5, k: 4, l: 1, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 5, p: 5, n: 10, k: 2, l: 2, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 5, p: 5, n: 10, k: 4, l: 1, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 5, p: 5, n: 10, k: 4, l: 2, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 10, p: 5, n: 5, k: 2, l: 2, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 10, p: 5, n: 5, k: 4, l: 1, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 10, p: 10, n: 10, k: 5, l: 3, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 10, p: 10, n: 10, k: 6, l: 4, lda: 0, ldb: 0, ldu: 0, ldv: 0, ldq: 0, ok: true},
+		{m: 5, p: 5, n: 5, k: 2, l: 2, lda: 10, ldb: 10, ldu: 10, ldv: 10, ldq: 10, ok: true},
+		{m: 5, p: 5, n: 5, k: 4, l: 1, lda: 10, ldb: 10, ldu: 10, ldv: 10, ldq: 10, ok: true},
+		{m: 5, p: 5, n: 10, k: 2, l: 2, lda: 20, ldb: 20, ldu: 10, ldv: 10, ldq: 20, ok: true},
+		{m: 5, p: 5, n: 10, k: 4, l: 1, lda: 20, ldb: 20, ldu: 10, ldv: 10, ldq: 20, ok: true},
+		{m: 5, p: 5, n: 10, k: 4, l: 2, lda: 20, ldb: 20, ldu: 10, ldv: 10, ldq: 20, ok: true},
+		{m: 10, p: 5, n: 5, k: 2, l: 2, lda: 10, ldb: 10, ldu: 20, ldv: 10, ldq: 10, ok: true},
+		{m: 10, p: 5, n: 5, k: 4, l: 1, lda: 10, ldb: 10, ldu: 20, ldv: 10, ldq: 10, ok: true},
+		{m: 10, p: 10, n: 10, k: 5, l: 3, lda: 20, ldb: 20, ldu: 20, ldv: 20, ldq: 20, ok: true},
+		{m: 10, p: 10, n: 10, k: 6, l: 4, lda: 20, ldb: 20, ldu: 20, ldv: 20, ldq: 20, ok: true},
+	} {
+		m := test.m
+		p := test.p
+		n := test.n
+		k := test.k
+		l := test.l
+		lda := test.lda
+		if lda == 0 {
+			lda = n
+		}
+		ldb := test.ldb
+		if ldb == 0 {
+			ldb = n
+		}
+		ldu := test.ldu
+		if ldu == 0 {
+			ldu = m
+		}
+		ldv := test.ldv
+		if ldv == 0 {
+			ldv = p
+		}
+		ldq := test.ldq
+		if ldq == 0 {
+			ldq = n
+		}
+
+		a := blockedUpperTriGeneral(m, n, k, l, lda, true, rnd)
+		aCopy := cloneGeneral(a)
+		b := blockedUpperTriGeneral(p, n, k, l, ldb, false, rnd)
+		bCopy := cloneGeneral(b)
+
+		tola := float64(max(m, n)) * impl.Dlange(lapack.NormFrob, m, n, a.Data, a.Stride, nil) * dlamchE
+		tolb := float64(max(p, n)) * impl.Dlange(lapack.NormFrob, p, n, b.Data, b.Stride, nil) * dlamchE
+
+		alpha := make([]float64, n)
+		beta := make([]float64, n)
+
+		work := make([]float64, 2*n)
+
+		u := nanGeneral(m, m, ldu)
+		v := nanGeneral(p, p, ldv)
+		q := nanGeneral(n, n, ldq)
+
+		_, ok := impl.Dtgsja(lapack.GSVDUnit, lapack.GSVDUnit, lapack.GSVDUnit,
+			m, p, n, k, l,
+			a.Data, a.Stride,
+			b.Data, b.Stride,
+			tola, tolb,
+			alpha, beta,
+			u.Data, u.Stride,
+			v.Data, v.Stride,
+			q.Data, q.Stride,
+			work)
+
+		if !ok {
+			if test.ok {
+				t.Errorf("test %d unexpectedly did not converge", cas)
+			}
+			continue
+		}
+
+		// Check orthogonality of U, V and Q.
+		if !isOrthonormal(u) {
+			t.Errorf("test %d: U is not orthogonal\n%+v", cas, u)
+		}
+		if !isOrthonormal(v) {
+			t.Errorf("test %d: V is not orthogonal\n%+v", cas, v)
+		}
+		if !isOrthonormal(q) {
+			t.Errorf("test %d: Q is not orthogonal\n%+v", cas, q)
+		}
+
+		// Check C^2 + S^2 = I.
+		var elements []float64
+		if m-k-l >= 0 {
+			elements = alpha[k : k+l]
+		} else {
+			elements = alpha[k:m]
+		}
+		for i := range elements {
+			i += k
+			d := alpha[i]*alpha[i] + beta[i]*beta[i]
+			if !floats.EqualWithinAbsOrRel(d, 1, 1e-14, 1e-14) {
+				t.Errorf("test %d: alpha_%d^2 + beta_%d^2 != 1: got: %v", cas, i, i, d)
+			}
+		}
+
+		zeroR, d1, d2 := constructGSVDresults(n, p, m, k, l, a, b, alpha, beta)
+
+		// Check U^T*A*Q = D1*[ 0 R ].
+		uTmp := nanGeneral(m, n, n)
+		blas64.Gemm(blas.Trans, blas.NoTrans, 1, u, aCopy, 0, uTmp)
+		uAns := nanGeneral(m, n, n)
+		blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, uTmp, q, 0, uAns)
+
+		d10r := nanGeneral(m, n, n)
+		blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, d1, zeroR, 0, d10r)
+
+		if !equalApproxGeneral(uAns, d10r, 1e-14) {
+			t.Errorf("test %d: U^T*A*Q != D1*[ 0 R ]\nU^T*A*Q:\n%+v\nD1*[ 0 R ]:\n%+v",
+				cas, uAns, d10r)
+		}
+
+		// Check V^T*B*Q = D2*[ 0 R ].
+		vTmp := nanGeneral(p, n, n)
+		blas64.Gemm(blas.Trans, blas.NoTrans, 1, v, bCopy, 0, vTmp)
+		vAns := nanGeneral(p, n, n)
+		blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, vTmp, q, 0, vAns)
+
+		d20r := nanGeneral(p, n, n)
+		blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, d2, zeroR, 0, d20r)
+
+		if !equalApproxGeneral(vAns, d20r, 1e-14) {
+			t.Errorf("test %d: V^T*B*Q != D2*[ 0 R ]\nV^T*B*Q:\n%+v\nD2*[ 0 R ]:\n%+v",
+				cas, vAns, d20r)
+		}
+	}
+}
diff --git a/testlapack/general.go b/testlapack/general.go
index ec8a38cb..0b092b4c 100644
--- a/testlapack/general.go
+++ b/testlapack/general.go
@@ -17,6 +17,9 @@ import (
 	"github.com/gonum/lapack"
 )
 
+// dlamchE is the machine epsilon. For IEEE this is 2^-53.
+const dlamchE = 1.0 / (1 << 53)
+
 func max(a, b int) int {
 	if a > b {
 		return a
@@ -138,6 +141,41 @@ func randomSchurCanonical(n, stride int, rnd *rand.Rand) blas64.General {
 	return t
 }
 
+// blockedUpperTriGeneral returns a normal random, general matrix in the form
+//
+//            c-k-l  k    l
+//  A =    k [  0   A12  A13 ] if r-k-l >= 0;
+//         l [  0    0   A23 ]
+//     r-k-l [  0    0    0  ]
+//
+//          c-k-l  k    l
+//  A =  k [  0   A12  A13 ] if r-k-l < 0;
+//     r-k [  0    0   A23 ]
+//
+// where the k×k matrix A12 and l×l matrix is non-singular
+// upper triangular. A23 is l×l upper triangular if r-k-l >= 0,
+// otherwise A23 is (r-k)×l upper trapezoidal.
+func blockedUpperTriGeneral(r, c, k, l, stride int, kblock bool, rnd *rand.Rand) blas64.General {
+	t := l
+	if kblock {
+		t += k
+	}
+	ans := zeros(r, c, stride)
+	for i := 0; i < min(r, t); i++ {
+		var v float64
+		for v == 0 {
+			v = rnd.NormFloat64()
+		}
+		ans.Data[i*ans.Stride+i+(c-t)] = v
+	}
+	for i := 0; i < min(r, t); i++ {
+		for j := i + (c - t) + 1; j < c; j++ {
+			ans.Data[i*ans.Stride+j] = rnd.NormFloat64()
+		}
+	}
+	return ans
+}
+
 // nanTriangular allocates a new r×c triangular matrix filled with NaN values.
 func nanTriangular(uplo blas.Uplo, n, stride int) blas64.Triangular {
 	if n < 0 {
@@ -1291,3 +1329,72 @@ func applyReflector(qh blas64.General, q blas64.General, v []float64) {
 		}
 	}
 }
+
+// constructGSVDresults returns the matrices [ 0 R ], D1 and D2 described
+// in the documentation of Dtgsja and Dggsvd3, and the result matrix in
+// the documentation for Dggsvp3.
+func constructGSVDresults(n, p, m, k, l int, a, b blas64.General, alpha, beta []float64) (zeroR, d1, d2 blas64.General) {
+	zeroR = zeros(k+l, n, n)
+	d1 = zeros(m, k+l, k+l)
+	d2 = zeros(p, k+l, k+l)
+	if m-k-l >= 0 {
+		// [ 0 R ]
+		dst := zeroR
+		dst.Cols = k + l
+		dst.Data = zeroR.Data[n-k-l:]
+		src := a
+		src.Cols = k + l
+		src.Data = a.Data[n-k-l:]
+		copyGeneral(dst, src)
+
+		// D1
+		for i := 0; i < k; i++ {
+			d1.Data[i*d1.Stride+i] = 1
+		}
+		for i := k; i < k+l; i++ {
+			d1.Data[i*d1.Stride+i] = alpha[i]
+		}
+
+		// D2
+		for i := 0; i < l; i++ {
+			d2.Data[i*d2.Stride+i+k] = beta[k+i]
+		}
+	} else {
+		// [ 0 R ]
+		dst := zeroR
+		dst.Rows = m
+		dst.Cols = k + l
+		dst.Data = zeroR.Data[n-k-l:]
+		src := a
+		src.Rows = m
+		src.Cols = k + l
+		src.Data = a.Data[n-k-l:]
+		copyGeneral(dst, src)
+		dst.Rows = k + l - m
+		dst.Cols = k + l - m
+		dst.Data = zeroR.Data[m*zeroR.Stride+n-(k+l-m):]
+		src = b
+		src.Rows = k + l - m
+		src.Cols = k + l - m
+		src.Data = b.Data[(m-k)*b.Stride+n+m-k-l:]
+		copyGeneral(dst, src)
+
+		// D1
+		for i := 0; i < k; i++ {
+			d1.Data[i*d1.Stride+i] = 1
+		}
+		for i := k; i < m; i++ {
+			d1.Data[i*d1.Stride+i] = alpha[i]
+		}
+
+		// D2
+		for i := 0; i < m-k; i++ {
+			d2.Data[i*d2.Stride+i+k] = beta[k+i]
+		}
+		for i := m - k; i < l; i++ {
+			d2.Data[i*d2.Stride+i+k] = 1
+		}
+	}
+
+	return zeroR, d1, d2
+}