remove orig

2025-10-20 13:55:20 +08:00 · 2016-01-06 21:38:42 -07:00
parent 7016112fd0
commit 909081d32b
13 changed files with 1059 additions and 1473 deletions
--- a/cgo/lapack.go
+++ b/cgo/lapack.go
@@ -498,6 +498,8 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 	return clapack.Dgels(trans, m, n, nrhs, a, lda, b, ldb)
 }

+const noSVDO = "dgesvd: not coded for overwrite"
+
 // Dgesvd computes the singular value decomposition of the input matrix A.
 //
 // The singular value decomposition is
@@ -509,12 +511,12 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 //
 // jobU and jobVT are options for computing the singular vectors. The behavior
 // is as follows
-//  jobU == lapack.SVDAll		All M columns of U are returned in u
+//  jobU == lapack.SVDAll       All m columns of U are returned in u
 //  jobU == lapack.SVDInPlace   The first min(m,n) columns are returned in u
 //  jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
 //  jobU == lapack.SVDNone      The columns of U are not computed.
 // The behavior is the same for jobVT and the rows of V^T. At most one of jobU
-// and jobVT can equal lapack.SVDOverwrite.
+// and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise.
 //
 // On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
 // the data is overwritten. On exit, A contains the appropriate singular vectors
@@ -529,12 +531,12 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 // not used.
 //
 // vt contains the left singular vectors on exit, stored rowwise. If
-// jobV == lapack.SVDAll, vt is of size n×m. If jobV == lapack.SVDInPlace vt is
-// of size min(m,n)×n. If jobU == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDInPlace vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
 // not used.
 //
 // The C interface does not support providing temporary storage. To provide compatibility
-// with native, lwork == -1 will not run Dgeqrf but will instead write the minimum
+// with native, lwork == -1 will not run Dgesvd but will instead write the minimum
 // work necessary to work[0]. If len(work) < lwork, Dgeqrf will panic.
 //
 // Dgesvd returns whether the decomposition successfully completed.
@@ -551,13 +553,13 @@ func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float
 		checkMatrix(min(m, n), n, vt, ldvt)
 	}
 	if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite {
-		panic("lapack: both jobU and jobVT are lapack.SVDOverwrite")
+		panic(noSVDO)
 	}
 	if len(s) < min(m, n) {
 		panic(badS)
 	}
-	if jobU != lapack.SVDAll || jobVT != lapack.SVDAll {
-		panic("lapack: SVD only coded for SVDAll job inputs")
+	if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite {
+		panic("lapack: SVD not coded to overwrite original matrix")
 	}
 	minWork := max(5*min(m, n), 3*min(m, n)+max(m, n))
 	if lwork != -1 {
@@ -572,7 +574,7 @@ func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float
 		work[0] = float64(minWork)
 		return true
 	}
-	return clapack.Dgesvd(byte(jobU), byte(jobVT), m, n, a, lda, s, u, ldu, vt, ldvt, work)
+	return clapack.Dgesvd(lapack.Job(jobU), lapack.Job(jobVT), m, n, a, lda, s, u, ldu, vt, ldvt, work[1:])
 }

 // Dgetf2 computes the LU decomposition of the m×n matrix A.
--- a/cgo/lapack_test.go
+++ b/cgo/lapack_test.go
@@ -88,6 +88,10 @@ func TestDgeqrf(t *testing.T) {
 	testlapack.DgeqrfTest(t, impl)
 }

+func TestDgesvd(t *testing.T) {
+	testlapack.DgesvdTest(t, impl)
+}
+
 func TestDgetf2(t *testing.T) {
 	testlapack.Dgetf2Test(t, impl)
 }
--- a/lapack.go
+++ b/lapack.go
@@ -104,7 +104,7 @@ type SVDJob byte

 const (
 	SVDAll       SVDJob = 'A' // Compute all singular vectors
-	SVDInPlace          = 'S' // Compute the first singular vectors and store in provided storage.
-	SVDOverwrite        = 'O' // Compute the singular vectors and store in input matrix
+	SVDInPlace          = 'S' // Compute the first singular vectors and store them in provided storage.
+	SVDOverwrite        = 'O' // Compute the singular vectors and store them in input matrix
 	SVDNone             = 'N' // Do not compute singular vectors
 )
--- a/native/dbdsqr.go.orig
+++ b/native/dbdsqr.go.orig
@@ -1,497 +0,0 @@
-// Copyright ©2015 The gonum Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package native
-
-import (
-	"math"
-
-	"github.com/gonum/blas"
-	"github.com/gonum/blas/blas64"
-	"github.com/gonum/lapack"
-)
-
-// Dbdsqr performs a singular value decomposition of a real n×n bidiagonal matrix.
-//
-// The SVD of the bidiagonal matrix B is
-//  B = Q * S * P^T
-// where S is a diagonal matrix of singular values, Q is an orthogonal matrix of
-// left singular vectors, and P is an orthogonal matrix of right singular vectors.
-//
-// Q and P are only computed if requested. If left singular vectors are requested,
-// this routine returns U * Q instead of Q, and if right singular vectors are
-// requested P^T * VT is returned instead of P^T.
-//
-// Frequently Dbdsqr is used in conjuction with Dgebrd which reduces a general
-// matrix A into bidiagonal form. In this case, the SVD of A is
-//  A = (U * Q) * S * (P^T * VT)
-//
-// This routine may also compute Q^T * C.
-//
-// d and e contain the elements of the bidiagonal matrix b. d must have length at
-// least n, and e must have length at least n-1. Dbdsqr will panic if there is
-// insufficient length. On exit, D contains the singular values of B in decreasing
-// order.
-//
-// VT is a matrix of size n×ncvt whose elements are stored in vt. The elements
-// of vt are modified to contain P^T * VT on exit. VT is not used if ncvt == 0.
-//
-// U is a matrix of size nru×n whose elements are stored in u. The elements
-// of u are modified to contain U * Q on exit. U is not used if nru == 0.
-//
-// C is a matrix of size n×ncc whose elements are stored in c. The elements
-// of c are modified to contain Q^T * C on exit. C is not used if ncc == 0.
-//
-// work contains temporary storage and must have length at least 4*n. Dbdsqr
-// will panic if there is insufficient working memory.
-//
-// Dbdsqr returns whether the decomposition was successful.
-func (impl Implementation) Dbdsqr(uplo blas.Uplo, n, ncvt, nru, ncc int, d, e, vt []float64, ldvt int, u []float64, ldu int, c []float64, ldc int, work []float64) (ok bool) {
-	if uplo != blas.Upper && uplo != blas.Lower {
-		panic(badUplo)
-	}
-	if ncvt != 0 {
-		checkMatrix(n, ncvt, vt, ldvt)
-	}
-	if nru != 0 {
-		checkMatrix(nru, n, u, ldu)
-	}
-	if ncc != 0 {
-		checkMatrix(n, ncc, c, ldc)
-	}
-	if len(d) < n {
-		panic(badD)
-	}
-	if len(e) < n-1 {
-		panic(badE)
-	}
-	if len(work) < 4*n {
-		panic(badWork)
-	}
-	var info int
-	bi := blas64.Implementation()
-	const (
-		maxIter = 6
-	)
-	if n == 0 {
-		return true
-	}
-	if n != 1 {
-		// If the singular vectors do not need to be computed, use qd algorithm.
-		if !(ncvt > 0 || nru > 0 || ncc > 0) {
-			info = impl.Dlasq1(n, d, e, work)
-			// If info is 2 dqds didn't finish, and so try to.
-			if info != 2 {
-				return info == 0
-			}
-			info = 0
-		}
-		nm1 := n - 1
-		nm12 := nm1 + nm1
-		nm13 := nm12 + nm1
-		idir := 0
-
-		eps := dlamchE
-		unfl := dlamchS
-		lower := uplo == blas.Lower
-		var cs, sn, r float64
-		if lower {
-			for i := 0; i < n-1; i++ {
-				cs, sn, r = impl.Dlartg(d[i], e[i])
-				d[i] = r
-				e[i] = sn * d[i+1]
-				d[i+1] *= cs
-				work[i] = cs
-				work[nm1+i] = sn
-			}
-			if nru > 0 {
-				impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, n, work, work[n-1:], u, ldu)
-			}
-			if ncc > 0 {
-				impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, n, ncc, work, work[n-1:], c, ldc)
-			}
-		}
-		// Compute singular values to a relative accuracy of tol. If tol is negative
-		// the values will be computed to an absolute accuracy of math.Abs(tol) * norm(b)
-		tolmul := math.Max(10, math.Min(100, eps*(-1.0/8)))
-		tol := tolmul * eps
-		var smax float64
-		for i := 0; i < n; i++ {
-			smax = math.Max(smax, math.Abs(d[i]))
-		}
-		for i := 0; i < n-1; i++ {
-			smax = math.Max(smax, math.Abs(e[i]))
-		}
-
-		var sminl float64
-		var thresh float64
-		if tol >= 0 {
-			sminoa := math.Abs(d[0])
-			if sminoa != 0 {
-				mu := sminoa
-				for i := 1; i < n; i++ {
-					mu = math.Abs(d[i]) * (mu / (mu + math.Abs(e[i-1])))
-					sminoa = math.Min(sminoa, mu)
-					if sminoa == 0 {
-						break
-					}
-				}
-			}
-			sminoa = sminoa / math.Sqrt(float64(n))
-			thresh = math.Max(tol*sminoa, float64(maxIter*n*n)*unfl)
-		} else {
-			thresh = math.Max(math.Abs(tol)*smax, float64(maxIter*n*n)*unfl)
-		}
-		// Prepare for the main iteration loop for the singular values.
-		maxIt := maxIter * n * n
-		iter := 0
-		oldl2 := -1
-		oldm := -1
-		// m points to the last element of unconverged part of matrix.
-		m := n
-
-	Outer:
-		for m > 1 {
-			if iter > maxIt {
-				info = 0
-				for i := 0; i < n-1; i++ {
-					if e[i] != 0 {
-						info++
-					}
-				}
-				return info == 0
-			}
-			// Find diagonal block of matrix to work on.
-			if tol < 0 && math.Abs(d[m-1]) <= thresh {
-				d[m-1] = 0
-			}
-			smax = math.Abs(d[m-1])
-			smin := smax
-			var l2 int
-			var broke bool
-			for l3 := 0; l3 < m-1; l3++ {
-				l2 = m - l3 - 2
-				abss := math.Abs(d[l2])
-				abse := math.Abs(e[l2])
-				if tol < 0 && abss <= thresh {
-					d[l2] = 0
-				}
-				if abse <= thresh {
-					broke = true
-					e[l2] = 0
-					if l2 == m-2 {
-						// Convergence of bottom singular value, return to top.
-						m--
-						continue Outer
-					}
-					l2++
-					break
-				}
-				smin = math.Min(smin, abss)
-				smax = math.Max(math.Max(smax, abss), abse)
-			}
-<<<<<<< HEAD
-			if broke {
-				e[l2] = 0
-				if l2 == m-2 {
-					// Convergence of bottom singular value, return to top.
-					m--
-					continue
-				}
-				l2++
-			} else {
-=======
-			if !broke {
->>>>>>> 618d4f1... simplify loop condition
-				l2 = 0
-			}
-			// e[ll] through e[m-2] are nonzero, e[ll-1] is zero
-			if l2 == m-2 {
-				// Handle 2×2 block separately.
-				var sinr, cosr, sinl, cosl float64
-				d[m-1], d[m-2], sinr, cosr, sinl, cosl = impl.Dlasv2(d[m-2], e[m-2], d[m-1])
-				e[m-2] = 0
-				if ncvt > 0 {
-					bi.Drot(ncvt, vt[(m-2)*ldvt:], 1, vt[(m-1)*ldvt:], 1, cosr, sinr)
-				}
-				if nru > 0 {
-					bi.Drot(nru, u[m-2:], ldu, u[m-1:], ldu, cosl, sinl)
-				}
-				if ncc > 0 {
-					bi.Drot(ncc, c[(m-2)*ldc:], 1, c[(m-1)*ldc:], 1, cosl, sinl)
-				}
-				m -= 2
-				continue
-			}
-			// If working on a new submatrix, choose shift direction from larger end
-			// diagonal element toward smaller.
-			if l2 > oldm-1 || m-1 < oldl2 {
-				if math.Abs(d[l2]) >= math.Abs(d[m-1]) {
-					idir = 1
-				} else {
-					idir = 2
-				}
-			}
-			// Apply convergence tests.
-			// TODO(btracey): There is a lot of similar looking code here. See
-			// if there is a better way to de-duplicate.
-			if idir == 1 {
-				// Run convergence test in forward direction.
-				// First apply standard test to bottom of matrix.
-				if math.Abs(e[m-2]) <= math.Abs(tol)*math.Abs(d[m-1]) || (tol < 0 && math.Abs(e[m-2]) <= thresh) {
-					e[m-2] = 0
-					continue
-				}
-				if tol >= 0 {
-					// If relative accuracy desired, apply convergence criterion forward.
-					mu := math.Abs(d[l2])
-					sminl = mu
-					for l3 := l2; l3 < m-1; l3++ {
-						if math.Abs(e[l3]) <= tol*mu {
-							e[l3] = 0
-							continue Outer
-						}
-						mu = math.Abs(d[l3+1]) * (mu / (mu + math.Abs(e[l3])))
-						sminl = math.Min(sminl, mu)
-					}
-				}
-			} else {
-				// Run convergence test in backward direction.
-				// First apply standard test to top of matrix.
-				if math.Abs(e[l2]) <= math.Abs(tol)*math.Abs(d[l2]) || (tol < 0 && math.Abs(e[l2]) <= thresh) {
-					e[l2] = 0
-					continue
-				}
-				if tol >= 0 {
-					// If relative accuracy desired, apply convergence criterion backward.
-					mu := math.Abs(d[m-1])
-					sminl = mu
-					for l3 := m - 2; l3 >= l2; l3-- {
-						if math.Abs(e[l3]) <= tol*mu {
-							e[l3] = 0
-							continue Outer
-						}
-						mu = math.Abs(d[l3]) * (mu / (mu + math.Abs(e[l3])))
-						sminl = math.Min(sminl, mu)
-					}
-				}
-			}
-			oldl2 = l2
-			oldm = m
-			// Compute shift. First, test if shifting would ruin relative accuracy,
-			// and if so set the shift to zero.
-			var shift float64
-			if tol >= 0 && float64(n)*tol*(sminl/smax) <= math.Max(eps, (1.0/100)*tol) {
-				shift = 0
-			} else {
-				var sl2 float64
-				if idir == 1 {
-					sl2 = math.Abs(d[l2])
-					shift, _ = impl.Dlas2(d[m-2], e[m-2], d[m-1])
-				} else {
-					sl2 = math.Abs(d[m-1])
-					shift, _ = impl.Dlas2(d[l2], e[l2], d[l2+1])
-				}
-				// Test if shift is negligible
-				if sl2 > 0 {
-					if (shift/sl2)*(shift/sl2) < eps {
-						shift = 0
-					}
-				}
-			}
-			iter += m - l2 + 1
-			// If no shift, do simplified QR iteration.
-			if shift == 0 {
-				if idir == 1 {
-					cs := 1.0
-					oldcs := 1.0
-					var sn, r, oldsn float64
-					for i := l2; i < m-1; i++ {
-						cs, sn, r = impl.Dlartg(d[i]*cs, e[i])
-						if i > l2 {
-							e[i-1] = oldsn * r
-						}
-						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i+1]*sn)
-						work[i-l2] = cs
-						work[i-l2+nm1] = sn
-						work[i-l2+nm12] = oldcs
-						work[i-l2+nm13] = oldsn
-					}
-					h := d[m-1] * cs
-					d[m-1] = h * oldcs
-					e[m-2] = h * oldsn
-					if ncvt > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
-					}
-					if nru > 0 {
-						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
-					}
-					if ncc > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
-					}
-					if math.Abs(e[m-2]) < thresh {
-						e[m-2] = 0
-					}
-				} else {
-					cs := 1.0
-					oldcs := 1.0
-					var sn, r, oldsn float64
-					for i := m - 1; i >= l2+1; i-- {
-						cs, sn, r = impl.Dlartg(d[i]*cs, e[i-1])
-						if i < m-1 {
-							e[i] = oldsn * r
-						}
-						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i-1]*sn)
-						work[i-l2-1] = cs
-						work[i-l2+nm1-1] = -sn
-						work[i-l2+nm12-1] = oldcs
-						work[i-l2+nm13-1] = -oldsn
-					}
-					h := d[l2] * cs
-					d[l2] = h * oldcs
-					e[l2] = h * oldsn
-					if ncvt > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
-					}
-					if nru > 0 {
-						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
-					}
-					if ncc > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
-					}
-					if math.Abs(e[l2]) <= thresh {
-						e[l2] = 0
-					}
-				}
-			} else {
-				// Use nonzero shift.
-				if idir == 1 {
-					// Chase bulge from top to bottom. Save cosines and sines for
-					// later singular vector updates.
-					f := (math.Abs(d[l2]) - shift) * (math.Copysign(1, d[l2]) + shift/d[l2])
-					g := e[l2]
-					var cosl, sinl float64
-					for i := l2; i < m-1; i++ {
-						cosr, sinr, r := impl.Dlartg(f, g)
-						if i > l2 {
-							e[i-1] = r
-						}
-						f = cosr*d[i] + sinr*e[i]
-						e[i] = cosr*e[i] - sinr*d[i]
-						g = sinr * d[i+1]
-						d[i+1] *= cosr
-						cosl, sinl, r = impl.Dlartg(f, g)
-						d[i] = r
-						f = cosl*e[i] + sinl*d[i+1]
-						d[i+1] = cosl*d[i+1] - sinl*e[i]
-						if i < m-2 {
-							g = sinl * e[i+1]
-							e[i+1] = cosl * e[i+1]
-						}
-						work[i-l2] = cosr
-						work[i-l2+nm1] = sinr
-						work[i-l2+nm12] = cosl
-						work[i-l2+nm13] = sinl
-					}
-					e[m-2] = f
-					if ncvt > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
-					}
-					if nru > 0 {
-						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
-					}
-					if ncc > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
-					}
-					if math.Abs(e[m-2]) <= thresh {
-						e[m-2] = 0
-					}
-				} else {
-					// Chase bulge from top to bottom. Save cosines and sines for
-					// later singular vector updates.
-					f := (math.Abs(d[m-1]) - shift) * (math.Copysign(1, d[m-1]) + shift/d[m-1])
-					g := e[m-2]
-					for i := m - 1; i > l2; i-- {
-						cosr, sinr, r := impl.Dlartg(f, g)
-						if i < m-1 {
-							e[i] = r
-						}
-						f = cosr*d[i] + sinr*e[i-1]
-						e[i-1] = cosr*e[i-1] - sinr*d[i]
-						g = sinr * d[i-1]
-						d[i-1] *= cosr
-						cosl, sinl, r := impl.Dlartg(f, g)
-						d[i] = r
-						f = cosl*e[i-1] + sinl*d[i-1]
-						d[i-1] = cosl*d[i-1] - sinl*e[i-1]
-						if i > l2+1 {
-							g = sinl * e[i-2]
-							e[i-2] *= cosl
-						}
-						work[i-l2-1] = cosr
-						work[i-l2+nm1-1] = -sinr
-						work[i-l2+nm12-1] = cosl
-						work[i-l2+nm13-1] = -sinl
-					}
-					e[l2] = f
-					if math.Abs(e[l2]) <= thresh {
-						e[l2] = 0
-					}
-					if ncvt > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
-					}
-					if nru > 0 {
-						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
-					}
-					if ncc > 0 {
-						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
-					}
-				}
-			}
-		}
-	}
-
-	// All singular values converged, make them positive.
-	for i := 0; i < n; i++ {
-		if d[i] < 0 {
-			d[i] *= -1
-			if ncvt > 0 {
-				bi.Dscal(ncvt, -1, vt[i*ldvt:], 1)
-			}
-		}
-	}
-
-	// Sort the singular values in decreasing order.
-	for i := 0; i < n-1; i++ {
-		isub := 0
-		smin := d[0]
-		for j := 1; j < n-i; j++ {
-			if d[j] <= smin {
-				isub = j
-				smin = d[j]
-			}
-		}
-		if isub != n-i {
-			// Swap singular values and vectors.
-			d[isub] = d[n-i-1]
-			d[n-i-1] = smin
-			if ncvt > 0 {
-				bi.Dswap(ncvt, vt[isub*ldvt:], 1, vt[(n-i-1)*ldvt:], 1)
-			}
-			if nru > 0 {
-				bi.Dswap(nru, u[isub:], ldu, u[n-i-1:], ldu)
-			}
-			if ncc > 0 {
-				bi.Dswap(ncc, c[isub*ldc:], 1, c[(n-i-1)*ldc:], 1)
-			}
-		}
-	}
-	info = 0
-	for i := 0; i < n-1; i++ {
-		if e[i] != 0 {
-			info++
-		}
-	}
-	return info == 0
-}
--- a/native/dgebrd.go
+++ b/native/dgebrd.go
@@ -86,7 +86,7 @@ func (impl Implementation) Dgebrd(m, n int, a []float64, lda int, d, e, tauQ, ta
 				if lwork >= (m+n)*nbmin {
 					nb = lwork / (m + n)
 				} else {
-					nb = 1
+					nb = minmn
 					nx = minmn
 				}
 			}
--- a/native/dgesvd.go
+++ b/native/dgesvd.go
--- a/native/dgesvd.go.orig
+++ b/native/dgesvd.go.orig
@@ -1,721 +0,0 @@
-// Copyright ©2015 The gonum Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package native
-
-import (
-	"math"
-
-	"github.com/gonum/blas"
-	"github.com/gonum/blas/blas64"
-	"github.com/gonum/lapack"
-)
-
-// Dgesvd computes the singular value decomposition of the input matrix A.
-//
-// Only coded for jobU == lapack.SVDAll and jobVT == lapack.SVDAll.
-//
-// The singular value decomposition is
-//  A = U * Sigma * V^T
-// where Sigma is an m×n diagonal matrix containing the singular values of A,
-// U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
-// min(m,n) columns of U and V are the left and right singular vectors of A
-// respectively.
-//
-// jobU and jobVT are options for computing the singular vectors. The behavior
-// is as follows
-//  jobU == lapack.SVDAll		All M columns of U are returned in u
-//  jobU == lapack.SVDInPlace	The first min(m,n) columns are returned in u
-//  jobU == lapack.SVDOverwrite	The first min(m,n) columns of U are written into a
-//	jobU == lapack.SVDNone		The columns of U are not computed.
-// The behavior is the same for jobVT and the rows of V^T. At most one of jobU
-// and jobVT can equal lapack.SVDOverwrite.
-//
-// On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
-// the data is overwritten. On exit, A contains the appropriate singular vectors
-// if either job is lapack.SVDOverwrite.
-//
-// s is a slice of length at least min(m,n) and on exit contains the singular
-// values in decreasing order.
-//
-// u contains the left singular vectors on exit, stored columnwise. If
-// jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDInPlace u is
-// of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
-// not used.
-//
-// vt contains the left singular vectors on exit, stored rowwise. If
-// jobV == lapack.SVDAll, vt is of size n×m. If jobV == lapack.SVDInPlace vt is
-// of size min(m,n)×n. If jobU == lapack.SVDOverwrite or lapack.SVDNone, vt is
-// not used.
-//
-// work is a slice for storing temporary memory, and lwork is the usable size of
-// the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
-// If lwork == -1, instead of performing Dgesvd, the optimal work length will be
-// stored into work[0]. Dgesvd will panic if the working memory has insufficient
-// storage.
-//
-// Dgesvd returns whether the decomposition successfully completed.
-func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) {
-	checkMatrix(m, n, a, lda)
-	if jobU == lapack.SVDAll {
-		checkMatrix(m, m, u, ldu)
-	} else if jobU == lapack.SVDInPlace {
-		checkMatrix(m, min(m, n), u, ldu)
-	}
-	if jobVT == lapack.SVDAll {
-		checkMatrix(n, n, vt, ldvt)
-	} else if jobVT == lapack.SVDInPlace {
-		checkMatrix(min(m, n), n, vt, ldvt)
-	}
-	if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite {
-		panic("lapack: both jobU and jobVT are lapack.SVDOverwrite")
-	}
-	if len(s) < min(m, n) {
-		panic(badS)
-	}
-	if jobU != lapack.SVDAll || jobVT != lapack.SVDAll {
-		panic("lapack: SVD only coded for SVDAll job inputs")
-	}
-	minWork := max(5*min(m, n), 3*min(m, n)+max(m, n))
-	if lwork != -1 {
-		if len(work) < lwork {
-			panic(badWork)
-		}
-		if lwork < minWork {
-			panic(badWork)
-		}
-	}
-	if m == 0 || n == 0 {
-		return true
-	}
-
-	minmn := min(m, n)
-
-	wantua := jobU == lapack.SVDAll
-	wantus := jobU == lapack.SVDInPlace
-	wantuas := wantua || wantus
-	wantuo := jobU == lapack.SVDOverwrite
-	wantun := jobU == lapack.None
-
-	wantva := jobVT == lapack.SVDAll
-	wantvs := jobVT == lapack.SVDInPlace
-	wantvas := wantva || wantvs
-	wantvo := jobVT == lapack.SVDOverwrite
-	wantvn := jobVT == lapack.None
-
-	bi := blas64.Implementation()
-	var mnthr int
-
-	// The netlib implementation checks has this at only length 1. Our implementation
-	// checks all input sizes before examining the l == -1 case.
-	dum := make([]float64, m*n)
-
-	// Compute optimal space for subroutines.
-	maxwrk := 1
-	opts := string(jobU) + string(jobVT)
-	var wrkbl, bdspac int
-	if m >= n {
-		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
-		bdspac = 5 * n
-		impl.Dgeqrf(m, n, a, lda, dum, dum, -1)
-		lwork_dgeqrf := int(dum[0])
-		impl.Dorgqr(m, n, n, a, lda, dum, dum, -1)
-		lwork_dorgqr_n := int(dum[0])
-		impl.Dorgqr(m, m, n, a, lda, dum, dum, -1)
-		lwork_dorgqr_m := int(dum[0])
-		impl.Dgebrd(n, n, a, lda, s, dum, dum, dum, dum, -1)
-		lwork_dgebrd := int(dum[0])
-		impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, dum, dum, -1)
-		lwork_dorgbr_p := int(dum[0])
-		impl.Dorgbr(lapack.ApplyQ, n, n, n, a, lda, dum, dum, -1)
-		lwork_dorgbr_q := int(dum[0])
-
-		if m >= mnthr {
-			// m >> n
-			if wantun {
-				// Path 1
-				maxwrk = n + lwork_dgeqrf
-				maxwrk = max(maxwrk, 3*n+lwork_dgebrd)
-				if wantvo || wantvas {
-					maxWork = max(maxwrk, 3*n+lwork_dorgbr_p)
-				}
-				maxwrk = max(maxwrk, bdspac)
-			} else if wantuo && wantvn {
-				// Path 2
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
-			} else if wantuo && wantvs {
-				// Path 3
-				// or lapack.All
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
-			} else if wantus && wantvn {
-				// Path 4
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = n*n + wrkbl
-			} else if wantus && wantvo {
-				// Path 5
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = 2*n*n + wrkbl
-			} else if wantus && wantvas {
-				// Path 6
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = n*n + wrkbl
-			} else if wantua && wantvn {
-				// Path 7
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = n*n + wrkbl
-			} else if wantua && wantvo {
-				// Path 8
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = 2*n*n + wrkbl
-			} else if wantua && wantvas {
-				// Path 9
-				wrkbl = n + lwork_dgeqrf
-				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
-				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = n*n + wrkbl
-			}
-		} else {
-			// Path 10: m > n
-			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
-			lwork_dgebrd := int(dum[0])
-			maxwrk = 3*n + lwork_dgebrd
-			if wantus || wantuo {
-				impl.Dorgbr(lapack.ApplyQ, m, n, n, a, lda, dum, dum, -1)
-				lwork_dorgbr_q = int(dum[0])
-				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
-			}
-			if wantua {
-				impl.Dorgbr(lapack.ApplyQ, m, m, n, a, lda, dum, dum, -1)
-				lwork_dorgbr_q := int(dum[0])
-				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
-			}
-			if !wantvn {
-				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
-			}
-			maxwrk = max(maxwrk, bdspac)
-		}
-	} else {
-		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
-		bdspac = 5 * m
-		impl.Dgelqf(m, n, a, lda, dum, dum, -1)
-		lwork_dgelqf := int(dum[0])
-		impl.Dorglq(n, n, m, dum, n, dum, dum, -1)
-		lwork_dorglq_n := int(dum[0])
-		impl.Dorglq(m, n, m, a, lda, dum, dum, -1)
-		lwork_dorglq_m := int(dum[0])
-		impl.Dgebrd(m, m, a, lda, s, dum, dum, dum, dum, -1)
-		lwork_dgebrd := int(dum[0])
-		impl.Dorgbr(lapack.ApplyP, m, m, m, a, n, dum, dum, -1)
-		lwork_dorgbr_p := int(dum[0])
-		impl.Dorgbr(lapack.ApplyQ, m, m, m, a, n, dum, dum, -1)
-		lwork_dorgbr_q := int(dum[0])
-		if n >= mnthr {
-			// n >> m
-			if wantvn {
-				// Path 1t
-				maxwrk = m + lwork_dgelqf
-				maxwrk = max(maxwrk, 3*m+lwork_dgebrd)
-				if wntuo.OR.wntuas {
-					maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
-				}
-				maxwrk = max(maxwrk, bdspac)
-			} else if wantvo && wantun {
-				// Path 2t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_m)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
-			} else if wantvo && wantuas {
-				// Path 3t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_m)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
-			} else if wantvs && wantun {
-				// Path 4t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_m)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = m*m + wrkbl
-			} else if wantvs && wantuo {
-				// Path 5t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_m)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = 2*m*m + wrkbl
-			} else if wantvs && wantuas {
-				// Path 6t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_m)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = m*m + wrkbl
-			} else if wantva && wantun {
-				// Path 7t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_n)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = m*m + wrkbl
-			} else if wantva && wantuo {
-				// Path 8t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_n)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = 2*m*m + wrkbl
-			} else if wantva && wantuas {
-				// Path 9t
-				wrkbl = m + lwork_dgelqf
-				wrkbl = max(wrkbl, m+lwork_dorglq_n)
-				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
-				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
-				wrkbl = max(wrkbl, bdspac)
-				maxwrk = m*m + wrkbl
-			}
-		} else {
-			// Path 10t, n > m
-			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
-			lwork_dgebrd = int(dum[0])
-			maxwrk := 3*m + lwork_dgebrd
-			if wantvs || wantvo {
-				impl.Dorgbr(lapack.ApplyP, m, n, m, a, n, dum, dum, -1)
-				lwork_dorgbr_p = int(dum[0])
-				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
-			}
-			if wantva {
-				impl.Dorgbr(lapack.ApplyP, n, n, m, a, n, dum, dum, -1)
-				lwork_dorgbr_p = int(dum[0])
-				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
-			}
-			if !wantun {
-				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
-			}
-			maxwrk = max(maxwrk, bdspac)
-		}
-	}
-	maxwrk = max(maxwrk, minWork)
-	work[0] = maxwrk
-	if lwork == -1 {
-		return true
-	}
-
-	// Perform decomposition.
-	eps := dlamchE
-	smlnum := math.Sqrt(dlamchS) / eps
-	bignum := 1 / smlnum
-
-	// Scale A if max element outside range [smlnum, bignum]
-	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, dum)
-	iscl := 0
-	if anrm > 0 && anrm < smlnum {
-		iscl = 1
-		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
-	} else if anrm > bignum {
-		iscl = 1
-		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
-	}
-
-	var ie int
-	if m >= n {
-		// If A has sufficiently more rows than columns, use the QR decomposition.
-		if m >= mnthr {
-			if wantun {
-				// Path 1
-<<<<<<< HEAD
-				itau = 1
-				iwo
-			}
-		}
-	}
-=======
-				itau := 0
-				iwork := itau + n
-
-				// Compute A = Q * R
-				impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
-				// Zero out below R
-				impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
-				ie = 0
-				itauq := ie + n
-				itaup := itauq + n
-				iwork = itaup + n
-				// Bidiagonalize R in A
-				impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-				ncvt := 0
-				if wantvo || wantvas {
-					// Generate P^T.
-					impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, work[itaup:], work[iwork:], lwork-iwork)
-					ncvt = n
-				}
-				iwork = ie + n
-
-				// Perform bidiagonal QR iteration computing right singular vectors
-				// of A in A if desired.
-				ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:], a, lda, dum, 1, dum, 1, work[iwork:])
-
-				// If right singular vectors desired in VT, copy them there.
-				if wantvas {
-					impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt)
-				}
-			} else if wantuo && wantvn {
-				// Path 2.
-				/*
-					if lwork >= n*n+max(4*n, bdspac) {
-						// Sufficient workspace for a fast algorithm
-						ir := 1
-						var ldworku, ldworkr int
-						if lwork >= max(workbl, lda*n+n)+lda*n {
-							// work(iu) and work(ir) are n×lda
-							ldworku = lda
-							ldworkr = lda
-						} else if lwork >= max(workbl, lda*n+n)+n*n {
-							// work(iu) is ldwrku×n
-						}
-						itau := ir + ldworkr*n
-						iwork := itau + n
-
-						// Compute A = Q*R
-						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
-
-						// Copy R to work(ir) and zero out below it.
-						impl.Dlacpy(blas.Upper, n, n, a, lda, b, ldb)
-					}
-				*/
-				panic("not implemented")
-			} else if wantua {
-				if wantvas {
-					// Path 9
-					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
-						// Sufficient workspace for a fast algorithm
-						iu := 0
-						var ldworku int
-						if lwork >= wrkbl+lda*n {
-							ldworku = lda
-						} else {
-							ldworku = n
-						}
-						itau := iu + ldworku*n
-						iwork := itau + n
-
-						// Compute A = Q * R, copying result to U
-						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
-						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
-
-						// Generate Q in U
-						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)
-						// Copy R from A to VT, zeroing out below it.
-						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
-						if n > 1 {
-							impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
-						}
-						ie = itau
-						itauq := ie + n
-						itaup := itauq + n
-						iwork = itaup + n
-
-						// Bidiagonalize R in VT
-						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-
-						// Multiply Q in U by left bidiagonalizing vectors in VT
-						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
-
-						// Generate right bidiagonalizing vectors in VT
-						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
-						iwork = ie + n
-
-						// Perform bidiagonal QR iteration, computing left singular
-						// vectors of A in U and computing right singular vectors
-						// of A in VT.
-						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
-					}
-				}
-			}
-		} else {
-			// Path 10.
-			ie = 0
-			itauq := ie + n
-			itaup := itauq + n
-			iwork := itaup + n
-
-			// Bidiagonalize A
-			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-			if wantuas {
-				// Left singular vectors are desired in U. Copy result to U and
-				// generate left biadiagonalizing vectors in U.
-				impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
-				var ncu int
-				if wantus {
-					ncu = n
-				}
-				if wantua {
-					ncu = m
-				}
-				impl.Dorgbr(lapack.ApplyQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
-			}
-			if wantvas {
-				// Right singular vectors are desired in VT. Copy result to VT and
-				// generate left biadiagonalizing vectors in VT.
-				impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
-				impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
-			}
-			if wantuo {
-				panic("not implemented")
-			}
-			if wantvo {
-				panic("not implemented")
-			}
-			iwork = ie + n
-			var nru, ncvt int
-			if wantuas || wantuo {
-				nru = m
-			}
-			if wantun {
-				nru = 0
-			}
-			if wantvas || wantvo {
-				ncvt = n
-			}
-			if wantvn {
-				ncvt = 0
-			}
-			if !wantuo && !wantvo {
-				// Perform bidiagonal QR iteration, if desired, computing left
-				// singular vectors in U and right singular vectors in VT.
-				ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
-			} else {
-				panic("not implemented")
-			}
-		}
-	} else {
-		// A has more columns than rows. If A has sufficiently more columns than
-		// rows, first reduce using the LQ decomposition.
-		if n >= mnthr {
-			if wantva {
-				if wantuas {
-					// Path 9t
-					if lwork >= m*m+max(max(m+n, 4*m), bdspac) {
-						// Sufficient workspace for a fast algorithm.
-						iu := 0
-						var ldworku int
-						if lwork >= wrkbl+lda*m {
-							ldworku = lda
-						} else {
-							ldworku = m
-						}
-						itau := iu + ldworku*m
-						iwork := itau + m
-
-						// Generate A = L * Q copying result to VT
-						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
-						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
-
-						// Generate Q in VT
-						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
-
-						// Copy L to work[iu], zeroing out above it.
-						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
-						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+ldworku:], ldworku)
-						ie = itau
-						itauq := ie + m
-						itaup := itauq + m
-						iwork = itaup + m
-
-						// Bidiagonalize L in work[iu], copying result to U
-						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
-
-						// Generate right bidiagonalizing vectors in work[iu]
-						impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku, work[itaup:], work[iwork:], lwork-iwork)
-
-						// Generate left bidiagonalizing vectors in U.
-						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
-						iwork = ie + m
-
-						// Perform bidiagonal QR iteration, computing left singular
-						// vectors of L in U and computing right singular vectors
-						// of L in work[iu]
-						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:], work[iu:], ldworku, u, ldu, dum, 1, work[iwork:])
-
-						// Multiply right singular vectors of L in work[iu:]
-						// Q in VT, storing result in A.
-						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[iu:], ldworku, vt, ldvt, 0, a, lda)
-						// Copy right singular vectors of A from A to VT
-						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
-					} else {
-						// Insufficient workspace for a fast algorithm.
-						itau := 0
-						iwork := itau + m
-
-						// Compute A = L * Q, copying result to VT
-						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
-						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
-
-						// Generate Q in VT
-						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
-
-						// Copy L to U, zeroing out above it.
-						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
-						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
-
-						ie = itau
-						itauq := ie + m
-						itaup := itauq + m
-						iwork = itaup + m
-
-						// Bidiagonalize L in U
-						impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-
-						// Multiply right bidiagonalizing vectors in U by Q in VT.
-						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
-
-						// Generate left bidiagonalizing vectors in U
-						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
-						iwork = ie + m
-
-						// Perform bidiagonal QR iteration, computing left singular
-						// vectors of A in U and computing right singular vectors
-						// of A in VT.
-						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
-					}
-				} else {
-					panic("not implemented")
-				}
-			}
-		} else {
-			// Path 10t
-			ie = 0
-			itauq := ie + m
-			itaup := itauq + m
-			iwork := itaup + m
-
-			// Bidiagonalize A
-			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
-			if wantuas {
-				// If left singular vectors desired in U, copy result to U and
-				// generate left bidiagonalizing vectors in U.
-				impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
-				impl.Dorgbr(lapack.ApplyQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
-			}
-			if wantvas {
-				// If right singular vectors desired in VT, copy result to VT
-				// and generate right bidiagonalizing vectors in VT.
-				impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
-				var nrvt int
-				if wantva {
-					nrvt = n
-				} else {
-					nrvt = m
-				}
-				impl.Dorgbr(lapack.ApplyP, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
-			}
-			if wantuo {
-				panic("not implemented")
-			}
-			if wantvo {
-				panic("not implemented")
-			}
-			iwork = ie + m
-			var nru, ncvt int
-			if wantuas || wantuo {
-				nru = m
-			}
-			if wantvas || wantvo {
-				ncvt = n
-			}
-			if !wantvo && !wantvo {
-				// Perform bidiagonal QR iteration, if desired, computing left
-				// singular vectors in U and computing right singular vectors in
-				// VT.
-				ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
-			} else {
-				panic("not implemented")
-			}
-		}
-	}
-	minmn := min(m, n)
-	if !ok {
-		if ie > 1 {
-			for i := 0; i < minmn-1; i++ {
-				work[i+1] = work[i+ie]
-			}
-		}
-		if ie < 1 {
-			for i := minmn - 2; i >= 0; i-- {
-				work[i+1] = work[i+ie]
-			}
-		}
-	}
-	// Undo scaling if necessary
-	if iscl == 1 {
-		if anrm > bignum {
-			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn, 1, s, minmn)
-		}
-		if ok && anrm > bignum {
-			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn-1, 1, work[minmn:], minmn)
-		}
-		if anrm < smlnum {
-			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn, 1, s, minmn)
-		}
-		if ok && anrm < smlnum {
-			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn-1, 1, work[minmn:], minmn)
-		}
-	}
-	work[0] = float64(maxwrk)
-	return ok
->>>>>>> fd73640... Add a partial implementation of Dgesvd and test
-}
--- a/native/dlacpy.go
+++ b/native/dlacpy.go
@@ -24,11 +24,10 @@ func (impl Implementation) Dlacpy(uplo blas.Uplo, m, n int, a []float64, lda int

 	case blas.Lower:
 		for i := 0; i < m; i++ {
-			for j := 0; j < min(i, n); j++ {
+			for j := 0; j < min(i+1, n); j++ {
 				b[i*ldb+j] = a[i*lda+j]
 			}
 		}
-
 	case blas.All:
 		for i := 0; i < m; i++ {
 			for j := 0; j < n; j++ {
--- a/native/dlaset.go
+++ b/native/dlaset.go
@@ -20,7 +20,7 @@ func (impl Implementation) Dlaset(uplo blas.Uplo, m, n int, alpha, beta float64,
 		}
 	} else if uplo == blas.Lower {
 		for i := 0; i < m; i++ {
-			for j := 0; j < i; j++ {
+			for j := 0; j < min(i+1, n); j++ {
 				a[i*lda+j] = alpha
 			}
 		}
--- a/native/ilaenv.go
+++ b/native/ilaenv.go
@@ -355,7 +355,7 @@ func (Implementation) Ilaenv(ispec int, s string, opts string, n1, n2, n3, n4 in
 		return 2
 	case 6:
 		// Used by xGELSS and xGESVD
-		return min(n1, n2) * 1e6
+		return int(float64(min(n1, n2)) * 1.6)
 	case 7:
 		// Not used
 		return 1
--- a/testlapack/dbdsqr.go
+++ b/testlapack/dbdsqr.go
@@ -35,6 +35,8 @@ func DbdsqrTest(t *testing.T, impl Dbdsqrer) {
 			{10, 10, 10, 10, 30, 40, 50},
 			{10, 12, 11, 13, 30, 40, 50},
 			{20, 12, 13, 11, 30, 40, 50},
+
+			{130, 130, 130, 500, 900, 900, 500},
 		} {
 			for cas := 0; cas < 100; cas++ {
 				n := test.n
--- a/testlapack/dgebrd.go
+++ b/testlapack/dgebrd.go
@@ -46,8 +46,9 @@ func DgebrdTest(t *testing.T, impl Dgebrder) {
 		for i := range a {
 			a[i] = rand.NormFloat64()
 		}
+
 		d := make([]float64, minmn)
-		e := make([]float64, minmn)
+		e := make([]float64, minmn-1)
 		tauP := make([]float64, minmn)
 		tauQ := make([]float64, minmn)
 		work := make([]float64, max(m, n))
@@ -78,6 +79,21 @@ func DgebrdTest(t *testing.T, impl Dgebrder) {
 		impl.Dgebrd(m, n, a, lda, d, e, tauQ, tauP, work, lwork)
 		work = make([]float64, int(work[0]))
 		lwork = len(work)
+		for i := range work {
+			work[i] = math.NaN()
+		}
+		for i := range d {
+			d[i] = math.NaN()
+		}
+		for i := range e {
+			e[i] = math.NaN()
+		}
+		for i := range tauQ {
+			tauQ[i] = math.NaN()
+		}
+		for i := range tauP {
+			tauP[i] = math.NaN()
+		}
 		impl.Dgebrd(m, n, a, lda, d, e, tauQ, tauP, work, lwork)

 		// Test answers
--- a/testlapack/dgesvd.go
+++ b/testlapack/dgesvd.go
@@ -25,19 +25,37 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 	// TODO(btracey): Add tests for m > mnthr and n > mnthr when other SVD
 	// conditions are implemented. Right now mnthr is 5,000,000 which is too
 	// large to create a square matrix of that size.
-	for _, jobU := range []lapack.SVDJob{lapack.SVDAll} {
-		for _, jobVT := range []lapack.SVDJob{lapack.SVDAll} {
 	for _, test := range []struct {
 		m, n, lda, ldu, ldvt int
 	}{
 		{5, 5, 0, 0, 0},
-				{5, 7, 0, 0, 0},
-				{7, 5, 0, 0, 0},
+		{5, 6, 0, 0, 0},
+		{6, 5, 0, 0, 0},
+		{5, 9, 0, 0, 0},
+		{9, 5, 0, 0, 0},

 		{5, 5, 10, 11, 12},
-				{5, 7, 10, 11, 12},
-				{7, 5, 10, 11, 12},
+		{5, 6, 10, 11, 12},
+		{6, 5, 10, 11, 12},
+		{5, 5, 10, 11, 12},
+		{5, 9, 10, 11, 12},
+		{9, 5, 10, 11, 12},
+
+		{300, 300, 0, 0, 0},
+		{300, 400, 0, 0, 0},
+		{400, 300, 0, 0, 0},
+		{300, 600, 0, 0, 0},
+		{600, 300, 0, 0, 0},
+
+		{300, 300, 400, 450, 460},
+		{300, 400, 500, 550, 560},
+		{400, 300, 550, 550, 560},
+		{300, 600, 700, 750, 760},
+		{600, 300, 700, 750, 760},
 	} {
+		jobU := lapack.SVDAll
+		jobVT := lapack.SVDAll
+
 		m := test.m
 		n := test.n
 		lda := test.lda
@@ -68,6 +86,10 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 			vt[i] = rand.NormFloat64()
 		}

+		uAllOrig := make([]float64, len(u))
+		copy(uAllOrig, u)
+		vtAllOrig := make([]float64, len(vt))
+		copy(vtAllOrig, vt)
 		aCopy := make([]float64, len(a))
 		copy(aCopy, a)

@@ -76,10 +98,114 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 		work := make([]float64, 1)
 		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, -1)

+		if !floats.Equal(a, aCopy) {
+			t.Errorf("a changed during call to get work length")
+		}
+
 		work = make([]float64, int(work[0]))
 		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, len(work))

-				// Test the decomposition
+		errStr := fmt.Sprintf("m = %v, n = %v, lda = %v, ldu = %v, ldv = %v", m, n, lda, ldu, ldvt)
+		svdCheck(t, false, errStr, m, n, s, a, u, ldu, vt, ldvt, aCopy, lda)
+		svdCheckPartial(t, impl, lapack.SVDAll, errStr, uAllOrig, vtAllOrig, aCopy, m, n, a, lda, s, u, ldu, vt, ldvt, work, false)
+
+		// Test InPlace
+		jobU = lapack.SVDInPlace
+		jobVT = lapack.SVDInPlace
+		copy(a, aCopy)
+		copy(u, uAllOrig)
+		copy(vt, vtAllOrig)
+
+		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, len(work))
+		svdCheck(t, true, errStr, m, n, s, a, u, ldu, vt, ldvt, aCopy, lda)
+		svdCheckPartial(t, impl, lapack.SVDInPlace, errStr, uAllOrig, vtAllOrig, aCopy, m, n, a, lda, s, u, ldu, vt, ldvt, work, false)
+	}
+}
+
+// svdCheckPartial checks that the singular values and vectors are computed when
+// not all of them are computed.
+func svdCheckPartial(t *testing.T, impl Dgesvder, job lapack.SVDJob, errStr string, uAllOrig, vtAllOrig, aCopy []float64, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, shortWork bool) {
+	jobU := job
+	jobVT := job
+	// Compare the singular values when computed with {SVDNone, SVDNone.}
+	sCopy := make([]float64, len(s))
+	copy(sCopy, s)
+	copy(a, aCopy)
+	for i := range s {
+		s[i] = rand.Float64()
+	}
+	tmp1 := make([]float64, 1)
+	tmp2 := make([]float64, 1)
+	jobU = lapack.SVDNone
+	jobVT = lapack.SVDNone
+
+	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, tmp2, ldvt, work, -1)
+	work = make([]float64, int(work[0]))
+	lwork := len(work)
+	if shortWork {
+		lwork--
+	}
+	ok := impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, tmp2, ldvt, work, lwork)
+	if !ok {
+		t.Errorf("Dgesvd did not complete successfully")
+	}
+	if !floats.EqualApprox(s, sCopy, 1e-10) {
+		t.Errorf("Singular value mismatch when singular vectors not computed: %s", errStr)
+	}
+	// Check that the singular vectors are correctly computed when the other
+	// is none.
+	uAll := make([]float64, len(u))
+	copy(uAll, u)
+	vtAll := make([]float64, len(vt))
+	copy(vtAll, vt)
+
+	// Copy the original vectors so the data outside the matrix bounds is the same.
+	copy(u, uAllOrig)
+	copy(vt, vtAllOrig)
+
+	jobU = job
+	jobVT = lapack.SVDNone
+	copy(a, aCopy)
+	for i := range s {
+		s[i] = rand.Float64()
+	}
+	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, tmp2, ldvt, work, -1)
+	work = make([]float64, int(work[0]))
+	lwork = len(work)
+	if shortWork {
+		lwork--
+	}
+	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, tmp2, ldvt, work, len(work))
+	if !floats.EqualApprox(uAll, u, 1e-10) {
+		t.Errorf("U mismatch when VT is not computed: %s", errStr)
+	}
+	if !floats.EqualApprox(s, sCopy, 1e-10) {
+		t.Errorf("Singular value mismatch when U computed VT not")
+	}
+	jobU = lapack.SVDNone
+	jobVT = job
+	copy(a, aCopy)
+	for i := range s {
+		s[i] = rand.Float64()
+	}
+	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, vt, ldvt, work, -1)
+	work = make([]float64, int(work[0]))
+	lwork = len(work)
+	if shortWork {
+		lwork--
+	}
+	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, vt, ldvt, work, len(work))
+	if !floats.EqualApprox(vtAll, vt, 1e-10) {
+		t.Errorf("VT mismatch when U is not computed: %s", errStr)
+	}
+	if !floats.EqualApprox(s, sCopy, 1e-10) {
+		t.Errorf("Singular value mismatch when VT computed U not")
+	}
+}
+
+// svdCheck checks that the singular value decomposition correctly multiplies back
+// to the original matrix.
+func svdCheck(t *testing.T, thin bool, errStr string, m, n int, s, a, u []float64, ldu int, vt []float64, ldvt int, aCopy []float64, lda int) {
 	sigma := blas64.General{
 		Rows:   m,
 		Cols:   n,
@@ -102,6 +228,12 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 		Stride: ldvt,
 		Data:   vt,
 	}
+	if thin {
+		sigma.Rows = min(m, n)
+		sigma.Cols = min(m, n)
+		uMat.Cols = min(m, n)
+		vTMat.Rows = min(m, n)
+	}

 	tmp := blas64.General{
 		Rows:   m,
@@ -120,12 +252,12 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 	blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, uMat, sigma, 0, tmp)
 	blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, tmp, vTMat, 0, ans)

-				errStr := fmt.Sprintf("jobU = %v, jobVT = %v, m = %v, n = %v, lda = %v, ldu = %v, ldv = %v", jobU, jobVT, m, n, lda, ldu, ldvt)
 	if !floats.EqualApprox(ans.Data, aCopy, 1e-8) {
-					t.Errorf("Decomposition mismatch %s", errStr)
+		t.Errorf("Decomposition mismatch. Trim = %v, %s", thin, errStr)
 	}

-				// Check that U and V are orthogonal
+	if !thin {
+		// Check that U and V are orthogonal.
 		for i := 0; i < uMat.Rows; i++ {
 			for j := i + 1; j < uMat.Rows; j++ {
 				dot := blas64.Dot(uMat.Cols,
@@ -149,6 +281,5 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 			}
 		}
 	}
-		}
-	}
+
 }