remove orig

2025-10-20 21:59:25 +08:00 · 2016-01-06 21:38:42 -07:00
parent 7016112fd0
commit 909081d32b
13 changed files with 1059 additions and 1473 deletions
--- a/cgo/lapack.go
+++ b/cgo/lapack.go
@@ -498,6 +498,8 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 	return clapack.Dgels(trans, m, n, nrhs, a, lda, b, ldb)
 }
 const noSVDO = "dgesvd: not coded for overwrite"
 // Dgesvd computes the singular value decomposition of the input matrix A.
 //
 // The singular value decomposition is
@@ -509,12 +511,12 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 //
 // jobU and jobVT are options for computing the singular vectors. The behavior
 // is as follows
-//  jobU == lapack.SVDAll		All M columns of U are returned in u
+//  jobU == lapack.SVDAll       All m columns of U are returned in u
 //  jobU == lapack.SVDInPlace   The first min(m,n) columns are returned in u
 //  jobU == lapack.SVDOverwrite The first min(m,n) columns of U are written into a
 //  jobU == lapack.SVDNone      The columns of U are not computed.
 // The behavior is the same for jobVT and the rows of V^T. At most one of jobU
-// and jobVT can equal lapack.SVDOverwrite.
+// and jobVT can equal lapack.SVDOverwrite, and Dgesvd will panic otherwise.
 //
 // On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
 // the data is overwritten. On exit, A contains the appropriate singular vectors
@@ -529,12 +531,12 @@ func (impl Implementation) Dgels(trans blas.Transpose, m, n, nrhs int, a []float
 // not used.
 //
 // vt contains the left singular vectors on exit, stored rowwise. If
-// jobV == lapack.SVDAll, vt is of size n×m. If jobV == lapack.SVDInPlace vt is
+// jobV == lapack.SVDAll, vt is of size n×m. If jobVT == lapack.SVDInPlace vt is
-// of size min(m,n)×n. If jobU == lapack.SVDOverwrite or lapack.SVDNone, vt is
+// of size min(m,n)×n. If jobVT == lapack.SVDOverwrite or lapack.SVDNone, vt is
 // not used.
 //
 // The C interface does not support providing temporary storage. To provide compatibility
-// with native, lwork == -1 will not run Dgeqrf but will instead write the minimum
+// with native, lwork == -1 will not run Dgesvd but will instead write the minimum
 // work necessary to work[0]. If len(work) < lwork, Dgeqrf will panic.
 //
 // Dgesvd returns whether the decomposition successfully completed.
@@ -551,13 +553,13 @@ func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float
 		checkMatrix(min(m, n), n, vt, ldvt)
 	}
 	if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite {
-		panic("lapack: both jobU and jobVT are lapack.SVDOverwrite")
+		panic(noSVDO)
 	}
 	if len(s) < min(m, n) {
 		panic(badS)
 	}
-	if jobU != lapack.SVDAll || jobVT != lapack.SVDAll {
+	if jobU == lapack.SVDOverwrite || jobVT == lapack.SVDOverwrite {
-		panic("lapack: SVD only coded for SVDAll job inputs")
+		panic("lapack: SVD not coded to overwrite original matrix")
 	}
 	minWork := max(5*min(m, n), 3*min(m, n)+max(m, n))
 	if lwork != -1 {
@@ -572,7 +574,7 @@ func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float
 		work[0] = float64(minWork)
 		return true
 	}
-	return clapack.Dgesvd(byte(jobU), byte(jobVT), m, n, a, lda, s, u, ldu, vt, ldvt, work)
+	return clapack.Dgesvd(lapack.Job(jobU), lapack.Job(jobVT), m, n, a, lda, s, u, ldu, vt, ldvt, work[1:])
 }
 // Dgetf2 computes the LU decomposition of the m×n matrix A.
--- a/cgo/lapack_test.go
+++ b/cgo/lapack_test.go
@@ -88,6 +88,10 @@ func TestDgeqrf(t *testing.T) {
 	testlapack.DgeqrfTest(t, impl)
 }
 func TestDgesvd(t *testing.T) {
 	testlapack.DgesvdTest(t, impl)
 }
 func TestDgetf2(t *testing.T) {
 	testlapack.Dgetf2Test(t, impl)
 }
--- a/lapack.go
+++ b/lapack.go
@@ -104,7 +104,7 @@ type SVDJob byte
 const (
 	SVDAll       SVDJob = 'A' // Compute all singular vectors
-	SVDInPlace          = 'S' // Compute the first singular vectors and store in provided storage.
+	SVDInPlace          = 'S' // Compute the first singular vectors and store them in provided storage.
-	SVDOverwrite        = 'O' // Compute the singular vectors and store in input matrix
+	SVDOverwrite        = 'O' // Compute the singular vectors and store them in input matrix
 	SVDNone             = 'N' // Do not compute singular vectors
 )
--- a/native/dbdsqr.go.orig
+++ b/native/dbdsqr.go.orig
@@ -1,497 +0,0 @@
 // Copyright ©2015 The gonum Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package native
 import (
 	"math"
 	"github.com/gonum/blas"
 	"github.com/gonum/blas/blas64"
 	"github.com/gonum/lapack"
 )
 // Dbdsqr performs a singular value decomposition of a real n×n bidiagonal matrix.
 //
 // The SVD of the bidiagonal matrix B is
 //  B = Q * S * P^T
 // where S is a diagonal matrix of singular values, Q is an orthogonal matrix of
 // left singular vectors, and P is an orthogonal matrix of right singular vectors.
 //
 // Q and P are only computed if requested. If left singular vectors are requested,
 // this routine returns U * Q instead of Q, and if right singular vectors are
 // requested P^T * VT is returned instead of P^T.
 //
 // Frequently Dbdsqr is used in conjuction with Dgebrd which reduces a general
 // matrix A into bidiagonal form. In this case, the SVD of A is
 //  A = (U * Q) * S * (P^T * VT)
 //
 // This routine may also compute Q^T * C.
 //
 // d and e contain the elements of the bidiagonal matrix b. d must have length at
 // least n, and e must have length at least n-1. Dbdsqr will panic if there is
 // insufficient length. On exit, D contains the singular values of B in decreasing
 // order.
 //
 // VT is a matrix of size n×ncvt whose elements are stored in vt. The elements
 // of vt are modified to contain P^T * VT on exit. VT is not used if ncvt == 0.
 //
 // U is a matrix of size nru×n whose elements are stored in u. The elements
 // of u are modified to contain U * Q on exit. U is not used if nru == 0.
 //
 // C is a matrix of size n×ncc whose elements are stored in c. The elements
 // of c are modified to contain Q^T * C on exit. C is not used if ncc == 0.
 //
 // work contains temporary storage and must have length at least 4*n. Dbdsqr
 // will panic if there is insufficient working memory.
 //
 // Dbdsqr returns whether the decomposition was successful.
 func (impl Implementation) Dbdsqr(uplo blas.Uplo, n, ncvt, nru, ncc int, d, e, vt []float64, ldvt int, u []float64, ldu int, c []float64, ldc int, work []float64) (ok bool) {
 	if uplo != blas.Upper && uplo != blas.Lower {
 		panic(badUplo)
 	}
 	if ncvt != 0 {
 		checkMatrix(n, ncvt, vt, ldvt)
 	}
 	if nru != 0 {
 		checkMatrix(nru, n, u, ldu)
 	}
 	if ncc != 0 {
 		checkMatrix(n, ncc, c, ldc)
 	}
 	if len(d) < n {
 		panic(badD)
 	}
 	if len(e) < n-1 {
 		panic(badE)
 	}
 	if len(work) < 4*n {
 		panic(badWork)
 	}
 	var info int
 	bi := blas64.Implementation()
 	const (
 		maxIter = 6
 	)
 	if n == 0 {
 		return true
 	}
 	if n != 1 {
 		// If the singular vectors do not need to be computed, use qd algorithm.
 		if !(ncvt > 0 || nru > 0 || ncc > 0) {
 			info = impl.Dlasq1(n, d, e, work)
 			// If info is 2 dqds didn't finish, and so try to.
 			if info != 2 {
 				return info == 0
 			}
 			info = 0
 		}
 		nm1 := n - 1
 		nm12 := nm1 + nm1
 		nm13 := nm12 + nm1
 		idir := 0
 		eps := dlamchE
 		unfl := dlamchS
 		lower := uplo == blas.Lower
 		var cs, sn, r float64
 		if lower {
 			for i := 0; i < n-1; i++ {
 				cs, sn, r = impl.Dlartg(d[i], e[i])
 				d[i] = r
 				e[i] = sn * d[i+1]
 				d[i+1] *= cs
 				work[i] = cs
 				work[nm1+i] = sn
 			}
 			if nru > 0 {
 				impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, n, work, work[n-1:], u, ldu)
 			}
 			if ncc > 0 {
 				impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, n, ncc, work, work[n-1:], c, ldc)
 			}
 		}
 		// Compute singular values to a relative accuracy of tol. If tol is negative
 		// the values will be computed to an absolute accuracy of math.Abs(tol) * norm(b)
 		tolmul := math.Max(10, math.Min(100, eps*(-1.0/8)))
 		tol := tolmul * eps
 		var smax float64
 		for i := 0; i < n; i++ {
 			smax = math.Max(smax, math.Abs(d[i]))
 		}
 		for i := 0; i < n-1; i++ {
 			smax = math.Max(smax, math.Abs(e[i]))
 		}
 		var sminl float64
 		var thresh float64
 		if tol >= 0 {
 			sminoa := math.Abs(d[0])
 			if sminoa != 0 {
 				mu := sminoa
 				for i := 1; i < n; i++ {
 					mu = math.Abs(d[i]) * (mu / (mu + math.Abs(e[i-1])))
 					sminoa = math.Min(sminoa, mu)
 					if sminoa == 0 {
 						break
 					}
 				}
 			}
 			sminoa = sminoa / math.Sqrt(float64(n))
 			thresh = math.Max(tol*sminoa, float64(maxIter*n*n)*unfl)
 		} else {
 			thresh = math.Max(math.Abs(tol)*smax, float64(maxIter*n*n)*unfl)
 		}
 		// Prepare for the main iteration loop for the singular values.
 		maxIt := maxIter * n * n
 		iter := 0
 		oldl2 := -1
 		oldm := -1
 		// m points to the last element of unconverged part of matrix.
 		m := n
 	Outer:
 		for m > 1 {
 			if iter > maxIt {
 				info = 0
 				for i := 0; i < n-1; i++ {
 					if e[i] != 0 {
 						info++
 					}
 				}
 				return info == 0
 			}
 			// Find diagonal block of matrix to work on.
 			if tol < 0 && math.Abs(d[m-1]) <= thresh {
 				d[m-1] = 0
 			}
 			smax = math.Abs(d[m-1])
 			smin := smax
 			var l2 int
 			var broke bool
 			for l3 := 0; l3 < m-1; l3++ {
 				l2 = m - l3 - 2
 				abss := math.Abs(d[l2])
 				abse := math.Abs(e[l2])
 				if tol < 0 && abss <= thresh {
 					d[l2] = 0
 				}
 				if abse <= thresh {
 					broke = true
 					e[l2] = 0
 					if l2 == m-2 {
 						// Convergence of bottom singular value, return to top.
 						m--
 						continue Outer
 					}
 					l2++
 					break
 				}
 				smin = math.Min(smin, abss)
 				smax = math.Max(math.Max(smax, abss), abse)
 			}
 <<<<<<< HEAD
 			if broke {
 				e[l2] = 0
 				if l2 == m-2 {
 					// Convergence of bottom singular value, return to top.
 					m--
 					continue
 				}
 				l2++
 			} else {
 =======
 			if !broke {
 >>>>>>> 618d4f1... simplify loop condition
 				l2 = 0
 			}
 			// e[ll] through e[m-2] are nonzero, e[ll-1] is zero
 			if l2 == m-2 {
 				// Handle 2×2 block separately.
 				var sinr, cosr, sinl, cosl float64
 				d[m-1], d[m-2], sinr, cosr, sinl, cosl = impl.Dlasv2(d[m-2], e[m-2], d[m-1])
 				e[m-2] = 0
 				if ncvt > 0 {
 					bi.Drot(ncvt, vt[(m-2)*ldvt:], 1, vt[(m-1)*ldvt:], 1, cosr, sinr)
 				}
 				if nru > 0 {
 					bi.Drot(nru, u[m-2:], ldu, u[m-1:], ldu, cosl, sinl)
 				}
 				if ncc > 0 {
 					bi.Drot(ncc, c[(m-2)*ldc:], 1, c[(m-1)*ldc:], 1, cosl, sinl)
 				}
 				m -= 2
 				continue
 			}
 			// If working on a new submatrix, choose shift direction from larger end
 			// diagonal element toward smaller.
 			if l2 > oldm-1 || m-1 < oldl2 {
 				if math.Abs(d[l2]) >= math.Abs(d[m-1]) {
 					idir = 1
 				} else {
 					idir = 2
 				}
 			}
 			// Apply convergence tests.
 			// TODO(btracey): There is a lot of similar looking code here. See
 			// if there is a better way to de-duplicate.
 			if idir == 1 {
 				// Run convergence test in forward direction.
 				// First apply standard test to bottom of matrix.
 				if math.Abs(e[m-2]) <= math.Abs(tol)*math.Abs(d[m-1]) || (tol < 0 && math.Abs(e[m-2]) <= thresh) {
 					e[m-2] = 0
 					continue
 				}
 				if tol >= 0 {
 					// If relative accuracy desired, apply convergence criterion forward.
 					mu := math.Abs(d[l2])
 					sminl = mu
 					for l3 := l2; l3 < m-1; l3++ {
 						if math.Abs(e[l3]) <= tol*mu {
 							e[l3] = 0
 							continue Outer
 						}
 						mu = math.Abs(d[l3+1]) * (mu / (mu + math.Abs(e[l3])))
 						sminl = math.Min(sminl, mu)
 					}
 				}
 			} else {
 				// Run convergence test in backward direction.
 				// First apply standard test to top of matrix.
 				if math.Abs(e[l2]) <= math.Abs(tol)*math.Abs(d[l2]) || (tol < 0 && math.Abs(e[l2]) <= thresh) {
 					e[l2] = 0
 					continue
 				}
 				if tol >= 0 {
 					// If relative accuracy desired, apply convergence criterion backward.
 					mu := math.Abs(d[m-1])
 					sminl = mu
 					for l3 := m - 2; l3 >= l2; l3-- {
 						if math.Abs(e[l3]) <= tol*mu {
 							e[l3] = 0
 							continue Outer
 						}
 						mu = math.Abs(d[l3]) * (mu / (mu + math.Abs(e[l3])))
 						sminl = math.Min(sminl, mu)
 					}
 				}
 			}
 			oldl2 = l2
 			oldm = m
 			// Compute shift. First, test if shifting would ruin relative accuracy,
 			// and if so set the shift to zero.
 			var shift float64
 			if tol >= 0 && float64(n)*tol*(sminl/smax) <= math.Max(eps, (1.0/100)*tol) {
 				shift = 0
 			} else {
 				var sl2 float64
 				if idir == 1 {
 					sl2 = math.Abs(d[l2])
 					shift, _ = impl.Dlas2(d[m-2], e[m-2], d[m-1])
 				} else {
 					sl2 = math.Abs(d[m-1])
 					shift, _ = impl.Dlas2(d[l2], e[l2], d[l2+1])
 				}
 				// Test if shift is negligible
 				if sl2 > 0 {
 					if (shift/sl2)*(shift/sl2) < eps {
 						shift = 0
 					}
 				}
 			}
 			iter += m - l2 + 1
 			// If no shift, do simplified QR iteration.
 			if shift == 0 {
 				if idir == 1 {
 					cs := 1.0
 					oldcs := 1.0
 					var sn, r, oldsn float64
 					for i := l2; i < m-1; i++ {
 						cs, sn, r = impl.Dlartg(d[i]*cs, e[i])
 						if i > l2 {
 							e[i-1] = oldsn * r
 						}
 						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i+1]*sn)
 						work[i-l2] = cs
 						work[i-l2+nm1] = sn
 						work[i-l2+nm12] = oldcs
 						work[i-l2+nm13] = oldsn
 					}
 					h := d[m-1] * cs
 					d[m-1] = h * oldcs
 					e[m-2] = h * oldsn
 					if ncvt > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
 					}
 					if nru > 0 {
 						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
 					}
 					if ncc > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
 					}
 					if math.Abs(e[m-2]) < thresh {
 						e[m-2] = 0
 					}
 				} else {
 					cs := 1.0
 					oldcs := 1.0
 					var sn, r, oldsn float64
 					for i := m - 1; i >= l2+1; i-- {
 						cs, sn, r = impl.Dlartg(d[i]*cs, e[i-1])
 						if i < m-1 {
 							e[i] = oldsn * r
 						}
 						oldcs, oldsn, d[i] = impl.Dlartg(oldcs*r, d[i-1]*sn)
 						work[i-l2-1] = cs
 						work[i-l2+nm1-1] = -sn
 						work[i-l2+nm12-1] = oldcs
 						work[i-l2+nm13-1] = -oldsn
 					}
 					h := d[l2] * cs
 					d[l2] = h * oldcs
 					e[l2] = h * oldsn
 					if ncvt > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
 					}
 					if nru > 0 {
 						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
 					}
 					if ncc > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
 					}
 					if math.Abs(e[l2]) <= thresh {
 						e[l2] = 0
 					}
 				}
 			} else {
 				// Use nonzero shift.
 				if idir == 1 {
 					// Chase bulge from top to bottom. Save cosines and sines for
 					// later singular vector updates.
 					f := (math.Abs(d[l2]) - shift) * (math.Copysign(1, d[l2]) + shift/d[l2])
 					g := e[l2]
 					var cosl, sinl float64
 					for i := l2; i < m-1; i++ {
 						cosr, sinr, r := impl.Dlartg(f, g)
 						if i > l2 {
 							e[i-1] = r
 						}
 						f = cosr*d[i] + sinr*e[i]
 						e[i] = cosr*e[i] - sinr*d[i]
 						g = sinr * d[i+1]
 						d[i+1] *= cosr
 						cosl, sinl, r = impl.Dlartg(f, g)
 						d[i] = r
 						f = cosl*e[i] + sinl*d[i+1]
 						d[i+1] = cosl*d[i+1] - sinl*e[i]
 						if i < m-2 {
 							g = sinl * e[i+1]
 							e[i+1] = cosl * e[i+1]
 						}
 						work[i-l2] = cosr
 						work[i-l2+nm1] = sinr
 						work[i-l2+nm12] = cosl
 						work[i-l2+nm13] = sinl
 					}
 					e[m-2] = f
 					if ncvt > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncvt, work, work[n-1:], vt[l2*ldvt:], ldvt)
 					}
 					if nru > 0 {
 						impl.Dlasr(blas.Right, lapack.Variable, lapack.Forward, nru, m-l2, work[nm12:], work[nm13:], u[l2:], ldu)
 					}
 					if ncc > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Forward, m-l2, ncc, work[nm12:], work[nm13:], c[l2*ldc:], ldc)
 					}
 					if math.Abs(e[m-2]) <= thresh {
 						e[m-2] = 0
 					}
 				} else {
 					// Chase bulge from top to bottom. Save cosines and sines for
 					// later singular vector updates.
 					f := (math.Abs(d[m-1]) - shift) * (math.Copysign(1, d[m-1]) + shift/d[m-1])
 					g := e[m-2]
 					for i := m - 1; i > l2; i-- {
 						cosr, sinr, r := impl.Dlartg(f, g)
 						if i < m-1 {
 							e[i] = r
 						}
 						f = cosr*d[i] + sinr*e[i-1]
 						e[i-1] = cosr*e[i-1] - sinr*d[i]
 						g = sinr * d[i-1]
 						d[i-1] *= cosr
 						cosl, sinl, r := impl.Dlartg(f, g)
 						d[i] = r
 						f = cosl*e[i-1] + sinl*d[i-1]
 						d[i-1] = cosl*d[i-1] - sinl*e[i-1]
 						if i > l2+1 {
 							g = sinl * e[i-2]
 							e[i-2] *= cosl
 						}
 						work[i-l2-1] = cosr
 						work[i-l2+nm1-1] = -sinr
 						work[i-l2+nm12-1] = cosl
 						work[i-l2+nm13-1] = -sinl
 					}
 					e[l2] = f
 					if math.Abs(e[l2]) <= thresh {
 						e[l2] = 0
 					}
 					if ncvt > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncvt, work[nm12:], work[nm13:], vt[l2*ldvt:], ldvt)
 					}
 					if nru > 0 {
 						impl.Dlasr(blas.Right, lapack.Variable, lapack.Backward, nru, m-l2, work, work[n-1:], u[l2:], ldu)
 					}
 					if ncc > 0 {
 						impl.Dlasr(blas.Left, lapack.Variable, lapack.Backward, m-l2, ncc, work, work[n-1:], c[l2*ldc:], ldc)
 					}
 				}
 			}
 		}
 	}
 	// All singular values converged, make them positive.
 	for i := 0; i < n; i++ {
 		if d[i] < 0 {
 			d[i] *= -1
 			if ncvt > 0 {
 				bi.Dscal(ncvt, -1, vt[i*ldvt:], 1)
 			}
 		}
 	}
 	// Sort the singular values in decreasing order.
 	for i := 0; i < n-1; i++ {
 		isub := 0
 		smin := d[0]
 		for j := 1; j < n-i; j++ {
 			if d[j] <= smin {
 				isub = j
 				smin = d[j]
 			}
 		}
 		if isub != n-i {
 			// Swap singular values and vectors.
 			d[isub] = d[n-i-1]
 			d[n-i-1] = smin
 			if ncvt > 0 {
 				bi.Dswap(ncvt, vt[isub*ldvt:], 1, vt[(n-i-1)*ldvt:], 1)
 			}
 			if nru > 0 {
 				bi.Dswap(nru, u[isub:], ldu, u[n-i-1:], ldu)
 			}
 			if ncc > 0 {
 				bi.Dswap(ncc, c[isub*ldc:], 1, c[(n-i-1)*ldc:], 1)
 			}
 		}
 	}
 	info = 0
 	for i := 0; i < n-1; i++ {
 		if e[i] != 0 {
 			info++
 		}
 	}
 	return info == 0
 }
--- a/native/dgebrd.go
+++ b/native/dgebrd.go
@@ -86,7 +86,7 @@ func (impl Implementation) Dgebrd(m, n int, a []float64, lda int, d, e, tauQ, ta
 				if lwork >= (m+n)*nbmin {
 					nb = lwork / (m + n)
 				} else {
-					nb = 1
+					nb = minmn
 					nx = minmn
 				}
 			}
--- a/native/dgesvd.go
+++ b/native/dgesvd.go
--- a/native/dgesvd.go.orig
+++ b/native/dgesvd.go.orig
@@ -1,721 +0,0 @@
 // Copyright ©2015 The gonum Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package native
 import (
 	"math"
 	"github.com/gonum/blas"
 	"github.com/gonum/blas/blas64"
 	"github.com/gonum/lapack"
 )
 // Dgesvd computes the singular value decomposition of the input matrix A.
 //
 // Only coded for jobU == lapack.SVDAll and jobVT == lapack.SVDAll.
 //
 // The singular value decomposition is
 //  A = U * Sigma * V^T
 // where Sigma is an m×n diagonal matrix containing the singular values of A,
 // U is an m×m orthogonal matrix and V is an n×n orthogonal matrix. The first
 // min(m,n) columns of U and V are the left and right singular vectors of A
 // respectively.
 //
 // jobU and jobVT are options for computing the singular vectors. The behavior
 // is as follows
 //  jobU == lapack.SVDAll		All M columns of U are returned in u
 //  jobU == lapack.SVDInPlace	The first min(m,n) columns are returned in u
 //  jobU == lapack.SVDOverwrite	The first min(m,n) columns of U are written into a
 //	jobU == lapack.SVDNone		The columns of U are not computed.
 // The behavior is the same for jobVT and the rows of V^T. At most one of jobU
 // and jobVT can equal lapack.SVDOverwrite.
 //
 // On entry, a contains the data for the m×n matrix A. During the call to Dgesvd
 // the data is overwritten. On exit, A contains the appropriate singular vectors
 // if either job is lapack.SVDOverwrite.
 //
 // s is a slice of length at least min(m,n) and on exit contains the singular
 // values in decreasing order.
 //
 // u contains the left singular vectors on exit, stored columnwise. If
 // jobU == lapack.SVDAll, u is of size m×m. If jobU == lapack.SVDInPlace u is
 // of size m×min(m,n). If jobU == lapack.SVDOverwrite or lapack.SVDNone, u is
 // not used.
 //
 // vt contains the left singular vectors on exit, stored rowwise. If
 // jobV == lapack.SVDAll, vt is of size n×m. If jobV == lapack.SVDInPlace vt is
 // of size min(m,n)×n. If jobU == lapack.SVDOverwrite or lapack.SVDNone, vt is
 // not used.
 //
 // work is a slice for storing temporary memory, and lwork is the usable size of
 // the slice. lwork must be at least max(5*min(m,n), 3*min(m,n)+max(m,n)).
 // If lwork == -1, instead of performing Dgesvd, the optimal work length will be
 // stored into work[0]. Dgesvd will panic if the working memory has insufficient
 // storage.
 //
 // Dgesvd returns whether the decomposition successfully completed.
 func (impl Implementation) Dgesvd(jobU, jobVT lapack.SVDJob, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, lwork int) (ok bool) {
 	checkMatrix(m, n, a, lda)
 	if jobU == lapack.SVDAll {
 		checkMatrix(m, m, u, ldu)
 	} else if jobU == lapack.SVDInPlace {
 		checkMatrix(m, min(m, n), u, ldu)
 	}
 	if jobVT == lapack.SVDAll {
 		checkMatrix(n, n, vt, ldvt)
 	} else if jobVT == lapack.SVDInPlace {
 		checkMatrix(min(m, n), n, vt, ldvt)
 	}
 	if jobU == lapack.SVDOverwrite && jobVT == lapack.SVDOverwrite {
 		panic("lapack: both jobU and jobVT are lapack.SVDOverwrite")
 	}
 	if len(s) < min(m, n) {
 		panic(badS)
 	}
 	if jobU != lapack.SVDAll || jobVT != lapack.SVDAll {
 		panic("lapack: SVD only coded for SVDAll job inputs")
 	}
 	minWork := max(5*min(m, n), 3*min(m, n)+max(m, n))
 	if lwork != -1 {
 		if len(work) < lwork {
 			panic(badWork)
 		}
 		if lwork < minWork {
 			panic(badWork)
 		}
 	}
 	if m == 0 || n == 0 {
 		return true
 	}
 	minmn := min(m, n)
 	wantua := jobU == lapack.SVDAll
 	wantus := jobU == lapack.SVDInPlace
 	wantuas := wantua || wantus
 	wantuo := jobU == lapack.SVDOverwrite
 	wantun := jobU == lapack.None
 	wantva := jobVT == lapack.SVDAll
 	wantvs := jobVT == lapack.SVDInPlace
 	wantvas := wantva || wantvs
 	wantvo := jobVT == lapack.SVDOverwrite
 	wantvn := jobVT == lapack.None
 	bi := blas64.Implementation()
 	var mnthr int
 	// The netlib implementation checks has this at only length 1. Our implementation
 	// checks all input sizes before examining the l == -1 case.
 	dum := make([]float64, m*n)
 	// Compute optimal space for subroutines.
 	maxwrk := 1
 	opts := string(jobU) + string(jobVT)
 	var wrkbl, bdspac int
 	if m >= n {
 		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
 		bdspac = 5 * n
 		impl.Dgeqrf(m, n, a, lda, dum, dum, -1)
 		lwork_dgeqrf := int(dum[0])
 		impl.Dorgqr(m, n, n, a, lda, dum, dum, -1)
 		lwork_dorgqr_n := int(dum[0])
 		impl.Dorgqr(m, m, n, a, lda, dum, dum, -1)
 		lwork_dorgqr_m := int(dum[0])
 		impl.Dgebrd(n, n, a, lda, s, dum, dum, dum, dum, -1)
 		lwork_dgebrd := int(dum[0])
 		impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, dum, dum, -1)
 		lwork_dorgbr_p := int(dum[0])
 		impl.Dorgbr(lapack.ApplyQ, n, n, n, a, lda, dum, dum, -1)
 		lwork_dorgbr_q := int(dum[0])
 		if m >= mnthr {
 			// m >> n
 			if wantun {
 				// Path 1
 				maxwrk = n + lwork_dgeqrf
 				maxwrk = max(maxwrk, 3*n+lwork_dgebrd)
 				if wantvo || wantvas {
 					maxWork = max(maxwrk, 3*n+lwork_dorgbr_p)
 				}
 				maxwrk = max(maxwrk, bdspac)
 			} else if wantuo && wantvn {
 				// Path 2
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
 			} else if wantuo && wantvs {
 				// Path 3
 				// or lapack.All
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = max(n*n+wrkbl, n*n+m*n+n)
 			} else if wantus && wantvn {
 				// Path 4
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = n*n + wrkbl
 			} else if wantus && wantvo {
 				// Path 5
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = 2*n*n + wrkbl
 			} else if wantus && wantvas {
 				// Path 6
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_n)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = n*n + wrkbl
 			} else if wantua && wantvn {
 				// Path 7
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = n*n + wrkbl
 			} else if wantua && wantvo {
 				// Path 8
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = 2*n*n + wrkbl
 			} else if wantua && wantvas {
 				// Path 9
 				wrkbl = n + lwork_dgeqrf
 				wrkbl = max(wrkbl, n+lwork_dorgqr_m)
 				wrkbl = max(wrkbl, 3*n+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, 3*n+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = n*n + wrkbl
 			}
 		} else {
 			// Path 10: m > n
 			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
 			lwork_dgebrd := int(dum[0])
 			maxwrk = 3*n + lwork_dgebrd
 			if wantus || wantuo {
 				impl.Dorgbr(lapack.ApplyQ, m, n, n, a, lda, dum, dum, -1)
 				lwork_dorgbr_q = int(dum[0])
 				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_q)
 			}
 			if wantua {
 				impl.Dorgbr(lapack.ApplyQ, m, m, n, a, lda, dum, dum, -1)
 				lwork_dorgbr_q := int(dum[0])
 				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
 			}
 			if !wantvn {
 				maxwrk = max(maxwrk, 3*n+lwork_dorgbr_p)
 			}
 			maxwrk = max(maxwrk, bdspac)
 		}
 	} else {
 		mnthr = impl.Ilaenv(6, "DGESVD", opts, m, n, 0, 0)
 		bdspac = 5 * m
 		impl.Dgelqf(m, n, a, lda, dum, dum, -1)
 		lwork_dgelqf := int(dum[0])
 		impl.Dorglq(n, n, m, dum, n, dum, dum, -1)
 		lwork_dorglq_n := int(dum[0])
 		impl.Dorglq(m, n, m, a, lda, dum, dum, -1)
 		lwork_dorglq_m := int(dum[0])
 		impl.Dgebrd(m, m, a, lda, s, dum, dum, dum, dum, -1)
 		lwork_dgebrd := int(dum[0])
 		impl.Dorgbr(lapack.ApplyP, m, m, m, a, n, dum, dum, -1)
 		lwork_dorgbr_p := int(dum[0])
 		impl.Dorgbr(lapack.ApplyQ, m, m, m, a, n, dum, dum, -1)
 		lwork_dorgbr_q := int(dum[0])
 		if n >= mnthr {
 			// n >> m
 			if wantvn {
 				// Path 1t
 				maxwrk = m + lwork_dgelqf
 				maxwrk = max(maxwrk, 3*m+lwork_dgebrd)
 				if wntuo.OR.wntuas {
 					maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
 				}
 				maxwrk = max(maxwrk, bdspac)
 			} else if wantvo && wantun {
 				// Path 2t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_m)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
 			} else if wantvo && wantuas {
 				// Path 3t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_m)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = max(m*m+wrkbl, m*m+m*n+m)
 			} else if wantvs && wantun {
 				// Path 4t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_m)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = m*m + wrkbl
 			} else if wantvs && wantuo {
 				// Path 5t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_m)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = 2*m*m + wrkbl
 			} else if wantvs && wantuas {
 				// Path 6t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_m)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = m*m + wrkbl
 			} else if wantva && wantun {
 				// Path 7t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_n)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = m*m + wrkbl
 			} else if wantva && wantuo {
 				// Path 8t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_n)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = 2*m*m + wrkbl
 			} else if wantva && wantuas {
 				// Path 9t
 				wrkbl = m + lwork_dgelqf
 				wrkbl = max(wrkbl, m+lwork_dorglq_n)
 				wrkbl = max(wrkbl, 3*m+lwork_dgebrd)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_p)
 				wrkbl = max(wrkbl, 3*m+lwork_dorgbr_q)
 				wrkbl = max(wrkbl, bdspac)
 				maxwrk = m*m + wrkbl
 			}
 		} else {
 			// Path 10t, n > m
 			impl.Dgebrd(m, n, a, lda, s, dum, dum, dum, dum, -1)
 			lwork_dgebrd = int(dum[0])
 			maxwrk := 3*m + lwork_dgebrd
 			if wantvs || wantvo {
 				impl.Dorgbr(lapack.ApplyP, m, n, m, a, n, dum, dum, -1)
 				lwork_dorgbr_p = int(dum[0])
 				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
 			}
 			if wantva {
 				impl.Dorgbr(lapack.ApplyP, n, n, m, a, n, dum, dum, -1)
 				lwork_dorgbr_p = int(dum[0])
 				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_p)
 			}
 			if !wantun {
 				maxwrk = max(maxwrk, 3*m+lwork_dorgbr_q)
 			}
 			maxwrk = max(maxwrk, bdspac)
 		}
 	}
 	maxwrk = max(maxwrk, minWork)
 	work[0] = maxwrk
 	if lwork == -1 {
 		return true
 	}
 	// Perform decomposition.
 	eps := dlamchE
 	smlnum := math.Sqrt(dlamchS) / eps
 	bignum := 1 / smlnum
 	// Scale A if max element outside range [smlnum, bignum]
 	anrm := impl.Dlange(lapack.MaxAbs, m, n, a, lda, dum)
 	iscl := 0
 	if anrm > 0 && anrm < smlnum {
 		iscl = 1
 		impl.Dlascl(lapack.General, 0, 0, anrm, smlnum, m, n, a, lda)
 	} else if anrm > bignum {
 		iscl = 1
 		impl.Dlascl(lapack.General, 0, 0, anrm, bignum, m, n, a, lda)
 	}
 	var ie int
 	if m >= n {
 		// If A has sufficiently more rows than columns, use the QR decomposition.
 		if m >= mnthr {
 			if wantun {
 				// Path 1
 <<<<<<< HEAD
 				itau = 1
 				iwo
 			}
 		}
 	}
 =======
 				itau := 0
 				iwork := itau + n
 				// Compute A = Q * R
 				impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
 				// Zero out below R
 				impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, a[lda:], lda)
 				ie = 0
 				itauq := ie + n
 				itaup := itauq + n
 				iwork = itaup + n
 				// Bidiagonalize R in A
 				impl.Dgebrd(n, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 				ncvt := 0
 				if wantvo || wantvas {
 					// Generate P^T.
 					impl.Dorgbr(lapack.ApplyP, n, n, n, a, lda, work[itaup:], work[iwork:], lwork-iwork)
 					ncvt = n
 				}
 				iwork = ie + n
 				// Perform bidiagonal QR iteration computing right singular vectors
 				// of A in A if desired.
 				ok = impl.Dbdsqr(blas.Upper, n, ncvt, 0, 0, s, work[ie:], a, lda, dum, 1, dum, 1, work[iwork:])
 				// If right singular vectors desired in VT, copy them there.
 				if wantvas {
 					impl.Dlacpy(blas.All, n, n, a, lda, vt, ldvt)
 				}
 			} else if wantuo && wantvn {
 				// Path 2.
 				/*
 					if lwork >= n*n+max(4*n, bdspac) {
 						// Sufficient workspace for a fast algorithm
 						ir := 1
 						var ldworku, ldworkr int
 						if lwork >= max(workbl, lda*n+n)+lda*n {
 							// work(iu) and work(ir) are n×lda
 							ldworku = lda
 							ldworkr = lda
 						} else if lwork >= max(workbl, lda*n+n)+n*n {
 							// work(iu) is ldwrku×n
 						}
 						itau := ir + ldworkr*n
 						iwork := itau + n
 						// Compute A = Q*R
 						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
 						// Copy R to work(ir) and zero out below it.
 						impl.Dlacpy(blas.Upper, n, n, a, lda, b, ldb)
 					}
 				*/
 				panic("not implemented")
 			} else if wantua {
 				if wantvas {
 					// Path 9
 					if lwork >= n*n+max(max(n+m, 4*n), bdspac) {
 						// Sufficient workspace for a fast algorithm
 						iu := 0
 						var ldworku int
 						if lwork >= wrkbl+lda*n {
 							ldworku = lda
 						} else {
 							ldworku = n
 						}
 						itau := iu + ldworku*n
 						iwork := itau + n
 						// Compute A = Q * R, copying result to U
 						impl.Dgeqrf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
 						impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
 						// Generate Q in U
 						impl.Dorgqr(m, m, n, u, ldu, work[itau:], work[iwork:], lwork-iwork)
 						// Copy R from A to VT, zeroing out below it.
 						impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
 						if n > 1 {
 							impl.Dlaset(blas.Lower, n-1, n-1, 0, 0, vt[ldvt:], ldvt)
 						}
 						ie = itau
 						itauq := ie + n
 						itaup := itauq + n
 						iwork = itaup + n
 						// Bidiagonalize R in VT
 						impl.Dgebrd(n, n, vt, ldvt, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 						// Multiply Q in U by left bidiagonalizing vectors in VT
 						impl.Dormbr(lapack.ApplyQ, blas.Right, blas.NoTrans, m, n, n, vt, ldvt, work[itauq:], u, ldu, work[iwork:], lwork-iwork)
 						// Generate right bidiagonalizing vectors in VT
 						impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
 						iwork = ie + n
 						// Perform bidiagonal QR iteration, computing left singular
 						// vectors of A in U and computing right singular vectors
 						// of A in VT.
 						ok = impl.Dbdsqr(blas.Upper, n, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
 					}
 				}
 			}
 		} else {
 			// Path 10.
 			ie = 0
 			itauq := ie + n
 			itaup := itauq + n
 			iwork := itaup + n
 			// Bidiagonalize A
 			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 			if wantuas {
 				// Left singular vectors are desired in U. Copy result to U and
 				// generate left biadiagonalizing vectors in U.
 				impl.Dlacpy(blas.Lower, m, n, a, lda, u, ldu)
 				var ncu int
 				if wantus {
 					ncu = n
 				}
 				if wantua {
 					ncu = m
 				}
 				impl.Dorgbr(lapack.ApplyQ, m, ncu, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
 			}
 			if wantvas {
 				// Right singular vectors are desired in VT. Copy result to VT and
 				// generate left biadiagonalizing vectors in VT.
 				impl.Dlacpy(blas.Upper, n, n, a, lda, vt, ldvt)
 				impl.Dorgbr(lapack.ApplyP, n, n, n, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
 			}
 			if wantuo {
 				panic("not implemented")
 			}
 			if wantvo {
 				panic("not implemented")
 			}
 			iwork = ie + n
 			var nru, ncvt int
 			if wantuas || wantuo {
 				nru = m
 			}
 			if wantun {
 				nru = 0
 			}
 			if wantvas || wantvo {
 				ncvt = n
 			}
 			if wantvn {
 				ncvt = 0
 			}
 			if !wantuo && !wantvo {
 				// Perform bidiagonal QR iteration, if desired, computing left
 				// singular vectors in U and right singular vectors in VT.
 				ok = impl.Dbdsqr(blas.Upper, n, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
 			} else {
 				panic("not implemented")
 			}
 		}
 	} else {
 		// A has more columns than rows. If A has sufficiently more columns than
 		// rows, first reduce using the LQ decomposition.
 		if n >= mnthr {
 			if wantva {
 				if wantuas {
 					// Path 9t
 					if lwork >= m*m+max(max(m+n, 4*m), bdspac) {
 						// Sufficient workspace for a fast algorithm.
 						iu := 0
 						var ldworku int
 						if lwork >= wrkbl+lda*m {
 							ldworku = lda
 						} else {
 							ldworku = m
 						}
 						itau := iu + ldworku*m
 						iwork := itau + m
 						// Generate A = L * Q copying result to VT
 						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
 						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
 						// Generate Q in VT
 						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
 						// Copy L to work[iu], zeroing out above it.
 						impl.Dlacpy(blas.Lower, m, m, a, lda, work[iu:], ldworku)
 						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, work[iu+ldworku:], ldworku)
 						ie = itau
 						itauq := ie + m
 						itaup := itauq + m
 						iwork = itaup + m
 						// Bidiagonalize L in work[iu], copying result to U
 						impl.Dgebrd(m, m, work[iu:], ldworku, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 						impl.Dlacpy(blas.Lower, m, m, work[iu:], ldworku, u, ldu)
 						// Generate right bidiagonalizing vectors in work[iu]
 						impl.Dorgbr(lapack.ApplyP, m, m, m, work[iu:], ldworku, work[itaup:], work[iwork:], lwork-iwork)
 						// Generate left bidiagonalizing vectors in U.
 						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
 						iwork = ie + m
 						// Perform bidiagonal QR iteration, computing left singular
 						// vectors of L in U and computing right singular vectors
 						// of L in work[iu]
 						ok = impl.Dbdsqr(blas.Upper, m, m, m, 0, s, work[ie:], work[iu:], ldworku, u, ldu, dum, 1, work[iwork:])
 						// Multiply right singular vectors of L in work[iu:]
 						// Q in VT, storing result in A.
 						bi.Dgemm(blas.NoTrans, blas.NoTrans, m, n, m, 1, work[iu:], ldworku, vt, ldvt, 0, a, lda)
 						// Copy right singular vectors of A from A to VT
 						impl.Dlacpy(blas.All, m, n, a, lda, vt, ldvt)
 					} else {
 						// Insufficient workspace for a fast algorithm.
 						itau := 0
 						iwork := itau + m
 						// Compute A = L * Q, copying result to VT
 						impl.Dgelqf(m, n, a, lda, work[itau:], work[iwork:], lwork-iwork)
 						impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
 						// Generate Q in VT
 						impl.Dorglq(n, n, m, vt, ldvt, work[itau:], work[iwork:], lwork-iwork)
 						// Copy L to U, zeroing out above it.
 						impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
 						impl.Dlaset(blas.Upper, m-1, m-1, 0, 0, u[1:], ldu)
 						ie = itau
 						itauq := ie + m
 						itaup := itauq + m
 						iwork = itaup + m
 						// Bidiagonalize L in U
 						impl.Dgebrd(m, m, u, ldu, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 						// Multiply right bidiagonalizing vectors in U by Q in VT.
 						impl.Dormbr(lapack.ApplyP, blas.Left, blas.Trans, m, n, m, u, ldu, work[itaup:], vt, ldvt, work[iwork:], lwork-iwork)
 						// Generate left bidiagonalizing vectors in U
 						impl.Dorgbr(lapack.ApplyQ, m, m, m, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
 						iwork = ie + m
 						// Perform bidiagonal QR iteration, computing left singular
 						// vectors of A in U and computing right singular vectors
 						// of A in VT.
 						ok = impl.Dbdsqr(blas.Upper, m, n, m, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
 					}
 				} else {
 					panic("not implemented")
 				}
 			}
 		} else {
 			// Path 10t
 			ie = 0
 			itauq := ie + m
 			itaup := itauq + m
 			iwork := itaup + m
 			// Bidiagonalize A
 			impl.Dgebrd(m, n, a, lda, s, work[ie:], work[itauq:], work[itaup:], work[iwork:], lwork-iwork)
 			if wantuas {
 				// If left singular vectors desired in U, copy result to U and
 				// generate left bidiagonalizing vectors in U.
 				impl.Dlacpy(blas.Lower, m, m, a, lda, u, ldu)
 				impl.Dorgbr(lapack.ApplyQ, m, m, n, u, ldu, work[itauq:], work[iwork:], lwork-iwork)
 			}
 			if wantvas {
 				// If right singular vectors desired in VT, copy result to VT
 				// and generate right bidiagonalizing vectors in VT.
 				impl.Dlacpy(blas.Upper, m, n, a, lda, vt, ldvt)
 				var nrvt int
 				if wantva {
 					nrvt = n
 				} else {
 					nrvt = m
 				}
 				impl.Dorgbr(lapack.ApplyP, nrvt, n, m, vt, ldvt, work[itaup:], work[iwork:], lwork-iwork)
 			}
 			if wantuo {
 				panic("not implemented")
 			}
 			if wantvo {
 				panic("not implemented")
 			}
 			iwork = ie + m
 			var nru, ncvt int
 			if wantuas || wantuo {
 				nru = m
 			}
 			if wantvas || wantvo {
 				ncvt = n
 			}
 			if !wantvo && !wantvo {
 				// Perform bidiagonal QR iteration, if desired, computing left
 				// singular vectors in U and computing right singular vectors in
 				// VT.
 				ok = impl.Dbdsqr(blas.Lower, m, ncvt, nru, 0, s, work[ie:], vt, ldvt, u, ldu, dum, 1, work[iwork:])
 			} else {
 				panic("not implemented")
 			}
 		}
 	}
 	minmn := min(m, n)
 	if !ok {
 		if ie > 1 {
 			for i := 0; i < minmn-1; i++ {
 				work[i+1] = work[i+ie]
 			}
 		}
 		if ie < 1 {
 			for i := minmn - 2; i >= 0; i-- {
 				work[i+1] = work[i+ie]
 			}
 		}
 	}
 	// Undo scaling if necessary
 	if iscl == 1 {
 		if anrm > bignum {
 			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn, 1, s, minmn)
 		}
 		if ok && anrm > bignum {
 			impl.Dlascl(lapack.General, 0, 0, bignum, anrm, minmn-1, 1, work[minmn:], minmn)
 		}
 		if anrm < smlnum {
 			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn, 1, s, minmn)
 		}
 		if ok && anrm < smlnum {
 			impl.Dlascl(lapack.General, 0, 0, smlnum, anrm, minmn-1, 1, work[minmn:], minmn)
 		}
 	}
 	work[0] = float64(maxwrk)
 	return ok
 >>>>>>> fd73640... Add a partial implementation of Dgesvd and test
 }
--- a/native/dlacpy.go
+++ b/native/dlacpy.go
@@ -24,11 +24,10 @@ func (impl Implementation) Dlacpy(uplo blas.Uplo, m, n int, a []float64, lda int
 	case blas.Lower:
 		for i := 0; i < m; i++ {
-			for j := 0; j < min(i, n); j++ {
+			for j := 0; j < min(i+1, n); j++ {
 				b[i*ldb+j] = a[i*lda+j]
 			}
 		}
 	case blas.All:
 		for i := 0; i < m; i++ {
 			for j := 0; j < n; j++ {
--- a/native/dlaset.go
+++ b/native/dlaset.go
@@ -20,7 +20,7 @@ func (impl Implementation) Dlaset(uplo blas.Uplo, m, n int, alpha, beta float64,
 		}
 	} else if uplo == blas.Lower {
 		for i := 0; i < m; i++ {
-			for j := 0; j < i; j++ {
+			for j := 0; j < min(i+1, n); j++ {
 				a[i*lda+j] = alpha
 			}
 		}
--- a/native/ilaenv.go
+++ b/native/ilaenv.go
@@ -355,7 +355,7 @@ func (Implementation) Ilaenv(ispec int, s string, opts string, n1, n2, n3, n4 in
 		return 2
 	case 6:
 		// Used by xGELSS and xGESVD
-		return min(n1, n2) * 1e6
+		return int(float64(min(n1, n2)) * 1.6)
 	case 7:
 		// Not used
 		return 1
--- a/testlapack/dbdsqr.go
+++ b/testlapack/dbdsqr.go
@@ -35,6 +35,8 @@ func DbdsqrTest(t *testing.T, impl Dbdsqrer) {
 			{10, 10, 10, 10, 30, 40, 50},
 			{10, 12, 11, 13, 30, 40, 50},
 			{20, 12, 13, 11, 30, 40, 50},
 			{130, 130, 130, 500, 900, 900, 500},
 		} {
 			for cas := 0; cas < 100; cas++ {
 				n := test.n
--- a/testlapack/dgebrd.go
+++ b/testlapack/dgebrd.go
@@ -46,8 +46,9 @@ func DgebrdTest(t *testing.T, impl Dgebrder) {
 		for i := range a {
 			a[i] = rand.NormFloat64()
 		}
 		d := make([]float64, minmn)
-		e := make([]float64, minmn)
+		e := make([]float64, minmn-1)
 		tauP := make([]float64, minmn)
 		tauQ := make([]float64, minmn)
 		work := make([]float64, max(m, n))
@@ -78,6 +79,21 @@ func DgebrdTest(t *testing.T, impl Dgebrder) {
 		impl.Dgebrd(m, n, a, lda, d, e, tauQ, tauP, work, lwork)
 		work = make([]float64, int(work[0]))
 		lwork = len(work)
 		for i := range work {
 			work[i] = math.NaN()
 		}
 		for i := range d {
 			d[i] = math.NaN()
 		}
 		for i := range e {
 			e[i] = math.NaN()
 		}
 		for i := range tauQ {
 			tauQ[i] = math.NaN()
 		}
 		for i := range tauP {
 			tauP[i] = math.NaN()
 		}
 		impl.Dgebrd(m, n, a, lda, d, e, tauQ, tauP, work, lwork)
 		// Test answers
--- a/testlapack/dgesvd.go
+++ b/testlapack/dgesvd.go
@@ -25,19 +25,37 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 	// TODO(btracey): Add tests for m > mnthr and n > mnthr when other SVD
 	// conditions are implemented. Right now mnthr is 5,000,000 which is too
 	// large to create a square matrix of that size.
 	for _, jobU := range []lapack.SVDJob{lapack.SVDAll} {
 		for _, jobVT := range []lapack.SVDJob{lapack.SVDAll} {
 	for _, test := range []struct {
 		m, n, lda, ldu, ldvt int
 	}{
 		{5, 5, 0, 0, 0},
-				{5, 7, 0, 0, 0},
+		{5, 6, 0, 0, 0},
-				{7, 5, 0, 0, 0},
+		{6, 5, 0, 0, 0},
 		{5, 9, 0, 0, 0},
 		{9, 5, 0, 0, 0},
 		{5, 5, 10, 11, 12},
-				{5, 7, 10, 11, 12},
+		{5, 6, 10, 11, 12},
-				{7, 5, 10, 11, 12},
+		{6, 5, 10, 11, 12},
 		{5, 5, 10, 11, 12},
 		{5, 9, 10, 11, 12},
 		{9, 5, 10, 11, 12},
 		{300, 300, 0, 0, 0},
 		{300, 400, 0, 0, 0},
 		{400, 300, 0, 0, 0},
 		{300, 600, 0, 0, 0},
 		{600, 300, 0, 0, 0},
 		{300, 300, 400, 450, 460},
 		{300, 400, 500, 550, 560},
 		{400, 300, 550, 550, 560},
 		{300, 600, 700, 750, 760},
 		{600, 300, 700, 750, 760},
 	} {
 		jobU := lapack.SVDAll
 		jobVT := lapack.SVDAll
 		m := test.m
 		n := test.n
 		lda := test.lda
@@ -68,6 +86,10 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 			vt[i] = rand.NormFloat64()
 		}
 		uAllOrig := make([]float64, len(u))
 		copy(uAllOrig, u)
 		vtAllOrig := make([]float64, len(vt))
 		copy(vtAllOrig, vt)
 		aCopy := make([]float64, len(a))
 		copy(aCopy, a)
@@ -76,10 +98,114 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 		work := make([]float64, 1)
 		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, -1)
 		if !floats.Equal(a, aCopy) {
 			t.Errorf("a changed during call to get work length")
 		}
 		work = make([]float64, int(work[0]))
 		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, len(work))
-				// Test the decomposition
+		errStr := fmt.Sprintf("m = %v, n = %v, lda = %v, ldu = %v, ldv = %v", m, n, lda, ldu, ldvt)
 		svdCheck(t, false, errStr, m, n, s, a, u, ldu, vt, ldvt, aCopy, lda)
 		svdCheckPartial(t, impl, lapack.SVDAll, errStr, uAllOrig, vtAllOrig, aCopy, m, n, a, lda, s, u, ldu, vt, ldvt, work, false)
 		// Test InPlace
 		jobU = lapack.SVDInPlace
 		jobVT = lapack.SVDInPlace
 		copy(a, aCopy)
 		copy(u, uAllOrig)
 		copy(vt, vtAllOrig)
 		impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, vt, ldvt, work, len(work))
 		svdCheck(t, true, errStr, m, n, s, a, u, ldu, vt, ldvt, aCopy, lda)
 		svdCheckPartial(t, impl, lapack.SVDInPlace, errStr, uAllOrig, vtAllOrig, aCopy, m, n, a, lda, s, u, ldu, vt, ldvt, work, false)
 	}
 }
 // svdCheckPartial checks that the singular values and vectors are computed when
 // not all of them are computed.
 func svdCheckPartial(t *testing.T, impl Dgesvder, job lapack.SVDJob, errStr string, uAllOrig, vtAllOrig, aCopy []float64, m, n int, a []float64, lda int, s, u []float64, ldu int, vt []float64, ldvt int, work []float64, shortWork bool) {
 	jobU := job
 	jobVT := job
 	// Compare the singular values when computed with {SVDNone, SVDNone.}
 	sCopy := make([]float64, len(s))
 	copy(sCopy, s)
 	copy(a, aCopy)
 	for i := range s {
 		s[i] = rand.Float64()
 	}
 	tmp1 := make([]float64, 1)
 	tmp2 := make([]float64, 1)
 	jobU = lapack.SVDNone
 	jobVT = lapack.SVDNone
 	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, tmp2, ldvt, work, -1)
 	work = make([]float64, int(work[0]))
 	lwork := len(work)
 	if shortWork {
 		lwork--
 	}
 	ok := impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, tmp2, ldvt, work, lwork)
 	if !ok {
 		t.Errorf("Dgesvd did not complete successfully")
 	}
 	if !floats.EqualApprox(s, sCopy, 1e-10) {
 		t.Errorf("Singular value mismatch when singular vectors not computed: %s", errStr)
 	}
 	// Check that the singular vectors are correctly computed when the other
 	// is none.
 	uAll := make([]float64, len(u))
 	copy(uAll, u)
 	vtAll := make([]float64, len(vt))
 	copy(vtAll, vt)
 	// Copy the original vectors so the data outside the matrix bounds is the same.
 	copy(u, uAllOrig)
 	copy(vt, vtAllOrig)
 	jobU = job
 	jobVT = lapack.SVDNone
 	copy(a, aCopy)
 	for i := range s {
 		s[i] = rand.Float64()
 	}
 	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, tmp2, ldvt, work, -1)
 	work = make([]float64, int(work[0]))
 	lwork = len(work)
 	if shortWork {
 		lwork--
 	}
 	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, u, ldu, tmp2, ldvt, work, len(work))
 	if !floats.EqualApprox(uAll, u, 1e-10) {
 		t.Errorf("U mismatch when VT is not computed: %s", errStr)
 	}
 	if !floats.EqualApprox(s, sCopy, 1e-10) {
 		t.Errorf("Singular value mismatch when U computed VT not")
 	}
 	jobU = lapack.SVDNone
 	jobVT = job
 	copy(a, aCopy)
 	for i := range s {
 		s[i] = rand.Float64()
 	}
 	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, vt, ldvt, work, -1)
 	work = make([]float64, int(work[0]))
 	lwork = len(work)
 	if shortWork {
 		lwork--
 	}
 	impl.Dgesvd(jobU, jobVT, m, n, a, lda, s, tmp1, ldu, vt, ldvt, work, len(work))
 	if !floats.EqualApprox(vtAll, vt, 1e-10) {
 		t.Errorf("VT mismatch when U is not computed: %s", errStr)
 	}
 	if !floats.EqualApprox(s, sCopy, 1e-10) {
 		t.Errorf("Singular value mismatch when VT computed U not")
 	}
 }
 // svdCheck checks that the singular value decomposition correctly multiplies back
 // to the original matrix.
 func svdCheck(t *testing.T, thin bool, errStr string, m, n int, s, a, u []float64, ldu int, vt []float64, ldvt int, aCopy []float64, lda int) {
 	sigma := blas64.General{
 		Rows:   m,
 		Cols:   n,
@@ -102,6 +228,12 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 		Stride: ldvt,
 		Data:   vt,
 	}
 	if thin {
 		sigma.Rows = min(m, n)
 		sigma.Cols = min(m, n)
 		uMat.Cols = min(m, n)
 		vTMat.Rows = min(m, n)
 	}
 	tmp := blas64.General{
 		Rows:   m,
@@ -120,12 +252,12 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 	blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, uMat, sigma, 0, tmp)
 	blas64.Gemm(blas.NoTrans, blas.NoTrans, 1, tmp, vTMat, 0, ans)
 				errStr := fmt.Sprintf("jobU = %v, jobVT = %v, m = %v, n = %v, lda = %v, ldu = %v, ldv = %v", jobU, jobVT, m, n, lda, ldu, ldvt)
 	if !floats.EqualApprox(ans.Data, aCopy, 1e-8) {
-					t.Errorf("Decomposition mismatch %s", errStr)
+		t.Errorf("Decomposition mismatch. Trim = %v, %s", thin, errStr)
 	}
-				// Check that U and V are orthogonal
+	if !thin {
 		// Check that U and V are orthogonal.
 		for i := 0; i < uMat.Rows; i++ {
 			for j := i + 1; j < uMat.Rows; j++ {
 				dot := blas64.Dot(uMat.Cols,
@@ -149,6 +281,5 @@ func DgesvdTest(t *testing.T, impl Dgesvder) {
 			}
 		}
 	}
-		}
+
 	}
 }