diff --git a/covariancematrix.go b/covariancematrix.go index 4f277ec8..c3ee2671 100644 --- a/covariancematrix.go +++ b/covariancematrix.go @@ -7,6 +7,7 @@ package stat import ( "github.com/gonum/floats" "github.com/gonum/matrix/mat64" + "math" ) // CovarianceMatrix calculates a covariance matrix (also known as a @@ -17,16 +18,17 @@ import ( // The weights wts should have the length equal to the number of rows in // input data matrix x. cov should either be a square matrix with the same // number of columns as the input data matrix x, or nil in which case a new -// Dense matrix will be constructed. +// Dense matrix will be constructed. Weights cannot be negative. func CovarianceMatrix(cov *mat64.Dense, x mat64.Matrix, wts []float64) *mat64.Dense { // This is the matrix version of the two-pass algorithm. It doesn't use the // additional floating point error correction that the Covariance function uses // to reduce the impact of rounding during centering. + // TODO(jonlawlor): indicate that the resulting matrix is symmetric, and change + // the returned type from a *mat.Dense to a *mat.Symmetric. + r, c := x.Dims() - // TODO(jonlawlor): indicate that the resulting matrix is symmetric, which - // should improve performance. if cov == nil { cov = mat64.NewDense(c, c, nil) } else if covr, covc := cov.Dims(); covr != covc || covc != c { @@ -38,35 +40,43 @@ func CovarianceMatrix(cov *mat64.Dense, x mat64.Matrix, wts []float64) *mat64.De // Subtract the mean of each of the columns. for i := 0; i < c; i++ { v := xt.RawRowView(i) + // This will panic with ErrShape if len(wts) != len(v), so + // we don't have to check the size later. mean := Mean(v, wts) floats.AddConst(-mean, v) } - var xc mat64.Dense - xc.TCopy(&xt) - var n float64 - if wts != nil { - if wr := len(wts); wr != r { - panic(mat64.ErrShape) - } + if wts == nil { - // Weight the rows. - for i := 0; i < c; i++ { - v := xt.RawRowView(i) - floats.Mul(v, wts) - } - - // Calculate the normalization factor. - n = floats.Sum(wts) - } else { n = float64(r) + + cov.MulTrans(&xt, false, &xt, true) + + // Scale by the sample size. + cov.Scale(1/(n-1), cov) + return cov } - cov.Mul(&xt, &xc) + // Multiply by the sqrt of the weights, so that multiplication is symmetric. + sqrtwts := make([]float64, r) + for i, w := range wts { + if w < 0 { + panic("stat: negative covariance matrix weights") + } + sqrtwts[i] = math.Sqrt(w) + } + // Weight the rows. + for i := 0; i < c; i++ { + v := xt.RawRowView(i) + floats.Mul(v, sqrtwts) + } + + // Calculate the normalization factor. + n = floats.Sum(wts) + cov.MulTrans(&xt, false, &xt, true) // Scale by the sample size. cov.Scale(1/(n-1), cov) - return cov } diff --git a/covariancematrix_test.go b/covariancematrix_test.go index c6745e30..75b037b5 100644 --- a/covariancematrix_test.go +++ b/covariancematrix_test.go @@ -65,10 +65,10 @@ func TestCovarianceMatrix(t *testing.T) { t.Errorf("%d: expected cov %v, found %v", i, test.ans, c) } if !floats.Equal(d, r.Data) { - t.Errorf("%d: data was modified during execution") + t.Errorf("%d: data was modified during execution", i) } if !floats.Equal(w, test.weights) { - t.Errorf("%d: weights was modified during execution") + t.Errorf("%d: weights was modified during execution", i) } // compare with call to Covariance @@ -91,7 +91,9 @@ func TestCovarianceMatrix(t *testing.T) { if !Panics(func() { CovarianceMatrix(mat64.NewDense(1, 1, nil), mat64.NewDense(5, 2, nil), nil) }) { t.Errorf("CovarianceMatrix did not panic with preallocation size mismatch") } - + if !Panics(func() { CovarianceMatrix(nil, mat64.NewDense(2, 2, []float64{1, 2, 3, 4}), []float64{1, -1}) }) { + t.Errorf("CovarianceMatrix did not panic with negative weights") + } } // benchmarks @@ -110,6 +112,17 @@ func benchmarkCovarianceMatrix(b *testing.B, m mat64.Matrix) { CovarianceMatrix(nil, m, nil) } } +func benchmarkCovarianceMatrixWeighted(b *testing.B, m mat64.Matrix) { + r, _ := m.Dims() + wts := make([]float64, r) + for i := range wts { + wts[i] = 0.5 + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + CovarianceMatrix(nil, m, wts) + } +} func benchmarkCovarianceMatrixInPlace(b *testing.B, m mat64.Matrix) { _, c := m.Dims() res := mat64.NewDense(c, c, nil) @@ -153,6 +166,40 @@ func BenchmarkCovarianceMatrixHugexSmall(b *testing.B) { benchmarkCovarianceMatrix(b, x) } +func BenchmarkCovarianceMatrixSmallxSmallWeighted(b *testing.B) { + // 10 * 10 elements + x := randMat(small, small) + benchmarkCovarianceMatrixWeighted(b, x) +} +func BenchmarkCovarianceMatrixSmallxMediumWeighted(b *testing.B) { + // 10 * 1000 elements + x := randMat(small, medium) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixMediumxSmallWeighted(b *testing.B) { + // 1000 * 10 elements + x := randMat(medium, small) + benchmarkCovarianceMatrixWeighted(b, x) +} +func BenchmarkCovarianceMatrixMediumxMediumWeighted(b *testing.B) { + // 1000 * 1000 elements + x := randMat(medium, medium) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixLargexSmallWeighted(b *testing.B) { + // 1e5 * 10 elements + x := randMat(large, small) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixHugexSmallWeighted(b *testing.B) { + // 1e7 * 10 elements + x := randMat(huge, small) + benchmarkCovarianceMatrixWeighted(b, x) +} + func BenchmarkCovarianceMatrixSmallxSmallInPlace(b *testing.B) { // 10 * 10 elements x := randMat(small, small)