diff --git a/pca.go b/pca.go new file mode 100644 index 00000000..fa5b52b7 --- /dev/null +++ b/pca.go @@ -0,0 +1,81 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "math" + + "github.com/gonum/floats" + "github.com/gonum/matrix" + "github.com/gonum/matrix/mat64" +) + +// PrincipalComponents returns the principal component direction vectors and +// the column variances of the principle component scores, vecs * a, computed +// using the singular value decomposition of the input. The input a is an n×d +// matrix where each row is an observation and each column represents a variable. +// +// PrincipalComponents centers the variables but does not scale the variance. +// +// The slice weights is used to weight the observations. If weights is nil, +// each weight is considered to have a value of one, otherwise the length of +// weights must match the number of observations or PrincipalComponents will +// panic. +// +// On successful completion, the principal component direction vectors are +// returned in vecs as a d×min(n, d) matrix, and the variances are returned in +// vars as a min(n, d)-long slice in descending sort order. +// +// If no singular value decomposition is possible, vecs and vars are returned +// nil and ok is returned false. +func PrincipalComponents(a mat64.Matrix, weights []float64) (vecs *mat64.Dense, vars []float64, ok bool) { + n, d := a.Dims() + if weights != nil && len(weights) != n { + panic("stat: len(weights) != observations") + } + + centered := mat64.NewDense(n, d, nil) + col := make([]float64, n) + for j := 0; j < d; j++ { + mat64.Col(col, j, a) + floats.AddConst(-Mean(col, weights), col) + centered.SetCol(j, col) + } + for i, w := range weights { + floats.Scale(math.Sqrt(w), centered.RawRowView(i)) + } + + kind := matrix.SVDFull + if n > d { + kind = matrix.SVDThin + } + var svd mat64.SVD + ok = svd.Factorize(centered, kind) + if !ok { + return nil, nil, false + } + + var v mat64.Dense + v.VFromSVD(&svd) + vecs = v.View(0, 0, d, min(n, d)).(*mat64.Dense) + vars = svd.Values(nil) + var f float64 + if weights == nil { + f = 1 / float64(n-1) + } else { + f = 1 / (floats.Sum(weights) - 1) + } + for i, v := range vars { + vars[i] = f * v * v + } + return vecs, vars, true +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/pca_example_test.go b/pca_example_test.go new file mode 100644 index 00000000..36080a33 --- /dev/null +++ b/pca_example_test.go @@ -0,0 +1,59 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import ( + "fmt" + + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +func ExamplePrincipalComponents() { + // iris is a truncated sample of the Fisher's Iris dataset. + n := 10 + d := 4 + iris := mat64.NewDense(n, d, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }) + + // Calculate the principal component direction vectors + // and variances. + vecs, vars, ok := stat.PrincipalComponents(iris, nil) + if !ok { + return + } + fmt.Printf("variances = %.4f\n\n", vars) + + // Project the data onto the first 2 principal components. + k := 2 + var proj mat64.Dense + proj.Mul(iris, vecs.View(0, 0, d, k)) + + fmt.Printf("proj = %.4f", mat64.Formatted(&proj, mat64.Prefix(" "))) + + // Output: + // variances = [0.1666 0.0207 0.0079 0.0019] + // + // proj = ⎡-6.1686 1.4659⎤ + // ⎢-5.6767 1.6459⎥ + // ⎢-5.6699 1.3642⎥ + // ⎢-5.5643 1.3816⎥ + // ⎢-6.1734 1.3309⎥ + // ⎢-6.7278 1.4021⎥ + // ⎢-5.7743 1.1498⎥ + // ⎢-6.0466 1.4714⎥ + // ⎢-5.2709 1.3570⎥ + // ⎣-5.7533 1.6207⎦ +} diff --git a/pca_test.go b/pca_test.go new file mode 100644 index 00000000..77553767 --- /dev/null +++ b/pca_test.go @@ -0,0 +1,159 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" +) + +func TestPrincipalComponents(t *testing.T) { + for i, test := range []struct { + data mat64.Matrix + weights []float64 + wantVecs *mat64.Dense + wantVars []float64 + epsilon float64 + }{ + // Test results verified using R. + { + data: mat64.NewDense(3, 3, []float64{ + 1, 2, 3, + 4, 5, 6, + 7, 8, 9, + }), + wantVecs: mat64.NewDense(3, 3, []float64{ + 0.5773502691896258, 0.8164965809277261, 0, + 0.577350269189626, -0.4082482904638632, -0.7071067811865476, + 0.5773502691896258, -0.4082482904638631, 0.7071067811865475, + }), + wantVars: []float64{27, 0, 0}, + epsilon: 1e-12, + }, + { // Truncated iris data. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.6681110197952722, 0.7064764857539533, -0.14026590216895132, -0.18666578956412125, + -0.7166344774801547, -0.6427036135482664, -0.135650285905254, 0.23444848208629923, + -0.164411275166307, 0.11898477441068218, 0.9136367900709548, 0.35224901970831746, + -0.11415613655453069, -0.2714141920887426, 0.35664028439226514, -0.8866286823515034, + }), + wantVars: []float64{0.1665786313282786, 0.02065509475412993, 0.007944620317765855, 0.0019327647109368329}, + epsilon: 1e-12, + }, + { // Truncated iris data transposed to check for operation on fat input. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }).T(), + wantVecs: mat64.NewDense(10, 4, []float64{ + -0.3366602459946619, -0.1373634006401213, 0.3465102523547623, -0.10290179303893479, + -0.31381852053861975, 0.5197145790632827, 0.5567296129086686, -0.15923062170153618, + -0.30857197637565165, -0.07670930360819002, 0.36159923003337235, 0.3342301027853355, + -0.29527124351656137, 0.16885455995353074, -0.5056204762881208, 0.32580913261444344, + -0.3327611073694004, -0.39365834489416474, 0.04900050959307464, 0.46812879383236555, + -0.34445484362044815, -0.2985206914561878, -0.1009714701361799, -0.16803618186050803, + -0.2986246350957691, -0.4222037823717799, -0.11838613462182519, -0.580283530375069, + -0.325911246223126, 0.024366468758217238, -0.12082035131864265, 0.16756027181337868, + -0.2814284432361538, 0.240812316260054, -0.24061437569068145, -0.365034616264623, + -0.31906138507685167, 0.4423912824105986, -0.2906412122303604, 0.027551046870337714, + }), + wantVars: []float64{41.8851906634233, 0.07762619213464989, 0.010516477775373585, 0}, + epsilon: 1e-12, + }, + { // Truncated iris data unitary weights. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + weights: []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.6681110197952722, 0.7064764857539533, -0.14026590216895132, -0.18666578956412125, + -0.7166344774801547, -0.6427036135482664, -0.135650285905254, 0.23444848208629923, + -0.164411275166307, 0.11898477441068218, 0.9136367900709548, 0.35224901970831746, + -0.11415613655453069, -0.2714141920887426, 0.35664028439226514, -0.8866286823515034, + }), + wantVars: []float64{0.1665786313282786, 0.02065509475412993, 0.007944620317765855, 0.0019327647109368329}, + epsilon: 1e-12, + }, + { // Truncated iris data non-unitary weights. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + weights: []float64{2, 3, 1, 1, 1, 1, 1, 1, 1, 2}, + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.618936145422414, 0.763069301531647, 0.124857741232537, 0.138035623677211, + -0.763958271606519, -0.603881770702898, 0.118267155321333, -0.194184052457746, + -0.143552119754944, 0.090014599564871, -0.942209377020044, -0.289018426115945, + -0.112599271966947, -0.212012782487076, -0.287515067921680, 0.927203898682805, + }), + wantVars: []float64{0.129621985550623, 0.022417487771598, 0.006454461065715, 0.002495076601075}, + epsilon: 1e-12, + }, + } { + vecs, vars, ok := PrincipalComponents(test.data, test.weights) + if !ok { + t.Errorf("unexpected SVD failure for test %d", i) + continue + } + if !mat64.EqualApprox(vecs, test.wantVecs, test.epsilon) { + t.Errorf("%d: unexpected PCA result got:\n%v\nwant:\n%v", + i, mat64.Formatted(vecs), mat64.Formatted(test.wantVecs)) + } + if !approxEqual(vars, test.wantVars, test.epsilon) { + t.Errorf("%d: unexpected variance result got:%v, want:%v", i, vars, test.wantVars) + } + } +} + +func approxEqual(a, b []float64, epsilon float64) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if !floats.EqualWithinAbsOrRel(v, b[i], epsilon, epsilon) { + return false + } + } + return true +}