// Copyright ©2016 The gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package stat import ( "errors" "math" "gonum.org/v1/gonum/floats" "gonum.org/v1/gonum/mat" ) // PC is a type for computing and extracting the principal components of a // matrix. The results of the principal components analysis are only valid // if the call to PrincipalComponents was successful. type PC struct { n, d int weights []float64 svd *mat.SVD ok bool } // PrincipalComponents performs a weighted principal components analysis on the // matrix of the input data which is represented as an n×d matrix a where each // row is an observation and each column is a variable. // // PrincipalComponents centers the variables but does not scale the variance. // // The weights slice is used to weight the observations. If weights is nil, each // weight is considered to have a value of one, otherwise the length of weights // must match the number of observations or PrincipalComponents will panic. // // PrincipalComponents returns whether the analysis was successful. func (c *PC) PrincipalComponents(a mat.Matrix, weights []float64) (ok bool) { c.n, c.d = a.Dims() if weights != nil && len(weights) != c.n { panic("stat: len(weights) != observations") } c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights) if c.ok { c.weights = append(c.weights[:0], weights...) } return c.ok } // VectorsTo returns the component direction vectors of a principal components // analysis. The vectors are returned in the columns of a d×min(n, d) matrix. // If dst is not nil it must either be zero-sized or be a d×min(n, d) matrix. // dst will be used as the destination for the direction vector data. If dst // is nil, a new mat.Dense is allocated for the destination. func (c *PC) VectorsTo(dst *mat.Dense) *mat.Dense { if !c.ok { panic("stat: use of unsuccessful principal components analysis") } if dst != nil { if d, n := dst.Dims(); !dst.IsZero() && (d != c.d || n != min(c.n, c.d)) { panic(mat.ErrShape) } } return c.svd.VTo(dst) } // VarsTo returns the column variances of the principal component scores, // b * vecs, where b is a matrix with centered columns. Variances are returned // in descending order. // If dst is not nil it is used to store the variances and returned. // Vars will panic if the receiver has not successfully performed a principal // components analysis or dst is not nil and the length of dst is not min(n, d). func (c *PC) VarsTo(dst []float64) []float64 { if !c.ok { panic("stat: use of unsuccessful principal components analysis") } if dst != nil && len(dst) != min(c.n, c.d) { panic("stat: length of slice does not match analysis") } dst = c.svd.Values(dst) var f float64 if c.weights == nil { f = 1 / float64(c.n-1) } else { f = 1 / (floats.Sum(c.weights) - 1) } for i, v := range dst { dst[i] = f * v * v } return dst } func min(a, b int) int { if a < b { return a } return b } // CC is a type for computing the canonical correlations of a pair of matrices. // The results of the canonical correlation analysis are only valid // if the call to CanonicalCorrelations was successful. type CC struct { // n is the number of observations used to // construct the canonical correlations. n int // xd and yd are used for size checks. xd, yd int x, y, c *mat.SVD ok bool } // CanonicalCorrelations returns a CC which can provide the results of canonical // correlation analysis of the input data x and y, columns of which should be // interpretable as two sets of measurements on the same observations (rows). // These observations are optionally weighted by weights. // // Canonical correlation analysis finds associations between two sets of // variables on the same observations by finding linear combinations of the two // sphered datasets that maximize the correlation between them. // // Some notation: let Xc and Yc denote the centered input data matrices x // and y (column means subtracted from each column), let Sx and Sy denote the // sample covariance matrices within x and y respectively, and let Sxy denote // the covariance matrix between x and y. The sphered data can then be expressed // as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix // between the sphered data is called the canonical correlation matrix, // Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some // covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * E^T where S can // be eigendecomposed as S = E * D * E^T. // // The canonical correlations are the correlations between the corresponding // pairs of canonical variables and can be obtained with c.Corrs(). Canonical // variables can be obtained by projecting the sphered data into the left and // right eigenvectors of the canonical correlation matrix, and these // eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true) // respectively. The canonical variables can also be obtained directly from the // centered raw data by using the back-transformed eigenvectors which can be // obtained with c.Left(m, false) and c.Right(m, false) respectively. // // The first pair of left and right eigenvectors of the canonical correlation // matrix can be interpreted as directions into which the respective sphered // data can be projected such that the correlation between the two projections // is maximized. The second pair and onwards solve the same optimization but // under the constraint that they are uncorrelated (orthogonal in sphered space) // to previous projections. // // CanonicalCorrelations will panic if the inputs x and y do not have the same // number of rows. // // The slice weights is used to weight the observations. If weights is nil, each // weight is considered to have a value of one, otherwise the length of weights // must match the number of observations (rows of both x and y) or // CanonicalCorrelations will panic. // // More details can be found at // https://en.wikipedia.org/wiki/Canonical_correlation // or in Chapter 3 of // Koch, Inge. Analysis of multivariate and high-dimensional data. // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 func (c *CC) CanonicalCorrelations(x, y mat.Matrix, weights []float64) error { var yn int c.n, c.xd = x.Dims() yn, c.yd = y.Dims() if c.n != yn { panic("stat: unequal number of observations") } if weights != nil && len(weights) != c.n { panic("stat: len(weights) != observations") } // Center and factorize x and y. c.x, c.ok = svdFactorizeCentered(c.x, x, weights) if !c.ok { return errors.New("stat: failed to factorize x") } c.y, c.ok = svdFactorizeCentered(c.y, y, weights) if !c.ok { return errors.New("stat: failed to factorize y") } xu := c.x.UTo(nil) xv := c.x.VTo(nil) yu := c.y.UTo(nil) yv := c.y.VTo(nil) // Calculate and factorise the canonical correlation matrix. var ccor mat.Dense ccor.Product(xv, xu.T(), yu, yv.T()) if c.c == nil { c.c = &mat.SVD{} } c.ok = c.c.Factorize(&ccor, mat.SVDThin) if !c.ok { return errors.New("stat: failed to factorize ccor") } return nil } // CorrsTo returns the canonical correlations, using dst if it is not nil. // If dst is not nil and len(dst) does not match the number of columns in // the y input matrix, Corrs will panic. func (c *CC) CorrsTo(dst []float64) []float64 { if !c.ok { panic("stat: canonical correlations missing or invalid") } if dst != nil && len(dst) != c.yd { panic("stat: length of destination does not match input dimension") } return c.c.Values(dst) } // LeftTo returns the left eigenvectors of the canonical correlation matrix if // spheredSpace is true. If spheredSpace is false it returns these eigenvectors // back-transformed to the original data space. // If dst is not nil it must either be zero-sized or be an xd×yd matrix where xd // and yd are the number of variables in the input x and y matrices. dst will // be used as the destination for the vector data. If dst is nil, a new // mat.Dense is allocated for the destination. func (c *CC) LeftTo(dst *mat.Dense, spheredSpace bool) *mat.Dense { if !c.ok || c.n < 2 { panic("stat: canonical correlations missing or invalid") } if dst != nil { if d, n := dst.Dims(); !dst.IsZero() && (n != c.yd || d != c.xd) { panic(mat.ErrShape) } } dst = c.c.UTo(dst) if spheredSpace { return dst } xs := c.x.Values(nil) xv := c.x.VTo(nil) scaleColsReciSqrt(xv, xs) dst.Product(xv, xv.T(), dst) dst.Scale(math.Sqrt(float64(c.n-1)), dst) return dst } // RightTo returns the right eigenvectors of the canonical correlation matrix if // spheredSpace is true. If spheredSpace is false it returns these eigenvectors // back-transformed to the original data space. // If dst is not nil it must either be zero-sized or be an yd×yd matrix where yd // is the number of variables in the input y matrix. dst will // be used as the destination for the vector data. If dst is nil, a new // mat.Dense is allocated for the destination. func (c *CC) RightTo(dst *mat.Dense, spheredSpace bool) *mat.Dense { if !c.ok || c.n < 2 { panic("stat: canonical correlations missing or invalid") } if dst != nil { if d, n := dst.Dims(); (n != 0 || d != 0) && (n != c.yd || d != c.yd) { panic(mat.ErrShape) } } dst = c.c.VTo(dst) if spheredSpace { return dst } ys := c.y.Values(nil) yv := c.y.VTo(nil) scaleColsReciSqrt(yv, ys) dst.Product(yv, yv.T(), dst) dst.Scale(math.Sqrt(float64(c.n-1)), dst) return dst } func svdFactorizeCentered(work *mat.SVD, m mat.Matrix, weights []float64) (svd *mat.SVD, ok bool) { n, d := m.Dims() centered := mat.NewDense(n, d, nil) col := make([]float64, n) for j := 0; j < d; j++ { mat.Col(col, j, m) floats.AddConst(-Mean(col, weights), col) centered.SetCol(j, col) } for i, w := range weights { floats.Scale(math.Sqrt(w), centered.RawRowView(i)) } if work == nil { work = &mat.SVD{} } ok = work.Factorize(centered, mat.SVDThin) return work, ok } // scaleColsReciSqrt scales the columns of cols // by the reciprocal square-root of vals. func scaleColsReciSqrt(cols *mat.Dense, vals []float64) { if cols == nil { panic("stat: input nil") } n, d := cols.Dims() if len(vals) != d { panic("stat: input length mismatch") } col := make([]float64, n) for j := 0; j < d; j++ { mat.Col(col, j, cols) floats.Scale(math.Sqrt(1/vals[j]), col) cols.SetCol(j, col) } }