gonum/stat.go

// Copyright ©2014 The gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package stat

import (
	"math"
	"sort"

	"github.com/gonum/floats"
)

// CumulantKind specifies the behavior for calculating the empirical CDF or Quantile
type CumulantKind int

const (
	// Constant values should match the R nomenclature. See
	// https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population

	// Empirical treats the distribution as the actual empirical distribution.
	Empirical CumulantKind = 1
)

// bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by:
//  \sum_i \sqrt{p_i q_i}
//
// It is assumed that p and q have equal length.
func bhattacharyyaCoeff(p, q []float64) float64 {
	var bc float64
	for i, a := range p {
		bc += math.Sqrt(a * q[i])
	}
	return bc
}

// Bhattacharyya computes the distance between the probability distributions p and q given by:
//  -\ln ( \sum_i \sqrt{p_i q_i} )
//
// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
func Bhattacharyya(p, q []float64) float64 {
	if len(p) != len(q) {
		panic("stat: slice length mismatch")
	}
	bc := bhattacharyyaCoeff(p, q)
	return -math.Log(bc)
}

// CDF returns the empirical cumulative distribution function value of x, that is
// the fraction of the samples less than or equal to q. The
// exact behavior is determined by the CumulantKind. CDF is theoretically
// the inverse of the Quantile function, though it may not be the actual inverse
// for all values q and CumulantKinds.
//
// The x data must be sorted in increasing order. If weights is nil then all
// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
//
// CumulantKind behaviors:
//  - Empirical: Returns the lowest fraction for which q is greater than or equal
//  to that fraction of samples
func CDF(q float64, c CumulantKind, x, weights []float64) float64 {
	if weights != nil && len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	if floats.HasNaN(x) {
		return math.NaN()
	}
	if !sort.Float64sAreSorted(x) {
		panic("x data are not sorted")
	}

	if q < x[0] {
		return 0
	}
	if q >= x[len(x)-1] {
		return 1
	}

	var sumWeights float64
	if weights == nil {
		sumWeights = float64(len(x))
	} else {
		sumWeights = floats.Sum(weights)
	}

	// Calculate the index
	switch c {
	case Empirical:
		// Find the smallest value that is greater than that percent of the samples
		var w float64
		for i, v := range x {
			if v > q {
				return w / sumWeights
			}
			if weights == nil {
				w++
			} else {
				w += weights[i]
			}
		}
		panic("impossible")
	default:
		panic("stat: bad cumulant kind")
	}
}

// ChiSquare computes the chi-square distance between the observed frequences 'obs' and
// expected frequences 'exp' given by:
//  \sum_i (obs_i-exp_i)^2 / exp_i
//
// The lengths of obs and exp must be equal.
func ChiSquare(obs, exp []float64) float64 {
	if len(obs) != len(exp) {
		panic("stat: slice length mismatch")
	}
	var result float64
	for i, a := range obs {
		b := exp[i]
		if a == 0 && b == 0 {
			continue
		}
		result += (a - b) * (a - b) / b
	}
	return result
}

// Correlation returns the weighted correlation between the samples of x and y
// with the given means.
//  sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY)
// The lengths of x and y must be equal. If weights is nil then all of the
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
func Correlation(x, y, weights []float64) float64 {
	// This is a two-pass corrected implementation.  It is an adaptation of the
	// algorithm used in the MeanVariance function, which applies a correction
	// to the typical two pass approach.

	if len(x) != len(y) {
		panic("stat: slice length mismatch")
	}
	xu := Mean(x, weights)
	yu := Mean(y, weights)
	var (
		sxx           float64
		syy           float64
		sxy           float64
		xcompensation float64
		ycompensation float64
	)
	if weights == nil {
		for i, xv := range x {
			yv := y[i]
			xd := xv - xu
			yd := yv - yu
			sxx += xd * xd
			syy += yd * yd
			sxy += xd * yd
			xcompensation += xd
			ycompensation += yd
		}
		// xcompensation and ycompensation are from Chan, et. al.
		// referenced in the MeanVariance function.  They are analogous
		// to the second term in (1.7) in that paper.
		sxx -= xcompensation * xcompensation / float64(len(x))
		syy -= ycompensation * ycompensation / float64(len(x))

		return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy)

	}

	var sumWeights float64
	for i, xv := range x {
		w := weights[i]
		yv := y[i]
		xd := xv - xu
		wxd := w * xd
		yd := yv - yu
		wyd := w * yd
		sxx += wxd * xd
		syy += wyd * yd
		sxy += wxd * yd
		xcompensation += wxd
		ycompensation += wyd
		sumWeights += w
	}
	// xcompensation and ycompensation are from Chan, et. al.
	// referenced in the MeanVariance function.  They are analogous
	// to the second term in (1.7) in that paper, except they use
	// the sumWeights instead of the sample count.
	sxx -= xcompensation * xcompensation / sumWeights
	syy -= ycompensation * ycompensation / sumWeights

	return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy)
}

// Covariance returns the weighted covariance between the samples of x and y.
//  sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1)
// The lengths of x and y must be equal. If weights is nil then all of the
// weights are 1. If weights is not nil, then len(x) must equal len(weights).
func Covariance(x, y, weights []float64) float64 {
	// This is a two-pass corrected implementation.  It is an adaptation of the
	// algorithm used in the MeanVariance function, which applies a correction
	// to the typical two pass approach.

	if len(x) != len(y) {
		panic("stat: slice length mismatch")
	}
	xu := Mean(x, weights)
	yu := Mean(y, weights)
	var (
		ss            float64
		xcompensation float64
		ycompensation float64
	)
	if weights == nil {
		for i, xv := range x {
			yv := y[i]
			xd := xv - xu
			yd := yv - yu
			ss += xd * yd
			xcompensation += xd
			ycompensation += yd
		}
		// xcompensation and ycompensation are from Chan, et. al.
		// referenced in the MeanVariance function.  They are analogous
		// to the second term in (1.7) in that paper.
		return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1)
	}

	var sumWeights float64

	for i, xv := range x {
		w := weights[i]
		yv := y[i]
		wxd := w * (xv - xu)
		yd := (yv - yu)
		ss += wxd * yd
		xcompensation += wxd
		ycompensation += w * yd
		sumWeights += w
	}
	// xcompensation and ycompensation are from Chan, et. al.
	// referenced in the MeanVariance function.  They are analogous
	// to the second term in (1.7) in that paper, except they use
	// the sumWeights instead of the sample count.
	return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1)
}

// CrossEntropy computes the cross-entropy between the two distributions specified
// in p and q.
func CrossEntropy(p, q []float64) float64 {
	if len(p) != len(q) {
		panic("stat: slice length mismatch")
	}
	var ce float64
	for i, v := range p {
		if v != 0 {
			ce -= v * math.Log(q[i])
		}
	}
	return ce
}

// Entropy computes the Shannon entropy of a distribution or the distance between
// two distributions. The natural logarithm is used.
//  - sum_i (p_i * log_e(p_i))
func Entropy(p []float64) float64 {
	var e float64
	for _, v := range p {
		if v != 0 { // Entropy needs 0 * log(0) == 0
			e -= v * math.Log(v)
		}
	}
	return e
}

// ExKurtosis returns the population excess kurtosis of the sample.
// The kurtosis is defined by the 4th moment of the mean divided by the squared
// variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of
// the normal distribution is zero.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func ExKurtosis(x, weights []float64) float64 {
	mean, std := MeanStdDev(x, weights)
	if weights == nil {
		var e float64
		for _, v := range x {
			z := (v - mean) / std
			e += z * z * z * z
		}
		mul, offset := kurtosisCorrection(float64(len(x)))
		return e*mul - offset
	}

	var (
		e          float64
		sumWeights float64
	)
	for i, v := range x {
		z := (v - mean) / std
		e += weights[i] * z * z * z * z
		sumWeights += weights[i]
	}
	mul, offset := kurtosisCorrection(sumWeights)
	return e*mul - offset
}

// n is the number of samples
// see https://en.wikipedia.org/wiki/Kurtosis
func kurtosisCorrection(n float64) (mul, offset float64) {
	return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3))
}

// GeometricMean returns the weighted geometric mean of the dataset
//  \prod_i {x_i ^ w_i}
// This only applies with positive x and positive weights. If weights is nil
// then all of the weights are 1. If weights is not nil, then len(x) must equal
// len(weights).
func GeometricMean(x, weights []float64) float64 {
	if weights == nil {
		var s float64
		for _, v := range x {
			s += math.Log(v)
		}
		s /= float64(len(x))
		return math.Exp(s)
	}
	if len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	var (
		s          float64
		sumWeights float64
	)
	for i, v := range x {
		s += weights[i] * math.Log(v)
		sumWeights += weights[i]
	}
	s /= sumWeights
	return math.Exp(s)
}

// HarmonicMean returns the weighted harmonic mean of the dataset
//  \sum_i {w_i} / ( sum_i {w_i / x_i} )
// This only applies with positive x and positive weights.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func HarmonicMean(x, weights []float64) float64 {
	if weights != nil && len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	// TODO: Fix this to make it more efficient and avoid allocation

	// This can be numerically unstable (for example if x is very small)
	// W = \sum_i {w_i}
	// hm = exp(log(W) - log(\sum_i w_i / x_i))

	logs := make([]float64, len(x))
	var W float64
	for i := range x {
		if weights == nil {
			logs[i] = -math.Log(x[i])
			W++
			continue
		}
		logs[i] = math.Log(weights[i]) - math.Log(x[i])
		W += weights[i]
	}

	// Sum all of the logs
	v := floats.LogSumExp(logs) // this computes log(\sum_i { w_i / x_i})
	return math.Exp(math.Log(W) - v)
}

// Hellinger computes the distance between the probability distributions p and q given by:
//  \sqrt{ 1 - \sum_i \sqrt{p_i q_i} }
//
// The lengths of p and q must be equal. It is assumed that p and q sum to 1.
func Hellinger(p, q []float64) float64 {
	if len(p) != len(q) {
		panic("stat: slice length mismatch")
	}
	bc := bhattacharyyaCoeff(p, q)
	return math.Sqrt(1 - bc)
}

// Histogram sums up the weighted number of data points in each bin.
// The weight of data point x[i] will be placed into count[j] if
// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
// with bin creation.
//
// The following conditions on the inputs apply:
//  - The count variable must either be nil or have length of one less than dividers.
//  - The values in dividers must be sorted (use the sort package).
//  - The x values must be sorted.
//  - If weights is nil then all of the weights are 1.
//  - If weights is not nil, then len(x) must equal len(weights).
func Histogram(count, dividers, x, weights []float64) []float64 {
	if weights != nil && len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	if count == nil {
		count = make([]float64, len(dividers)-1)
	}
	if len(dividers) < 2 {
		panic("histogram: fewer than two dividers")
	}
	if len(count) != len(dividers)-1 {
		panic("histogram: bin count mismatch")
	}
	if !sort.Float64sAreSorted(dividers) {
		panic("histogram: dividers are not sorted")
	}
	if !sort.Float64sAreSorted(x) {
		panic("histogram: x data are not sorted")
	}
	if len(x) == 0 {
		for i := range count {
			count[i] = 0
		}
		return count
	}
	if x[0] < dividers[0] {
		panic("histogram: minimum x value is less than lowest divider")
	}
	if x[len(x)-1] >= dividers[len(dividers)-1] {
		panic("histogram: minimum x value is greater than highest divider")
	}

	idx := 0
	comp := dividers[idx+1]
	if weights == nil {
		for _, v := range x {
			if v < comp {
				// Still in the current bucket
				count[idx]++
				continue
			}
			// Find the next divider where v is less than the divider
			for j := idx + 1; j < len(dividers); j++ {
				if v < dividers[j+1] {
					idx = j
					comp = dividers[j+1]
					break
				}
			}
			count[idx]++
		}
		return count
	}

	for i, v := range x {
		if v < comp {
			// Still in the current bucket
			count[idx] += weights[i]
			continue
		}
		// Need to find the next divider where v is less than the divider.
		for j := idx + 1; j < len(count); j++ {
			if v < dividers[j+1] {
				idx = j
				comp = dividers[j+1]
				break
			}
		}
		count[idx] += weights[i]
	}
	return count
}

// JensenShannon computes the JensenShannon divergence between the distributions
// p and q. The Jensen-Shannon divergence is defined as
//  m = 0.5 * (p + q)
//  JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) )
// Unlike Kullback-Liebler, the Jensen-Shannon distance is symmetric. The value
// is between 0 and ln(2).
func JensenShannon(p, q []float64) float64 {
	if len(p) != len(q) {
		panic("stat: slice length mismatch")
	}
	var js float64
	for i, v := range p {
		qi := q[i]
		m := 0.5 * (v + qi)
		if v != 0 {
			// add kl from p to m
			js += 0.5 * v * (math.Log(v) - math.Log(m))
		}
		if qi != 0 {
			// add kl from q to m
			js += 0.5 * qi * (math.Log(qi) - math.Log(m))
		}
	}
	return js
}

// KolmogorovSmirnov computes the largest distance between two empirical CDFs.
// Each dataset x and y consists of sample locations and counts, xWeights and
// yWeights, respectively.
//
// x and y may have different lengths, though len(x) must equal len(xWeights), and
// len(y) must equal len(yWeights).  Both x and y must be sorted.
//
// Special cases are:
//  = 0 if len(x) == len(y) == 0
//  = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
	if xWeights != nil && len(x) != len(xWeights) {
		panic("stat: slice length mismatch")
	}
	if yWeights != nil && len(y) != len(yWeights) {
		panic("stat: slice length mismatch")
	}
	if len(x) == 0 || len(y) == 0 {
		if len(x) == 0 && len(y) == 0 {
			return 0
		}
		return 1
	}

	if floats.HasNaN(x) {
		return math.NaN()
	}
	if floats.HasNaN(y) {
		return math.NaN()
	}

	if !sort.Float64sAreSorted(x) {
		panic("x data are not sorted")
	}
	if !sort.Float64sAreSorted(y) {
		panic("y data are not sorted")
	}

	xWeightsNil := xWeights == nil
	yWeightsNil := yWeights == nil

	var (
		maxDist    float64
		xSum, ySum float64
		xCdf, yCdf float64
		xIdx, yIdx int
	)

	if xWeightsNil {
		xSum = float64(len(x))
	} else {
		xSum = floats.Sum(xWeights)
	}

	if yWeightsNil {
		ySum = float64(len(y))
	} else {
		ySum = floats.Sum(yWeights)
	}

	xVal := x[0]
	yVal := y[0]

	// Algorithm description:
	// The goal is to find the maximum difference in the empirical CDFs for the
	// two datasets. The CDFs are piecewise-constant, and thus the distance
	// between the CDFs will only change at the values themselves.
	//
	// To find the maximum distance, step through the data in ascending order
	// of value between the two datasets. At each step, compute the empirical CDF
	// and compare the local distance with the maximum distance.
	// Due to some corner cases, equal data entries must be tallied simultaneously.
	for {
		switch {
		case xVal < yVal:
			xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
		case yVal < xVal:
			yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
		case xVal == yVal:
			newX := x[xIdx]
			newY := y[yIdx]
			if newX < newY {
				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
			} else if newY < newX {
				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
			} else {
				// Update them both, they'll be equal next time and the right
				// thing will happen
				xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil)
				yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil)
			}
		default:
			panic("unreachable")
		}

		dist := math.Abs(xCdf - yCdf)
		if dist > maxDist {
			maxDist = dist
		}

		// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
		// end of either sample list, the distance is as large as it can be.
		if xIdx == len(x) || yIdx == len(y) {
			return maxDist
		}
	}
}

// updateKS gets the next data point from one of the set. In doing so, it combines
// the weight of all the data points of equal value. Upon return, val is the new
// value of the data set, newCdf is the total combined CDF up until this point,
// and newIdx is the index of the next location in that sample to examine.
func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
	// Sum up all the weights of consecutive values that are equal
	if isNil {
		newCdf = cdf + 1/sum
	} else {
		newCdf = cdf + weights[idx]/sum
	}
	newIdx = idx + 1
	for {
		if newIdx == len(values) {
			return values[newIdx-1], newCdf, newIdx
		}
		if values[newIdx-1] != values[newIdx] {
			return values[newIdx], newCdf, newIdx
		}
		if isNil {
			newCdf += 1 / sum
		} else {
			newCdf += weights[newIdx] / sum
		}
		newIdx++
	}
}

// KullbackLeibler computes the Kullback-Leibler distance between the
// distributions p and q. The natural logarithm is used.
//  sum_i(p_i * log(p_i / q_i))
// Note that the Kullback-Leibler distance is not symmetric;
// KullbackLeibler(p,q) != KullbackLeibler(q,p)
func KullbackLeibler(p, q []float64) float64 {
	if len(p) != len(q) {
		panic("stat: slice length mismatch")
	}
	var kl float64
	for i, v := range p {
		if v != 0 { // Entropy needs 0 * log(0) == 0
			kl += v * (math.Log(v) - math.Log(q[i]))
		}
	}
	return kl
}

// Mean computes the weighted mean of the data set.
//  sum_i {w_i * x_i} / sum_i {w_i}
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func Mean(x, weights []float64) float64 {
	if weights == nil {
		return floats.Sum(x) / float64(len(x))
	}
	if len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	var (
		sumValues  float64
		sumWeights float64
	)
	for i, w := range weights {
		sumValues += w * x[i]
		sumWeights += w
	}
	return sumValues / sumWeights
}

// Mode returns the most common value in the dataset specified by x and the
// given weights. Strict float64 equality is used when comparing values, so users
// should take caution. If several values are the mode, any of them may be returned.
func Mode(x []float64, weights []float64) (val float64, count float64) {
	if weights != nil && len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	if len(x) == 0 {
		return 0, 0
	}
	m := make(map[float64]float64)
	if weights == nil {
		for _, v := range x {
			m[v]++
		}
	} else {
		for i, v := range x {
			m[v] += weights[i]
		}
	}
	var (
		maxCount float64
		max      float64
	)
	for val, count := range m {
		if count > maxCount {
			maxCount = count
			max = val
		}
	}
	return max, maxCount
}

// Moment computes the weighted n^th moment of the samples,
//  E[(x - μ)^N]
// No degrees of freedom correction is done.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func Moment(moment float64, x, weights []float64) float64 {
	mean := Mean(x, weights)
	if weights == nil {
		var m float64
		for _, v := range x {
			m += math.Pow(v-mean, moment)
		}
		return m / float64(len(x))
	}
	var (
		m          float64
		sumWeights float64
	)
	for i, v := range x {
		m += weights[i] * math.Pow(v-mean, moment)
		sumWeights += weights[i]
	}
	return m / sumWeights
}

// MomentAbout computes the weighted n^th weighted moment of the samples about
// the given mean \mu,
//  E[(x - μ)^N]
// No degrees of freedom correction is done.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 {
	if weights == nil {
		var m float64
		for _, v := range x {
			m += math.Pow(v-mean, moment)
		}
		m /= float64(len(x))
		return m
	}
	if len(weights) != len(x) {
		panic("stat: slice length mismatch")
	}
	var (
		m          float64
		sumWeights float64
	)
	for i, v := range x {
		m += weights[i] * math.Pow(v-mean, moment)
		sumWeights += weights[i]
	}
	return m / sumWeights
}

// Quantile returns the sample of x such that x is greater than or
// equal to the fraction p of samples. The exact behavior is determined by the
// CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically
// the inverse of the CDF function, though it may not be the actual inverse
// for all values p and CumulantKinds.
//
// The x data must be sorted in increasing order. If weights is nil then all
// of the weights are 1. If weights is not nil, then len(x) must equal len(weights).
//
// CumulantKind behaviors:
//  - Empirical: Returns the lowest value q for which q is greater than or equal
//  to the fraction p of samples
func Quantile(p float64, c CumulantKind, x, weights []float64) float64 {
	if !(p >= 0 && p <= 1) {
		panic("stat: percentile out of bounds")
	}

	if weights != nil && len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	if floats.HasNaN(x) {
		return math.NaN() // This is needed because the algorithm breaks otherwise
	}
	if !sort.Float64sAreSorted(x) {
		panic("x data are not sorted")
	}

	var sumWeights float64
	if weights == nil {
		sumWeights = float64(len(x))
	} else {
		sumWeights = floats.Sum(weights)
	}
	switch c {
	case Empirical:
		var cumsum float64
		fidx := p * sumWeights
		for i := range x {
			if weights == nil {
				cumsum++
			} else {
				cumsum += weights[i]
			}
			if cumsum >= fidx {
				return x[i]
			}
		}
		panic("impossible")
	default:
		panic("stat: bad cumulant kind")
	}
}

// Skew computes the skewness of the sample data.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func Skew(x, weights []float64) float64 {

	mean, std := MeanStdDev(x, weights)
	if weights == nil {
		var s float64
		for _, v := range x {
			z := (v - mean) / std
			s += z * z * z
		}
		return s * skewCorrection(float64(len(x)))
	}
	var (
		s          float64
		sumWeights float64
	)
	for i, v := range x {
		z := (v - mean) / std
		s += weights[i] * z * z * z
		sumWeights += weights[i]
	}
	return s * skewCorrection(sumWeights)
}

// From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7
func skewCorrection(n float64) float64 {
	return (n / (n - 1)) * (1 / (n - 2))
}

// SortWeighted rearranges the data in x along with their corresponding
// weights so that the x data are sorted. The data is sorted in place.
// Weights may be nil, but if weights is non-nil then it must have the same
// length as x.
func SortWeighted(x, weights []float64) {
	if weights == nil {
		sort.Float64s(x)
		return
	}
	if len(x) != len(weights) {
		panic("stat: slice length mismatch")
	}
	sort.Sort(weightSorter{
		x: x,
		w: weights,
	})
}

type weightSorter struct {
	x []float64
	w []float64
}

func (w weightSorter) Less(i, j int) bool {
	return w.x[i] < w.x[j]
}

func (w weightSorter) Swap(i, j int) {
	w.x[i], w.x[j] = w.x[j], w.x[i]
	w.w[i], w.w[j] = w.w[j], w.w[i]
}

func (w weightSorter) Len() int {
	return len(w.x)
}

// StdDev returns the sample standard deviation.
func StdDev(x []float64, weights []float64) float64 {
	_, std := MeanStdDev(x, weights)
	return std
}

// MeanStdDev returns the sample mean and standard deviation
func MeanStdDev(x []float64, weights []float64) (mean, std float64) {
	mean, variance := MeanVariance(x, weights)
	return mean, math.Sqrt(variance)
}

// StdErr returns the standard error in the mean with the given values.
func StdErr(std, sampleSize float64) float64 {
	return std / math.Sqrt(sampleSize)
}

// StdScore returns the standard score (a.k.a. z-score, z-value) for the value x
// with the givem mean and standard deviation, i.e.
//  (x - mean) / std
func StdScore(x, mean, std float64) float64 {
	return (x - mean) / std
}

// Variance computes the weighted sample variance:
//  \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func Variance(x, weights []float64) float64 {
	_, variance := MeanVariance(x, weights)
	return variance
}

// MeanVariance computes the sample mean and variance, where the mean and variance are
//  \sum_i w_i * x_i / (sum_i w_i)
//  \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1)
// respectively.
// If weights is nil then all of the weights are 1. If weights is not nil, then
// len(x) must equal len(weights).
func MeanVariance(x, weights []float64) (mean, variance float64) {

	// This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing
	// the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub,
	// and Randall J. LeVeque.

	// note that this will panic if the slice lengths do not match
	mean = Mean(x, weights)
	var (
		ss           float64
		compensation float64
	)
	if weights == nil {
		for _, v := range x {
			d := v - mean
			ss += d * d
			compensation += d
		}
		variance = (ss - compensation*compensation/float64(len(x))) / float64(len(x)-1)
		return
	}

	var sumWeights float64
	for i, v := range x {
		w := weights[i]
		d := v - mean
		wd := w * d
		ss += wd * d
		compensation += wd
		sumWeights += w
	}
	variance = (ss - compensation*compensation/sumWeights) / (sumWeights - 1)
	return
}