mirror of
https://github.com/gonum/gonum.git
synced 2025-10-21 14:19:35 +08:00
Improved comments. Fixed special case documentation of the public comment, and added better documentation to the working
of the algorithm
This commit is contained in:
25
stat.go
25
stat.go
@@ -421,8 +421,9 @@ func JensenShannon(p, q []float64) float64 {
|
|||||||
// x and y may have different lengths, though len(x) must equal len(xWeights), and
|
// x and y may have different lengths, though len(x) must equal len(xWeights), and
|
||||||
// len(y) must equal len(yWeights).
|
// len(y) must equal len(yWeights).
|
||||||
//
|
//
|
||||||
// If len(x) == len(y) == 0, the function returns 0. Otherwise, it returns 1 if
|
// Special cases are:
|
||||||
// one of the lengths is zero but not the other.
|
// = 0 if len(x) == len(y) == 0
|
||||||
|
// = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0
|
||||||
func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
|
func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
|
||||||
if xWeights != nil && len(x) != len(xWeights) {
|
if xWeights != nil && len(x) != len(xWeights) {
|
||||||
panic("stat: slice length mismatch")
|
panic("stat: slice length mismatch")
|
||||||
@@ -477,10 +478,15 @@ func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
|
|||||||
xVal := x[0]
|
xVal := x[0]
|
||||||
yVal := y[0]
|
yVal := y[0]
|
||||||
|
|
||||||
// How to deal with the first case well?
|
// Algorithm description:
|
||||||
|
// The goal is to find the maximum difference in the empirical CDFs for the
|
||||||
// Step through all the entries in the two vectors. The difference between
|
// two datasets. The CDFs are piecewise-constant, and thus the distance
|
||||||
// the empirical cdfs only changes when a new x or y comes up.
|
// between the CDFs will only change at the values themselves.
|
||||||
|
//
|
||||||
|
// To find the maximum distance, step through the data in ascending order
|
||||||
|
// of value between the two datasets. At each step, compute the empirical CDF
|
||||||
|
// and compare the local distance with the maximum distance.
|
||||||
|
// Due to some corner cases, equal data entries must be tallied simultaneously.
|
||||||
for {
|
for {
|
||||||
switch {
|
switch {
|
||||||
case xVal < yVal:
|
case xVal < yVal:
|
||||||
@@ -510,14 +516,17 @@ func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
|
// Both xCdf and yCdf will equal 1 at the end, so if we have reached the
|
||||||
// end of either sample, the distance is as large as it can be, so we can
|
// end of either sample list, the distance is as large as it can be.
|
||||||
// just leave
|
|
||||||
if xIdx == len(x) || yIdx == len(y) {
|
if xIdx == len(x) || yIdx == len(y) {
|
||||||
return maxDist
|
return maxDist
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// updateKS gets the next data point from one of the set. In doing so, it combines
|
||||||
|
// the weight of all the data points of equal value. Upon return, val is the new
|
||||||
|
// value of the data set, newCdf is the total combined CDF up until this point,
|
||||||
|
// and newIdx is the index of the next location in that sample to examine.
|
||||||
func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
|
func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) {
|
||||||
// Sum up all the weights of consecutive values that are equal
|
// Sum up all the weights of consecutive values that are equal
|
||||||
if isNil {
|
if isNil {
|
||||||
|
Reference in New Issue
Block a user