mirror of
https://github.com/gonum/gonum.git
synced 2025-10-19 05:24:52 +08:00
Fix Histogram implementation.
The former behavior of Histogram did not agree with the documentation. The documentation matched the spirit of floats.Within, so keep the documentation and fix the behavior. This change updates the function behavior, as well as corrects the test and the example.
This commit is contained in:
55
stat.go
55
stat.go
@@ -385,7 +385,7 @@ func Hellinger(p, q []float64) float64 {
|
|||||||
|
|
||||||
// Histogram sums up the weighted number of data points in each bin.
|
// Histogram sums up the weighted number of data points in each bin.
|
||||||
// The weight of data point x[i] will be placed into count[j] if
|
// The weight of data point x[i] will be placed into count[j] if
|
||||||
// dividers[j-1] <= x < dividers[j]. The "span" function in the floats package can assist
|
// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist
|
||||||
// with bin creation.
|
// with bin creation.
|
||||||
//
|
//
|
||||||
// The following conditions on the inputs apply:
|
// The following conditions on the inputs apply:
|
||||||
@@ -399,37 +399,47 @@ func Histogram(count, dividers, x, weights []float64) []float64 {
|
|||||||
panic("stat: slice length mismatch")
|
panic("stat: slice length mismatch")
|
||||||
}
|
}
|
||||||
if count == nil {
|
if count == nil {
|
||||||
count = make([]float64, len(dividers)+1)
|
count = make([]float64, len(dividers)-1)
|
||||||
}
|
}
|
||||||
if len(count) != len(dividers)+1 {
|
if len(dividers) < 2 {
|
||||||
|
panic("histogram: fewer than two dividers")
|
||||||
|
}
|
||||||
|
if len(count) != len(dividers)-1 {
|
||||||
panic("histogram: bin count mismatch")
|
panic("histogram: bin count mismatch")
|
||||||
}
|
}
|
||||||
if !sort.Float64sAreSorted(dividers) {
|
if !sort.Float64sAreSorted(dividers) {
|
||||||
panic("dividers are not sorted")
|
panic("histogram: dividers are not sorted")
|
||||||
}
|
}
|
||||||
if !sort.Float64sAreSorted(x) {
|
if !sort.Float64sAreSorted(x) {
|
||||||
panic("x data are not sorted")
|
panic("histogram: x data are not sorted")
|
||||||
|
}
|
||||||
|
if len(x) == 0 {
|
||||||
|
for i := range count {
|
||||||
|
count[i] = 0
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
}
|
||||||
|
if x[0] < dividers[0] {
|
||||||
|
panic("histogram: minimum x value is less than lowest divider")
|
||||||
|
}
|
||||||
|
if x[len(x)-1] >= dividers[len(dividers)-1] {
|
||||||
|
panic("histogram: minimum x value is greater than highest divider")
|
||||||
}
|
}
|
||||||
|
|
||||||
idx := 0
|
idx := 0
|
||||||
comp := dividers[idx]
|
comp := dividers[idx+1]
|
||||||
if weights == nil {
|
if weights == nil {
|
||||||
for _, v := range x {
|
for _, v := range x {
|
||||||
if v < comp || idx == len(count)-1 {
|
if v < comp {
|
||||||
// Still in the current bucket
|
// Still in the current bucket
|
||||||
count[idx]++
|
count[idx]++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Need to find the next divider where v is less than the divider
|
// Find the next divider where v is less than the divider
|
||||||
// or to set the maximum divider if no such exists
|
for j := idx + 1; j < len(dividers); j++ {
|
||||||
for j := idx + 1; j < len(count); j++ {
|
if v < dividers[j+1] {
|
||||||
if j == len(dividers) {
|
|
||||||
idx = len(dividers)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if v < dividers[j] {
|
|
||||||
idx = j
|
idx = j
|
||||||
comp = dividers[j]
|
comp = dividers[j+1]
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -439,21 +449,16 @@ func Histogram(count, dividers, x, weights []float64) []float64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for i, v := range x {
|
for i, v := range x {
|
||||||
if v < comp || idx == len(count)-1 {
|
if v < comp {
|
||||||
// Still in the current bucket
|
// Still in the current bucket
|
||||||
count[idx] += weights[i]
|
count[idx] += weights[i]
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Need to find the next divider where v is less than the divider
|
// Need to find the next divider where v is less than the divider.
|
||||||
// or to set the maximum divider if no such exists
|
|
||||||
for j := idx + 1; j < len(count); j++ {
|
for j := idx + 1; j < len(count); j++ {
|
||||||
if j == len(dividers) {
|
if v < dividers[j+1] {
|
||||||
idx = len(dividers)
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if v < dividers[j] {
|
|
||||||
idx = j
|
idx = j
|
||||||
comp = dividers[j]
|
comp = dividers[j+1]
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
58
stat_test.go
58
stat_test.go
@@ -332,26 +332,31 @@ func TestHistogram(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
x: []float64{1, 3, 5, 6, 7, 8},
|
x: []float64{1, 3, 5, 6, 7, 8},
|
||||||
dividers: []float64{2, 4, 6, 7},
|
dividers: []float64{0, 2, 4, 6, 7, 9},
|
||||||
ans: []float64{1, 1, 1, 1, 2},
|
ans: []float64{1, 1, 1, 1, 2},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
x: []float64{1, 3, 5, 6, 7, 8},
|
x: []float64{1, 3, 5, 6, 7, 8},
|
||||||
dividers: []float64{2, 4, 6, 7},
|
dividers: []float64{1, 2, 4, 6, 7, 9},
|
||||||
weights: []float64{1, 2, 1, 1, 1, 2},
|
weights: []float64{1, 2, 1, 1, 1, 2},
|
||||||
ans: []float64{1, 2, 1, 1, 3},
|
ans: []float64{1, 2, 1, 1, 3},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
x: []float64{1, 8},
|
x: []float64{1, 8},
|
||||||
dividers: []float64{2, 4, 6, 7},
|
dividers: []float64{0, 2, 4, 6, 7, 9},
|
||||||
weights: []float64{1, 2},
|
weights: []float64{1, 2},
|
||||||
ans: []float64{1, 0, 0, 0, 2},
|
ans: []float64{1, 0, 0, 0, 2},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
x: []float64{1, 8},
|
x: []float64{1, 8},
|
||||||
dividers: []float64{2, 4, 6, 7},
|
dividers: []float64{0, 2, 4, 6, 7, 9},
|
||||||
ans: []float64{1, 0, 0, 0, 1},
|
ans: []float64{1, 0, 0, 0, 1},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
x: []float64{},
|
||||||
|
dividers: []float64{1, 3},
|
||||||
|
ans: []float64{0},
|
||||||
|
},
|
||||||
} {
|
} {
|
||||||
hist := Histogram(nil, test.dividers, test.x, test.weights)
|
hist := Histogram(nil, test.dividers, test.x, test.weights)
|
||||||
if !floats.Equal(hist, test.ans) {
|
if !floats.Equal(hist, test.ans) {
|
||||||
@@ -372,7 +377,7 @@ func TestHistogram(t *testing.T) {
|
|||||||
weights: []float64{1, 1, 1, 1},
|
weights: []float64{1, 1, 1, 1},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "len(dividers) != len(count)",
|
name: "len(count) != len(dividers) - 1",
|
||||||
x: []float64{1, 3, 5, 6, 7, 8},
|
x: []float64{1, 3, 5, 6, 7, 8},
|
||||||
dividers: []float64{1, 4, 9},
|
dividers: []float64{1, 4, 9},
|
||||||
count: make([]float64, 6),
|
count: make([]float64, 6),
|
||||||
@@ -387,6 +392,21 @@ func TestHistogram(t *testing.T) {
|
|||||||
x: []float64{1, 5, 2, 9, 7, 8},
|
x: []float64{1, 5, 2, 9, 7, 8},
|
||||||
dividers: []float64{1, 4, 9},
|
dividers: []float64{1, 4, 9},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "fewer than 2 dividers",
|
||||||
|
x: []float64{1, 2, 3},
|
||||||
|
dividers: []float64{5},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "x too large",
|
||||||
|
x: []float64{1, 2, 3},
|
||||||
|
dividers: []float64{1, 3},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "x too small",
|
||||||
|
x: []float64{1, 2, 3},
|
||||||
|
dividers: []float64{2, 3},
|
||||||
|
},
|
||||||
} {
|
} {
|
||||||
if !Panics(func() { Histogram(test.count, test.dividers, test.x, test.weights) }) {
|
if !Panics(func() { Histogram(test.count, test.dividers, test.x, test.weights) }) {
|
||||||
t.Errorf("Histogram did not panic when %s", test.name)
|
t.Errorf("Histogram did not panic when %s", test.name)
|
||||||
@@ -399,26 +419,25 @@ func ExampleHistogram() {
|
|||||||
for i := range x {
|
for i := range x {
|
||||||
x[i] = 1.1 * float64(i) // x data ranges from 0 to 110
|
x[i] = 1.1 * float64(i) // x data ranges from 0 to 110
|
||||||
}
|
}
|
||||||
dividers := []float64{7, 20, 100, 1000}
|
dividers := []float64{0, 7, 20, 100, 1000}
|
||||||
fmt.Println(`Histogram counts the amount of data in the bins specified by
|
fmt.Println(`Histogram counts the amount of data in the bins specified by
|
||||||
the dividers. In this data set, there are 7 data points less than 7 (dividers[0]),
|
the dividers. In this data set, there are 7 data points less than 7 (between dividers[0]
|
||||||
12 data points between 7 and 20 (dividers[2] and dividers[1]), and 0 data points
|
and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]),
|
||||||
above 1000. Since dividers has length 4, there will be 5 bins.`)
|
and 0 data points above 1000. Since dividers has length 5, there will be 4 bins.`)
|
||||||
hist := Histogram(nil, dividers, x, nil)
|
hist := Histogram(nil, dividers, x, nil)
|
||||||
fmt.Printf("Hist = %v\n", hist)
|
fmt.Printf("Hist = %v\n", hist)
|
||||||
|
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
fmt.Println("For ease, the floats Span function can be used to set the dividers")
|
fmt.Println("For ease, the floats Span function can be used to set the dividers")
|
||||||
nBins := 10
|
nBins := 10
|
||||||
// Create one fewer divider than bins, but add two to work with Span (see
|
|
||||||
// note below)
|
|
||||||
dividers = make([]float64, nBins+1)
|
dividers = make([]float64, nBins+1)
|
||||||
min, _ := floats.Min(x)
|
min, _ := floats.Min(x)
|
||||||
max, _ := floats.Max(x)
|
max, _ := floats.Max(x)
|
||||||
|
// Increase the maximum divider so that the maximum value of x is contained
|
||||||
|
// within the last bucket.
|
||||||
|
max += 1
|
||||||
floats.Span(dividers, min, max)
|
floats.Span(dividers, min, max)
|
||||||
// Span includes the min and the max. Trim the dividers to create 10 buckets
|
// Span includes the min and the max. Trim the dividers to create 10 buckets
|
||||||
dividers = dividers[1 : len(dividers)-1]
|
|
||||||
fmt.Println("len dividers = ", len(dividers))
|
|
||||||
hist = Histogram(nil, dividers, x, nil)
|
hist = Histogram(nil, dividers, x, nil)
|
||||||
fmt.Printf("Hist = %v\n", hist)
|
fmt.Printf("Hist = %v\n", hist)
|
||||||
fmt.Println()
|
fmt.Println()
|
||||||
@@ -433,18 +452,17 @@ the count field in order to avoid extra garbage`)
|
|||||||
|
|
||||||
// Output:
|
// Output:
|
||||||
// Histogram counts the amount of data in the bins specified by
|
// Histogram counts the amount of data in the bins specified by
|
||||||
// the dividers. In this data set, there are 7 data points less than 7 (dividers[0]),
|
// the dividers. In this data set, there are 7 data points less than 7 (between dividers[0]
|
||||||
// 12 data points between 7 and 20 (dividers[2] and dividers[1]), and 0 data points
|
// and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]),
|
||||||
// above 1000. Since dividers has length 4, there will be 5 bins.
|
// and 0 data points above 1000. Since dividers has length 5, there will be 4 bins.
|
||||||
// Hist = [7 12 72 10 0]
|
// Hist = [7 12 72 10]
|
||||||
//
|
//
|
||||||
// For ease, the floats Span function can be used to set the dividers
|
// For ease, the floats Span function can be used to set the dividers
|
||||||
// len dividers = 9
|
// Hist = [11 10 10 10 10 10 10 10 10 10]
|
||||||
// Hist = [11 10 10 10 9 11 10 10 9 11]
|
|
||||||
//
|
//
|
||||||
// Histogram also works with weighted data, and allows reusing of
|
// Histogram also works with weighted data, and allows reusing of
|
||||||
// the count field in order to avoid extra garbage
|
// the count field in order to avoid extra garbage
|
||||||
// Weighted Hist = [77 175 275 375 423 627 675 775 783 1067]
|
// Weighted Hist = [77 175 275 375 475 575 675 775 875 975]
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestJensenShannon(t *testing.T) {
|
func TestJensenShannon(t *testing.T) {
|
||||||
|
Reference in New Issue
Block a user