diff --git a/stat.go b/stat.go index 4c76fbb3..720d9c08 100644 --- a/stat.go +++ b/stat.go @@ -385,7 +385,7 @@ func Hellinger(p, q []float64) float64 { // Histogram sums up the weighted number of data points in each bin. // The weight of data point x[i] will be placed into count[j] if -// dividers[j-1] <= x < dividers[j]. The "span" function in the floats package can assist +// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist // with bin creation. // // The following conditions on the inputs apply: @@ -399,37 +399,47 @@ func Histogram(count, dividers, x, weights []float64) []float64 { panic("stat: slice length mismatch") } if count == nil { - count = make([]float64, len(dividers)+1) + count = make([]float64, len(dividers)-1) } - if len(count) != len(dividers)+1 { + if len(dividers) < 2 { + panic("histogram: fewer than two dividers") + } + if len(count) != len(dividers)-1 { panic("histogram: bin count mismatch") } if !sort.Float64sAreSorted(dividers) { - panic("dividers are not sorted") + panic("histogram: dividers are not sorted") } if !sort.Float64sAreSorted(x) { - panic("x data are not sorted") + panic("histogram: x data are not sorted") + } + if len(x) == 0 { + for i := range count { + count[i] = 0 + } + return count + } + if x[0] < dividers[0] { + panic("histogram: minimum x value is less than lowest divider") + } + if x[len(x)-1] >= dividers[len(dividers)-1] { + panic("histogram: minimum x value is greater than highest divider") } idx := 0 - comp := dividers[idx] + comp := dividers[idx+1] if weights == nil { for _, v := range x { - if v < comp || idx == len(count)-1 { + if v < comp { // Still in the current bucket count[idx]++ continue } - // Need to find the next divider where v is less than the divider - // or to set the maximum divider if no such exists - for j := idx + 1; j < len(count); j++ { - if j == len(dividers) { - idx = len(dividers) - break - } - if v < dividers[j] { + // Find the next divider where v is less than the divider + for j := idx + 1; j < len(dividers); j++ { + if v < dividers[j+1] { idx = j - comp = dividers[j] + comp = dividers[j+1] break } } @@ -439,21 +449,16 @@ func Histogram(count, dividers, x, weights []float64) []float64 { } for i, v := range x { - if v < comp || idx == len(count)-1 { + if v < comp { // Still in the current bucket count[idx] += weights[i] continue } - // Need to find the next divider where v is less than the divider - // or to set the maximum divider if no such exists + // Need to find the next divider where v is less than the divider. for j := idx + 1; j < len(count); j++ { - if j == len(dividers) { - idx = len(dividers) - break - } - if v < dividers[j] { + if v < dividers[j+1] { idx = j - comp = dividers[j] + comp = dividers[j+1] break } } diff --git a/stat_test.go b/stat_test.go index 51e9a54c..03480300 100644 --- a/stat_test.go +++ b/stat_test.go @@ -332,26 +332,31 @@ func TestHistogram(t *testing.T) { }{ { x: []float64{1, 3, 5, 6, 7, 8}, - dividers: []float64{2, 4, 6, 7}, + dividers: []float64{0, 2, 4, 6, 7, 9}, ans: []float64{1, 1, 1, 1, 2}, }, { x: []float64{1, 3, 5, 6, 7, 8}, - dividers: []float64{2, 4, 6, 7}, + dividers: []float64{1, 2, 4, 6, 7, 9}, weights: []float64{1, 2, 1, 1, 1, 2}, ans: []float64{1, 2, 1, 1, 3}, }, { x: []float64{1, 8}, - dividers: []float64{2, 4, 6, 7}, + dividers: []float64{0, 2, 4, 6, 7, 9}, weights: []float64{1, 2}, ans: []float64{1, 0, 0, 0, 2}, }, { x: []float64{1, 8}, - dividers: []float64{2, 4, 6, 7}, + dividers: []float64{0, 2, 4, 6, 7, 9}, ans: []float64{1, 0, 0, 0, 1}, }, + { + x: []float64{}, + dividers: []float64{1, 3}, + ans: []float64{0}, + }, } { hist := Histogram(nil, test.dividers, test.x, test.weights) if !floats.Equal(hist, test.ans) { @@ -372,7 +377,7 @@ func TestHistogram(t *testing.T) { weights: []float64{1, 1, 1, 1}, }, { - name: "len(dividers) != len(count)", + name: "len(count) != len(dividers) - 1", x: []float64{1, 3, 5, 6, 7, 8}, dividers: []float64{1, 4, 9}, count: make([]float64, 6), @@ -387,6 +392,21 @@ func TestHistogram(t *testing.T) { x: []float64{1, 5, 2, 9, 7, 8}, dividers: []float64{1, 4, 9}, }, + { + name: "fewer than 2 dividers", + x: []float64{1, 2, 3}, + dividers: []float64{5}, + }, + { + name: "x too large", + x: []float64{1, 2, 3}, + dividers: []float64{1, 3}, + }, + { + name: "x too small", + x: []float64{1, 2, 3}, + dividers: []float64{2, 3}, + }, } { if !Panics(func() { Histogram(test.count, test.dividers, test.x, test.weights) }) { t.Errorf("Histogram did not panic when %s", test.name) @@ -399,26 +419,25 @@ func ExampleHistogram() { for i := range x { x[i] = 1.1 * float64(i) // x data ranges from 0 to 110 } - dividers := []float64{7, 20, 100, 1000} + dividers := []float64{0, 7, 20, 100, 1000} fmt.Println(`Histogram counts the amount of data in the bins specified by -the dividers. In this data set, there are 7 data points less than 7 (dividers[0]), -12 data points between 7 and 20 (dividers[2] and dividers[1]), and 0 data points -above 1000. Since dividers has length 4, there will be 5 bins.`) +the dividers. In this data set, there are 7 data points less than 7 (between dividers[0] +and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]), +and 0 data points above 1000. Since dividers has length 5, there will be 4 bins.`) hist := Histogram(nil, dividers, x, nil) fmt.Printf("Hist = %v\n", hist) fmt.Println() fmt.Println("For ease, the floats Span function can be used to set the dividers") nBins := 10 - // Create one fewer divider than bins, but add two to work with Span (see - // note below) dividers = make([]float64, nBins+1) min, _ := floats.Min(x) max, _ := floats.Max(x) + // Increase the maximum divider so that the maximum value of x is contained + // within the last bucket. + max += 1 floats.Span(dividers, min, max) // Span includes the min and the max. Trim the dividers to create 10 buckets - dividers = dividers[1 : len(dividers)-1] - fmt.Println("len dividers = ", len(dividers)) hist = Histogram(nil, dividers, x, nil) fmt.Printf("Hist = %v\n", hist) fmt.Println() @@ -433,18 +452,17 @@ the count field in order to avoid extra garbage`) // Output: // Histogram counts the amount of data in the bins specified by - // the dividers. In this data set, there are 7 data points less than 7 (dividers[0]), - // 12 data points between 7 and 20 (dividers[2] and dividers[1]), and 0 data points - // above 1000. Since dividers has length 4, there will be 5 bins. - // Hist = [7 12 72 10 0] + // the dividers. In this data set, there are 7 data points less than 7 (between dividers[0] + // and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]), + // and 0 data points above 1000. Since dividers has length 5, there will be 4 bins. + // Hist = [7 12 72 10] // // For ease, the floats Span function can be used to set the dividers - // len dividers = 9 - // Hist = [11 10 10 10 9 11 10 10 9 11] + // Hist = [11 10 10 10 10 10 10 10 10 10] // // Histogram also works with weighted data, and allows reusing of // the count field in order to avoid extra garbage - // Weighted Hist = [77 175 275 375 423 627 675 775 783 1067] + // Weighted Hist = [77 175 275 375 475 575 675 775 875 975] } func TestJensenShannon(t *testing.T) {