diff --git a/stat/.travis.yml b/stat/.travis.yml new file mode 100644 index 00000000..3a8895eb --- /dev/null +++ b/stat/.travis.yml @@ -0,0 +1,23 @@ +sudo: false + +language: go + +# Versions of go that are explicitly supported by gonum. +go: + - 1.5.4 + - 1.6.3 + - 1.7.3 + +# Required for coverage. +before_install: + - go get golang.org/x/tools/cmd/cover + - go get github.com/mattn/goveralls + +# Get deps, build, test, and ensure the code is gofmt'ed. +# If we are building as gonum, then we have access to the coveralls api key, so we can run coverage as well. +script: + - go get -d -t -v ./... + - go build -v ./... + - go test -v ./... + - test -z "$(gofmt -d .)" + - if [[ $TRAVIS_SECURE_ENV_VARS = "true" ]]; then bash ./.travis/test-coverage.sh; fi diff --git a/stat/.travis/test-coverage.sh b/stat/.travis/test-coverage.sh new file mode 100755 index 00000000..7df8aa6a --- /dev/null +++ b/stat/.travis/test-coverage.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +PROFILE_OUT=$PWD/profile.out +ACC_OUT=$PWD/acc.out + +testCover() { + # set the return value to 0 (succesful) + retval=0 + # get the directory to check from the parameter. Default to '.' + d=${1:-.} + # skip if there are no Go files here + ls $d/*.go &> /dev/null || return $retval + # switch to the directory to check + pushd $d > /dev/null + # create the coverage profile + coverageresult=`go test -v -coverprofile=$PROFILE_OUT` + # output the result so we can check the shell output + echo ${coverageresult} + # append the results to acc.out if coverage didn't fail, else set the retval to 1 (failed) + ( [[ ${coverageresult} == *FAIL* ]] && retval=1 ) || ( [ -f $PROFILE_OUT ] && grep -v "mode: set" $PROFILE_OUT >> $ACC_OUT ) + # return to our working dir + popd > /dev/null + # return our return value + return $retval +} + +# Init acc.out +echo "mode: set" > $ACC_OUT + +# Run test coverage on all directories containing go files +find . -maxdepth 10 -type d | while read d; do testCover $d || exit; done + +# Upload the coverage profile to coveralls.io +[ -n "$COVERALLS_TOKEN" ] && goveralls -coverprofile=$ACC_OUT -service=travis-ci -repotoken $COVERALLS_TOKEN + diff --git a/stat/README.md b/stat/README.md new file mode 100644 index 00000000..4efcc125 --- /dev/null +++ b/stat/README.md @@ -0,0 +1,13 @@ +# Gonum Stat [![Build Status](https://travis-ci.org/gonum/stat.svg?branch=master)](https://travis-ci.org/gonum/stat) [![Coverage Status](https://coveralls.io/repos/gonum/stat/badge.svg?branch=master&service=github)](https://coveralls.io/github/gonum/stat?branch=master) [![GoDoc](https://godoc.org/github.com/gonum/stat?status.svg)](https://godoc.org/github.com/gonum/stat) + +This is a statistics package for the Go language. + +## Issues + +If you find any bugs, feel free to file an issue on the github issue tracker. Discussions on API changes, added features, code review, or similar requests are preferred on the gonum-dev Google Group. + +https://groups.google.com/forum/#!forum/gonum-dev + +## License + +Please see github.com/gonum/license for general license information, contributors, authors, etc on the Gonum suite of packages. diff --git a/stat/boston_data_test.go b/stat/boston_data_test.go new file mode 100644 index 00000000..c7b33146 --- /dev/null +++ b/stat/boston_data_test.go @@ -0,0 +1,531 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import "github.com/gonum/matrix/mat64" + +// Boston Housing Data of Harrison and Rubinfeld (1978) +// http://dx.doi.org/10.1016/0095-0696(78)90006-2 +// http://lib.stat.cmu.edu/datasets/boston +// Columns are; +// per capita crime rate by town, +// proportion of non-retail business acres per town, +// nitric oxide concentration (parts per 10 million), +// weighted distances to Boston employment centers, +// index of accessibility to radial highways, +// pupil-teacher ratio by town, +// proportion of blacks by town, +// average number of rooms per dwelling, +// proportion of owner-occupied units built prior to 1940, +// full-value property-tax rate per $10000, +// median value of owner-occupied homes in $1000s. +var bostonData = mat64.NewDense(506, 11, []float64{ + 0.00632, 2.31000, 0.53800, 4.09000, 1.00000, 15.30000, 396.90000, 6.57500, 65.20000, 296.00000, 24.00000, + 0.02731, 7.07000, 0.46900, 4.96710, 2.00000, 17.80000, 396.90000, 6.42100, 78.90000, 242.00000, 21.60000, + 0.02729, 7.07000, 0.46900, 4.96710, 2.00000, 17.80000, 392.83000, 7.18500, 61.10000, 242.00000, 34.70000, + 0.03237, 2.18000, 0.45800, 6.06220, 3.00000, 18.70000, 394.63000, 6.99800, 45.80000, 222.00000, 33.40000, + 0.06905, 2.18000, 0.45800, 6.06220, 3.00000, 18.70000, 396.90000, 7.14700, 54.20000, 222.00000, 36.20000, + 0.02985, 2.18000, 0.45800, 6.06220, 3.00000, 18.70000, 394.12000, 6.43000, 58.70000, 222.00000, 28.70000, + 0.08829, 7.87000, 0.52400, 5.56050, 5.00000, 15.20000, 395.60000, 6.01200, 66.60000, 311.00000, 22.90000, + 0.14455, 7.87000, 0.52400, 5.95050, 5.00000, 15.20000, 396.90000, 6.17200, 96.10000, 311.00000, 27.10000, + 0.21124, 7.87000, 0.52400, 6.08210, 5.00000, 15.20000, 386.63000, 5.63100, 100.00000, 311.00000, 16.50000, + 0.17004, 7.87000, 0.52400, 6.59210, 5.00000, 15.20000, 386.71000, 6.00400, 85.90000, 311.00000, 18.90000, + 0.22489, 7.87000, 0.52400, 6.34670, 5.00000, 15.20000, 392.52000, 6.37700, 94.30000, 311.00000, 15.00000, + 0.11747, 7.87000, 0.52400, 6.22670, 5.00000, 15.20000, 396.90000, 6.00900, 82.90000, 311.00000, 18.90000, + 0.09378, 7.87000, 0.52400, 5.45090, 5.00000, 15.20000, 390.50000, 5.88900, 39.00000, 311.00000, 21.70000, + 0.62976, 8.14000, 0.53800, 4.70750, 4.00000, 21.00000, 396.90000, 5.94900, 61.80000, 307.00000, 20.40000, + 0.63796, 8.14000, 0.53800, 4.46190, 4.00000, 21.00000, 380.02000, 6.09600, 84.50000, 307.00000, 18.20000, + 0.62739, 8.14000, 0.53800, 4.49860, 4.00000, 21.00000, 395.62000, 5.83400, 56.50000, 307.00000, 19.90000, + 1.05393, 8.14000, 0.53800, 4.49860, 4.00000, 21.00000, 386.85000, 5.93500, 29.30000, 307.00000, 23.10000, + 0.78420, 8.14000, 0.53800, 4.25790, 4.00000, 21.00000, 386.75000, 5.99000, 81.70000, 307.00000, 17.50000, + 0.80271, 8.14000, 0.53800, 3.79650, 4.00000, 21.00000, 288.99000, 5.45600, 36.60000, 307.00000, 20.20000, + 0.72580, 8.14000, 0.53800, 3.79650, 4.00000, 21.00000, 390.95000, 5.72700, 69.50000, 307.00000, 18.20000, + 1.25179, 8.14000, 0.53800, 3.79790, 4.00000, 21.00000, 376.57000, 5.57000, 98.10000, 307.00000, 13.60000, + 0.85204, 8.14000, 0.53800, 4.01230, 4.00000, 21.00000, 392.53000, 5.96500, 89.20000, 307.00000, 19.60000, + 1.23247, 8.14000, 0.53800, 3.97690, 4.00000, 21.00000, 396.90000, 6.14200, 91.70000, 307.00000, 15.20000, + 0.98843, 8.14000, 0.53800, 4.09520, 4.00000, 21.00000, 394.54000, 5.81300, 100.00000, 307.00000, 14.50000, + 0.75026, 8.14000, 0.53800, 4.39960, 4.00000, 21.00000, 394.33000, 5.92400, 94.10000, 307.00000, 15.60000, + 0.84054, 8.14000, 0.53800, 4.45460, 4.00000, 21.00000, 303.42000, 5.59900, 85.70000, 307.00000, 13.90000, + 0.67191, 8.14000, 0.53800, 4.68200, 4.00000, 21.00000, 376.88000, 5.81300, 90.30000, 307.00000, 16.60000, + 0.95577, 8.14000, 0.53800, 4.45340, 4.00000, 21.00000, 306.38000, 6.04700, 88.80000, 307.00000, 14.80000, + 0.77299, 8.14000, 0.53800, 4.45470, 4.00000, 21.00000, 387.94000, 6.49500, 94.40000, 307.00000, 18.40000, + 1.00245, 8.14000, 0.53800, 4.23900, 4.00000, 21.00000, 380.23000, 6.67400, 87.30000, 307.00000, 21.00000, + 1.13081, 8.14000, 0.53800, 4.23300, 4.00000, 21.00000, 360.17000, 5.71300, 94.10000, 307.00000, 12.70000, + 1.35472, 8.14000, 0.53800, 4.17500, 4.00000, 21.00000, 376.73000, 6.07200, 100.00000, 307.00000, 14.50000, + 1.38799, 8.14000, 0.53800, 3.99000, 4.00000, 21.00000, 232.60000, 5.95000, 82.00000, 307.00000, 13.20000, + 1.15172, 8.14000, 0.53800, 3.78720, 4.00000, 21.00000, 358.77000, 5.70100, 95.00000, 307.00000, 13.10000, + 1.61282, 8.14000, 0.53800, 3.75980, 4.00000, 21.00000, 248.31000, 6.09600, 96.90000, 307.00000, 13.50000, + 0.06417, 5.96000, 0.49900, 3.36030, 5.00000, 19.20000, 396.90000, 5.93300, 68.20000, 279.00000, 18.90000, + 0.09744, 5.96000, 0.49900, 3.37790, 5.00000, 19.20000, 377.56000, 5.84100, 61.40000, 279.00000, 20.00000, + 0.08014, 5.96000, 0.49900, 3.93420, 5.00000, 19.20000, 396.90000, 5.85000, 41.50000, 279.00000, 21.00000, + 0.17505, 5.96000, 0.49900, 3.84730, 5.00000, 19.20000, 393.43000, 5.96600, 30.20000, 279.00000, 24.70000, + 0.02763, 2.95000, 0.42800, 5.40110, 3.00000, 18.30000, 395.63000, 6.59500, 21.80000, 252.00000, 30.80000, + 0.03359, 2.95000, 0.42800, 5.40110, 3.00000, 18.30000, 395.62000, 7.02400, 15.80000, 252.00000, 34.90000, + 0.12744, 6.91000, 0.44800, 5.72090, 3.00000, 17.90000, 385.41000, 6.77000, 2.90000, 233.00000, 26.60000, + 0.14150, 6.91000, 0.44800, 5.72090, 3.00000, 17.90000, 383.37000, 6.16900, 6.60000, 233.00000, 25.30000, + 0.15936, 6.91000, 0.44800, 5.72090, 3.00000, 17.90000, 394.46000, 6.21100, 6.50000, 233.00000, 24.70000, + 0.12269, 6.91000, 0.44800, 5.72090, 3.00000, 17.90000, 389.39000, 6.06900, 40.00000, 233.00000, 21.20000, + 0.17142, 6.91000, 0.44800, 5.10040, 3.00000, 17.90000, 396.90000, 5.68200, 33.80000, 233.00000, 19.30000, + 0.18836, 6.91000, 0.44800, 5.10040, 3.00000, 17.90000, 396.90000, 5.78600, 33.30000, 233.00000, 20.00000, + 0.22927, 6.91000, 0.44800, 5.68940, 3.00000, 17.90000, 392.74000, 6.03000, 85.50000, 233.00000, 16.60000, + 0.25387, 6.91000, 0.44800, 5.87000, 3.00000, 17.90000, 396.90000, 5.39900, 95.30000, 233.00000, 14.40000, + 0.21977, 6.91000, 0.44800, 6.08770, 3.00000, 17.90000, 396.90000, 5.60200, 62.00000, 233.00000, 19.40000, + 0.08873, 5.64000, 0.43900, 6.81470, 4.00000, 16.80000, 395.56000, 5.96300, 45.70000, 243.00000, 19.70000, + 0.04337, 5.64000, 0.43900, 6.81470, 4.00000, 16.80000, 393.97000, 6.11500, 63.00000, 243.00000, 20.50000, + 0.05360, 5.64000, 0.43900, 6.81470, 4.00000, 16.80000, 396.90000, 6.51100, 21.10000, 243.00000, 25.00000, + 0.04981, 5.64000, 0.43900, 6.81470, 4.00000, 16.80000, 396.90000, 5.99800, 21.40000, 243.00000, 23.40000, + 0.01360, 4.00000, 0.41000, 7.31970, 3.00000, 21.10000, 396.90000, 5.88800, 47.60000, 469.00000, 18.90000, + 0.01311, 1.22000, 0.40300, 8.69660, 5.00000, 17.90000, 395.93000, 7.24900, 21.90000, 226.00000, 35.40000, + 0.02055, 0.74000, 0.41000, 9.18760, 2.00000, 17.30000, 396.90000, 6.38300, 35.70000, 313.00000, 24.70000, + 0.01432, 1.32000, 0.41100, 8.32480, 5.00000, 15.10000, 392.90000, 6.81600, 40.50000, 256.00000, 31.60000, + 0.15445, 5.13000, 0.45300, 7.81480, 8.00000, 19.70000, 390.68000, 6.14500, 29.20000, 284.00000, 23.30000, + 0.10328, 5.13000, 0.45300, 6.93200, 8.00000, 19.70000, 396.90000, 5.92700, 47.20000, 284.00000, 19.60000, + 0.14932, 5.13000, 0.45300, 7.22540, 8.00000, 19.70000, 395.11000, 5.74100, 66.20000, 284.00000, 18.70000, + 0.17171, 5.13000, 0.45300, 6.81850, 8.00000, 19.70000, 378.08000, 5.96600, 93.40000, 284.00000, 16.00000, + 0.11027, 5.13000, 0.45300, 7.22550, 8.00000, 19.70000, 396.90000, 6.45600, 67.80000, 284.00000, 22.20000, + 0.12650, 5.13000, 0.45300, 7.98090, 8.00000, 19.70000, 395.58000, 6.76200, 43.40000, 284.00000, 25.00000, + 0.01951, 1.38000, 0.41610, 9.22290, 3.00000, 18.60000, 393.24000, 7.10400, 59.50000, 216.00000, 33.00000, + 0.03584, 3.37000, 0.39800, 6.61150, 4.00000, 16.10000, 396.90000, 6.29000, 17.80000, 337.00000, 23.50000, + 0.04379, 3.37000, 0.39800, 6.61150, 4.00000, 16.10000, 396.90000, 5.78700, 31.10000, 337.00000, 19.40000, + 0.05789, 6.07000, 0.40900, 6.49800, 4.00000, 18.90000, 396.21000, 5.87800, 21.40000, 345.00000, 22.00000, + 0.13554, 6.07000, 0.40900, 6.49800, 4.00000, 18.90000, 396.90000, 5.59400, 36.80000, 345.00000, 17.40000, + 0.12816, 6.07000, 0.40900, 6.49800, 4.00000, 18.90000, 396.90000, 5.88500, 33.00000, 345.00000, 20.90000, + 0.08826, 10.81000, 0.41300, 5.28730, 4.00000, 19.20000, 383.73000, 6.41700, 6.60000, 305.00000, 24.20000, + 0.15876, 10.81000, 0.41300, 5.28730, 4.00000, 19.20000, 376.94000, 5.96100, 17.50000, 305.00000, 21.70000, + 0.09164, 10.81000, 0.41300, 5.28730, 4.00000, 19.20000, 390.91000, 6.06500, 7.80000, 305.00000, 22.80000, + 0.19539, 10.81000, 0.41300, 5.28730, 4.00000, 19.20000, 377.17000, 6.24500, 6.20000, 305.00000, 23.40000, + 0.07896, 12.83000, 0.43700, 4.25150, 5.00000, 18.70000, 394.92000, 6.27300, 6.00000, 398.00000, 24.10000, + 0.09512, 12.83000, 0.43700, 4.50260, 5.00000, 18.70000, 383.23000, 6.28600, 45.00000, 398.00000, 21.40000, + 0.10153, 12.83000, 0.43700, 4.05220, 5.00000, 18.70000, 373.66000, 6.27900, 74.50000, 398.00000, 20.00000, + 0.08707, 12.83000, 0.43700, 4.09050, 5.00000, 18.70000, 386.96000, 6.14000, 45.80000, 398.00000, 20.80000, + 0.05646, 12.83000, 0.43700, 5.01410, 5.00000, 18.70000, 386.40000, 6.23200, 53.70000, 398.00000, 21.20000, + 0.08387, 12.83000, 0.43700, 4.50260, 5.00000, 18.70000, 396.06000, 5.87400, 36.60000, 398.00000, 20.30000, + 0.04113, 4.86000, 0.42600, 5.40070, 4.00000, 19.00000, 396.90000, 6.72700, 33.50000, 281.00000, 28.00000, + 0.04462, 4.86000, 0.42600, 5.40070, 4.00000, 19.00000, 395.63000, 6.61900, 70.40000, 281.00000, 23.90000, + 0.03659, 4.86000, 0.42600, 5.40070, 4.00000, 19.00000, 396.90000, 6.30200, 32.20000, 281.00000, 24.80000, + 0.03551, 4.86000, 0.42600, 5.40070, 4.00000, 19.00000, 390.64000, 6.16700, 46.70000, 281.00000, 22.90000, + 0.05059, 4.49000, 0.44900, 4.77940, 3.00000, 18.50000, 396.90000, 6.38900, 48.00000, 247.00000, 23.90000, + 0.05735, 4.49000, 0.44900, 4.43770, 3.00000, 18.50000, 392.30000, 6.63000, 56.10000, 247.00000, 26.60000, + 0.05188, 4.49000, 0.44900, 4.42720, 3.00000, 18.50000, 395.99000, 6.01500, 45.10000, 247.00000, 22.50000, + 0.07151, 4.49000, 0.44900, 3.74760, 3.00000, 18.50000, 395.15000, 6.12100, 56.80000, 247.00000, 22.20000, + 0.05660, 3.41000, 0.48900, 3.42170, 2.00000, 17.80000, 396.90000, 7.00700, 86.30000, 270.00000, 23.60000, + 0.05302, 3.41000, 0.48900, 3.41450, 2.00000, 17.80000, 396.06000, 7.07900, 63.10000, 270.00000, 28.70000, + 0.04684, 3.41000, 0.48900, 3.09230, 2.00000, 17.80000, 392.18000, 6.41700, 66.10000, 270.00000, 22.60000, + 0.03932, 3.41000, 0.48900, 3.09210, 2.00000, 17.80000, 393.55000, 6.40500, 73.90000, 270.00000, 22.00000, + 0.04203, 15.04000, 0.46400, 3.66590, 4.00000, 18.20000, 395.01000, 6.44200, 53.60000, 270.00000, 22.90000, + 0.02875, 15.04000, 0.46400, 3.66590, 4.00000, 18.20000, 396.33000, 6.21100, 28.90000, 270.00000, 25.00000, + 0.04294, 15.04000, 0.46400, 3.61500, 4.00000, 18.20000, 396.90000, 6.24900, 77.30000, 270.00000, 20.60000, + 0.12204, 2.89000, 0.44500, 3.49520, 2.00000, 18.00000, 357.98000, 6.62500, 57.80000, 276.00000, 28.40000, + 0.11504, 2.89000, 0.44500, 3.49520, 2.00000, 18.00000, 391.83000, 6.16300, 69.60000, 276.00000, 21.40000, + 0.12083, 2.89000, 0.44500, 3.49520, 2.00000, 18.00000, 396.90000, 8.06900, 76.00000, 276.00000, 38.70000, + 0.08187, 2.89000, 0.44500, 3.49520, 2.00000, 18.00000, 393.53000, 7.82000, 36.90000, 276.00000, 43.80000, + 0.06860, 2.89000, 0.44500, 3.49520, 2.00000, 18.00000, 396.90000, 7.41600, 62.50000, 276.00000, 33.20000, + 0.14866, 8.56000, 0.52000, 2.77780, 5.00000, 20.90000, 394.76000, 6.72700, 79.90000, 384.00000, 27.50000, + 0.11432, 8.56000, 0.52000, 2.85610, 5.00000, 20.90000, 395.58000, 6.78100, 71.30000, 384.00000, 26.50000, + 0.22876, 8.56000, 0.52000, 2.71470, 5.00000, 20.90000, 70.80000, 6.40500, 85.40000, 384.00000, 18.60000, + 0.21161, 8.56000, 0.52000, 2.71470, 5.00000, 20.90000, 394.47000, 6.13700, 87.40000, 384.00000, 19.30000, + 0.13960, 8.56000, 0.52000, 2.42100, 5.00000, 20.90000, 392.69000, 6.16700, 90.00000, 384.00000, 20.10000, + 0.13262, 8.56000, 0.52000, 2.10690, 5.00000, 20.90000, 394.05000, 5.85100, 96.70000, 384.00000, 19.50000, + 0.17120, 8.56000, 0.52000, 2.21100, 5.00000, 20.90000, 395.67000, 5.83600, 91.90000, 384.00000, 19.50000, + 0.13117, 8.56000, 0.52000, 2.12240, 5.00000, 20.90000, 387.69000, 6.12700, 85.20000, 384.00000, 20.40000, + 0.12802, 8.56000, 0.52000, 2.43290, 5.00000, 20.90000, 395.24000, 6.47400, 97.10000, 384.00000, 19.80000, + 0.26363, 8.56000, 0.52000, 2.54510, 5.00000, 20.90000, 391.23000, 6.22900, 91.20000, 384.00000, 19.40000, + 0.10793, 8.56000, 0.52000, 2.77780, 5.00000, 20.90000, 393.49000, 6.19500, 54.40000, 384.00000, 21.70000, + 0.10084, 10.01000, 0.54700, 2.67750, 6.00000, 17.80000, 395.59000, 6.71500, 81.60000, 432.00000, 22.80000, + 0.12329, 10.01000, 0.54700, 2.35340, 6.00000, 17.80000, 394.95000, 5.91300, 92.90000, 432.00000, 18.80000, + 0.22212, 10.01000, 0.54700, 2.54800, 6.00000, 17.80000, 396.90000, 6.09200, 95.40000, 432.00000, 18.70000, + 0.14231, 10.01000, 0.54700, 2.25650, 6.00000, 17.80000, 388.74000, 6.25400, 84.20000, 432.00000, 18.50000, + 0.17134, 10.01000, 0.54700, 2.46310, 6.00000, 17.80000, 344.91000, 5.92800, 88.20000, 432.00000, 18.30000, + 0.13158, 10.01000, 0.54700, 2.73010, 6.00000, 17.80000, 393.30000, 6.17600, 72.50000, 432.00000, 21.20000, + 0.15098, 10.01000, 0.54700, 2.74740, 6.00000, 17.80000, 394.51000, 6.02100, 82.60000, 432.00000, 19.20000, + 0.13058, 10.01000, 0.54700, 2.47750, 6.00000, 17.80000, 338.63000, 5.87200, 73.10000, 432.00000, 20.40000, + 0.14476, 10.01000, 0.54700, 2.75920, 6.00000, 17.80000, 391.50000, 5.73100, 65.20000, 432.00000, 19.30000, + 0.06899, 25.65000, 0.58100, 2.25770, 2.00000, 19.10000, 389.15000, 5.87000, 69.70000, 188.00000, 22.00000, + 0.07165, 25.65000, 0.58100, 2.19740, 2.00000, 19.10000, 377.67000, 6.00400, 84.10000, 188.00000, 20.30000, + 0.09299, 25.65000, 0.58100, 2.08690, 2.00000, 19.10000, 378.09000, 5.96100, 92.90000, 188.00000, 20.50000, + 0.15038, 25.65000, 0.58100, 1.94440, 2.00000, 19.10000, 370.31000, 5.85600, 97.00000, 188.00000, 17.30000, + 0.09849, 25.65000, 0.58100, 2.00630, 2.00000, 19.10000, 379.38000, 5.87900, 95.80000, 188.00000, 18.80000, + 0.16902, 25.65000, 0.58100, 1.99290, 2.00000, 19.10000, 385.02000, 5.98600, 88.40000, 188.00000, 21.40000, + 0.38735, 25.65000, 0.58100, 1.75720, 2.00000, 19.10000, 359.29000, 5.61300, 95.60000, 188.00000, 15.70000, + 0.25915, 21.89000, 0.62400, 1.78830, 4.00000, 21.20000, 392.11000, 5.69300, 96.00000, 437.00000, 16.20000, + 0.32543, 21.89000, 0.62400, 1.81250, 4.00000, 21.20000, 396.90000, 6.43100, 98.80000, 437.00000, 18.00000, + 0.88125, 21.89000, 0.62400, 1.97990, 4.00000, 21.20000, 396.90000, 5.63700, 94.70000, 437.00000, 14.30000, + 0.34006, 21.89000, 0.62400, 2.11850, 4.00000, 21.20000, 395.04000, 6.45800, 98.90000, 437.00000, 19.20000, + 1.19294, 21.89000, 0.62400, 2.27100, 4.00000, 21.20000, 396.90000, 6.32600, 97.70000, 437.00000, 19.60000, + 0.59005, 21.89000, 0.62400, 2.32740, 4.00000, 21.20000, 385.76000, 6.37200, 97.90000, 437.00000, 23.00000, + 0.32982, 21.89000, 0.62400, 2.46990, 4.00000, 21.20000, 388.69000, 5.82200, 95.40000, 437.00000, 18.40000, + 0.97617, 21.89000, 0.62400, 2.34600, 4.00000, 21.20000, 262.76000, 5.75700, 98.40000, 437.00000, 15.60000, + 0.55778, 21.89000, 0.62400, 2.11070, 4.00000, 21.20000, 394.67000, 6.33500, 98.20000, 437.00000, 18.10000, + 0.32264, 21.89000, 0.62400, 1.96690, 4.00000, 21.20000, 378.25000, 5.94200, 93.50000, 437.00000, 17.40000, + 0.35233, 21.89000, 0.62400, 1.84980, 4.00000, 21.20000, 394.08000, 6.45400, 98.40000, 437.00000, 17.10000, + 0.24980, 21.89000, 0.62400, 1.66860, 4.00000, 21.20000, 392.04000, 5.85700, 98.20000, 437.00000, 13.30000, + 0.54452, 21.89000, 0.62400, 1.66870, 4.00000, 21.20000, 396.90000, 6.15100, 97.90000, 437.00000, 17.80000, + 0.29090, 21.89000, 0.62400, 1.61190, 4.00000, 21.20000, 388.08000, 6.17400, 93.60000, 437.00000, 14.00000, + 1.62864, 21.89000, 0.62400, 1.43940, 4.00000, 21.20000, 396.90000, 5.01900, 100.00000, 437.00000, 14.40000, + 3.32105, 19.58000, 0.87100, 1.32160, 5.00000, 14.70000, 396.90000, 5.40300, 100.00000, 403.00000, 13.40000, + 4.09740, 19.58000, 0.87100, 1.41180, 5.00000, 14.70000, 396.90000, 5.46800, 100.00000, 403.00000, 15.60000, + 2.77974, 19.58000, 0.87100, 1.34590, 5.00000, 14.70000, 396.90000, 4.90300, 97.80000, 403.00000, 11.80000, + 2.37934, 19.58000, 0.87100, 1.41910, 5.00000, 14.70000, 172.91000, 6.13000, 100.00000, 403.00000, 13.80000, + 2.15505, 19.58000, 0.87100, 1.51660, 5.00000, 14.70000, 169.27000, 5.62800, 100.00000, 403.00000, 15.60000, + 2.36862, 19.58000, 0.87100, 1.46080, 5.00000, 14.70000, 391.71000, 4.92600, 95.70000, 403.00000, 14.60000, + 2.33099, 19.58000, 0.87100, 1.52960, 5.00000, 14.70000, 356.99000, 5.18600, 93.80000, 403.00000, 17.80000, + 2.73397, 19.58000, 0.87100, 1.52570, 5.00000, 14.70000, 351.85000, 5.59700, 94.90000, 403.00000, 15.40000, + 1.65660, 19.58000, 0.87100, 1.61800, 5.00000, 14.70000, 372.80000, 6.12200, 97.30000, 403.00000, 21.50000, + 1.49632, 19.58000, 0.87100, 1.59160, 5.00000, 14.70000, 341.60000, 5.40400, 100.00000, 403.00000, 19.60000, + 1.12658, 19.58000, 0.87100, 1.61020, 5.00000, 14.70000, 343.28000, 5.01200, 88.00000, 403.00000, 15.30000, + 2.14918, 19.58000, 0.87100, 1.62320, 5.00000, 14.70000, 261.95000, 5.70900, 98.50000, 403.00000, 19.40000, + 1.41385, 19.58000, 0.87100, 1.74940, 5.00000, 14.70000, 321.02000, 6.12900, 96.00000, 403.00000, 17.00000, + 3.53501, 19.58000, 0.87100, 1.74550, 5.00000, 14.70000, 88.01000, 6.15200, 82.60000, 403.00000, 15.60000, + 2.44668, 19.58000, 0.87100, 1.73640, 5.00000, 14.70000, 88.63000, 5.27200, 94.00000, 403.00000, 13.10000, + 1.22358, 19.58000, 0.60500, 1.87730, 5.00000, 14.70000, 363.43000, 6.94300, 97.40000, 403.00000, 41.30000, + 1.34284, 19.58000, 0.60500, 1.75730, 5.00000, 14.70000, 353.89000, 6.06600, 100.00000, 403.00000, 24.30000, + 1.42502, 19.58000, 0.87100, 1.76590, 5.00000, 14.70000, 364.31000, 6.51000, 100.00000, 403.00000, 23.30000, + 1.27346, 19.58000, 0.60500, 1.79840, 5.00000, 14.70000, 338.92000, 6.25000, 92.60000, 403.00000, 27.00000, + 1.46336, 19.58000, 0.60500, 1.97090, 5.00000, 14.70000, 374.43000, 7.48900, 90.80000, 403.00000, 50.00000, + 1.83377, 19.58000, 0.60500, 2.04070, 5.00000, 14.70000, 389.61000, 7.80200, 98.20000, 403.00000, 50.00000, + 1.51902, 19.58000, 0.60500, 2.16200, 5.00000, 14.70000, 388.45000, 8.37500, 93.90000, 403.00000, 50.00000, + 2.24236, 19.58000, 0.60500, 2.42200, 5.00000, 14.70000, 395.11000, 5.85400, 91.80000, 403.00000, 22.70000, + 2.92400, 19.58000, 0.60500, 2.28340, 5.00000, 14.70000, 240.16000, 6.10100, 93.00000, 403.00000, 25.00000, + 2.01019, 19.58000, 0.60500, 2.04590, 5.00000, 14.70000, 369.30000, 7.92900, 96.20000, 403.00000, 50.00000, + 1.80028, 19.58000, 0.60500, 2.42590, 5.00000, 14.70000, 227.61000, 5.87700, 79.20000, 403.00000, 23.80000, + 2.30040, 19.58000, 0.60500, 2.10000, 5.00000, 14.70000, 297.09000, 6.31900, 96.10000, 403.00000, 23.80000, + 2.44953, 19.58000, 0.60500, 2.26250, 5.00000, 14.70000, 330.04000, 6.40200, 95.20000, 403.00000, 22.30000, + 1.20742, 19.58000, 0.60500, 2.42590, 5.00000, 14.70000, 292.29000, 5.87500, 94.60000, 403.00000, 17.40000, + 2.31390, 19.58000, 0.60500, 2.38870, 5.00000, 14.70000, 348.13000, 5.88000, 97.30000, 403.00000, 19.10000, + 0.13914, 4.05000, 0.51000, 2.59610, 5.00000, 16.60000, 396.90000, 5.57200, 88.50000, 296.00000, 23.10000, + 0.09178, 4.05000, 0.51000, 2.64630, 5.00000, 16.60000, 395.50000, 6.41600, 84.10000, 296.00000, 23.60000, + 0.08447, 4.05000, 0.51000, 2.70190, 5.00000, 16.60000, 393.23000, 5.85900, 68.70000, 296.00000, 22.60000, + 0.06664, 4.05000, 0.51000, 3.13230, 5.00000, 16.60000, 390.96000, 6.54600, 33.10000, 296.00000, 29.40000, + 0.07022, 4.05000, 0.51000, 3.55490, 5.00000, 16.60000, 393.23000, 6.02000, 47.20000, 296.00000, 23.20000, + 0.05425, 4.05000, 0.51000, 3.31750, 5.00000, 16.60000, 395.60000, 6.31500, 73.40000, 296.00000, 24.60000, + 0.06642, 4.05000, 0.51000, 2.91530, 5.00000, 16.60000, 391.27000, 6.86000, 74.40000, 296.00000, 29.90000, + 0.05780, 2.46000, 0.48800, 2.82900, 3.00000, 17.80000, 396.90000, 6.98000, 58.40000, 193.00000, 37.20000, + 0.06588, 2.46000, 0.48800, 2.74100, 3.00000, 17.80000, 395.56000, 7.76500, 83.30000, 193.00000, 39.80000, + 0.06888, 2.46000, 0.48800, 2.59790, 3.00000, 17.80000, 396.90000, 6.14400, 62.20000, 193.00000, 36.20000, + 0.09103, 2.46000, 0.48800, 2.70060, 3.00000, 17.80000, 394.12000, 7.15500, 92.20000, 193.00000, 37.90000, + 0.10008, 2.46000, 0.48800, 2.84700, 3.00000, 17.80000, 396.90000, 6.56300, 95.60000, 193.00000, 32.50000, + 0.08308, 2.46000, 0.48800, 2.98790, 3.00000, 17.80000, 391.00000, 5.60400, 89.80000, 193.00000, 26.40000, + 0.06047, 2.46000, 0.48800, 3.27970, 3.00000, 17.80000, 387.11000, 6.15300, 68.80000, 193.00000, 29.60000, + 0.05602, 2.46000, 0.48800, 3.19920, 3.00000, 17.80000, 392.63000, 7.83100, 53.60000, 193.00000, 50.00000, + 0.07875, 3.44000, 0.43700, 3.78860, 5.00000, 15.20000, 393.87000, 6.78200, 41.10000, 398.00000, 32.00000, + 0.12579, 3.44000, 0.43700, 4.56670, 5.00000, 15.20000, 382.84000, 6.55600, 29.10000, 398.00000, 29.80000, + 0.08370, 3.44000, 0.43700, 4.56670, 5.00000, 15.20000, 396.90000, 7.18500, 38.90000, 398.00000, 34.90000, + 0.09068, 3.44000, 0.43700, 6.47980, 5.00000, 15.20000, 377.68000, 6.95100, 21.50000, 398.00000, 37.00000, + 0.06911, 3.44000, 0.43700, 6.47980, 5.00000, 15.20000, 389.71000, 6.73900, 30.80000, 398.00000, 30.50000, + 0.08664, 3.44000, 0.43700, 6.47980, 5.00000, 15.20000, 390.49000, 7.17800, 26.30000, 398.00000, 36.40000, + 0.02187, 2.93000, 0.40100, 6.21960, 1.00000, 15.60000, 393.37000, 6.80000, 9.90000, 265.00000, 31.10000, + 0.01439, 2.93000, 0.40100, 6.21960, 1.00000, 15.60000, 376.70000, 6.60400, 18.80000, 265.00000, 29.10000, + 0.01381, 0.46000, 0.42200, 5.64840, 4.00000, 14.40000, 394.23000, 7.87500, 32.00000, 255.00000, 50.00000, + 0.04011, 1.52000, 0.40400, 7.30900, 2.00000, 12.60000, 396.90000, 7.28700, 34.10000, 329.00000, 33.30000, + 0.04666, 1.52000, 0.40400, 7.30900, 2.00000, 12.60000, 354.31000, 7.10700, 36.60000, 329.00000, 30.30000, + 0.03768, 1.52000, 0.40400, 7.30900, 2.00000, 12.60000, 392.20000, 7.27400, 38.30000, 329.00000, 34.60000, + 0.03150, 1.47000, 0.40300, 7.65340, 3.00000, 17.00000, 396.90000, 6.97500, 15.30000, 402.00000, 34.90000, + 0.01778, 1.47000, 0.40300, 7.65340, 3.00000, 17.00000, 384.30000, 7.13500, 13.90000, 402.00000, 32.90000, + 0.03445, 2.03000, 0.41500, 6.27000, 2.00000, 14.70000, 393.77000, 6.16200, 38.40000, 348.00000, 24.10000, + 0.02177, 2.03000, 0.41500, 6.27000, 2.00000, 14.70000, 395.38000, 7.61000, 15.70000, 348.00000, 42.30000, + 0.03510, 2.68000, 0.41610, 5.11800, 4.00000, 14.70000, 392.78000, 7.85300, 33.20000, 224.00000, 48.50000, + 0.02009, 2.68000, 0.41610, 5.11800, 4.00000, 14.70000, 390.55000, 8.03400, 31.90000, 224.00000, 50.00000, + 0.13642, 10.59000, 0.48900, 3.94540, 4.00000, 18.60000, 396.90000, 5.89100, 22.30000, 277.00000, 22.60000, + 0.22969, 10.59000, 0.48900, 4.35490, 4.00000, 18.60000, 394.87000, 6.32600, 52.50000, 277.00000, 24.40000, + 0.25199, 10.59000, 0.48900, 4.35490, 4.00000, 18.60000, 389.43000, 5.78300, 72.70000, 277.00000, 22.50000, + 0.13587, 10.59000, 0.48900, 4.23920, 4.00000, 18.60000, 381.32000, 6.06400, 59.10000, 277.00000, 24.40000, + 0.43571, 10.59000, 0.48900, 3.87500, 4.00000, 18.60000, 396.90000, 5.34400, 100.00000, 277.00000, 20.00000, + 0.17446, 10.59000, 0.48900, 3.87710, 4.00000, 18.60000, 393.25000, 5.96000, 92.10000, 277.00000, 21.70000, + 0.37578, 10.59000, 0.48900, 3.66500, 4.00000, 18.60000, 395.24000, 5.40400, 88.60000, 277.00000, 19.30000, + 0.21719, 10.59000, 0.48900, 3.65260, 4.00000, 18.60000, 390.94000, 5.80700, 53.80000, 277.00000, 22.40000, + 0.14052, 10.59000, 0.48900, 3.94540, 4.00000, 18.60000, 385.81000, 6.37500, 32.30000, 277.00000, 28.10000, + 0.28955, 10.59000, 0.48900, 3.58750, 4.00000, 18.60000, 348.93000, 5.41200, 9.80000, 277.00000, 23.70000, + 0.19802, 10.59000, 0.48900, 3.94540, 4.00000, 18.60000, 393.63000, 6.18200, 42.40000, 277.00000, 25.00000, + 0.04560, 13.89000, 0.55000, 3.11210, 5.00000, 16.40000, 392.80000, 5.88800, 56.00000, 276.00000, 23.30000, + 0.07013, 13.89000, 0.55000, 3.42110, 5.00000, 16.40000, 392.78000, 6.64200, 85.10000, 276.00000, 28.70000, + 0.11069, 13.89000, 0.55000, 2.88930, 5.00000, 16.40000, 396.90000, 5.95100, 93.80000, 276.00000, 21.50000, + 0.11425, 13.89000, 0.55000, 3.36330, 5.00000, 16.40000, 393.74000, 6.37300, 92.40000, 276.00000, 23.00000, + 0.35809, 6.20000, 0.50700, 2.86170, 8.00000, 17.40000, 391.70000, 6.95100, 88.50000, 307.00000, 26.70000, + 0.40771, 6.20000, 0.50700, 3.04800, 8.00000, 17.40000, 395.24000, 6.16400, 91.30000, 307.00000, 21.70000, + 0.62356, 6.20000, 0.50700, 3.27210, 8.00000, 17.40000, 390.39000, 6.87900, 77.70000, 307.00000, 27.50000, + 0.61470, 6.20000, 0.50700, 3.27210, 8.00000, 17.40000, 396.90000, 6.61800, 80.80000, 307.00000, 30.10000, + 0.31533, 6.20000, 0.50400, 2.89440, 8.00000, 17.40000, 385.05000, 8.26600, 78.30000, 307.00000, 44.80000, + 0.52693, 6.20000, 0.50400, 2.89440, 8.00000, 17.40000, 382.00000, 8.72500, 83.00000, 307.00000, 50.00000, + 0.38214, 6.20000, 0.50400, 3.21570, 8.00000, 17.40000, 387.38000, 8.04000, 86.50000, 307.00000, 37.60000, + 0.41238, 6.20000, 0.50400, 3.21570, 8.00000, 17.40000, 372.08000, 7.16300, 79.90000, 307.00000, 31.60000, + 0.29819, 6.20000, 0.50400, 3.37510, 8.00000, 17.40000, 377.51000, 7.68600, 17.00000, 307.00000, 46.70000, + 0.44178, 6.20000, 0.50400, 3.37510, 8.00000, 17.40000, 380.34000, 6.55200, 21.40000, 307.00000, 31.50000, + 0.53700, 6.20000, 0.50400, 3.67150, 8.00000, 17.40000, 378.35000, 5.98100, 68.10000, 307.00000, 24.30000, + 0.46296, 6.20000, 0.50400, 3.67150, 8.00000, 17.40000, 376.14000, 7.41200, 76.90000, 307.00000, 31.70000, + 0.57529, 6.20000, 0.50700, 3.83840, 8.00000, 17.40000, 385.91000, 8.33700, 73.30000, 307.00000, 41.70000, + 0.33147, 6.20000, 0.50700, 3.65190, 8.00000, 17.40000, 378.95000, 8.24700, 70.40000, 307.00000, 48.30000, + 0.44791, 6.20000, 0.50700, 3.65190, 8.00000, 17.40000, 360.20000, 6.72600, 66.50000, 307.00000, 29.00000, + 0.33045, 6.20000, 0.50700, 3.65190, 8.00000, 17.40000, 376.75000, 6.08600, 61.50000, 307.00000, 24.00000, + 0.52058, 6.20000, 0.50700, 4.14800, 8.00000, 17.40000, 388.45000, 6.63100, 76.50000, 307.00000, 25.10000, + 0.51183, 6.20000, 0.50700, 4.14800, 8.00000, 17.40000, 390.07000, 7.35800, 71.60000, 307.00000, 31.50000, + 0.08244, 4.93000, 0.42800, 6.18990, 6.00000, 16.60000, 379.41000, 6.48100, 18.50000, 300.00000, 23.70000, + 0.09252, 4.93000, 0.42800, 6.18990, 6.00000, 16.60000, 383.78000, 6.60600, 42.20000, 300.00000, 23.30000, + 0.11329, 4.93000, 0.42800, 6.33610, 6.00000, 16.60000, 391.25000, 6.89700, 54.30000, 300.00000, 22.00000, + 0.10612, 4.93000, 0.42800, 6.33610, 6.00000, 16.60000, 394.62000, 6.09500, 65.10000, 300.00000, 20.10000, + 0.10290, 4.93000, 0.42800, 7.03550, 6.00000, 16.60000, 372.75000, 6.35800, 52.90000, 300.00000, 22.20000, + 0.12757, 4.93000, 0.42800, 7.03550, 6.00000, 16.60000, 374.71000, 6.39300, 7.80000, 300.00000, 23.70000, + 0.20608, 5.86000, 0.43100, 7.95490, 7.00000, 19.10000, 372.49000, 5.59300, 76.50000, 330.00000, 17.60000, + 0.19133, 5.86000, 0.43100, 7.95490, 7.00000, 19.10000, 389.13000, 5.60500, 70.20000, 330.00000, 18.50000, + 0.33983, 5.86000, 0.43100, 8.05550, 7.00000, 19.10000, 390.18000, 6.10800, 34.90000, 330.00000, 24.30000, + 0.19657, 5.86000, 0.43100, 8.05550, 7.00000, 19.10000, 376.14000, 6.22600, 79.20000, 330.00000, 20.50000, + 0.16439, 5.86000, 0.43100, 7.82650, 7.00000, 19.10000, 374.71000, 6.43300, 49.10000, 330.00000, 24.50000, + 0.19073, 5.86000, 0.43100, 7.82650, 7.00000, 19.10000, 393.74000, 6.71800, 17.50000, 330.00000, 26.20000, + 0.14030, 5.86000, 0.43100, 7.39670, 7.00000, 19.10000, 396.28000, 6.48700, 13.00000, 330.00000, 24.40000, + 0.21409, 5.86000, 0.43100, 7.39670, 7.00000, 19.10000, 377.07000, 6.43800, 8.90000, 330.00000, 24.80000, + 0.08221, 5.86000, 0.43100, 8.90670, 7.00000, 19.10000, 386.09000, 6.95700, 6.80000, 330.00000, 29.60000, + 0.36894, 5.86000, 0.43100, 8.90670, 7.00000, 19.10000, 396.90000, 8.25900, 8.40000, 330.00000, 42.80000, + 0.04819, 3.64000, 0.39200, 9.22030, 1.00000, 16.40000, 392.89000, 6.10800, 32.00000, 315.00000, 21.90000, + 0.03548, 3.64000, 0.39200, 9.22030, 1.00000, 16.40000, 395.18000, 5.87600, 19.10000, 315.00000, 20.90000, + 0.01538, 3.75000, 0.39400, 6.33610, 3.00000, 15.90000, 386.34000, 7.45400, 34.20000, 244.00000, 44.00000, + 0.61154, 3.97000, 0.64700, 1.80100, 5.00000, 13.00000, 389.70000, 8.70400, 86.90000, 264.00000, 50.00000, + 0.66351, 3.97000, 0.64700, 1.89460, 5.00000, 13.00000, 383.29000, 7.33300, 100.00000, 264.00000, 36.00000, + 0.65665, 3.97000, 0.64700, 2.01070, 5.00000, 13.00000, 391.93000, 6.84200, 100.00000, 264.00000, 30.10000, + 0.54011, 3.97000, 0.64700, 2.11210, 5.00000, 13.00000, 392.80000, 7.20300, 81.80000, 264.00000, 33.80000, + 0.53412, 3.97000, 0.64700, 2.13980, 5.00000, 13.00000, 388.37000, 7.52000, 89.40000, 264.00000, 43.10000, + 0.52014, 3.97000, 0.64700, 2.28850, 5.00000, 13.00000, 386.86000, 8.39800, 91.50000, 264.00000, 48.80000, + 0.82526, 3.97000, 0.64700, 2.07880, 5.00000, 13.00000, 393.42000, 7.32700, 94.50000, 264.00000, 31.00000, + 0.55007, 3.97000, 0.64700, 1.93010, 5.00000, 13.00000, 387.89000, 7.20600, 91.60000, 264.00000, 36.50000, + 0.76162, 3.97000, 0.64700, 1.98650, 5.00000, 13.00000, 392.40000, 5.56000, 62.80000, 264.00000, 22.80000, + 0.78570, 3.97000, 0.64700, 2.13290, 5.00000, 13.00000, 384.07000, 7.01400, 84.60000, 264.00000, 30.70000, + 0.57834, 3.97000, 0.57500, 2.42160, 5.00000, 13.00000, 384.54000, 8.29700, 67.00000, 264.00000, 50.00000, + 0.54050, 3.97000, 0.57500, 2.87200, 5.00000, 13.00000, 390.30000, 7.47000, 52.60000, 264.00000, 43.50000, + 0.09065, 6.96000, 0.46400, 3.91750, 3.00000, 18.60000, 391.34000, 5.92000, 61.50000, 223.00000, 20.70000, + 0.29916, 6.96000, 0.46400, 4.42900, 3.00000, 18.60000, 388.65000, 5.85600, 42.10000, 223.00000, 21.10000, + 0.16211, 6.96000, 0.46400, 4.42900, 3.00000, 18.60000, 396.90000, 6.24000, 16.30000, 223.00000, 25.20000, + 0.11460, 6.96000, 0.46400, 3.91750, 3.00000, 18.60000, 394.96000, 6.53800, 58.70000, 223.00000, 24.40000, + 0.22188, 6.96000, 0.46400, 4.36650, 3.00000, 18.60000, 390.77000, 7.69100, 51.80000, 223.00000, 35.20000, + 0.05644, 6.41000, 0.44700, 4.07760, 4.00000, 17.60000, 396.90000, 6.75800, 32.90000, 254.00000, 32.40000, + 0.09604, 6.41000, 0.44700, 4.26730, 4.00000, 17.60000, 396.90000, 6.85400, 42.80000, 254.00000, 32.00000, + 0.10469, 6.41000, 0.44700, 4.78720, 4.00000, 17.60000, 389.25000, 7.26700, 49.00000, 254.00000, 33.20000, + 0.06127, 6.41000, 0.44700, 4.86280, 4.00000, 17.60000, 393.45000, 6.82600, 27.60000, 254.00000, 33.10000, + 0.07978, 6.41000, 0.44700, 4.14030, 4.00000, 17.60000, 396.90000, 6.48200, 32.10000, 254.00000, 29.10000, + 0.21038, 3.33000, 0.44290, 4.10070, 5.00000, 14.90000, 396.90000, 6.81200, 32.20000, 216.00000, 35.10000, + 0.03578, 3.33000, 0.44290, 4.69470, 5.00000, 14.90000, 387.31000, 7.82000, 64.50000, 216.00000, 45.40000, + 0.03705, 3.33000, 0.44290, 5.24470, 5.00000, 14.90000, 392.23000, 6.96800, 37.20000, 216.00000, 35.40000, + 0.06129, 3.33000, 0.44290, 5.21190, 5.00000, 14.90000, 377.07000, 7.64500, 49.70000, 216.00000, 46.00000, + 0.01501, 1.21000, 0.40100, 5.88500, 1.00000, 13.60000, 395.52000, 7.92300, 24.80000, 198.00000, 50.00000, + 0.00906, 2.97000, 0.40000, 7.30730, 1.00000, 15.30000, 394.72000, 7.08800, 20.80000, 285.00000, 32.20000, + 0.01096, 2.25000, 0.38900, 7.30730, 1.00000, 15.30000, 394.72000, 6.45300, 31.90000, 300.00000, 22.00000, + 0.01965, 1.76000, 0.38500, 9.08920, 1.00000, 18.20000, 341.60000, 6.23000, 31.50000, 241.00000, 20.10000, + 0.03871, 5.32000, 0.40500, 7.31720, 6.00000, 16.60000, 396.90000, 6.20900, 31.30000, 293.00000, 23.20000, + 0.04590, 5.32000, 0.40500, 7.31720, 6.00000, 16.60000, 396.90000, 6.31500, 45.60000, 293.00000, 22.30000, + 0.04297, 5.32000, 0.40500, 7.31720, 6.00000, 16.60000, 371.72000, 6.56500, 22.90000, 293.00000, 24.80000, + 0.03502, 4.95000, 0.41100, 5.11670, 4.00000, 19.20000, 396.90000, 6.86100, 27.90000, 245.00000, 28.50000, + 0.07886, 4.95000, 0.41100, 5.11670, 4.00000, 19.20000, 396.90000, 7.14800, 27.70000, 245.00000, 37.30000, + 0.03615, 4.95000, 0.41100, 5.11670, 4.00000, 19.20000, 396.90000, 6.63000, 23.40000, 245.00000, 27.90000, + 0.08265, 13.92000, 0.43700, 5.50270, 4.00000, 16.00000, 396.90000, 6.12700, 18.40000, 289.00000, 23.90000, + 0.08199, 13.92000, 0.43700, 5.50270, 4.00000, 16.00000, 396.90000, 6.00900, 42.30000, 289.00000, 21.70000, + 0.12932, 13.92000, 0.43700, 5.96040, 4.00000, 16.00000, 396.90000, 6.67800, 31.10000, 289.00000, 28.60000, + 0.05372, 13.92000, 0.43700, 5.96040, 4.00000, 16.00000, 392.85000, 6.54900, 51.00000, 289.00000, 27.10000, + 0.14103, 13.92000, 0.43700, 6.32000, 4.00000, 16.00000, 396.90000, 5.79000, 58.00000, 289.00000, 20.30000, + 0.06466, 2.24000, 0.40000, 7.82780, 5.00000, 14.80000, 368.24000, 6.34500, 20.10000, 358.00000, 22.50000, + 0.05561, 2.24000, 0.40000, 7.82780, 5.00000, 14.80000, 371.58000, 7.04100, 10.00000, 358.00000, 29.00000, + 0.04417, 2.24000, 0.40000, 7.82780, 5.00000, 14.80000, 390.86000, 6.87100, 47.40000, 358.00000, 24.80000, + 0.03537, 6.09000, 0.43300, 5.49170, 7.00000, 16.10000, 395.75000, 6.59000, 40.40000, 329.00000, 22.00000, + 0.09266, 6.09000, 0.43300, 5.49170, 7.00000, 16.10000, 383.61000, 6.49500, 18.40000, 329.00000, 26.40000, + 0.10000, 6.09000, 0.43300, 5.49170, 7.00000, 16.10000, 390.43000, 6.98200, 17.70000, 329.00000, 33.10000, + 0.05515, 2.18000, 0.47200, 4.02200, 7.00000, 18.40000, 393.68000, 7.23600, 41.10000, 222.00000, 36.10000, + 0.05479, 2.18000, 0.47200, 3.37000, 7.00000, 18.40000, 393.36000, 6.61600, 58.10000, 222.00000, 28.40000, + 0.07503, 2.18000, 0.47200, 3.09920, 7.00000, 18.40000, 396.90000, 7.42000, 71.90000, 222.00000, 33.40000, + 0.04932, 2.18000, 0.47200, 3.18270, 7.00000, 18.40000, 396.90000, 6.84900, 70.30000, 222.00000, 28.20000, + 0.49298, 9.90000, 0.54400, 3.31750, 4.00000, 18.40000, 396.90000, 6.63500, 82.50000, 304.00000, 22.80000, + 0.34940, 9.90000, 0.54400, 3.10250, 4.00000, 18.40000, 396.24000, 5.97200, 76.70000, 304.00000, 20.30000, + 2.63548, 9.90000, 0.54400, 2.51940, 4.00000, 18.40000, 350.45000, 4.97300, 37.80000, 304.00000, 16.10000, + 0.79041, 9.90000, 0.54400, 2.64030, 4.00000, 18.40000, 396.90000, 6.12200, 52.80000, 304.00000, 22.10000, + 0.26169, 9.90000, 0.54400, 2.83400, 4.00000, 18.40000, 396.30000, 6.02300, 90.40000, 304.00000, 19.40000, + 0.26938, 9.90000, 0.54400, 3.26280, 4.00000, 18.40000, 393.39000, 6.26600, 82.80000, 304.00000, 21.60000, + 0.36920, 9.90000, 0.54400, 3.60230, 4.00000, 18.40000, 395.69000, 6.56700, 87.30000, 304.00000, 23.80000, + 0.25356, 9.90000, 0.54400, 3.94500, 4.00000, 18.40000, 396.42000, 5.70500, 77.70000, 304.00000, 16.20000, + 0.31827, 9.90000, 0.54400, 3.99860, 4.00000, 18.40000, 390.70000, 5.91400, 83.20000, 304.00000, 17.80000, + 0.24522, 9.90000, 0.54400, 4.03170, 4.00000, 18.40000, 396.90000, 5.78200, 71.70000, 304.00000, 19.80000, + 0.40202, 9.90000, 0.54400, 3.53250, 4.00000, 18.40000, 395.21000, 6.38200, 67.20000, 304.00000, 23.10000, + 0.47547, 9.90000, 0.54400, 4.00190, 4.00000, 18.40000, 396.23000, 6.11300, 58.80000, 304.00000, 21.00000, + 0.16760, 7.38000, 0.49300, 4.54040, 5.00000, 19.60000, 396.90000, 6.42600, 52.30000, 287.00000, 23.80000, + 0.18159, 7.38000, 0.49300, 4.54040, 5.00000, 19.60000, 396.90000, 6.37600, 54.30000, 287.00000, 23.10000, + 0.35114, 7.38000, 0.49300, 4.72110, 5.00000, 19.60000, 396.90000, 6.04100, 49.90000, 287.00000, 20.40000, + 0.28392, 7.38000, 0.49300, 4.72110, 5.00000, 19.60000, 391.13000, 5.70800, 74.30000, 287.00000, 18.50000, + 0.34109, 7.38000, 0.49300, 4.72110, 5.00000, 19.60000, 396.90000, 6.41500, 40.10000, 287.00000, 25.00000, + 0.19186, 7.38000, 0.49300, 5.41590, 5.00000, 19.60000, 393.68000, 6.43100, 14.70000, 287.00000, 24.60000, + 0.30347, 7.38000, 0.49300, 5.41590, 5.00000, 19.60000, 396.90000, 6.31200, 28.90000, 287.00000, 23.00000, + 0.24103, 7.38000, 0.49300, 5.41590, 5.00000, 19.60000, 396.90000, 6.08300, 43.70000, 287.00000, 22.20000, + 0.06617, 3.24000, 0.46000, 5.21460, 4.00000, 16.90000, 382.44000, 5.86800, 25.80000, 430.00000, 19.30000, + 0.06724, 3.24000, 0.46000, 5.21460, 4.00000, 16.90000, 375.21000, 6.33300, 17.20000, 430.00000, 22.60000, + 0.04544, 3.24000, 0.46000, 5.87360, 4.00000, 16.90000, 368.57000, 6.14400, 32.20000, 430.00000, 19.80000, + 0.05023, 6.06000, 0.43790, 6.64070, 1.00000, 16.90000, 394.02000, 5.70600, 28.40000, 304.00000, 17.10000, + 0.03466, 6.06000, 0.43790, 6.64070, 1.00000, 16.90000, 362.25000, 6.03100, 23.30000, 304.00000, 19.40000, + 0.05083, 5.19000, 0.51500, 6.45840, 5.00000, 20.20000, 389.71000, 6.31600, 38.10000, 224.00000, 22.20000, + 0.03738, 5.19000, 0.51500, 6.45840, 5.00000, 20.20000, 389.40000, 6.31000, 38.50000, 224.00000, 20.70000, + 0.03961, 5.19000, 0.51500, 5.98530, 5.00000, 20.20000, 396.90000, 6.03700, 34.50000, 224.00000, 21.10000, + 0.03427, 5.19000, 0.51500, 5.23110, 5.00000, 20.20000, 396.90000, 5.86900, 46.30000, 224.00000, 19.50000, + 0.03041, 5.19000, 0.51500, 5.61500, 5.00000, 20.20000, 394.81000, 5.89500, 59.60000, 224.00000, 18.50000, + 0.03306, 5.19000, 0.51500, 4.81220, 5.00000, 20.20000, 396.14000, 6.05900, 37.30000, 224.00000, 20.60000, + 0.05497, 5.19000, 0.51500, 4.81220, 5.00000, 20.20000, 396.90000, 5.98500, 45.40000, 224.00000, 19.00000, + 0.06151, 5.19000, 0.51500, 4.81220, 5.00000, 20.20000, 396.90000, 5.96800, 58.50000, 224.00000, 18.70000, + 0.01301, 1.52000, 0.44200, 7.03790, 1.00000, 15.50000, 394.74000, 7.24100, 49.30000, 284.00000, 32.70000, + 0.02498, 1.89000, 0.51800, 6.26690, 1.00000, 15.90000, 389.96000, 6.54000, 59.70000, 422.00000, 16.50000, + 0.02543, 3.78000, 0.48400, 5.73210, 5.00000, 17.60000, 396.90000, 6.69600, 56.40000, 370.00000, 23.90000, + 0.03049, 3.78000, 0.48400, 6.46540, 5.00000, 17.60000, 387.97000, 6.87400, 28.10000, 370.00000, 31.20000, + 0.03113, 4.39000, 0.44200, 8.01360, 3.00000, 18.80000, 385.64000, 6.01400, 48.50000, 352.00000, 17.50000, + 0.06162, 4.39000, 0.44200, 8.01360, 3.00000, 18.80000, 364.61000, 5.89800, 52.30000, 352.00000, 17.20000, + 0.01870, 4.15000, 0.42900, 8.53530, 4.00000, 17.90000, 392.43000, 6.51600, 27.70000, 351.00000, 23.10000, + 0.01501, 2.01000, 0.43500, 8.34400, 4.00000, 17.00000, 390.94000, 6.63500, 29.70000, 280.00000, 24.50000, + 0.02899, 1.25000, 0.42900, 8.79210, 1.00000, 19.70000, 389.85000, 6.93900, 34.50000, 335.00000, 26.60000, + 0.06211, 1.25000, 0.42900, 8.79210, 1.00000, 19.70000, 396.90000, 6.49000, 44.40000, 335.00000, 22.90000, + 0.07950, 1.69000, 0.41100, 10.71030, 4.00000, 18.30000, 370.78000, 6.57900, 35.90000, 411.00000, 24.10000, + 0.07244, 1.69000, 0.41100, 10.71030, 4.00000, 18.30000, 392.33000, 5.88400, 18.50000, 411.00000, 18.60000, + 0.01709, 2.02000, 0.41000, 12.12650, 5.00000, 17.00000, 384.46000, 6.72800, 36.10000, 187.00000, 30.10000, + 0.04301, 1.91000, 0.41300, 10.58570, 4.00000, 22.00000, 382.80000, 5.66300, 21.90000, 334.00000, 18.20000, + 0.10659, 1.91000, 0.41300, 10.58570, 4.00000, 22.00000, 376.04000, 5.93600, 19.50000, 334.00000, 20.60000, + 8.98296, 18.10000, 0.77000, 2.12220, 24.00000, 20.20000, 377.73000, 6.21200, 97.40000, 666.00000, 17.80000, + 3.84970, 18.10000, 0.77000, 2.50520, 24.00000, 20.20000, 391.34000, 6.39500, 91.00000, 666.00000, 21.70000, + 5.20177, 18.10000, 0.77000, 2.72270, 24.00000, 20.20000, 395.43000, 6.12700, 83.40000, 666.00000, 22.70000, + 4.26131, 18.10000, 0.77000, 2.50910, 24.00000, 20.20000, 390.74000, 6.11200, 81.30000, 666.00000, 22.60000, + 4.54192, 18.10000, 0.77000, 2.51820, 24.00000, 20.20000, 374.56000, 6.39800, 88.00000, 666.00000, 25.00000, + 3.83684, 18.10000, 0.77000, 2.29550, 24.00000, 20.20000, 350.65000, 6.25100, 91.10000, 666.00000, 19.90000, + 3.67822, 18.10000, 0.77000, 2.10360, 24.00000, 20.20000, 380.79000, 5.36200, 96.20000, 666.00000, 20.80000, + 4.22239, 18.10000, 0.77000, 1.90470, 24.00000, 20.20000, 353.04000, 5.80300, 89.00000, 666.00000, 16.80000, + 3.47428, 18.10000, 0.71800, 1.90470, 24.00000, 20.20000, 354.55000, 8.78000, 82.90000, 666.00000, 21.90000, + 4.55587, 18.10000, 0.71800, 1.61320, 24.00000, 20.20000, 354.70000, 3.56100, 87.90000, 666.00000, 27.50000, + 3.69695, 18.10000, 0.71800, 1.75230, 24.00000, 20.20000, 316.03000, 4.96300, 91.40000, 666.00000, 21.90000, + 13.52220, 18.10000, 0.63100, 1.51060, 24.00000, 20.20000, 131.42000, 3.86300, 100.00000, 666.00000, 23.10000, + 4.89822, 18.10000, 0.63100, 1.33250, 24.00000, 20.20000, 375.52000, 4.97000, 100.00000, 666.00000, 50.00000, + 5.66998, 18.10000, 0.63100, 1.35670, 24.00000, 20.20000, 375.33000, 6.68300, 96.80000, 666.00000, 50.00000, + 6.53876, 18.10000, 0.63100, 1.20240, 24.00000, 20.20000, 392.05000, 7.01600, 97.50000, 666.00000, 50.00000, + 9.23230, 18.10000, 0.63100, 1.16910, 24.00000, 20.20000, 366.15000, 6.21600, 100.00000, 666.00000, 50.00000, + 8.26725, 18.10000, 0.66800, 1.12960, 24.00000, 20.20000, 347.88000, 5.87500, 89.60000, 666.00000, 50.00000, + 11.10810, 18.10000, 0.66800, 1.17420, 24.00000, 20.20000, 396.90000, 4.90600, 100.00000, 666.00000, 13.80000, + 18.49820, 18.10000, 0.66800, 1.13700, 24.00000, 20.20000, 396.90000, 4.13800, 100.00000, 666.00000, 13.80000, + 19.60910, 18.10000, 0.67100, 1.31630, 24.00000, 20.20000, 396.90000, 7.31300, 97.90000, 666.00000, 15.00000, + 15.28800, 18.10000, 0.67100, 1.34490, 24.00000, 20.20000, 363.02000, 6.64900, 93.30000, 666.00000, 13.90000, + 9.82349, 18.10000, 0.67100, 1.35800, 24.00000, 20.20000, 396.90000, 6.79400, 98.80000, 666.00000, 13.30000, + 23.64820, 18.10000, 0.67100, 1.38610, 24.00000, 20.20000, 396.90000, 6.38000, 96.20000, 666.00000, 13.10000, + 17.86670, 18.10000, 0.67100, 1.38610, 24.00000, 20.20000, 393.74000, 6.22300, 100.00000, 666.00000, 10.20000, + 88.97620, 18.10000, 0.67100, 1.41650, 24.00000, 20.20000, 396.90000, 6.96800, 91.90000, 666.00000, 10.40000, + 15.87440, 18.10000, 0.67100, 1.51920, 24.00000, 20.20000, 396.90000, 6.54500, 99.10000, 666.00000, 10.90000, + 9.18702, 18.10000, 0.70000, 1.58040, 24.00000, 20.20000, 396.90000, 5.53600, 100.00000, 666.00000, 11.30000, + 7.99248, 18.10000, 0.70000, 1.53310, 24.00000, 20.20000, 396.90000, 5.52000, 100.00000, 666.00000, 12.30000, + 20.08490, 18.10000, 0.70000, 1.43950, 24.00000, 20.20000, 285.83000, 4.36800, 91.20000, 666.00000, 8.80000, + 16.81180, 18.10000, 0.70000, 1.42610, 24.00000, 20.20000, 396.90000, 5.27700, 98.10000, 666.00000, 7.20000, + 24.39380, 18.10000, 0.70000, 1.46720, 24.00000, 20.20000, 396.90000, 4.65200, 100.00000, 666.00000, 10.50000, + 22.59710, 18.10000, 0.70000, 1.51840, 24.00000, 20.20000, 396.90000, 5.00000, 89.50000, 666.00000, 7.40000, + 14.33370, 18.10000, 0.70000, 1.58950, 24.00000, 20.20000, 372.92000, 4.88000, 100.00000, 666.00000, 10.20000, + 8.15174, 18.10000, 0.70000, 1.72810, 24.00000, 20.20000, 396.90000, 5.39000, 98.90000, 666.00000, 11.50000, + 6.96215, 18.10000, 0.70000, 1.92650, 24.00000, 20.20000, 394.43000, 5.71300, 97.00000, 666.00000, 15.10000, + 5.29305, 18.10000, 0.70000, 2.16780, 24.00000, 20.20000, 378.38000, 6.05100, 82.50000, 666.00000, 23.20000, + 11.57790, 18.10000, 0.70000, 1.77000, 24.00000, 20.20000, 396.90000, 5.03600, 97.00000, 666.00000, 9.70000, + 8.64476, 18.10000, 0.69300, 1.79120, 24.00000, 20.20000, 396.90000, 6.19300, 92.60000, 666.00000, 13.80000, + 13.35980, 18.10000, 0.69300, 1.78210, 24.00000, 20.20000, 396.90000, 5.88700, 94.70000, 666.00000, 12.70000, + 8.71675, 18.10000, 0.69300, 1.72570, 24.00000, 20.20000, 391.98000, 6.47100, 98.80000, 666.00000, 13.10000, + 5.87205, 18.10000, 0.69300, 1.67680, 24.00000, 20.20000, 396.90000, 6.40500, 96.00000, 666.00000, 12.50000, + 7.67202, 18.10000, 0.69300, 1.63340, 24.00000, 20.20000, 393.10000, 5.74700, 98.90000, 666.00000, 8.50000, + 38.35180, 18.10000, 0.69300, 1.48960, 24.00000, 20.20000, 396.90000, 5.45300, 100.00000, 666.00000, 5.00000, + 9.91655, 18.10000, 0.69300, 1.50040, 24.00000, 20.20000, 338.16000, 5.85200, 77.80000, 666.00000, 6.30000, + 25.04610, 18.10000, 0.69300, 1.58880, 24.00000, 20.20000, 396.90000, 5.98700, 100.00000, 666.00000, 5.60000, + 14.23620, 18.10000, 0.69300, 1.57410, 24.00000, 20.20000, 396.90000, 6.34300, 100.00000, 666.00000, 7.20000, + 9.59571, 18.10000, 0.69300, 1.63900, 24.00000, 20.20000, 376.11000, 6.40400, 100.00000, 666.00000, 12.10000, + 24.80170, 18.10000, 0.69300, 1.70280, 24.00000, 20.20000, 396.90000, 5.34900, 96.00000, 666.00000, 8.30000, + 41.52920, 18.10000, 0.69300, 1.60740, 24.00000, 20.20000, 329.46000, 5.53100, 85.40000, 666.00000, 8.50000, + 67.92080, 18.10000, 0.69300, 1.42540, 24.00000, 20.20000, 384.97000, 5.68300, 100.00000, 666.00000, 5.00000, + 20.71620, 18.10000, 0.65900, 1.17810, 24.00000, 20.20000, 370.22000, 4.13800, 100.00000, 666.00000, 11.90000, + 11.95110, 18.10000, 0.65900, 1.28520, 24.00000, 20.20000, 332.09000, 5.60800, 100.00000, 666.00000, 27.90000, + 7.40389, 18.10000, 0.59700, 1.45470, 24.00000, 20.20000, 314.64000, 5.61700, 97.90000, 666.00000, 17.20000, + 14.43830, 18.10000, 0.59700, 1.46550, 24.00000, 20.20000, 179.36000, 6.85200, 100.00000, 666.00000, 27.50000, + 51.13580, 18.10000, 0.59700, 1.41300, 24.00000, 20.20000, 2.60000, 5.75700, 100.00000, 666.00000, 15.00000, + 14.05070, 18.10000, 0.59700, 1.52750, 24.00000, 20.20000, 35.05000, 6.65700, 100.00000, 666.00000, 17.20000, + 18.81100, 18.10000, 0.59700, 1.55390, 24.00000, 20.20000, 28.79000, 4.62800, 100.00000, 666.00000, 17.90000, + 28.65580, 18.10000, 0.59700, 1.58940, 24.00000, 20.20000, 210.97000, 5.15500, 100.00000, 666.00000, 16.30000, + 45.74610, 18.10000, 0.69300, 1.65820, 24.00000, 20.20000, 88.27000, 4.51900, 100.00000, 666.00000, 7.00000, + 18.08460, 18.10000, 0.67900, 1.83470, 24.00000, 20.20000, 27.25000, 6.43400, 100.00000, 666.00000, 7.20000, + 10.83420, 18.10000, 0.67900, 1.81950, 24.00000, 20.20000, 21.57000, 6.78200, 90.80000, 666.00000, 7.50000, + 25.94060, 18.10000, 0.67900, 1.64750, 24.00000, 20.20000, 127.36000, 5.30400, 89.10000, 666.00000, 10.40000, + 73.53410, 18.10000, 0.67900, 1.80260, 24.00000, 20.20000, 16.45000, 5.95700, 100.00000, 666.00000, 8.80000, + 11.81230, 18.10000, 0.71800, 1.79400, 24.00000, 20.20000, 48.45000, 6.82400, 76.50000, 666.00000, 8.40000, + 11.08740, 18.10000, 0.71800, 1.85890, 24.00000, 20.20000, 318.75000, 6.41100, 100.00000, 666.00000, 16.70000, + 7.02259, 18.10000, 0.71800, 1.87460, 24.00000, 20.20000, 319.98000, 6.00600, 95.30000, 666.00000, 14.20000, + 12.04820, 18.10000, 0.61400, 1.95120, 24.00000, 20.20000, 291.55000, 5.64800, 87.60000, 666.00000, 20.80000, + 7.05042, 18.10000, 0.61400, 2.02180, 24.00000, 20.20000, 2.52000, 6.10300, 85.10000, 666.00000, 13.40000, + 8.79212, 18.10000, 0.58400, 2.06350, 24.00000, 20.20000, 3.65000, 5.56500, 70.60000, 666.00000, 11.70000, + 15.86030, 18.10000, 0.67900, 1.90960, 24.00000, 20.20000, 7.68000, 5.89600, 95.40000, 666.00000, 8.30000, + 12.24720, 18.10000, 0.58400, 1.99760, 24.00000, 20.20000, 24.65000, 5.83700, 59.70000, 666.00000, 10.20000, + 37.66190, 18.10000, 0.67900, 1.86290, 24.00000, 20.20000, 18.82000, 6.20200, 78.70000, 666.00000, 10.90000, + 7.36711, 18.10000, 0.67900, 1.93560, 24.00000, 20.20000, 96.73000, 6.19300, 78.10000, 666.00000, 11.00000, + 9.33889, 18.10000, 0.67900, 1.96820, 24.00000, 20.20000, 60.72000, 6.38000, 95.60000, 666.00000, 9.50000, + 8.49213, 18.10000, 0.58400, 2.05270, 24.00000, 20.20000, 83.45000, 6.34800, 86.10000, 666.00000, 14.50000, + 10.06230, 18.10000, 0.58400, 2.08820, 24.00000, 20.20000, 81.33000, 6.83300, 94.30000, 666.00000, 14.10000, + 6.44405, 18.10000, 0.58400, 2.20040, 24.00000, 20.20000, 97.95000, 6.42500, 74.80000, 666.00000, 16.10000, + 5.58107, 18.10000, 0.71300, 2.31580, 24.00000, 20.20000, 100.19000, 6.43600, 87.90000, 666.00000, 14.30000, + 13.91340, 18.10000, 0.71300, 2.22220, 24.00000, 20.20000, 100.63000, 6.20800, 95.00000, 666.00000, 11.70000, + 11.16040, 18.10000, 0.74000, 2.12470, 24.00000, 20.20000, 109.85000, 6.62900, 94.60000, 666.00000, 13.40000, + 14.42080, 18.10000, 0.74000, 2.00260, 24.00000, 20.20000, 27.49000, 6.46100, 93.30000, 666.00000, 9.60000, + 15.17720, 18.10000, 0.74000, 1.91420, 24.00000, 20.20000, 9.32000, 6.15200, 100.00000, 666.00000, 8.70000, + 13.67810, 18.10000, 0.74000, 1.82060, 24.00000, 20.20000, 68.95000, 5.93500, 87.90000, 666.00000, 8.40000, + 9.39063, 18.10000, 0.74000, 1.81720, 24.00000, 20.20000, 396.90000, 5.62700, 93.90000, 666.00000, 12.80000, + 22.05110, 18.10000, 0.74000, 1.86620, 24.00000, 20.20000, 391.45000, 5.81800, 92.40000, 666.00000, 10.50000, + 9.72418, 18.10000, 0.74000, 2.06510, 24.00000, 20.20000, 385.96000, 6.40600, 97.20000, 666.00000, 17.10000, + 5.66637, 18.10000, 0.74000, 2.00480, 24.00000, 20.20000, 395.69000, 6.21900, 100.00000, 666.00000, 18.40000, + 9.96654, 18.10000, 0.74000, 1.97840, 24.00000, 20.20000, 386.73000, 6.48500, 100.00000, 666.00000, 15.40000, + 12.80230, 18.10000, 0.74000, 1.89560, 24.00000, 20.20000, 240.52000, 5.85400, 96.60000, 666.00000, 10.80000, + 10.67180, 18.10000, 0.74000, 1.98790, 24.00000, 20.20000, 43.06000, 6.45900, 94.80000, 666.00000, 11.80000, + 6.28807, 18.10000, 0.74000, 2.07200, 24.00000, 20.20000, 318.01000, 6.34100, 96.40000, 666.00000, 14.90000, + 9.92485, 18.10000, 0.74000, 2.19800, 24.00000, 20.20000, 388.52000, 6.25100, 96.60000, 666.00000, 12.60000, + 9.32909, 18.10000, 0.71300, 2.26160, 24.00000, 20.20000, 396.90000, 6.18500, 98.70000, 666.00000, 14.10000, + 7.52601, 18.10000, 0.71300, 2.18500, 24.00000, 20.20000, 304.21000, 6.41700, 98.30000, 666.00000, 13.00000, + 6.71772, 18.10000, 0.71300, 2.32360, 24.00000, 20.20000, 0.32000, 6.74900, 92.60000, 666.00000, 13.40000, + 5.44114, 18.10000, 0.71300, 2.35520, 24.00000, 20.20000, 355.29000, 6.65500, 98.20000, 666.00000, 15.20000, + 5.09017, 18.10000, 0.71300, 2.36820, 24.00000, 20.20000, 385.09000, 6.29700, 91.80000, 666.00000, 16.10000, + 8.24809, 18.10000, 0.71300, 2.45270, 24.00000, 20.20000, 375.87000, 7.39300, 99.30000, 666.00000, 17.80000, + 9.51363, 18.10000, 0.71300, 2.49610, 24.00000, 20.20000, 6.68000, 6.72800, 94.10000, 666.00000, 14.90000, + 4.75237, 18.10000, 0.71300, 2.43580, 24.00000, 20.20000, 50.92000, 6.52500, 86.50000, 666.00000, 14.10000, + 4.66883, 18.10000, 0.71300, 2.58060, 24.00000, 20.20000, 10.48000, 5.97600, 87.90000, 666.00000, 12.70000, + 8.20058, 18.10000, 0.71300, 2.77920, 24.00000, 20.20000, 3.50000, 5.93600, 80.30000, 666.00000, 13.50000, + 7.75223, 18.10000, 0.71300, 2.78310, 24.00000, 20.20000, 272.21000, 6.30100, 83.70000, 666.00000, 14.90000, + 6.80117, 18.10000, 0.71300, 2.71750, 24.00000, 20.20000, 396.90000, 6.08100, 84.40000, 666.00000, 20.00000, + 4.81213, 18.10000, 0.71300, 2.59750, 24.00000, 20.20000, 255.23000, 6.70100, 90.00000, 666.00000, 16.40000, + 3.69311, 18.10000, 0.71300, 2.56710, 24.00000, 20.20000, 391.43000, 6.37600, 88.40000, 666.00000, 17.70000, + 6.65492, 18.10000, 0.71300, 2.73440, 24.00000, 20.20000, 396.90000, 6.31700, 83.00000, 666.00000, 19.50000, + 5.82115, 18.10000, 0.71300, 2.80160, 24.00000, 20.20000, 393.82000, 6.51300, 89.90000, 666.00000, 20.20000, + 7.83932, 18.10000, 0.65500, 2.96340, 24.00000, 20.20000, 396.90000, 6.20900, 65.40000, 666.00000, 21.40000, + 3.16360, 18.10000, 0.65500, 3.06650, 24.00000, 20.20000, 334.40000, 5.75900, 48.20000, 666.00000, 19.90000, + 3.77498, 18.10000, 0.65500, 2.87150, 24.00000, 20.20000, 22.01000, 5.95200, 84.70000, 666.00000, 19.00000, + 4.42228, 18.10000, 0.58400, 2.54030, 24.00000, 20.20000, 331.29000, 6.00300, 94.50000, 666.00000, 19.10000, + 15.57570, 18.10000, 0.58000, 2.90840, 24.00000, 20.20000, 368.74000, 5.92600, 71.00000, 666.00000, 19.10000, + 13.07510, 18.10000, 0.58000, 2.82370, 24.00000, 20.20000, 396.90000, 5.71300, 56.70000, 666.00000, 20.10000, + 4.34879, 18.10000, 0.58000, 3.03340, 24.00000, 20.20000, 396.90000, 6.16700, 84.00000, 666.00000, 19.90000, + 4.03841, 18.10000, 0.53200, 3.09930, 24.00000, 20.20000, 395.33000, 6.22900, 90.70000, 666.00000, 19.60000, + 3.56868, 18.10000, 0.58000, 2.89650, 24.00000, 20.20000, 393.37000, 6.43700, 75.00000, 666.00000, 23.20000, + 4.64689, 18.10000, 0.61400, 2.53290, 24.00000, 20.20000, 374.68000, 6.98000, 67.60000, 666.00000, 29.80000, + 8.05579, 18.10000, 0.58400, 2.42980, 24.00000, 20.20000, 352.58000, 5.42700, 95.40000, 666.00000, 13.80000, + 6.39312, 18.10000, 0.58400, 2.20600, 24.00000, 20.20000, 302.76000, 6.16200, 97.40000, 666.00000, 13.30000, + 4.87141, 18.10000, 0.61400, 2.30530, 24.00000, 20.20000, 396.21000, 6.48400, 93.60000, 666.00000, 16.70000, + 15.02340, 18.10000, 0.61400, 2.10070, 24.00000, 20.20000, 349.48000, 5.30400, 97.30000, 666.00000, 12.00000, + 10.23300, 18.10000, 0.61400, 2.17050, 24.00000, 20.20000, 379.70000, 6.18500, 96.70000, 666.00000, 14.60000, + 14.33370, 18.10000, 0.61400, 1.95120, 24.00000, 20.20000, 383.32000, 6.22900, 88.00000, 666.00000, 21.40000, + 5.82401, 18.10000, 0.53200, 3.42420, 24.00000, 20.20000, 396.90000, 6.24200, 64.70000, 666.00000, 23.00000, + 5.70818, 18.10000, 0.53200, 3.33170, 24.00000, 20.20000, 393.07000, 6.75000, 74.90000, 666.00000, 23.70000, + 5.73116, 18.10000, 0.53200, 3.41060, 24.00000, 20.20000, 395.28000, 7.06100, 77.00000, 666.00000, 25.00000, + 2.81838, 18.10000, 0.53200, 4.09830, 24.00000, 20.20000, 392.92000, 5.76200, 40.30000, 666.00000, 21.80000, + 2.37857, 18.10000, 0.58300, 3.72400, 24.00000, 20.20000, 370.73000, 5.87100, 41.90000, 666.00000, 20.60000, + 3.67367, 18.10000, 0.58300, 3.99170, 24.00000, 20.20000, 388.62000, 6.31200, 51.90000, 666.00000, 21.20000, + 5.69175, 18.10000, 0.58300, 3.54590, 24.00000, 20.20000, 392.68000, 6.11400, 79.80000, 666.00000, 19.10000, + 4.83567, 18.10000, 0.58300, 3.15230, 24.00000, 20.20000, 388.22000, 5.90500, 53.20000, 666.00000, 20.60000, + 0.15086, 27.74000, 0.60900, 1.82090, 4.00000, 20.10000, 395.09000, 5.45400, 92.70000, 711.00000, 15.20000, + 0.18337, 27.74000, 0.60900, 1.75540, 4.00000, 20.10000, 344.05000, 5.41400, 98.30000, 711.00000, 7.00000, + 0.20746, 27.74000, 0.60900, 1.82260, 4.00000, 20.10000, 318.43000, 5.09300, 98.00000, 711.00000, 8.10000, + 0.10574, 27.74000, 0.60900, 1.86810, 4.00000, 20.10000, 390.11000, 5.98300, 98.80000, 711.00000, 13.60000, + 0.11132, 27.74000, 0.60900, 2.10990, 4.00000, 20.10000, 396.90000, 5.98300, 83.50000, 711.00000, 20.10000, + 0.17331, 9.69000, 0.58500, 2.38170, 6.00000, 19.20000, 396.90000, 5.70700, 54.00000, 391.00000, 21.80000, + 0.27957, 9.69000, 0.58500, 2.38170, 6.00000, 19.20000, 396.90000, 5.92600, 42.60000, 391.00000, 24.50000, + 0.17899, 9.69000, 0.58500, 2.79860, 6.00000, 19.20000, 393.29000, 5.67000, 28.80000, 391.00000, 23.10000, + 0.28960, 9.69000, 0.58500, 2.79860, 6.00000, 19.20000, 396.90000, 5.39000, 72.90000, 391.00000, 19.70000, + 0.26838, 9.69000, 0.58500, 2.89270, 6.00000, 19.20000, 396.90000, 5.79400, 70.60000, 391.00000, 18.30000, + 0.23912, 9.69000, 0.58500, 2.40910, 6.00000, 19.20000, 396.90000, 6.01900, 65.30000, 391.00000, 21.20000, + 0.17783, 9.69000, 0.58500, 2.39990, 6.00000, 19.20000, 395.77000, 5.56900, 73.50000, 391.00000, 17.50000, + 0.22438, 9.69000, 0.58500, 2.49820, 6.00000, 19.20000, 396.90000, 6.02700, 79.70000, 391.00000, 16.80000, + 0.06263, 11.93000, 0.57300, 2.47860, 1.00000, 21.00000, 391.99000, 6.59300, 69.10000, 273.00000, 22.40000, + 0.04527, 11.93000, 0.57300, 2.28750, 1.00000, 21.00000, 396.90000, 6.12000, 76.70000, 273.00000, 20.60000, + 0.06076, 11.93000, 0.57300, 2.16750, 1.00000, 21.00000, 396.90000, 6.97600, 91.00000, 273.00000, 23.90000, + 0.10959, 11.93000, 0.57300, 2.38890, 1.00000, 21.00000, 393.45000, 6.79400, 89.30000, 273.00000, 22.00000, + 0.04741, 11.93000, 0.57300, 2.50500, 1.00000, 21.00000, 396.90000, 6.03000, 80.80000, 273.00000, 11.90000, +}) diff --git a/stat/car_data_test.go b/stat/car_data_test.go new file mode 100644 index 00000000..ea83b9c8 --- /dev/null +++ b/stat/car_data_test.go @@ -0,0 +1,406 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import "github.com/gonum/matrix/mat64" + +// ASA Car Exposition Data of Ramos and Donoho (1983) +// http://lib.stat.cmu.edu/datasets/cars.desc +// http://lib.stat.cmu.edu/datasets/cars.data +// Columns are: displacement, horsepower, weight, acceleration, MPG. +var carData = mat64.NewDense(392, 5, []float64{ + 307.0, 130.0, 3504.0, 12.0, 18.0, + 350.0, 165.0, 3693.0, 11.5, 15.0, + 318.0, 150.0, 3436.0, 11.0, 18.0, + 304.0, 150.0, 3433.0, 12.0, 16.0, + 302.0, 140.0, 3449.0, 10.5, 17.0, + 429.0, 198.0, 4341.0, 10.0, 15.0, + 454.0, 220.0, 4354.0, 9.0, 14.0, + 440.0, 215.0, 4312.0, 8.5, 14.0, + 455.0, 225.0, 4425.0, 10.0, 14.0, + 390.0, 190.0, 3850.0, 8.5, 15.0, + 383.0, 170.0, 3563.0, 10.0, 15.0, + 340.0, 160.0, 3609.0, 8.0, 14.0, + 400.0, 150.0, 3761.0, 9.5, 15.0, + 455.0, 225.0, 3086.0, 10.0, 14.0, + 113.0, 95.0, 2372.0, 15.0, 24.0, + 198.0, 95.0, 2833.0, 15.5, 22.0, + 199.0, 97.0, 2774.0, 15.5, 18.0, + 200.0, 85.0, 2587.0, 16.0, 21.0, + 97.0, 88.0, 2130.0, 14.5, 27.0, + 97.0, 46.0, 1835.0, 20.5, 26.0, + 110.0, 87.0, 2672.0, 17.5, 25.0, + 107.0, 90.0, 2430.0, 14.5, 24.0, + 104.0, 95.0, 2375.0, 17.5, 25.0, + 121.0, 113.0, 2234.0, 12.5, 26.0, + 199.0, 90.0, 2648.0, 15.0, 21.0, + 360.0, 215.0, 4615.0, 14.0, 10.0, + 307.0, 200.0, 4376.0, 15.0, 10.0, + 318.0, 210.0, 4382.0, 13.5, 11.0, + 304.0, 193.0, 4732.0, 18.5, 9.0, + 97.0, 88.0, 2130.0, 14.5, 27.0, + 140.0, 90.0, 2264.0, 15.5, 28.0, + 113.0, 95.0, 2228.0, 14.0, 25.0, + 232.0, 100.0, 2634.0, 13.0, 19.0, + 225.0, 105.0, 3439.0, 15.5, 16.0, + 250.0, 100.0, 3329.0, 15.5, 17.0, + 250.0, 88.0, 3302.0, 15.5, 19.0, + 232.0, 100.0, 3288.0, 15.5, 18.0, + 350.0, 165.0, 4209.0, 12.0, 14.0, + 400.0, 175.0, 4464.0, 11.5, 14.0, + 351.0, 153.0, 4154.0, 13.5, 14.0, + 318.0, 150.0, 4096.0, 13.0, 14.0, + 383.0, 180.0, 4955.0, 11.5, 12.0, + 400.0, 170.0, 4746.0, 12.0, 13.0, + 400.0, 175.0, 5140.0, 12.0, 13.0, + 258.0, 110.0, 2962.0, 13.5, 18.0, + 140.0, 72.0, 2408.0, 19.0, 22.0, + 250.0, 100.0, 3282.0, 15.0, 19.0, + 250.0, 88.0, 3139.0, 14.5, 18.0, + 122.0, 86.0, 2220.0, 14.0, 23.0, + 116.0, 90.0, 2123.0, 14.0, 28.0, + 79.0, 70.0, 2074.0, 19.5, 30.0, + 88.0, 76.0, 2065.0, 14.5, 30.0, + 71.0, 65.0, 1773.0, 19.0, 31.0, + 72.0, 69.0, 1613.0, 18.0, 35.0, + 97.0, 60.0, 1834.0, 19.0, 27.0, + 91.0, 70.0, 1955.0, 20.5, 26.0, + 113.0, 95.0, 2278.0, 15.5, 24.0, + 97.5, 80.0, 2126.0, 17.0, 25.0, + 97.0, 54.0, 2254.0, 23.5, 23.0, + 140.0, 90.0, 2408.0, 19.5, 20.0, + 122.0, 86.0, 2226.0, 16.5, 21.0, + 350.0, 165.0, 4274.0, 12.0, 13.0, + 400.0, 175.0, 4385.0, 12.0, 14.0, + 318.0, 150.0, 4135.0, 13.5, 15.0, + 351.0, 153.0, 4129.0, 13.0, 14.0, + 304.0, 150.0, 3672.0, 11.5, 17.0, + 429.0, 208.0, 4633.0, 11.0, 11.0, + 350.0, 155.0, 4502.0, 13.5, 13.0, + 350.0, 160.0, 4456.0, 13.5, 12.0, + 400.0, 190.0, 4422.0, 12.5, 13.0, + 70.0, 97.0, 2330.0, 13.5, 19.0, + 304.0, 150.0, 3892.0, 12.5, 15.0, + 307.0, 130.0, 4098.0, 14.0, 13.0, + 302.0, 140.0, 4294.0, 16.0, 13.0, + 318.0, 150.0, 4077.0, 14.0, 14.0, + 121.0, 112.0, 2933.0, 14.5, 18.0, + 121.0, 76.0, 2511.0, 18.0, 22.0, + 120.0, 87.0, 2979.0, 19.5, 21.0, + 96.0, 69.0, 2189.0, 18.0, 26.0, + 122.0, 86.0, 2395.0, 16.0, 22.0, + 97.0, 92.0, 2288.0, 17.0, 28.0, + 120.0, 97.0, 2506.0, 14.5, 23.0, + 98.0, 80.0, 2164.0, 15.0, 28.0, + 97.0, 88.0, 2100.0, 16.5, 27.0, + 350.0, 175.0, 4100.0, 13.0, 13.0, + 304.0, 150.0, 3672.0, 11.5, 14.0, + 350.0, 145.0, 3988.0, 13.0, 13.0, + 302.0, 137.0, 4042.0, 14.5, 14.0, + 318.0, 150.0, 3777.0, 12.5, 15.0, + 429.0, 198.0, 4952.0, 11.5, 12.0, + 400.0, 150.0, 4464.0, 12.0, 13.0, + 351.0, 158.0, 4363.0, 13.0, 13.0, + 318.0, 150.0, 4237.0, 14.5, 14.0, + 440.0, 215.0, 4735.0, 11.0, 13.0, + 455.0, 225.0, 4951.0, 11.0, 12.0, + 360.0, 175.0, 3821.0, 11.0, 13.0, + 225.0, 105.0, 3121.0, 16.5, 18.0, + 250.0, 100.0, 3278.0, 18.0, 16.0, + 232.0, 100.0, 2945.0, 16.0, 18.0, + 250.0, 88.0, 3021.0, 16.5, 18.0, + 198.0, 95.0, 2904.0, 16.0, 23.0, + 97.0, 46.0, 1950.0, 21.0, 26.0, + 400.0, 150.0, 4997.0, 14.0, 11.0, + 400.0, 167.0, 4906.0, 12.5, 12.0, + 360.0, 170.0, 4654.0, 13.0, 13.0, + 350.0, 180.0, 4499.0, 12.5, 12.0, + 232.0, 100.0, 2789.0, 15.0, 18.0, + 97.0, 88.0, 2279.0, 19.0, 20.0, + 140.0, 72.0, 2401.0, 19.5, 21.0, + 108.0, 94.0, 2379.0, 16.5, 22.0, + 70.0, 90.0, 2124.0, 13.5, 18.0, + 122.0, 85.0, 2310.0, 18.5, 19.0, + 155.0, 107.0, 2472.0, 14.0, 21.0, + 98.0, 90.0, 2265.0, 15.5, 26.0, + 350.0, 145.0, 4082.0, 13.0, 15.0, + 400.0, 230.0, 4278.0, 9.5, 16.0, + 68.0, 49.0, 1867.0, 19.5, 29.0, + 116.0, 75.0, 2158.0, 15.5, 24.0, + 114.0, 91.0, 2582.0, 14.0, 20.0, + 121.0, 112.0, 2868.0, 15.5, 19.0, + 318.0, 150.0, 3399.0, 11.0, 15.0, + 121.0, 110.0, 2660.0, 14.0, 24.0, + 156.0, 122.0, 2807.0, 13.5, 20.0, + 350.0, 180.0, 3664.0, 11.0, 11.0, + 198.0, 95.0, 3102.0, 16.5, 20.0, + 232.0, 100.0, 2901.0, 16.0, 19.0, + 250.0, 100.0, 3336.0, 17.0, 15.0, + 79.0, 67.0, 1950.0, 19.0, 31.0, + 122.0, 80.0, 2451.0, 16.5, 26.0, + 71.0, 65.0, 1836.0, 21.0, 32.0, + 140.0, 75.0, 2542.0, 17.0, 25.0, + 250.0, 100.0, 3781.0, 17.0, 16.0, + 258.0, 110.0, 3632.0, 18.0, 16.0, + 225.0, 105.0, 3613.0, 16.5, 18.0, + 302.0, 140.0, 4141.0, 14.0, 16.0, + 350.0, 150.0, 4699.0, 14.5, 13.0, + 318.0, 150.0, 4457.0, 13.5, 14.0, + 302.0, 140.0, 4638.0, 16.0, 14.0, + 304.0, 150.0, 4257.0, 15.5, 14.0, + 98.0, 83.0, 2219.0, 16.5, 29.0, + 79.0, 67.0, 1963.0, 15.5, 26.0, + 97.0, 78.0, 2300.0, 14.5, 26.0, + 76.0, 52.0, 1649.0, 16.5, 31.0, + 83.0, 61.0, 2003.0, 19.0, 32.0, + 90.0, 75.0, 2125.0, 14.5, 28.0, + 90.0, 75.0, 2108.0, 15.5, 24.0, + 116.0, 75.0, 2246.0, 14.0, 26.0, + 120.0, 97.0, 2489.0, 15.0, 24.0, + 108.0, 93.0, 2391.0, 15.5, 26.0, + 79.0, 67.0, 2000.0, 16.0, 31.0, + 225.0, 95.0, 3264.0, 16.0, 19.0, + 250.0, 105.0, 3459.0, 16.0, 18.0, + 250.0, 72.0, 3432.0, 21.0, 15.0, + 250.0, 72.0, 3158.0, 19.5, 15.0, + 400.0, 170.0, 4668.0, 11.5, 16.0, + 350.0, 145.0, 4440.0, 14.0, 15.0, + 318.0, 150.0, 4498.0, 14.5, 16.0, + 351.0, 148.0, 4657.0, 13.5, 14.0, + 231.0, 110.0, 3907.0, 21.0, 17.0, + 250.0, 105.0, 3897.0, 18.5, 16.0, + 258.0, 110.0, 3730.0, 19.0, 15.0, + 225.0, 95.0, 3785.0, 19.0, 18.0, + 231.0, 110.0, 3039.0, 15.0, 21.0, + 262.0, 110.0, 3221.0, 13.5, 20.0, + 302.0, 129.0, 3169.0, 12.0, 13.0, + 97.0, 75.0, 2171.0, 16.0, 29.0, + 140.0, 83.0, 2639.0, 17.0, 23.0, + 232.0, 100.0, 2914.0, 16.0, 20.0, + 140.0, 78.0, 2592.0, 18.5, 23.0, + 134.0, 96.0, 2702.0, 13.5, 24.0, + 90.0, 71.0, 2223.0, 16.5, 25.0, + 119.0, 97.0, 2545.0, 17.0, 24.0, + 171.0, 97.0, 2984.0, 14.5, 18.0, + 90.0, 70.0, 1937.0, 14.0, 29.0, + 232.0, 90.0, 3211.0, 17.0, 19.0, + 115.0, 95.0, 2694.0, 15.0, 23.0, + 120.0, 88.0, 2957.0, 17.0, 23.0, + 121.0, 98.0, 2945.0, 14.5, 22.0, + 121.0, 115.0, 2671.0, 13.5, 25.0, + 91.0, 53.0, 1795.0, 17.5, 33.0, + 107.0, 86.0, 2464.0, 15.5, 28.0, + 116.0, 81.0, 2220.0, 16.9, 25.0, + 140.0, 92.0, 2572.0, 14.9, 25.0, + 98.0, 79.0, 2255.0, 17.7, 26.0, + 101.0, 83.0, 2202.0, 15.3, 27.0, + 305.0, 140.0, 4215.0, 13.0, 17.5, + 318.0, 150.0, 4190.0, 13.0, 16.0, + 304.0, 120.0, 3962.0, 13.9, 15.5, + 351.0, 152.0, 4215.0, 12.8, 14.5, + 225.0, 100.0, 3233.0, 15.4, 22.0, + 250.0, 105.0, 3353.0, 14.5, 22.0, + 200.0, 81.0, 3012.0, 17.6, 24.0, + 232.0, 90.0, 3085.0, 17.6, 22.5, + 85.0, 52.0, 2035.0, 22.2, 29.0, + 98.0, 60.0, 2164.0, 22.1, 24.5, + 90.0, 70.0, 1937.0, 14.2, 29.0, + 91.0, 53.0, 1795.0, 17.4, 33.0, + 225.0, 100.0, 3651.0, 17.7, 20.0, + 250.0, 78.0, 3574.0, 21.0, 18.0, + 250.0, 110.0, 3645.0, 16.2, 18.5, + 258.0, 95.0, 3193.0, 17.8, 17.5, + 97.0, 71.0, 1825.0, 12.2, 29.5, + 85.0, 70.0, 1990.0, 17.0, 32.0, + 97.0, 75.0, 2155.0, 16.4, 28.0, + 140.0, 72.0, 2565.0, 13.6, 26.5, + 130.0, 102.0, 3150.0, 15.7, 20.0, + 318.0, 150.0, 3940.0, 13.2, 13.0, + 120.0, 88.0, 3270.0, 21.9, 19.0, + 156.0, 108.0, 2930.0, 15.5, 19.0, + 168.0, 120.0, 3820.0, 16.7, 16.5, + 350.0, 180.0, 4380.0, 12.1, 16.5, + 350.0, 145.0, 4055.0, 12.0, 13.0, + 302.0, 130.0, 3870.0, 15.0, 13.0, + 318.0, 150.0, 3755.0, 14.0, 13.0, + 98.0, 68.0, 2045.0, 18.5, 31.5, + 111.0, 80.0, 2155.0, 14.8, 30.0, + 79.0, 58.0, 1825.0, 18.6, 36.0, + 122.0, 96.0, 2300.0, 15.5, 25.5, + 85.0, 70.0, 1945.0, 16.8, 33.5, + 305.0, 145.0, 3880.0, 12.5, 17.5, + 260.0, 110.0, 4060.0, 19.0, 17.0, + 318.0, 145.0, 4140.0, 13.7, 15.5, + 302.0, 130.0, 4295.0, 14.9, 15.0, + 250.0, 110.0, 3520.0, 16.4, 17.5, + 231.0, 105.0, 3425.0, 16.9, 20.5, + 225.0, 100.0, 3630.0, 17.7, 19.0, + 250.0, 98.0, 3525.0, 19.0, 18.5, + 400.0, 180.0, 4220.0, 11.1, 16.0, + 350.0, 170.0, 4165.0, 11.4, 15.5, + 400.0, 190.0, 4325.0, 12.2, 15.5, + 351.0, 149.0, 4335.0, 14.5, 16.0, + 97.0, 78.0, 1940.0, 14.5, 29.0, + 151.0, 88.0, 2740.0, 16.0, 24.5, + 97.0, 75.0, 2265.0, 18.2, 26.0, + 140.0, 89.0, 2755.0, 15.8, 25.5, + 98.0, 63.0, 2051.0, 17.0, 30.5, + 98.0, 83.0, 2075.0, 15.9, 33.5, + 97.0, 67.0, 1985.0, 16.4, 30.0, + 97.0, 78.0, 2190.0, 14.1, 30.5, + 146.0, 97.0, 2815.0, 14.5, 22.0, + 121.0, 110.0, 2600.0, 12.8, 21.5, + 80.0, 110.0, 2720.0, 13.5, 21.5, + 90.0, 48.0, 1985.0, 21.5, 43.1, + 98.0, 66.0, 1800.0, 14.4, 36.1, + 78.0, 52.0, 1985.0, 19.4, 32.8, + 85.0, 70.0, 2070.0, 18.6, 39.4, + 91.0, 60.0, 1800.0, 16.4, 36.1, + 260.0, 110.0, 3365.0, 15.5, 19.9, + 318.0, 140.0, 3735.0, 13.2, 19.4, + 302.0, 139.0, 3570.0, 12.8, 20.2, + 231.0, 105.0, 3535.0, 19.2, 19.2, + 200.0, 95.0, 3155.0, 18.2, 20.5, + 200.0, 85.0, 2965.0, 15.8, 20.2, + 140.0, 88.0, 2720.0, 15.4, 25.1, + 225.0, 100.0, 3430.0, 17.2, 20.5, + 232.0, 90.0, 3210.0, 17.2, 19.4, + 231.0, 105.0, 3380.0, 15.8, 20.6, + 200.0, 85.0, 3070.0, 16.7, 20.8, + 225.0, 110.0, 3620.0, 18.7, 18.6, + 258.0, 120.0, 3410.0, 15.1, 18.1, + 305.0, 145.0, 3425.0, 13.2, 19.2, + 231.0, 165.0, 3445.0, 13.4, 17.7, + 302.0, 139.0, 3205.0, 11.2, 18.1, + 318.0, 140.0, 4080.0, 13.7, 17.5, + 98.0, 68.0, 2155.0, 16.5, 30.0, + 134.0, 95.0, 2560.0, 14.2, 27.5, + 119.0, 97.0, 2300.0, 14.7, 27.2, + 105.0, 75.0, 2230.0, 14.5, 30.9, + 134.0, 95.0, 2515.0, 14.8, 21.1, + 156.0, 105.0, 2745.0, 16.7, 23.2, + 151.0, 85.0, 2855.0, 17.6, 23.8, + 119.0, 97.0, 2405.0, 14.9, 23.9, + 131.0, 103.0, 2830.0, 15.9, 20.3, + 163.0, 125.0, 3140.0, 13.6, 17.0, + 121.0, 115.0, 2795.0, 15.7, 21.6, + 163.0, 133.0, 3410.0, 15.8, 16.2, + 89.0, 71.0, 1990.0, 14.9, 31.5, + 98.0, 68.0, 2135.0, 16.6, 29.5, + 231.0, 115.0, 3245.0, 15.4, 21.5, + 200.0, 85.0, 2990.0, 18.2, 19.8, + 140.0, 88.0, 2890.0, 17.3, 22.3, + 232.0, 90.0, 3265.0, 18.2, 20.2, + 225.0, 110.0, 3360.0, 16.6, 20.6, + 305.0, 130.0, 3840.0, 15.4, 17.0, + 302.0, 129.0, 3725.0, 13.4, 17.6, + 351.0, 138.0, 3955.0, 13.2, 16.5, + 318.0, 135.0, 3830.0, 15.2, 18.2, + 350.0, 155.0, 4360.0, 14.9, 16.9, + 351.0, 142.0, 4054.0, 14.3, 15.5, + 267.0, 125.0, 3605.0, 15.0, 19.2, + 360.0, 150.0, 3940.0, 13.0, 18.5, + 89.0, 71.0, 1925.0, 14.0, 31.9, + 86.0, 65.0, 1975.0, 15.2, 34.1, + 98.0, 80.0, 1915.0, 14.4, 35.7, + 121.0, 80.0, 2670.0, 15.0, 27.4, + 183.0, 77.0, 3530.0, 20.1, 25.4, + 350.0, 125.0, 3900.0, 17.4, 23.0, + 141.0, 71.0, 3190.0, 24.8, 27.2, + 260.0, 90.0, 3420.0, 22.2, 23.9, + 105.0, 70.0, 2200.0, 13.2, 34.2, + 105.0, 70.0, 2150.0, 14.9, 34.5, + 85.0, 65.0, 2020.0, 19.2, 31.8, + 91.0, 69.0, 2130.0, 14.7, 37.3, + 151.0, 90.0, 2670.0, 16.0, 28.4, + 173.0, 115.0, 2595.0, 11.3, 28.8, + 173.0, 115.0, 2700.0, 12.9, 26.8, + 151.0, 90.0, 2556.0, 13.2, 33.5, + 98.0, 76.0, 2144.0, 14.7, 41.5, + 89.0, 60.0, 1968.0, 18.8, 38.1, + 98.0, 70.0, 2120.0, 15.5, 32.1, + 86.0, 65.0, 2019.0, 16.4, 37.2, + 151.0, 90.0, 2678.0, 16.5, 28.0, + 140.0, 88.0, 2870.0, 18.1, 26.4, + 151.0, 90.0, 3003.0, 20.1, 24.3, + 225.0, 90.0, 3381.0, 18.7, 19.1, + 97.0, 78.0, 2188.0, 15.8, 34.3, + 134.0, 90.0, 2711.0, 15.5, 29.8, + 120.0, 75.0, 2542.0, 17.5, 31.3, + 119.0, 92.0, 2434.0, 15.0, 37.0, + 108.0, 75.0, 2265.0, 15.2, 32.2, + 86.0, 65.0, 2110.0, 17.9, 46.6, + 156.0, 105.0, 2800.0, 14.4, 27.9, + 85.0, 65.0, 2110.0, 19.2, 40.8, + 90.0, 48.0, 2085.0, 21.7, 44.3, + 90.0, 48.0, 2335.0, 23.7, 43.4, + 121.0, 67.0, 2950.0, 19.9, 36.4, + 146.0, 67.0, 3250.0, 21.8, 30.0, + 91.0, 67.0, 1850.0, 13.8, 44.6, + 97.0, 67.0, 2145.0, 18.0, 33.8, + 89.0, 62.0, 1845.0, 15.3, 29.8, + 168.0, 132.0, 2910.0, 11.4, 32.7, + 70.0, 100.0, 2420.0, 12.5, 23.7, + 122.0, 88.0, 2500.0, 15.1, 35.0, + 107.0, 72.0, 2290.0, 17.0, 32.4, + 135.0, 84.0, 2490.0, 15.7, 27.2, + 151.0, 84.0, 2635.0, 16.4, 26.6, + 156.0, 92.0, 2620.0, 14.4, 25.8, + 173.0, 110.0, 2725.0, 12.6, 23.5, + 135.0, 84.0, 2385.0, 12.9, 30.0, + 79.0, 58.0, 1755.0, 16.9, 39.1, + 86.0, 64.0, 1875.0, 16.4, 39.0, + 81.0, 60.0, 1760.0, 16.1, 35.1, + 97.0, 67.0, 2065.0, 17.8, 32.3, + 85.0, 65.0, 1975.0, 19.4, 37.0, + 89.0, 62.0, 2050.0, 17.3, 37.7, + 91.0, 68.0, 1985.0, 16.0, 34.1, + 105.0, 63.0, 2215.0, 14.9, 34.7, + 98.0, 65.0, 2045.0, 16.2, 34.4, + 98.0, 65.0, 2380.0, 20.7, 29.9, + 105.0, 74.0, 2190.0, 14.2, 33.0, + 107.0, 75.0, 2210.0, 14.4, 33.7, + 108.0, 75.0, 2350.0, 16.8, 32.4, + 119.0, 100.0, 2615.0, 14.8, 32.9, + 120.0, 74.0, 2635.0, 18.3, 31.6, + 141.0, 80.0, 3230.0, 20.4, 28.1, + 145.0, 76.0, 3160.0, 19.6, 30.7, + 168.0, 116.0, 2900.0, 12.6, 25.4, + 146.0, 120.0, 2930.0, 13.8, 24.2, + 231.0, 110.0, 3415.0, 15.8, 22.4, + 350.0, 105.0, 3725.0, 19.0, 26.6, + 200.0, 88.0, 3060.0, 17.1, 20.2, + 225.0, 85.0, 3465.0, 16.6, 17.6, + 112.0, 88.0, 2605.0, 19.6, 28.0, + 112.0, 88.0, 2640.0, 18.6, 27.0, + 112.0, 88.0, 2395.0, 18.0, 34.0, + 112.0, 85.0, 2575.0, 16.2, 31.0, + 135.0, 84.0, 2525.0, 16.0, 29.0, + 151.0, 90.0, 2735.0, 18.0, 27.0, + 140.0, 92.0, 2865.0, 16.4, 24.0, + 105.0, 74.0, 1980.0, 15.3, 36.0, + 91.0, 68.0, 2025.0, 18.2, 37.0, + 91.0, 68.0, 1970.0, 17.6, 31.0, + 105.0, 63.0, 2125.0, 14.7, 38.0, + 98.0, 70.0, 2125.0, 17.3, 36.0, + 120.0, 88.0, 2160.0, 14.5, 36.0, + 107.0, 75.0, 2205.0, 14.5, 36.0, + 108.0, 70.0, 2245.0, 16.9, 34.0, + 91.0, 67.0, 1965.0, 15.0, 38.0, + 91.0, 67.0, 1965.0, 15.7, 32.0, + 91.0, 67.0, 1995.0, 16.2, 38.0, + 181.0, 110.0, 2945.0, 16.4, 25.0, + 262.0, 85.0, 3015.0, 17.0, 38.0, + 156.0, 92.0, 2585.0, 14.5, 26.0, + 232.0, 112.0, 2835.0, 14.7, 22.0, + 144.0, 96.0, 2665.0, 13.9, 32.0, + 135.0, 84.0, 2370.0, 13.0, 36.0, + 151.0, 90.0, 2950.0, 17.3, 27.0, + 140.0, 86.0, 2790.0, 15.6, 27.0, + 97.0, 52.0, 2130.0, 24.6, 44.0, + 135.0, 84.0, 2295.0, 11.6, 32.0, + 120.0, 79.0, 2625.0, 18.6, 28.0, + 119.0, 82.0, 2720.0, 19.4, 31.0, +}) diff --git a/stat/cca_example_test.go b/stat/cca_example_test.go new file mode 100644 index 00000000..a7abc013 --- /dev/null +++ b/stat/cca_example_test.go @@ -0,0 +1,165 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import ( + "fmt" + "log" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +// symView is a helper for getting a View of a SymDense. +type symView struct { + sym *mat64.SymDense + + i, j, r, c int +} + +func (s symView) Dims() (r, c int) { return s.r, s.c } + +func (s symView) At(i, j int) float64 { + if i < 0 || s.r <= i { + panic("i out of bounds") + } + if j < 0 || s.c <= j { + panic("j out of bounds") + } + return s.sym.At(s.i+i, s.j+j) +} + +func (s symView) T() mat64.Matrix { return mat64.Transpose{s} } + +func ExampleCC() { + // This example is directly analogous to Example 3.5 on page 87 of + // Koch, Inge. Analysis of multivariate and high-dimensional data. + // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 + + // bostonData is the Boston Housing Data of Harrison and Rubinfeld (1978) + n, _ := bostonData.Dims() + var xd, yd = 7, 4 + // The variables (columns) of bostonData can be partitioned into two sets: + // those that deal with environmental/social variables (xdata), and those + // that contain information regarding the individual (ydata). Because the + // variables can be naturally partitioned in this way, these data are + // appropriate for canonical correlation analysis. The columns (variables) + // of xdata are, in order: + // per capita crime rate by town, + // proportion of non-retail business acres per town, + // nitric oxide concentration (parts per 10 million), + // weighted distances to Boston employment centres, + // index of accessibility to radial highways, + // pupil-teacher ratio by town, and + // proportion of blacks by town. + xdata := bostonData.Slice(0, n, 0, xd) + + // The columns (variables) of ydata are, in order: + // average number of rooms per dwelling, + // proportion of owner-occupied units built prior to 1940, + // full-value property-tax rate per $10000, and + // median value of owner-occupied homes in $1000s. + ydata := bostonData.Slice(0, n, xd, xd+yd) + + // For comparison, calculate the correlation matrix for the original data. + var cor mat64.SymDense + stat.CorrelationMatrix(&cor, bostonData, nil) + + // Extract just those correlations that are between xdata and ydata. + var corRaw = symView{sym: &cor, i: 0, j: xd, r: xd, c: yd} + + // Note that the strongest correlation between individual variables is 0.91 + // between the 5th variable of xdata (index of accessibility to radial + // highways) and the 3rd variable of ydata (full-value property-tax rate per + // $10000). + fmt.Printf("corRaw = %.4f", mat64.Formatted(corRaw, mat64.Prefix(" "))) + + // Calculate the canonical correlations. + var cc stat.CC + err := cc.CanonicalCorrelations(xdata, ydata, nil) + if err != nil { + log.Fatal(err) + } + + // Unpack cc. + ccors := cc.Corrs(nil) + pVecs := cc.Left(nil, true) + qVecs := cc.Right(nil, true) + phiVs := cc.Left(nil, false) + psiVs := cc.Right(nil, false) + + // Canonical Correlation Matrix, or the correlations between the sphered + // data. + var corSph mat64.Dense + corSph.Clone(pVecs) + col := make([]float64, xd) + for j := 0; j < yd; j++ { + mat64.Col(col, j, &corSph) + floats.Scale(ccors[j], col) + corSph.SetCol(j, col) + } + corSph.Product(&corSph, qVecs.T()) + fmt.Printf("\n\ncorSph = %.4f", mat64.Formatted(&corSph, mat64.Prefix(" "))) + + // Canonical Correlations. Note that the first canonical correlation is + // 0.95, stronger than the greatest correlation in the original data, and + // much stronger than the greatest correlation in the sphered data. + fmt.Printf("\n\nccors = %.4f", ccors) + + // Left and right eigenvectors of the canonical correlation matrix. + fmt.Printf("\n\npVecs = %.4f", mat64.Formatted(pVecs, mat64.Prefix(" "))) + fmt.Printf("\n\nqVecs = %.4f", mat64.Formatted(qVecs, mat64.Prefix(" "))) + + // Canonical Correlation Transforms. These can be useful as they represent + // the canonical variables as linear combinations of the original variables. + fmt.Printf("\n\nphiVs = %.4f", mat64.Formatted(phiVs, mat64.Prefix(" "))) + fmt.Printf("\n\npsiVs = %.4f", mat64.Formatted(psiVs, mat64.Prefix(" "))) + + // Output: + // corRaw = ⎡-0.2192 0.3527 0.5828 -0.3883⎤ + // ⎢-0.3917 0.6448 0.7208 -0.4837⎥ + // ⎢-0.3022 0.7315 0.6680 -0.4273⎥ + // ⎢ 0.2052 -0.7479 -0.5344 0.2499⎥ + // ⎢-0.2098 0.4560 0.9102 -0.3816⎥ + // ⎢-0.3555 0.2615 0.4609 -0.5078⎥ + // ⎣ 0.1281 -0.2735 -0.4418 0.3335⎦ + // + // corSph = ⎡ 0.0118 0.0525 0.2300 -0.1363⎤ + // ⎢-0.1810 0.3213 0.3814 -0.1412⎥ + // ⎢ 0.0166 0.2241 0.0104 -0.2235⎥ + // ⎢ 0.0346 -0.5481 -0.0034 -0.1994⎥ + // ⎢ 0.0303 -0.0956 0.7152 0.2039⎥ + // ⎢-0.0298 -0.0022 0.0739 -0.3703⎥ + // ⎣-0.1226 -0.0746 -0.3899 0.1541⎦ + // + // ccors = [0.9451 0.6787 0.5714 0.2010] + // + // pVecs = ⎡-0.2574 0.0158 0.2122 -0.0946⎤ + // ⎢-0.4837 0.3837 0.1474 0.6597⎥ + // ⎢-0.0801 0.3494 0.3287 -0.2862⎥ + // ⎢ 0.1278 -0.7337 0.4851 0.2248⎥ + // ⎢-0.6969 -0.4342 -0.3603 0.0291⎥ + // ⎢-0.0991 0.0503 0.6384 0.1022⎥ + // ⎣ 0.4260 0.0323 -0.2290 0.6419⎦ + // + // qVecs = ⎡ 0.0182 -0.1583 -0.0067 -0.9872⎤ + // ⎢-0.2348 0.9483 -0.1462 -0.1554⎥ + // ⎢-0.9701 -0.2406 -0.0252 0.0209⎥ + // ⎣ 0.0593 -0.1330 -0.9889 0.0291⎦ + // + // phiVs = ⎡-0.0027 0.0093 0.0490 -0.0155⎤ + // ⎢-0.0429 -0.0242 0.0361 0.1839⎥ + // ⎢-1.2248 5.6031 5.8094 -4.7927⎥ + // ⎢-0.0044 -0.3424 0.4470 0.1150⎥ + // ⎢-0.0742 -0.1193 -0.1116 0.0022⎥ + // ⎢-0.0233 0.1046 0.3853 -0.0161⎥ + // ⎣ 0.0001 0.0005 -0.0030 0.0082⎦ + // + // psiVs = ⎡ 0.0302 -0.3002 0.0878 -1.9583⎤ + // ⎢-0.0065 0.0392 -0.0118 -0.0061⎥ + // ⎢-0.0052 -0.0046 -0.0023 0.0008⎥ + // ⎣ 0.0020 0.0037 -0.1293 0.1038⎦ +} diff --git a/stat/cca_test.go b/stat/cca_test.go new file mode 100644 index 00000000..92c2dbf0 --- /dev/null +++ b/stat/cca_test.go @@ -0,0 +1,191 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import ( + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +func TestCanonicalCorrelations(t *testing.T) { +tests: + for i, test := range []struct { + xdata mat64.Matrix + ydata mat64.Matrix + weights []float64 + wantCorrs []float64 + wantpVecs *mat64.Dense + wantqVecs *mat64.Dense + wantphiVs *mat64.Dense + wantpsiVs *mat64.Dense + epsilon float64 + }{ + // Test results verified using R. + { // Truncated iris data, Sepal vs Petal measurements. + xdata: mat64.NewDense(10, 2, []float64{ + 5.1, 3.5, + 4.9, 3.0, + 4.7, 3.2, + 4.6, 3.1, + 5.0, 3.6, + 5.4, 3.9, + 4.6, 3.4, + 5.0, 3.4, + 4.4, 2.9, + 4.9, 3.1, + }), + ydata: mat64.NewDense(10, 2, []float64{ + 1.4, 0.2, + 1.4, 0.2, + 1.3, 0.2, + 1.5, 0.2, + 1.4, 0.2, + 1.7, 0.4, + 1.4, 0.3, + 1.5, 0.2, + 1.4, 0.2, + 1.5, 0.1, + }), + wantCorrs: []float64{0.7250624174504773, 0.5547679185730191}, + wantpVecs: mat64.NewDense(2, 2, []float64{ + 0.0765914610875867, 0.9970625597666721, + 0.9970625597666721, -0.0765914610875868, + }), + wantqVecs: mat64.NewDense(2, 2, []float64{ + 0.3075184850910837, 0.9515421069649439, + 0.9515421069649439, -0.3075184850910837, + }), + wantphiVs: mat64.NewDense(2, 2, []float64{ + -1.9794877596804641, 5.2016325219025124, + 4.5211829944066553, -2.7263663170835697, + }), + wantpsiVs: mat64.NewDense(2, 2, []float64{ + -0.0613084818030103, 10.8514169865438941, + 12.7209032660734298, -7.6793888180353775, + }), + epsilon: 1e-12, + }, + // Test results compared to those results presented in examples by + // Koch, Inge. Analysis of multivariate and high-dimensional data. + // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 + { // ASA Car Exposition Data of Ramos and Donoho (1983) + // Displacement, Horsepower, Weight + xdata: carData.Slice(0, 392, 0, 3), + // Acceleration, MPG + ydata: carData.Slice(0, 392, 3, 5), + wantCorrs: []float64{0.8782187384352336, 0.6328187219216761}, + wantpVecs: mat64.NewDense(3, 2, []float64{ + 0.3218296374829181, 0.3947540257657075, + 0.4162807660635797, 0.7573719053303306, + 0.8503740401982725, -0.5201509936144236, + }), + wantqVecs: mat64.NewDense(2, 2, []float64{ + -0.5161984172278830, -0.8564690269072364, + -0.8564690269072364, 0.5161984172278830, + }), + wantphiVs: mat64.NewDense(3, 2, []float64{ + 0.0025033152994308, 0.0047795464118615, + 0.0201923608080173, 0.0409150208725958, + -0.0000247374128745, -0.0026766435161875, + }), + wantpsiVs: mat64.NewDense(2, 2, []float64{ + -0.1666196759760772, -0.3637393866139658, + -0.0915512109649727, 0.1077863777929168, + }), + epsilon: 1e-12, + }, + // Test results compared to those results presented in examples by + // Koch, Inge. Analysis of multivariate and high-dimensional data. + // Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 + { // Boston Housing Data of Harrison and Rubinfeld (1978) + // Per capita crime rate by town, + // Proportion of non-retail business acres per town, + // Nitric oxide concentration (parts per 10 million), + // Weighted distances to Boston employment centres, + // Index of accessibility to radial highways, + // Pupil-teacher ratio by town, Proportion of blacks by town + xdata: bostonData.Slice(0, 506, 0, 7), + // Average number of rooms per dwelling, + // Proportion of owner-occupied units built prior to 1940, + // Full-value property-tax rate per $10000, + // Median value of owner-occupied homes in $1000s + ydata: bostonData.Slice(0, 506, 7, 11), + wantCorrs: []float64{0.9451239443886021, 0.6786622733370654, 0.5714338361583764, 0.2009739704710440}, + wantpVecs: mat64.NewDense(7, 4, []float64{ + -0.2574391924541903, 0.0158477516621194, 0.2122169934631024, -0.0945733803894706, + -0.4836594430018478, 0.3837101908138468, 0.1474448317415911, 0.6597324886718275, + -0.0800776365873296, 0.3493556742809252, 0.3287336458109373, -0.2862040444334655, + 0.1277586360386374, -0.7337427663667596, 0.4851134819037011, 0.2247964865970192, + -0.6969432006136684, -0.4341748776002893, -0.3602872887636357, 0.0290661608626292, + -0.0990903250057199, 0.0503411215453873, 0.6384330631742202, 0.1022367136218303, + 0.4260459963765036, 0.0323334351308141, -0.2289527516030810, 0.6419232947608805, + }), + wantqVecs: mat64.NewDense(4, 4, []float64{ + 0.0181660502363264, -0.1583489460479038, -0.0066723577642883, -0.9871935400650649, + -0.2347699045986119, 0.9483314614936594, -0.1462420505631345, -0.1554470767919033, + -0.9700704038477141, -0.2406071741000039, -0.0251838984227037, 0.0209134074358349, + 0.0593000682318482, -0.1330460003097728, -0.9889057151969489, 0.0291161494720761, + }), + wantphiVs: mat64.NewDense(7, 4, []float64{ + -0.0027462234108197, 0.0093444513500898, 0.0489643932714296, -0.0154967189805819, + -0.0428564455279537, -0.0241708702119420, 0.0360723472093996, 0.1838983230588095, + -1.2248435648802380, 5.6030921364723980, 5.8094144583797025, -4.7926812190419676, + -0.0043684825094649, -0.3424101164977618, 0.4469961215717917, 0.1150161814353696, + -0.0741534069521954, -0.1193135794923700, -0.1115518305471460, 0.0021638758323088, + -0.0233270323101624, 0.1046330818178399, 0.3853045975077387, -0.0160927870102877, + 0.0001293051387859, 0.0004540746921446, -0.0030296315865440, 0.0081895477974654, + }), + wantpsiVs: mat64.NewDense(4, 4, []float64{ + 0.0301593362017375, -0.3002219289647127, 0.0878217377593682, -1.9583226531517062, + -0.0065483104073892, 0.0392212086716247, -0.0117570776209991, -0.0061113064481860, + -0.0052075523350125, -0.0045770200452960, -0.0022762313289592, 0.0008441873006821, + 0.0020111735096327, 0.0037352799829930, -0.1292578071621794, 0.1037709056329765, + }), + epsilon: 1e-12, + }, + } { + var cc stat.CC + var corrs []float64 + var pVecs, qVecs *mat64.Dense + var phiVs, psiVs *mat64.Dense + for j := 0; j < 2; j++ { + err := cc.CanonicalCorrelations(test.xdata, test.ydata, test.weights) + if err != nil { + t.Errorf("%d use %d: unexpected error: %v", i, j, err) + continue tests + } + + corrs = cc.Corrs(corrs) + pVecs = cc.Left(pVecs, true) + qVecs = cc.Right(qVecs, true) + phiVs = cc.Left(phiVs, false) + psiVs = cc.Right(psiVs, false) + + if !floats.EqualApprox(corrs, test.wantCorrs, test.epsilon) { + t.Errorf("%d use %d: unexpected variance result got:%v, want:%v", + i, j, corrs, test.wantCorrs) + } + if !mat64.EqualApprox(pVecs, test.wantpVecs, test.epsilon) { + t.Errorf("%d use %d: unexpected CCA result got:\n%v\nwant:\n%v", + i, j, mat64.Formatted(pVecs), mat64.Formatted(test.wantpVecs)) + } + if !mat64.EqualApprox(qVecs, test.wantqVecs, test.epsilon) { + t.Errorf("%d use %d: unexpected CCA result got:\n%v\nwant:\n%v", + i, j, mat64.Formatted(qVecs), mat64.Formatted(test.wantqVecs)) + } + if !mat64.EqualApprox(phiVs, test.wantphiVs, test.epsilon) { + t.Errorf("%d use %d: unexpected CCA result got:\n%v\nwant:\n%v", + i, j, mat64.Formatted(phiVs), mat64.Formatted(test.wantphiVs)) + } + if !mat64.EqualApprox(psiVs, test.wantpsiVs, test.epsilon) { + t.Errorf("%d use %d: unexpected CCA result got:\n%v\nwant:\n%v", + i, j, mat64.Formatted(psiVs), mat64.Formatted(test.wantpsiVs)) + } + } + } +} diff --git a/stat/combin/combin.go b/stat/combin/combin.go new file mode 100644 index 00000000..4dba68f2 --- /dev/null +++ b/stat/combin/combin.go @@ -0,0 +1,183 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package combin implements routines involving combinatorics (permutations, +// combinations, etc.). +package combin + +import "math" + +const ( + badNegInput = "combin: negative input" + badSetSize = "combin: n < k" + badInput = "combin: wrong input slice length" +) + +// Binomial returns the binomial coefficient of (n,k), also commonly referred to +// as "n choose k". +// +// The binomial coefficient, C(n,k), is the number of unordered combinations of +// k elements in a set that is n elements big, and is defined as +// +// C(n,k) = n!/((n-k)!k!) +// +// n and k must be non-negative with n >= k, otherwise Binomial will panic. +// No check is made for overflow. +func Binomial(n, k int) int { + if n < 0 || k < 0 { + panic(badNegInput) + } + if n < k { + panic(badSetSize) + } + // (n,k) = (n, n-k) + if k > n/2 { + k = n - k + } + b := 1 + for i := 1; i <= k; i++ { + b = (n - k + i) * b / i + } + return b +} + +// GeneralizedBinomial returns the generalized binomial coefficient of (n, k), +// defined as +// Γ(n+1) / (Γ(k+1) Γ(n-k+1)) +// where Γ is the Gamma function. GeneralizedBinomial is useful for continuous +// relaxations of the binomial coefficient, or when the binomial coefficient value +// may overflow int. In the latter case, one may use math/big for an exact +// computation. +// +// n and k must be non-negative with n >= k, otherwise GeneralizedBinomial will panic. +func GeneralizedBinomial(n, k float64) float64 { + return math.Exp(LogGeneralizedBinomial(n, k)) +} + +// LogGeneralizedBinomial returns the log of the generalized binomial coefficient. +// See GeneralizedBinomial for more information. +func LogGeneralizedBinomial(n, k float64) float64 { + if n < 0 || k < 0 { + panic(badNegInput) + } + if n < k { + panic(badSetSize) + } + a, _ := math.Lgamma(n + 1) + b, _ := math.Lgamma(k + 1) + c, _ := math.Lgamma(n - k + 1) + return a - b - c +} + +// CombinationGenerator generates combinations iteratively. Combinations may be +// called to generate all combinations collectively. +type CombinationGenerator struct { + n int + k int + previous []int + remaining int +} + +// NewCombinationGenerator returns a CombinationGenerator for generating the +// combinations of k elements from a set of size n. +// +// n and k must be non-negative with n >= k, otherwise NewCombinationGenerator +// will panic. +func NewCombinationGenerator(n, k int) *CombinationGenerator { + return &CombinationGenerator{ + n: n, + k: k, + remaining: Binomial(n, k), + } +} + +// Next advances the iterator if there are combinations remaining to be generated, +// and returns false if all combinations have been generated. Next must be called +// to initialize the first value before calling Combination or Combination will +// panic. The value returned by Combination is only changed during calls to Next. +func (c *CombinationGenerator) Next() bool { + if c.remaining <= 0 { + // Next is called before combination, so c.remaining is set to zero before + // Combination is called. Thus, Combination cannot panic on zero, and a + // second sentinel value is needed. + c.remaining = -1 + return false + } + if c.previous == nil { + c.previous = make([]int, c.k) + for i := range c.previous { + c.previous[i] = i + } + } else { + nextCombination(c.previous, c.n, c.k) + } + c.remaining-- + return true +} + +// Combination generates the next combination. If next is non-nil, it must have +// length k and the result will be stored in-place into combination. If combination +// is nil a new slice will be allocated and returned. If all of the combinations +// have already been constructed (Next() returns false), Combination will panic. +// +// Next must be called to initialize the first value before calling Combination +// or Combination will panic. The value returned by Combination is only changed +// during calls to Next. +func (c *CombinationGenerator) Combination(combination []int) []int { + if c.remaining == -1 { + panic("combin: all combinations have been generated") + } + if c.previous == nil { + panic("combin: Combination called before Next") + } + if combination == nil { + combination = make([]int, c.k) + } + if len(combination) != c.k { + panic(badInput) + } + copy(combination, c.previous) + return combination +} + +// Combinations generates all of the combinations of k elements from a +// set of size n. The returned slice has length Binomial(n,k) and each inner slice +// has length k. +// +// n and k must be non-negative with n >= k, otherwise Combinations will panic. +// +// CombinationGenerator may alternatively be used to generate the combinations +// iteratively instead of collectively. +func Combinations(n, k int) [][]int { + combins := Binomial(n, k) + data := make([][]int, combins) + if len(data) == 0 { + return data + } + data[0] = make([]int, k) + for i := range data[0] { + data[0][i] = i + } + for i := 1; i < combins; i++ { + next := make([]int, k) + copy(next, data[i-1]) + nextCombination(next, n, k) + data[i] = next + } + return data +} + +// nextCombination generates the combination after s, overwriting the input value. +func nextCombination(s []int, n, k int) { + for j := k - 1; j >= 0; j-- { + if s[j] == n+j-k { + continue + } + s[j]++ + for l := j + 1; l < k; l++ { + s[l] = s[j] + l - j + } + break + } +} diff --git a/stat/combin/combin_test.go b/stat/combin/combin_test.go new file mode 100644 index 00000000..791e9391 --- /dev/null +++ b/stat/combin/combin_test.go @@ -0,0 +1,181 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package combin + +import ( + "math/big" + "testing" + + "github.com/gonum/floats" +) + +// intSosMatch returns true if the two slices of slices are equal. +func intSosMatch(a, b [][]int) bool { + if len(a) != len(b) { + return false + } + for i, s := range a { + if len(s) != len(b[i]) { + return false + } + for j, v := range s { + if v != b[i][j] { + return false + } + } + } + return true +} + +var binomialTests = []struct { + n, k, ans int +}{ + {0, 0, 1}, + {5, 0, 1}, + {5, 1, 5}, + {5, 2, 10}, + {5, 3, 10}, + {5, 4, 5}, + {5, 5, 1}, + + {6, 0, 1}, + {6, 1, 6}, + {6, 2, 15}, + {6, 3, 20}, + {6, 4, 15}, + {6, 5, 6}, + {6, 6, 1}, + + {20, 0, 1}, + {20, 1, 20}, + {20, 2, 190}, + {20, 3, 1140}, + {20, 4, 4845}, + {20, 5, 15504}, + {20, 6, 38760}, + {20, 7, 77520}, + {20, 8, 125970}, + {20, 9, 167960}, + {20, 10, 184756}, + {20, 11, 167960}, + {20, 12, 125970}, + {20, 13, 77520}, + {20, 14, 38760}, + {20, 15, 15504}, + {20, 16, 4845}, + {20, 17, 1140}, + {20, 18, 190}, + {20, 19, 20}, + {20, 20, 1}, +} + +func TestBinomial(t *testing.T) { + for cas, test := range binomialTests { + ans := Binomial(test.n, test.k) + if ans != test.ans { + t.Errorf("Case %v: Binomial mismatch. Got %v, want %v.", cas, ans, test.ans) + } + } + var ( + n = 61 + want big.Int + got big.Int + ) + for k := 0; k <= n; k++ { + want.Binomial(int64(n), int64(k)) + got.SetInt64(int64(Binomial(n, k))) + if want.Cmp(&got) != 0 { + t.Errorf("Case n=%v,k=%v: Binomial mismatch for large n. Got %v, want %v.", n, k, got, want) + } + } +} + +func TestGeneralizedBinomial(t *testing.T) { + for cas, test := range binomialTests { + ans := GeneralizedBinomial(float64(test.n), float64(test.k)) + if !floats.EqualWithinAbsOrRel(ans, float64(test.ans), 1e-14, 1e-14) { + t.Errorf("Case %v: Binomial mismatch. Got %v, want %v.", cas, ans, test.ans) + } + } +} + +func TestCombinations(t *testing.T) { + for cas, test := range []struct { + n, k int + data [][]int + }{ + { + n: 1, + k: 1, + data: [][]int{{0}}, + }, + { + n: 2, + k: 1, + data: [][]int{{0}, {1}}, + }, + { + n: 2, + k: 2, + data: [][]int{{0, 1}}, + }, + { + n: 3, + k: 1, + data: [][]int{{0}, {1}, {2}}, + }, + { + n: 3, + k: 2, + data: [][]int{{0, 1}, {0, 2}, {1, 2}}, + }, + { + n: 3, + k: 3, + data: [][]int{{0, 1, 2}}, + }, + { + n: 4, + k: 1, + data: [][]int{{0}, {1}, {2}, {3}}, + }, + { + n: 4, + k: 2, + data: [][]int{{0, 1}, {0, 2}, {0, 3}, {1, 2}, {1, 3}, {2, 3}}, + }, + { + n: 4, + k: 3, + data: [][]int{{0, 1, 2}, {0, 1, 3}, {0, 2, 3}, {1, 2, 3}}, + }, + { + n: 4, + k: 4, + data: [][]int{{0, 1, 2, 3}}, + }, + } { + data := Combinations(test.n, test.k) + if !intSosMatch(data, test.data) { + t.Errorf("Cas %v: Generated combinations mismatch. Got %v, want %v.", cas, data, test.data) + } + } +} + +func TestCombinationGenerator(t *testing.T) { + for n := 0; n <= 10; n++ { + for k := 1; k <= n; k++ { + combinations := Combinations(n, k) + cg := NewCombinationGenerator(n, k) + genCombs := make([][]int, 0, len(combinations)) + for cg.Next() { + genCombs = append(genCombs, cg.Combination(nil)) + } + if !intSosMatch(combinations, genCombs) { + t.Errorf("Combinations and generated combinations do not match. n = %v, k = %v", n, k) + } + } + } +} diff --git a/stat/distmat/general.go b/stat/distmat/general.go new file mode 100644 index 00000000..9e778ef6 --- /dev/null +++ b/stat/distmat/general.go @@ -0,0 +1,8 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package distmat provides probability distributions over matrices. +package distmat + +var badDim = "distmat: dimension mismatch" diff --git a/stat/distmat/wishart.go b/stat/distmat/wishart.go new file mode 100644 index 00000000..0f435aa0 --- /dev/null +++ b/stat/distmat/wishart.go @@ -0,0 +1,210 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmat + +import ( + "math" + "math/rand" + "sync" + + "github.com/gonum/mathext" + "github.com/gonum/matrix" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat/distuv" +) + +// Wishart is a distribution over d×d positive symmetric definite matrices. It +// is parametrized by a scalar degrees of freedom parameter ν and a d×d positive +// definite matrix V. +// +// The Wishart PDF is given by +// p(X) = [|X|^((ν-d-1)/2) * exp(-tr(V^-1 * X)/2)] / [2^(ν*d/2) * |V|^(ν/2) * Γ_d(ν/2)] +// where X is a d×d PSD matrix, ν > d-1, |·| denotes the determinant, tr is the +// trace and Γ_d is the multivariate gamma function. +// +// See https://en.wikipedia.org/wiki/Wishart_distribution for more information. +type Wishart struct { + nu float64 + src *rand.Rand + + dim int + cholv mat64.Cholesky + logdetv float64 + upper mat64.TriDense + + once sync.Once + v *mat64.SymDense // only stored if needed +} + +// NewWishart returns a new Wishart distribution with the given shape matrix and +// degrees of freedom parameter. NewWishart returns whether the creation was +// successful. +// +// NewWishart panics if nu <= d - 1 where d is the order of v. +func NewWishart(v mat64.Symmetric, nu float64, src *rand.Rand) (*Wishart, bool) { + dim := v.Symmetric() + if nu <= float64(dim-1) { + panic("wishart: nu must be greater than dim-1") + } + var chol mat64.Cholesky + ok := chol.Factorize(v) + if !ok { + return nil, false + } + + var u mat64.TriDense + u.UFromCholesky(&chol) + + w := &Wishart{ + nu: nu, + src: src, + + dim: dim, + cholv: chol, + logdetv: chol.LogDet(), + upper: u, + } + return w, true +} + +// MeanSym returns the mean matrix of the distribution as a symmetric matrix. +// If x is nil, a new matrix is allocated and returned. If x is not nil, the +// result is stored in-place into x and MeanSym will panic if the order of x +// is not equal to the order of the receiver. +func (w *Wishart) MeanSym(x *mat64.SymDense) *mat64.SymDense { + if x == nil { + x = mat64.NewSymDense(w.dim, nil) + } + d := x.Symmetric() + if d != w.dim { + panic(badDim) + } + w.setV() + x.CopySym(w.v) + x.ScaleSym(w.nu, x) + return x +} + +// ProbSym returns the probability of the symmetric matrix x. If x is not positive +// definite (the Cholesky decomposition fails), it has 0 probability. +func (w *Wishart) ProbSym(x mat64.Symmetric) float64 { + return math.Exp(w.LogProbSym(x)) +} + +// LogProbSym returns the log of the probability of the input symmetric matrix. +// +// LogProbSym returns -∞ if the input matrix is not positive definite (the Cholesky +// decomposition fails). +func (w *Wishart) LogProbSym(x mat64.Symmetric) float64 { + dim := x.Symmetric() + if dim != w.dim { + panic(badDim) + } + var chol mat64.Cholesky + ok := chol.Factorize(x) + if !ok { + return math.Inf(-1) + } + return w.logProbSymChol(&chol) +} + +// LogProbSymChol returns the log of the probability of the input symmetric matrix +// given its Cholesky decomposition. +func (w *Wishart) LogProbSymChol(cholX *mat64.Cholesky) float64 { + dim := cholX.Size() + if dim != w.dim { + panic(badDim) + } + return w.logProbSymChol(cholX) +} + +func (w *Wishart) logProbSymChol(cholX *mat64.Cholesky) float64 { + // The PDF is + // p(X) = [|X|^((ν-d-1)/2) * exp(-tr(V^-1 * X)/2)] / [2^(ν*d/2) * |V|^(ν/2) * Γ_d(ν/2)] + // The LogPDF is thus + // (ν-d-1)/2 * log(|X|) - tr(V^-1 * X)/2 - (ν*d/2)*log(2) - ν/2 * log(|V|) - log(Γ_d(ν/2)) + logdetx := cholX.LogDet() + + // Compute tr(V^-1 * X), using the fact that X = U^T * U. + var u mat64.TriDense + u.UFromCholesky(cholX) + + var vinvx mat64.Dense + err := vinvx.SolveCholesky(&w.cholv, u.T()) + if err != nil { + return math.Inf(-1) + } + vinvx.Mul(&vinvx, &u) + tr := mat64.Trace(&vinvx) + + fnu := float64(w.nu) + fdim := float64(w.dim) + + return 0.5*((fnu-fdim-1)*logdetx-tr-fnu*fdim*math.Ln2-fnu*w.logdetv) - mathext.MvLgamma(0.5*fnu, w.dim) +} + +// RandSym generates a random symmetric matrix from the distribution. +func (w *Wishart) RandSym(x *mat64.SymDense) *mat64.SymDense { + if x == nil { + x = &mat64.SymDense{} + } + var c mat64.Cholesky + w.RandChol(&c) + x.FromCholesky(&c) + return x +} + +// RandChol generates the Cholesky decomposition of a random matrix from the distribution. +func (w *Wishart) RandChol(c *mat64.Cholesky) *mat64.Cholesky { + // TODO(btracey): Modify the code if the underlying data from c is exposed + // to avoid the dim^2 allocation here. + + // Use the Bartlett Decomposition, which says that + // X ~ L A A^T L^T + // Where A is a lower triangular matrix in which the diagonal of A is + // generated from the square roots of χ^2 random variables, and the + // off-diagonals are generated from standard normal variables. + // The above gives the cholesky decomposition of X, where L_x = L A. + // + // mat64 works with the upper triagular decomposition, so we would like to do + // the same. We can instead say that + // U_x = L_x^T = (L * A)^T = A^T * L^T = A^T * U + // Instead, generate A^T, by using the procedure above, except as an upper + // triangular matrix. + norm := distuv.Normal{ + Mu: 0, + Sigma: 1, + Source: w.src, + } + + t := mat64.NewTriDense(w.dim, matrix.Upper, nil) + for i := 0; i < w.dim; i++ { + v := distuv.ChiSquared{ + K: w.nu - float64(i), + Src: w.src, + }.Rand() + t.SetTri(i, i, math.Sqrt(v)) + } + for i := 0; i < w.dim; i++ { + for j := i + 1; j < w.dim; j++ { + t.SetTri(i, j, norm.Rand()) + } + } + + t.MulTri(t, &w.upper) + if c == nil { + c = &mat64.Cholesky{} + } + c.SetFromU(t) + return c +} + +// setV computes and stores the covariance matrix of the distribution. +func (w *Wishart) setV() { + w.once.Do(func() { + w.v = mat64.NewSymDense(w.dim, nil) + w.v.FromCholesky(&w.cholv) + }) +} diff --git a/stat/distmat/wishart_test.go b/stat/distmat/wishart_test.go new file mode 100644 index 00000000..10c7c5ce --- /dev/null +++ b/stat/distmat/wishart_test.go @@ -0,0 +1,129 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmat + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" +) + +func TestWishart(t *testing.T) { + for c, test := range []struct { + v *mat64.SymDense + nu float64 + xs []*mat64.SymDense + lps []float64 + }{ + // Logprob data compared with scipy. + { + v: mat64.NewSymDense(2, []float64{1, 0, 0, 1}), + nu: 4, + xs: []*mat64.SymDense{ + mat64.NewSymDense(2, []float64{0.9, 0.1, 0.1, 0.9}), + }, + lps: []float64{-4.2357432031863409}, + }, + { + v: mat64.NewSymDense(2, []float64{0.8, -0.2, -0.2, 0.7}), + nu: 5, + xs: []*mat64.SymDense{ + mat64.NewSymDense(2, []float64{0.9, 0.1, 0.1, 0.9}), + mat64.NewSymDense(2, []float64{0.3, -0.1, -0.1, 0.7}), + }, + lps: []float64{-4.2476495605333575, -4.9993285370378633}, + }, + { + v: mat64.NewSymDense(3, []float64{0.8, 0.3, 0.1, 0.3, 0.7, -0.1, 0.1, -0.1, 7}), + nu: 5, + xs: []*mat64.SymDense{ + mat64.NewSymDense(3, []float64{1, 0.2, -0.3, 0.2, 0.6, -0.2, -0.3, -0.2, 6}), + }, + lps: []float64{-11.010982249229421}, + }, + } { + w, ok := NewWishart(test.v, test.nu, nil) + if !ok { + panic("bad test") + } + for i, x := range test.xs { + lp := w.LogProbSym(x) + + var chol mat64.Cholesky + ok := chol.Factorize(x) + if !ok { + panic("bad test") + } + lpc := w.LogProbSymChol(&chol) + + if math.Abs(lp-lpc) > 1e-14 { + t.Errorf("Case %d, test %d: probability mismatch between chol and not", c, i) + } + if !floats.EqualWithinAbsOrRel(lp, test.lps[i], 1e-14, 1e-14) { + t.Errorf("Case %d, test %d: got %v, want %v", c, i, lp, test.lps[i]) + } + } + + ch := w.RandChol(nil) + w.RandChol(ch) + + s := w.RandSym(nil) + w.RandSym(s) + + } +} + +func TestWishartRand(t *testing.T) { + for c, test := range []struct { + v *mat64.SymDense + nu float64 + samples int + tol float64 + }{ + { + v: mat64.NewSymDense(2, []float64{0.8, -0.2, -0.2, 0.7}), + nu: 5, + samples: 30000, + tol: 3e-2, + }, + { + v: mat64.NewSymDense(3, []float64{0.8, 0.3, 0.1, 0.3, 0.7, -0.1, 0.1, -0.1, 7}), + nu: 5, + samples: 300000, + tol: 3e-2, + }, + { + v: mat64.NewSymDense(4, []float64{ + 0.8, 0.3, 0.1, -0.2, + 0.3, 0.7, -0.1, 0.4, + 0.1, -0.1, 7, 1, + -0.2, -0.1, 1, 6}), + nu: 6, + samples: 300000, + tol: 3e-2, + }, + } { + rnd := rand.New(rand.NewSource(1)) + dim := test.v.Symmetric() + w, ok := NewWishart(test.v, test.nu, rnd) + if !ok { + panic("bad test") + } + mean := mat64.NewSymDense(dim, nil) + x := mat64.NewSymDense(dim, nil) + for i := 0; i < test.samples; i++ { + w.RandSym(x) + x.ScaleSym(1/float64(test.samples), x) + mean.AddSym(mean, x) + } + trueMean := w.MeanSym(nil) + if !mat64.EqualApprox(trueMean, mean, test.tol) { + t.Errorf("Case %d: Mismatch between estimated and true mean. Got\n%0.4v\nWant\n%0.4v\n", c, mat64.Formatted(mean), mat64.Formatted(trueMean)) + } + } +} diff --git a/stat/distmv/dirichlet.go b/stat/distmv/dirichlet.go new file mode 100644 index 00000000..0de747b5 --- /dev/null +++ b/stat/distmv/dirichlet.go @@ -0,0 +1,144 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat/distuv" +) + +// Dirichlet implements the Dirichlet probability distribution. +// +// The Dirichlet distribution is a continuous probability distribution that +// generates elements over the probability simplex, i.e. ||x||_1 = 1. The Dirichlet +// distribution is the conjugate prior to the categorical distribution and the +// multivariate version of the beta distribution. The probability of a point x is +// 1/Beta(α) \prod_i x_i^(α_i - 1) +// where Beta(α) is the multivariate Beta function (see the mathext package). +// +// For more information see https://en.wikipedia.org/wiki/Dirichlet_distribution +type Dirichlet struct { + alpha []float64 + dim int + src *rand.Rand + + lbeta float64 + sumAlpha float64 +} + +// NewDirichlet creates a new dirichlet distribution with the given parameters alpha. +// NewDirichlet will panic if len(alpha) == 0, or if any alpha is <= 0. +func NewDirichlet(alpha []float64, src *rand.Rand) *Dirichlet { + dim := len(alpha) + if dim == 0 { + panic(badZeroDimension) + } + for _, v := range alpha { + if v <= 0 { + panic("dirichlet: non-positive alpha") + } + } + a := make([]float64, len(alpha)) + copy(a, alpha) + d := &Dirichlet{ + alpha: a, + dim: dim, + src: src, + } + d.lbeta, d.sumAlpha = d.genLBeta(a) + return d +} + +// CovarianceMatrix returns the covariance matrix of the distribution. Upon +// return, the value at element {i, j} of the covariance matrix is equal to +// the covariance of the i^th and j^th variables. +// covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])] +// If the input matrix is nil a new matrix is allocated, otherwise the result +// is stored in-place into the input. +func (d *Dirichlet) CovarianceMatrix(cov *mat64.SymDense) *mat64.SymDense { + if cov == nil { + cov = mat64.NewSymDense(d.Dim(), nil) + } else if cov.Symmetric() == 0 { + *cov = *(cov.GrowSquare(d.dim).(*mat64.SymDense)) + } else if cov.Symmetric() != d.dim { + panic("normal: input matrix size mismatch") + } + scale := 1 / (d.sumAlpha * d.sumAlpha * (d.sumAlpha + 1)) + for i := 0; i < d.dim; i++ { + ai := d.alpha[i] + v := ai * (d.sumAlpha - ai) * scale + cov.SetSym(i, i, v) + for j := i + 1; j < d.dim; j++ { + aj := d.alpha[j] + v := -ai * aj * scale + cov.SetSym(i, j, v) + } + } + return cov +} + +// genLBeta computes the generalized LBeta function. +func (d *Dirichlet) genLBeta(alpha []float64) (lbeta, sumAlpha float64) { + for _, alpha := range d.alpha { + lg, _ := math.Lgamma(alpha) + lbeta += lg + sumAlpha += alpha + } + lg, _ := math.Lgamma(sumAlpha) + return lbeta - lg, sumAlpha +} + +// Dim returns the dimension of the distribution. +func (d *Dirichlet) Dim() int { + return d.dim +} + +// LogProb computes the log of the pdf of the point x. +// +// It does not check that ||x||_1 = 1. +func (d *Dirichlet) LogProb(x []float64) float64 { + dim := d.dim + if len(x) != dim { + panic(badSizeMismatch) + } + var lprob float64 + for i, x := range x { + lprob += (d.alpha[i] - 1) * math.Log(x) + } + lprob -= d.lbeta + return lprob +} + +// Mean returns the mean of the probability distribution at x. If the +// input argument is nil, a new slice will be allocated, otherwise the result +// will be put in-place into the receiver. +func (d *Dirichlet) Mean(x []float64) []float64 { + x = reuseAs(x, d.dim) + copy(x, d.alpha) + floats.Scale(1/d.sumAlpha, x) + return x +} + +// Prob computes the value of the probability density function at x. +func (d *Dirichlet) Prob(x []float64) float64 { + return math.Exp(d.LogProb(x)) +} + +// Rand generates a random number according to the distributon. +// If the input slice is nil, new memory is allocated, otherwise the result is stored +// in place. +func (d *Dirichlet) Rand(x []float64) []float64 { + x = reuseAs(x, d.dim) + for i := range x { + x[i] = distuv.Gamma{Alpha: d.alpha[i], Beta: 1, Source: d.src}.Rand() + } + sum := floats.Sum(x) + floats.Scale(1/sum, x) + return x +} diff --git a/stat/distmv/dirichlet_test.go b/stat/distmv/dirichlet_test.go new file mode 100644 index 00000000..b696b9a4 --- /dev/null +++ b/stat/distmv/dirichlet_test.go @@ -0,0 +1,72 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/matrix/mat64" +) + +func TestDirichlet(t *testing.T) { + // Data from Scipy. + for cas, test := range []struct { + Dir *Dirichlet + x []float64 + prob float64 + }{ + { + NewDirichlet([]float64{1, 1, 1}, nil), + []float64{0.2, 0.3, 0.5}, + 2.0, + }, + { + NewDirichlet([]float64{0.6, 10, 8.7}, nil), + []float64{0.2, 0.3, 0.5}, + 0.24079612737071665, + }, + } { + p := test.Dir.Prob(test.x) + if math.Abs(p-test.prob) > 1e-14 { + t.Errorf("Probablility mismatch. Case %v. Got %v, want %v", cas, p, test.prob) + } + } + + rnd := rand.New(rand.NewSource(1)) + for cas, test := range []struct { + Dir *Dirichlet + N int + }{ + { + NewDirichlet([]float64{1, 1, 1}, rnd), + 1e6, + }, + { + NewDirichlet([]float64{2, 3}, rnd), + 1e6, + }, + { + NewDirichlet([]float64{0.2, 0.3}, rnd), + 1e6, + }, + { + NewDirichlet([]float64{0.2, 4}, rnd), + 1e6, + }, + { + NewDirichlet([]float64{0.1, 4, 20}, rnd), + 1e6, + }, + } { + d := test.Dir + dim := d.Dim() + x := mat64.NewDense(test.N, dim, nil) + generateSamples(x, d) + checkMean(t, cas, x, d, 1e-3) + checkCov(t, cas, x, d, 1e-3) + } +} diff --git a/stat/distmv/general.go b/stat/distmv/general.go new file mode 100644 index 00000000..a7734a1f --- /dev/null +++ b/stat/distmv/general.go @@ -0,0 +1,31 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package distmv provides multivariate random distribution types. +package distmv + +var ( + badQuantile = "distmv: quantile not between 0 and 1" + badReceiver = "distmv: input slice is not nil or the correct length" + badSizeMismatch = "distmv: size mismatch" + badZeroDimension = "distmv: zero dimensional input" + nonPosDimension = "distmv: non-positive dimension input" +) + +const logTwoPi = 1.8378770664093454835606594728112352797227949472755668 + +// useAs gets a slice of size n. If len(x) == n, x is returned, if len(x) == 0 +// then a slice is returned of length n. +func reuseAs(x []float64, n int) []float64 { + if len(x) == n { + return x + } + if len(x) == 0 { + if cap(x) >= n { + return x[:n] + } + return make([]float64, n) + } + panic(badReceiver) +} diff --git a/stat/distmv/general_test.go b/stat/distmv/general_test.go new file mode 100644 index 00000000..64369852 --- /dev/null +++ b/stat/distmv/general_test.go @@ -0,0 +1,96 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +type prober interface { + Prob(x []float64) float64 + LogProb(x []float64) float64 +} + +type probCase struct { + dist prober + loc []float64 + logProb float64 +} + +func testProbability(t *testing.T, cases []probCase) { + for _, test := range cases { + logProb := test.dist.LogProb(test.loc) + if math.Abs(logProb-test.logProb) > 1e-14 { + t.Errorf("LogProb mismatch: want: %v, got: %v", test.logProb, logProb) + } + prob := test.dist.Prob(test.loc) + if math.Abs(prob-math.Exp(test.logProb)) > 1e-14 { + t.Errorf("Prob mismatch: want: %v, got: %v", math.Exp(test.logProb), prob) + } + } +} + +func generateSamples(x *mat64.Dense, r Rander) { + n, _ := x.Dims() + for i := 0; i < n; i++ { + r.Rand(x.RawRowView(i)) + } +} + +type Meaner interface { + Mean([]float64) []float64 +} + +func checkMean(t *testing.T, cas int, x *mat64.Dense, m Meaner, tol float64) { + mean := m.Mean(nil) + + // Check that the answer is identical when using nil or non-nil. + mean2 := make([]float64, len(mean)) + m.Mean(mean2) + if !floats.Equal(mean, mean2) { + t.Errorf("Mean mismatch when providing nil and slice. Case %v", cas) + } + + // Check that the mean matches the samples. + r, _ := x.Dims() + col := make([]float64, r) + meanEst := make([]float64, len(mean)) + for i := range meanEst { + meanEst[i] = stat.Mean(mat64.Col(col, i, x), nil) + } + if !floats.EqualApprox(mean, meanEst, tol) { + t.Errorf("Returned mean and sample mean mismatch. Case %v. Empirical %v, returned %v", cas, meanEst, mean) + } +} + +type Cover interface { + CovarianceMatrix(*mat64.SymDense) *mat64.SymDense +} + +func checkCov(t *testing.T, cas int, x *mat64.Dense, c Cover, tol float64) { + cov := c.CovarianceMatrix(nil) + n := cov.Symmetric() + cov2 := mat64.NewSymDense(n, nil) + c.CovarianceMatrix(cov2) + if !mat64.Equal(cov, cov2) { + t.Errorf("Cov mismatch when providing nil and matrix. Case %v", cas) + } + var cov3 mat64.SymDense + c.CovarianceMatrix(&cov3) + if !mat64.Equal(cov, &cov3) { + t.Errorf("Cov mismatch when providing zero matrix. Case %v", cas) + } + + // Check that the covariance matrix matches the samples + covEst := stat.CovarianceMatrix(nil, x, nil) + if !mat64.EqualApprox(covEst, cov, tol) { + t.Errorf("Return cov and sample cov mismatch. Cas %v.\nGot:\n%0.4v\nWant:\n%0.4v", cas, mat64.Formatted(cov), mat64.Formatted(covEst)) + } +} diff --git a/stat/distmv/interfaces.go b/stat/distmv/interfaces.go new file mode 100644 index 00000000..57411933 --- /dev/null +++ b/stat/distmv/interfaces.go @@ -0,0 +1,33 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +// Quantiler returns the multi-dimensional inverse cumulative distribution function. +// len(x) must equal len(p), and if x is non-nil, len(x) must also equal len(p). +// If x is nil, a new slice will be allocated and returned, otherwise the quantile +// will be stored in-place into x. All of the values of p must be between 0 and 1, +// or Quantile will panic. +type Quantiler interface { + Quantile(x, p []float64) []float64 +} + +// LogProber computes the log of the probability of the point x. +type LogProber interface { + LogProb(x []float64) float64 +} + +// Rander generates a random number according to the distributon. +// If the input is non-nil, len(x) must equal len(p) and the dimension of the distribution, +// otherwise Quantile will panic. +// If the input is nil, a new slice will be allocated and returned. +type Rander interface { + Rand(x []float64) []float64 +} + +// RandLogProber is both a Rander and a LogProber. +type RandLogProber interface { + Rander + LogProber +} diff --git a/stat/distmv/normal.go b/stat/distmv/normal.go new file mode 100644 index 00000000..1a20af2c --- /dev/null +++ b/stat/distmv/normal.go @@ -0,0 +1,316 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" + "github.com/gonum/stat/distuv" +) + +var ( + badInputLength = "distmv: input slice length mismatch" +) + +// Normal is a multivariate normal distribution (also known as the multivariate +// Gaussian distribution). Its pdf in k dimensions is given by +// (2 π)^(-k/2) |Σ|^(-1/2) exp(-1/2 (x-μ)'Σ^-1(x-μ)) +// where μ is the mean vector and Σ the covariance matrix. Σ must be symmetric +// and positive definite. Use NewNormal to construct. +type Normal struct { + mu []float64 + + sigma mat64.SymDense + + chol mat64.Cholesky + lower mat64.TriDense + logSqrtDet float64 + dim int + + src *rand.Rand +} + +// NewNormal creates a new Normal with the given mean and covariance matrix. +// NewNormal panics if len(mu) == 0, or if len(mu) != sigma.N. If the covariance +// matrix is not positive-definite, the returned boolean is false. +func NewNormal(mu []float64, sigma mat64.Symmetric, src *rand.Rand) (*Normal, bool) { + if len(mu) == 0 { + panic(badZeroDimension) + } + dim := sigma.Symmetric() + if dim != len(mu) { + panic(badSizeMismatch) + } + n := &Normal{ + src: src, + dim: dim, + mu: make([]float64, dim), + } + copy(n.mu, mu) + ok := n.chol.Factorize(sigma) + if !ok { + return nil, false + } + n.sigma = *mat64.NewSymDense(dim, nil) + n.sigma.CopySym(sigma) + n.lower.LFromCholesky(&n.chol) + n.logSqrtDet = 0.5 * n.chol.LogDet() + return n, true +} + +// NewNormalChol creates a new Normal distribution with the given mean and +// covariance matrix represented by its Cholesky decomposition. NewNormalChol +// panics if len(mu) is not equal to chol.Size(). +func NewNormalChol(mu []float64, chol *mat64.Cholesky, src *rand.Rand) *Normal { + dim := len(mu) + if dim != chol.Size() { + panic(badSizeMismatch) + } + n := &Normal{ + src: src, + dim: dim, + mu: make([]float64, dim), + } + n.chol.Clone(chol) + copy(n.mu, mu) + n.lower.LFromCholesky(chol) + n.logSqrtDet = 0.5 * n.chol.LogDet() + return n +} + +// NewNormalPrecision creates a new Normal distribution with the given mean and +// precision matrix (inverse of the covariance matrix). NewNormalPrecision +// panics if len(mu) is not equal to prec.Symmetric(). If the precision matrix +// is not positive-definite, NewNormalPrecision returns nil for norm and false +// for ok. +func NewNormalPrecision(mu []float64, prec *mat64.SymDense, src *rand.Rand) (norm *Normal, ok bool) { + if len(mu) == 0 { + panic(badZeroDimension) + } + dim := prec.Symmetric() + if dim != len(mu) { + panic(badSizeMismatch) + } + // TODO(btracey): Computing a matrix inverse is generally numerically instable. + // This only has to compute the inverse of a positive definite matrix, which + // is much better, but this still loses precision. It is worth considering if + // instead the precision matrix should be stored explicitly and used instead + // of the Cholesky decomposition of the covariance matrix where appropriate. + var chol mat64.Cholesky + ok = chol.Factorize(prec) + if !ok { + return nil, false + } + var sigma mat64.SymDense + sigma.InverseCholesky(&chol) + return NewNormal(mu, &sigma, src) +} + +// ConditionNormal returns the Normal distribution that is the receiver conditioned +// on the input evidence. The returned multivariate normal has dimension +// n - len(observed), where n is the dimension of the original receiver. The updated +// mean and covariance are +// mu = mu_un + sigma_{ob,un}^T * sigma_{ob,ob}^-1 (v - mu_ob) +// sigma = sigma_{un,un} - sigma_{ob,un}^T * sigma_{ob,ob}^-1 * sigma_{ob,un} +// where mu_un and mu_ob are the original means of the unobserved and observed +// variables respectively, sigma_{un,un} is the unobserved subset of the covariance +// matrix, sigma_{ob,ob} is the observed subset of the covariance matrix, and +// sigma_{un,ob} are the cross terms. The elements of x_2 have been observed with +// values v. The dimension order is preserved during conditioning, so if the value +// of dimension 1 is observed, the returned normal represents dimensions {0, 2, ...} +// of the original Normal distribution. +// +// ConditionNormal returns {nil, false} if there is a failure during the update. +// Mathematically this is impossible, but can occur with finite precision arithmetic. +func (n *Normal) ConditionNormal(observed []int, values []float64, src *rand.Rand) (*Normal, bool) { + if len(observed) == 0 { + panic("normal: no observed value") + } + if len(observed) != len(values) { + panic(badInputLength) + } + for _, v := range observed { + if v < 0 || v >= n.Dim() { + panic("normal: observed value out of bounds") + } + } + + _, mu1, sigma11 := studentsTConditional(observed, values, math.Inf(1), n.mu, &n.sigma) + if mu1 == nil { + return nil, false + } + return NewNormal(mu1, sigma11, src) +} + +// CovarianceMatrix returns the covariance matrix of the distribution. Upon +// return, the value at element {i, j} of the covariance matrix is equal to +// the covariance of the i^th and j^th variables. +// covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])] +// If the input matrix is nil a new matrix is allocated, otherwise the result +// is stored in-place into the input. +func (n *Normal) CovarianceMatrix(s *mat64.SymDense) *mat64.SymDense { + if s == nil { + s = mat64.NewSymDense(n.Dim(), nil) + } + sn := s.Symmetric() + if sn != n.Dim() { + panic("normal: input matrix size mismatch") + } + s.CopySym(&n.sigma) + return s +} + +// Dim returns the dimension of the distribution. +func (n *Normal) Dim() int { + return n.dim +} + +// Entropy returns the differential entropy of the distribution. +func (n *Normal) Entropy() float64 { + return float64(n.dim)/2*(1+logTwoPi) + n.logSqrtDet +} + +// LogProb computes the log of the pdf of the point x. +func (n *Normal) LogProb(x []float64) float64 { + dim := n.dim + if len(x) != dim { + panic(badSizeMismatch) + } + c := -0.5*float64(dim)*logTwoPi - n.logSqrtDet + dst := stat.Mahalanobis(mat64.NewVector(dim, x), mat64.NewVector(dim, n.mu), &n.chol) + return c - 0.5*dst*dst +} + +// MarginalNormal returns the marginal distribution of the given input variables. +// That is, MarginalNormal returns +// p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o +// where x_i are the dimensions in the input, and x_o are the remaining dimensions. +// See https://en.wikipedia.org/wiki/Marginal_distribution for more information. +// +// The input src is passed to the call to NewNormal. +func (n *Normal) MarginalNormal(vars []int, src *rand.Rand) (*Normal, bool) { + newMean := make([]float64, len(vars)) + for i, v := range vars { + newMean[i] = n.mu[v] + } + var s mat64.SymDense + s.SubsetSym(&n.sigma, vars) + return NewNormal(newMean, &s, src) +} + +// MarginalNormalSingle returns the marginal of the given input variable. +// That is, MarginalNormal returns +// p(x_i) = \int_{x_¬i} p(x_i | x_¬i) p(x_¬i) dx_¬i +// where i is the input index. +// See https://en.wikipedia.org/wiki/Marginal_distribution for more information. +// +// The input src is passed to the constructed distuv.Normal. +func (n *Normal) MarginalNormalSingle(i int, src *rand.Rand) distuv.Normal { + return distuv.Normal{ + Mu: n.mu[i], + Sigma: math.Sqrt(n.sigma.At(i, i)), + Source: src, + } +} + +// Mean returns the mean of the probability distribution at x. If the +// input argument is nil, a new slice will be allocated, otherwise the result +// will be put in-place into the receiver. +func (n *Normal) Mean(x []float64) []float64 { + x = reuseAs(x, n.dim) + copy(x, n.mu) + return x +} + +// Prob computes the value of the probability density function at x. +func (n *Normal) Prob(x []float64) float64 { + return math.Exp(n.LogProb(x)) +} + +// Quantile returns the multi-dimensional inverse cumulative distribution function. +// If x is nil, a new slice will be allocated and returned. If x is non-nil, +// len(x) must equal len(p) and the quantile will be stored in-place into x. +// All of the values of p must be between 0 and 1, inclusive, or Quantile will panic. +func (n *Normal) Quantile(x, p []float64) []float64 { + dim := n.Dim() + if len(p) != dim { + panic(badInputLength) + } + if x == nil { + x = make([]float64, dim) + } + if len(x) != len(p) { + panic(badInputLength) + } + + // Transform to a standard normal and then transform to a multivariate Gaussian. + tmp := make([]float64, len(x)) + for i, v := range p { + tmp[i] = distuv.UnitNormal.Quantile(v) + } + n.TransformNormal(x, tmp) + return x +} + +// Rand generates a random number according to the distributon. +// If the input slice is nil, new memory is allocated, otherwise the result is stored +// in place. +func (n *Normal) Rand(x []float64) []float64 { + x = reuseAs(x, n.dim) + tmp := make([]float64, n.dim) + if n.src == nil { + for i := range x { + tmp[i] = rand.NormFloat64() + } + } else { + for i := range x { + tmp[i] = n.src.NormFloat64() + } + } + n.transformNormal(x, tmp) + return x +} + +// SetMean changes the mean of the normal distribution. SetMean panics if len(mu) +// does not equal the dimension of the normal distribution. +func (n *Normal) SetMean(mu []float64) { + if len(mu) != n.Dim() { + panic(badSizeMismatch) + } + copy(n.mu, mu) +} + +// TransformNormal transforms the vector, normal, generated from a standard +// multidimensional normal into a vector that has been generated under the +// distribution of the receiver. +// +// If dst is non-nil, the result will be stored into dst, otherwise a new slice +// will be allocated. TransformNormal will panic if the length of normal is not +// the dimension of the receiver, or if dst is non-nil and len(dist) != len(normal). +func (n *Normal) TransformNormal(dst, normal []float64) []float64 { + if len(normal) != n.dim { + panic(badInputLength) + } + dst = reuseAs(dst, n.dim) + if len(dst) != len(normal) { + panic(badInputLength) + } + n.transformNormal(dst, normal) + return dst +} + +// transformNormal performs the same operation as TransformNormal except no +// safety checks are performed and both input slices must be non-nil. +func (n *Normal) transformNormal(dst, normal []float64) []float64 { + srcVec := mat64.NewVector(n.dim, normal) + dstVec := mat64.NewVector(n.dim, dst) + dstVec.MulVec(&n.lower, srcVec) + floats.Add(dst, n.mu) + return dst +} diff --git a/stat/distmv/normal_test.go b/stat/distmv/normal_test.go new file mode 100644 index 00000000..7cd837ed --- /dev/null +++ b/stat/distmv/normal_test.go @@ -0,0 +1,538 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +type mvTest struct { + Mu []float64 + Sigma *mat64.SymDense + Loc []float64 + Logprob float64 + Prob float64 +} + +func TestNormProbs(t *testing.T) { + dist1, ok := NewNormal([]float64{0, 0}, mat64.NewSymDense(2, []float64{1, 0, 0, 1}), nil) + if !ok { + t.Errorf("bad test") + } + dist2, ok := NewNormal([]float64{6, 7}, mat64.NewSymDense(2, []float64{8, 2, 0, 4}), nil) + if !ok { + t.Errorf("bad test") + } + testProbability(t, []probCase{ + { + dist: dist1, + loc: []float64{0, 0}, + logProb: -1.837877066409345, + }, + { + dist: dist2, + loc: []float64{6, 7}, + logProb: -3.503979321496947, + }, + { + dist: dist2, + loc: []float64{1, 2}, + logProb: -7.075407892925519, + }, + }) +} + +func TestNewNormalChol(t *testing.T) { + for _, test := range []struct { + mean []float64 + cov *mat64.SymDense + }{ + { + mean: []float64{2, 3}, + cov: mat64.NewSymDense(2, []float64{1, 0.1, 0.1, 1}), + }, + } { + var chol mat64.Cholesky + ok := chol.Factorize(test.cov) + if !ok { + panic("bad test") + } + n := NewNormalChol(test.mean, &chol, nil) + // Generate a random number and calculate probability to ensure things + // have been set properly. See issue #426. + x := n.Rand(nil) + _ = n.Prob(x) + } +} + +func TestNormRand(t *testing.T) { + for _, test := range []struct { + mean []float64 + cov []float64 + }{ + { + mean: []float64{0, 0}, + cov: []float64{ + 1, 0, + 0, 1, + }, + }, + { + mean: []float64{0, 0}, + cov: []float64{ + 1, 0.9, + 0.9, 1, + }, + }, + { + mean: []float64{6, 7}, + cov: []float64{ + 5, 0.9, + 0.9, 2, + }, + }, + } { + dim := len(test.mean) + cov := mat64.NewSymDense(dim, test.cov) + n, ok := NewNormal(test.mean, cov, nil) + if !ok { + t.Errorf("bad covariance matrix") + } + + nSamples := 1000000 + samps := mat64.NewDense(nSamples, dim, nil) + for i := 0; i < nSamples; i++ { + n.Rand(samps.RawRowView(i)) + } + estMean := make([]float64, dim) + for i := range estMean { + estMean[i] = stat.Mean(mat64.Col(nil, i, samps), nil) + } + if !floats.EqualApprox(estMean, test.mean, 1e-2) { + t.Errorf("Mean mismatch: want: %v, got %v", test.mean, estMean) + } + estCov := stat.CovarianceMatrix(nil, samps, nil) + if !mat64.EqualApprox(estCov, cov, 1e-2) { + t.Errorf("Cov mismatch: want: %v, got %v", cov, estCov) + } + } +} + +func TestNormalQuantile(t *testing.T) { + for _, test := range []struct { + mean []float64 + cov []float64 + }{ + { + mean: []float64{6, 7}, + cov: []float64{ + 5, 0.9, + 0.9, 2, + }, + }, + } { + dim := len(test.mean) + cov := mat64.NewSymDense(dim, test.cov) + n, ok := NewNormal(test.mean, cov, nil) + if !ok { + t.Errorf("bad covariance matrix") + } + + nSamples := 1000000 + rnd := rand.New(rand.NewSource(1)) + samps := mat64.NewDense(nSamples, dim, nil) + tmp := make([]float64, dim) + for i := 0; i < nSamples; i++ { + for j := range tmp { + tmp[j] = rnd.Float64() + } + n.Quantile(samps.RawRowView(i), tmp) + } + estMean := make([]float64, dim) + for i := range estMean { + estMean[i] = stat.Mean(mat64.Col(nil, i, samps), nil) + } + if !floats.EqualApprox(estMean, test.mean, 1e-2) { + t.Errorf("Mean mismatch: want: %v, got %v", test.mean, estMean) + } + estCov := stat.CovarianceMatrix(nil, samps, nil) + if !mat64.EqualApprox(estCov, cov, 1e-2) { + t.Errorf("Cov mismatch: want: %v, got %v", cov, estCov) + } + } +} + +func TestConditionNormal(t *testing.T) { + // Uncorrelated values shouldn't influence the updated values. + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + observed []int + values []float64 + + newMu []float64 + newSigma *mat64.SymDense + }{ + { + mu: []float64{2, 3}, + sigma: mat64.NewSymDense(2, []float64{2, 0, 0, 5}), + observed: []int{0}, + values: []float64{10}, + + newMu: []float64{3}, + newSigma: mat64.NewSymDense(1, []float64{5}), + }, + { + mu: []float64{2, 3}, + sigma: mat64.NewSymDense(2, []float64{2, 0, 0, 5}), + observed: []int{1}, + values: []float64{10}, + + newMu: []float64{2}, + newSigma: mat64.NewSymDense(1, []float64{2}), + }, + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0, 0, 0, 5, 0, 0, 0, 10}), + observed: []int{1}, + values: []float64{10}, + + newMu: []float64{2, 4}, + newSigma: mat64.NewSymDense(2, []float64{2, 0, 0, 10}), + }, + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0, 0, 0, 5, 0, 0, 0, 10}), + observed: []int{0, 1}, + values: []float64{10, 15}, + + newMu: []float64{4}, + newSigma: mat64.NewSymDense(1, []float64{10}), + }, + { + mu: []float64{2, 3, 4, 5}, + sigma: mat64.NewSymDense(4, []float64{2, 0.5, 0, 0, 0.5, 5, 0, 0, 0, 0, 10, 2, 0, 0, 2, 3}), + observed: []int{0, 1}, + values: []float64{10, 15}, + + newMu: []float64{4, 5}, + newSigma: mat64.NewSymDense(2, []float64{10, 2, 2, 3}), + }, + } { + normal, ok := NewNormal(test.mu, test.sigma, nil) + if !ok { + t.Fatalf("Bad test, original sigma not positive definite") + } + newNormal, ok := normal.ConditionNormal(test.observed, test.values, nil) + if !ok { + t.Fatalf("Bad test, update failure") + } + + if !floats.EqualApprox(test.newMu, newNormal.mu, 1e-12) { + t.Errorf("Updated mean mismatch. Want %v, got %v.", test.newMu, newNormal.mu) + } + + var sigma mat64.SymDense + sigma.FromCholesky(&newNormal.chol) + if !mat64.EqualApprox(test.newSigma, &sigma, 1e-12) { + t.Errorf("Updated sigma mismatch\n.Want:\n% v\nGot:\n% v\n", test.newSigma, sigma) + } + } + + // Test bivariate case where the update rule is analytic + for _, test := range []struct { + mu []float64 + std []float64 + rho float64 + value float64 + }{ + { + mu: []float64{2, 3}, + std: []float64{3, 5}, + rho: 0.9, + value: 1000, + }, + { + mu: []float64{2, 3}, + std: []float64{3, 5}, + rho: -0.9, + value: 1000, + }, + } { + std := test.std + rho := test.rho + sigma := mat64.NewSymDense(2, []float64{std[0] * std[0], std[0] * std[1] * rho, std[0] * std[1] * rho, std[1] * std[1]}) + normal, ok := NewNormal(test.mu, sigma, nil) + if !ok { + t.Fatalf("Bad test, original sigma not positive definite") + } + newNormal, ok := normal.ConditionNormal([]int{1}, []float64{test.value}, nil) + if !ok { + t.Fatalf("Bad test, update failed") + } + var newSigma mat64.SymDense + newSigma.FromCholesky(&newNormal.chol) + trueMean := test.mu[0] + rho*(std[0]/std[1])*(test.value-test.mu[1]) + if math.Abs(trueMean-newNormal.mu[0]) > 1e-14 { + t.Errorf("Mean mismatch. Want %v, got %v", trueMean, newNormal.mu[0]) + } + trueVar := (1 - rho*rho) * std[0] * std[0] + if math.Abs(trueVar-newSigma.At(0, 0)) > 1e-14 { + t.Errorf("Std mismatch. Want %v, got %v", trueMean, newNormal.mu[0]) + } + } + + // Test via sampling. + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + observed []int + unobserved []int + value []float64 + }{ + // The indices in unobserved must be in ascending order for this test. + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0.5, 3, 0.5, 1, 0.6, 3, 0.6, 10}), + + observed: []int{0}, + unobserved: []int{1, 2}, + value: []float64{1.9}, + }, + { + mu: []float64{2, 3, 4, 5}, + sigma: mat64.NewSymDense(4, []float64{2, 0.5, 3, 0.1, 0.5, 1, 0.6, 0.2, 3, 0.6, 10, 0.3, 0.1, 0.2, 0.3, 3}), + + observed: []int{0, 3}, + unobserved: []int{1, 2}, + value: []float64{1.9, 2.9}, + }, + } { + totalSamp := 4000000 + var nSamp int + samples := mat64.NewDense(totalSamp, len(test.mu), nil) + normal, ok := NewNormal(test.mu, test.sigma, nil) + if !ok { + t.Errorf("bad test") + } + sample := make([]float64, len(test.mu)) + for i := 0; i < totalSamp; i++ { + normal.Rand(sample) + isClose := true + for i, v := range test.observed { + if math.Abs(sample[v]-test.value[i]) > 1e-1 { + isClose = false + break + } + } + if isClose { + samples.SetRow(nSamp, sample) + nSamp++ + } + } + + if nSamp < 100 { + t.Errorf("bad test, not enough samples") + continue + } + samples = samples.View(0, 0, nSamp, len(test.mu)).(*mat64.Dense) + + // Compute mean and covariance matrix. + estMean := make([]float64, len(test.mu)) + for i := range estMean { + estMean[i] = stat.Mean(mat64.Col(nil, i, samples), nil) + } + estCov := stat.CovarianceMatrix(nil, samples, nil) + + // Compute update rule. + newNormal, ok := normal.ConditionNormal(test.observed, test.value, nil) + if !ok { + t.Fatalf("Bad test, update failure") + } + + var subEstMean []float64 + for _, v := range test.unobserved { + + subEstMean = append(subEstMean, estMean[v]) + } + subEstCov := mat64.NewSymDense(len(test.unobserved), nil) + for i := 0; i < len(test.unobserved); i++ { + for j := i; j < len(test.unobserved); j++ { + subEstCov.SetSym(i, j, estCov.At(test.unobserved[i], test.unobserved[j])) + } + } + + for i, v := range subEstMean { + if math.Abs(newNormal.mu[i]-v) > 5e-2 { + t.Errorf("Mean mismatch. Want %v, got %v.", newNormal.mu[i], v) + } + } + var sigma mat64.SymDense + sigma.FromCholesky(&newNormal.chol) + if !mat64.EqualApprox(&sigma, subEstCov, 1e-1) { + t.Errorf("Covariance mismatch. Want:\n%0.8v\nGot:\n%0.8v\n", subEstCov, sigma) + } + } +} + +func TestCovarianceMatrix(t *testing.T) { + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + }{ + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{1, 0.5, 3, 0.5, 8, -1, 3, -1, 15}), + }, + } { + normal, ok := NewNormal(test.mu, test.sigma, nil) + if !ok { + t.Fatalf("Bad test, covariance matrix not positive definite") + } + cov := normal.CovarianceMatrix(nil) + if !mat64.EqualApprox(cov, test.sigma, 1e-14) { + t.Errorf("Covariance mismatch with nil input") + } + dim := test.sigma.Symmetric() + cov = mat64.NewSymDense(dim, nil) + normal.CovarianceMatrix(cov) + if !mat64.EqualApprox(cov, test.sigma, 1e-14) { + t.Errorf("Covariance mismatch with supplied input") + } + } +} + +func TestMarginal(t *testing.T) { + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + marginal []int + }{ + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0.5, 3, 0.5, 1, 0.6, 3, 0.6, 10}), + marginal: []int{0}, + }, + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0.5, 3, 0.5, 1, 0.6, 3, 0.6, 10}), + marginal: []int{0, 2}, + }, + { + mu: []float64{2, 3, 4, 5}, + sigma: mat64.NewSymDense(4, []float64{2, 0.5, 3, 0.1, 0.5, 1, 0.6, 0.2, 3, 0.6, 10, 0.3, 0.1, 0.2, 0.3, 3}), + + marginal: []int{0, 3}, + }, + } { + normal, ok := NewNormal(test.mu, test.sigma, nil) + if !ok { + t.Fatalf("Bad test, covariance matrix not positive definite") + } + marginal, ok := normal.MarginalNormal(test.marginal, nil) + if !ok { + t.Fatalf("Bad test, marginal matrix not positive definite") + } + dim := normal.Dim() + nSamples := 1000000 + samps := mat64.NewDense(nSamples, dim, nil) + for i := 0; i < nSamples; i++ { + normal.Rand(samps.RawRowView(i)) + } + estMean := make([]float64, dim) + for i := range estMean { + estMean[i] = stat.Mean(mat64.Col(nil, i, samps), nil) + } + for i, v := range test.marginal { + if math.Abs(marginal.mu[i]-estMean[v]) > 1e-2 { + t.Errorf("Mean mismatch: want: %v, got %v", estMean[v], marginal.mu[i]) + } + } + + marginalCov := marginal.CovarianceMatrix(nil) + estCov := stat.CovarianceMatrix(nil, samps, nil) + for i, v1 := range test.marginal { + for j, v2 := range test.marginal { + c := marginalCov.At(i, j) + ec := estCov.At(v1, v2) + if math.Abs(c-ec) > 5e-2 { + t.Errorf("Cov mismatch element i = %d, j = %d: want: %v, got %v", i, j, c, ec) + } + } + } + } +} + +func TestMarginalSingle(t *testing.T) { + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + }{ + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0.5, 3, 0.5, 1, 0.6, 3, 0.6, 10}), + }, + { + mu: []float64{2, 3, 4, 5}, + sigma: mat64.NewSymDense(4, []float64{2, 0.5, 3, 0.1, 0.5, 1, 0.6, 0.2, 3, 0.6, 10, 0.3, 0.1, 0.2, 0.3, 3}), + }, + } { + normal, ok := NewNormal(test.mu, test.sigma, nil) + if !ok { + t.Fatalf("Bad test, covariance matrix not positive definite") + } + for i, mean := range test.mu { + norm := normal.MarginalNormalSingle(i, nil) + if norm.Mean() != mean { + t.Errorf("Mean mismatch nil Sigma, idx %v: want %v, got %v.", i, mean, norm.Mean()) + } + std := math.Sqrt(test.sigma.At(i, i)) + if math.Abs(norm.StdDev()-std) > 1e-14 { + t.Errorf("StdDev mismatch nil Sigma, idx %v: want %v, got %v.", i, std, norm.StdDev()) + } + } + } + + // Test matching with TestMarginal. + rnd := rand.New(rand.NewSource(1)) + for cas := 0; cas < 10; cas++ { + dim := rnd.Intn(10) + 1 + mu := make([]float64, dim) + for i := range mu { + mu[i] = rnd.Float64() + } + x := make([]float64, dim*dim) + for i := range x { + x[i] = rnd.Float64() + } + mat := mat64.NewDense(dim, dim, x) + var sigma mat64.SymDense + sigma.SymOuterK(1, mat) + + normal, ok := NewNormal(mu, &sigma, nil) + if !ok { + t.Fatal("bad test") + } + for i := 0; i < dim; i++ { + single := normal.MarginalNormalSingle(i, nil) + mult, ok := normal.MarginalNormal([]int{i}, nil) + if !ok { + t.Fatal("bad test") + } + if math.Abs(single.Mean()-mult.Mean(nil)[0]) > 1e-14 { + t.Errorf("Mean mismatch") + } + if math.Abs(single.Variance()-mult.CovarianceMatrix(nil).At(0, 0)) > 1e-14 { + t.Errorf("Variance mismatch") + } + } + } +} diff --git a/stat/distmv/normalbench_test.go b/stat/distmv/normalbench_test.go new file mode 100644 index 00000000..6677ea1c --- /dev/null +++ b/stat/distmv/normalbench_test.go @@ -0,0 +1,73 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "log" + "math/rand" + "testing" + + "github.com/gonum/matrix/mat64" +) + +func BenchmarkMarginalNormal10(b *testing.B) { + sz := 10 + rnd := rand.New(rand.NewSource(1)) + normal := randomNormal(sz, rnd) + _ = normal.CovarianceMatrix(nil) // pre-compute sigma + b.ResetTimer() + for i := 0; i < b.N; i++ { + marg, ok := normal.MarginalNormal([]int{1}, nil) + if !ok { + b.Error("bad test") + } + _ = marg + } +} + +func BenchmarkMarginalNormalReset10(b *testing.B) { + sz := 10 + rnd := rand.New(rand.NewSource(1)) + normal := randomNormal(sz, rnd) + b.ResetTimer() + for i := 0; i < b.N; i++ { + marg, ok := normal.MarginalNormal([]int{1}, nil) + if !ok { + b.Error("bad test") + } + _ = marg + } +} + +func BenchmarkMarginalNormalSingle10(b *testing.B) { + sz := 10 + rnd := rand.New(rand.NewSource(1)) + normal := randomNormal(sz, rnd) + b.ResetTimer() + for i := 0; i < b.N; i++ { + marg := normal.MarginalNormalSingle(1, nil) + _ = marg + } +} + +func randomNormal(sz int, rnd *rand.Rand) *Normal { + mu := make([]float64, sz) + for i := range mu { + mu[i] = rnd.Float64() + } + data := make([]float64, sz*sz) + for i := range data { + data[i] = rnd.Float64() + } + dM := mat64.NewDense(sz, sz, data) + var sigma mat64.SymDense + sigma.SymOuterK(1, dM) + + normal, ok := NewNormal(mu, &sigma, nil) + if !ok { + log.Fatal("bad test, not pos def") + } + return normal +} diff --git a/stat/distmv/statdist.go b/stat/distmv/statdist.go new file mode 100644 index 00000000..cf4f37ed --- /dev/null +++ b/stat/distmv/statdist.go @@ -0,0 +1,252 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +// Bhattacharyya is a type for computing the Bhattacharyya distance between +// probability distributions. +// +// The Battachara distance is defined as +// D_B = -ln(BC(l,r)) +// BC = \int_x (p(x)q(x))^(1/2) dx +// Where BC is known as the Bhattacharyya coefficient. +// The Bhattacharyya distance is related to the Hellinger distance by +// H = sqrt(1-BC) +// For more information, see +// https://en.wikipedia.org/wiki/Bhattacharyya_distance +type Bhattacharyya struct{} + +// DistNormal computes the Bhattacharyya distance between normal distributions l and r. +// The dimensions of the input distributions must match or DistNormal will panic. +// +// For Normal distributions, the Bhattacharyya distance is +// Σ = (Σ_l + Σ_r)/2 +// D_B = (1/8)*(μ_l - μ_r)^T*Σ^-1*(μ_l - μ_r) + (1/2)*ln(det(Σ)/(det(Σ_l)*det(Σ_r))^(1/2)) +func (Bhattacharyya) DistNormal(l, r *Normal) float64 { + dim := l.Dim() + if dim != r.Dim() { + panic(badSizeMismatch) + } + + var sigma mat64.SymDense + sigma.AddSym(&l.sigma, &r.sigma) + sigma.ScaleSym(0.5, &sigma) + + var chol mat64.Cholesky + chol.Factorize(&sigma) + + mahalanobis := stat.Mahalanobis(mat64.NewVector(dim, l.mu), mat64.NewVector(dim, r.mu), &chol) + mahalanobisSq := mahalanobis * mahalanobis + + dl := l.chol.LogDet() + dr := r.chol.LogDet() + ds := chol.LogDet() + + return 0.125*mahalanobisSq + 0.5*ds - 0.25*dl - 0.25*dr +} + +// DistUniform computes the Bhattacharyya distance between uniform distributions l and r. +// The dimensions of the input distributions must match or DistUniform will panic. +func (Bhattacharyya) DistUniform(l, r *Uniform) float64 { + if len(l.bounds) != len(r.bounds) { + panic(badSizeMismatch) + } + // BC = \int \sqrt(p(x)q(x)), which for uniform distributions is a constant + // over the volume where both distributions have positive probability. + // Compute the overlap and the value of sqrt(p(x)q(x)). The entropy is the + // negative log probability of the distribution (use instead of LogProb so + // it is not necessary to construct an x value). + // + // BC = volume * sqrt(p(x)q(x)) + // logBC = log(volume) + 0.5*(logP + logQ) + // D_B = -logBC + return -unifLogVolOverlap(l.bounds, r.bounds) + 0.5*(l.Entropy()+r.Entropy()) +} + +// unifLogVolOverlap computes the log of the volume of the hyper-rectangle where +// both uniform distributions have positive probability. +func unifLogVolOverlap(b1, b2 []Bound) float64 { + var logVolOverlap float64 + for dim, v1 := range b1 { + v2 := b2[dim] + // If the surfaces don't overlap, then the volume is 0 + if v1.Max <= v2.Min || v2.Max <= v1.Min { + return math.Inf(-1) + } + vol := math.Min(v1.Max, v2.Max) - math.Max(v1.Min, v2.Min) + logVolOverlap += math.Log(vol) + } + return logVolOverlap +} + +// CrossEntropy is a type for computing the cross-entropy between probability +// distributions. +// +// The cross-entropy is defined as +// - \int_x l(x) log(r(x)) dx = KL(l || r) + H(l) +// where KL is the Kullback-Leibler divergence and H is the entropy. +// For more information, see +// https://en.wikipedia.org/wiki/Cross_entropy +type CrossEntropy struct{} + +// DistNormal returns the cross-entropy between normal distributions l and r. +// The dimensions of the input distributions must match or DistNormal will panic. +func (CrossEntropy) DistNormal(l, r *Normal) float64 { + if l.Dim() != r.Dim() { + panic(badSizeMismatch) + } + kl := KullbackLeibler{}.DistNormal(l, r) + return kl + l.Entropy() +} + +// Hellinger is a type for computing the Hellinger distance between probability +// distributions. +// +// The Hellinger distance is defined as +// H^2(l,r) = 1/2 * int_x (\sqrt(l(x)) - \sqrt(r(x)))^2 dx +// and is bounded between 0 and 1. +// The Hellinger distance is related to the Bhattacharyya distance by +// H^2 = 1 - exp(-Db) +// For more information, see +// https://en.wikipedia.org/wiki/Hellinger_distance +type Hellinger struct{} + +// DistNormal returns the Hellinger distance between normal distributions l and r. +// The dimensions of the input distributions must match or DistNormal will panic. +// +// See the documentation of Bhattacharyya.DistNormal for the formula for Normal +// distributions. +func (Hellinger) DistNormal(l, r *Normal) float64 { + if l.Dim() != r.Dim() { + panic(badSizeMismatch) + } + db := Bhattacharyya{}.DistNormal(l, r) + bc := math.Exp(-db) + return math.Sqrt(1 - bc) +} + +// KullbackLiebler is a type for computing the Kullback-Leibler divergence from l to r. +// The dimensions of the input distributions must match or the function will panic. +// +// The Kullback-Liebler divergence is defined as +// D_KL(l || r ) = \int_x p(x) log(p(x)/q(x)) dx +// Note that the Kullback-Liebler divergence is not symmetric with respect to +// the order of the input arguments. +type KullbackLeibler struct{} + +// DistNormal returns the KullbackLeibler distance between normal distributions l and r. +// The dimensions of the input distributions must match or DistNormal will panic. +// +// For two normal distributions, the KL divergence is computed as +// D_KL(l || r) = 0.5*[ln(|Σ_r|) - ln(|Σ_l|) + (μ_l - μ_r)^T*Σ_r^-1*(μ_l - μ_r) + tr(Σ_r^-1*Σ_l)-d] +func (KullbackLeibler) DistNormal(l, r *Normal) float64 { + dim := l.Dim() + if dim != r.Dim() { + panic(badSizeMismatch) + } + + mahalanobis := stat.Mahalanobis(mat64.NewVector(dim, l.mu), mat64.NewVector(dim, r.mu), &r.chol) + mahalanobisSq := mahalanobis * mahalanobis + + // TODO(btracey): Optimize where there is a SolveCholeskySym + // TODO(btracey): There may be a more efficient way to just compute the trace + // Compute tr(Σ_r^-1*Σ_l) using the fact that Σ_l = U^T * U + var u mat64.TriDense + u.UFromCholesky(&l.chol) + var m mat64.Dense + err := m.SolveCholesky(&r.chol, u.T()) + if err != nil { + return math.NaN() + } + m.Mul(&m, &u) + tr := mat64.Trace(&m) + + return r.logSqrtDet - l.logSqrtDet + 0.5*(mahalanobisSq+tr-float64(l.dim)) +} + +// DistUniform returns the KullbackLeibler distance between uniform distributions +// l and r. The dimensions of the input distributions must match or DistUniform +// will panic. +func (KullbackLeibler) DistUniform(l, r *Uniform) float64 { + bl := l.Bounds(nil) + br := r.Bounds(nil) + if len(bl) != len(br) { + panic(badSizeMismatch) + } + + // The KL is ∞ if l is not completely contained within r, because then + // r(x) is zero when l(x) is non-zero for some x. + contained := true + for i, v := range bl { + if v.Min < br[i].Min || br[i].Max < v.Max { + contained = false + break + } + } + if !contained { + return math.Inf(1) + } + + // The KL divergence is finite. + // + // KL defines 0*ln(0) = 0, so there is no contribution to KL where l(x) = 0. + // Inside the region, l(x) and r(x) are constant (uniform distribution), and + // this constant is integrated over l(x), which integrates out to one. + // The entropy is -log(p(x)). + logPx := -l.Entropy() + logQx := -r.Entropy() + return logPx - logQx +} + +// Wasserstein is a type for computing the Wasserstein distance between two +// probability distributions. +// +// The Wasserstein distance is defined as +// W(l,r) := inf 𝔼(||X-Y||_2^2)^1/2 +// For more information, see +// https://en.wikipedia.org/wiki/Wasserstein_metric +type Wasserstein struct{} + +// DistNormal returns the Wasserstein distance between normal distributions l and r. +// The dimensions of the input distributions must match or DistNormal will panic. +// +// The Wasserstein distance for Normal distributions is +// d^2 = ||m_l - m_r||_2^2 + Tr(Σ_l + Σ_r - 2(Σ_l^(1/2)*Σ_r*Σ_l^(1/2))^(1/2)) +// For more information, see +// http://djalil.chafai.net/blog/2010/04/30/wasserstein-distance-between-two-gaussians/ +func (Wasserstein) DistNormal(l, r *Normal) float64 { + dim := l.Dim() + if dim != r.Dim() { + panic(badSizeMismatch) + } + + d := floats.Distance(l.mu, r.mu, 2) + d = d * d + + // Compute Σ_l^(1/2) + var ssl mat64.SymDense + ssl.PowPSD(&l.sigma, 0.5) + // Compute Σ_l^(1/2)*Σ_r*Σ_l^(1/2) + var mean mat64.Dense + mean.Mul(&ssl, &r.sigma) + mean.Mul(&mean, &ssl) + + // Reinterpret as symdense, and take Σ^(1/2) + meanSym := mat64.NewSymDense(dim, mean.RawMatrix().Data) + ssl.PowPSD(meanSym, 0.5) + + tr := mat64.Trace(&r.sigma) + tl := mat64.Trace(&l.sigma) + tm := mat64.Trace(&ssl) + + return d + tl + tr - 2*tm +} diff --git a/stat/distmv/statdist_test.go b/stat/distmv/statdist_test.go new file mode 100644 index 00000000..9f146385 --- /dev/null +++ b/stat/distmv/statdist_test.go @@ -0,0 +1,261 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" +) + +func TestBhattacharyyaNormal(t *testing.T) { + for cas, test := range []struct { + am, bm []float64 + ac, bc *mat64.SymDense + samples int + tol float64 + }{ + { + am: []float64{2, 3}, + ac: mat64.NewSymDense(2, []float64{3, -1, -1, 2}), + bm: []float64{-1, 1}, + bc: mat64.NewSymDense(2, []float64{1.5, 0.2, 0.2, 0.9}), + samples: 100000, + tol: 1e-2, + }, + } { + rnd := rand.New(rand.NewSource(1)) + a, ok := NewNormal(test.am, test.ac, rnd) + if !ok { + panic("bad test") + } + b, ok := NewNormal(test.bm, test.bc, rnd) + if !ok { + panic("bad test") + } + want := bhattacharyyaSample(a.Dim(), test.samples, a, b) + got := Bhattacharyya{}.DistNormal(a, b) + if math.Abs(want-got) > test.tol { + t.Errorf("Bhattacharyya mismatch, case %d: got %v, want %v", cas, got, want) + } + + // Bhattacharyya should by symmetric + got2 := Bhattacharyya{}.DistNormal(b, a) + if math.Abs(got-got2) > 1e-14 { + t.Errorf("Bhattacharyya distance not symmetric") + } + } +} + +func TestBhattacharyyaUniform(t *testing.T) { + rnd := rand.New(rand.NewSource(1)) + for cas, test := range []struct { + a, b *Uniform + samples int + tol float64 + }{ + { + a: NewUniform([]Bound{{-3, 2}, {-5, 8}}, rnd), + b: NewUniform([]Bound{{-4, 1}, {-7, 10}}, rnd), + samples: 100000, + tol: 1e-2, + }, + { + a: NewUniform([]Bound{{-3, 2}, {-5, 8}}, rnd), + b: NewUniform([]Bound{{-5, -4}, {-7, 10}}, rnd), + samples: 100000, + tol: 1e-2, + }, + } { + a, b := test.a, test.b + want := bhattacharyyaSample(a.Dim(), test.samples, a, b) + got := Bhattacharyya{}.DistUniform(a, b) + if math.Abs(want-got) > test.tol { + t.Errorf("Bhattacharyya mismatch, case %d: got %v, want %v", cas, got, want) + } + // Bhattacharyya should by symmetric + got2 := Bhattacharyya{}.DistUniform(b, a) + if math.Abs(got-got2) > 1e-14 { + t.Errorf("Bhattacharyya distance not symmetric") + } + } +} + +// bhattacharyyaSample finds an estimate of the Bhattacharyya coefficient through +// sampling. +func bhattacharyyaSample(dim, samples int, l RandLogProber, r LogProber) float64 { + lBhatt := make([]float64, samples) + x := make([]float64, dim) + for i := 0; i < samples; i++ { + // Do importance sampling over a: \int sqrt(a*b)/a * a dx + l.Rand(x) + pa := l.LogProb(x) + pb := r.LogProb(x) + lBhatt[i] = 0.5*pb - 0.5*pa + } + logBc := floats.LogSumExp(lBhatt) - math.Log(float64(samples)) + return -logBc +} + +func TestCrossEntropyNormal(t *testing.T) { + for cas, test := range []struct { + am, bm []float64 + ac, bc *mat64.SymDense + samples int + tol float64 + }{ + { + am: []float64{2, 3}, + ac: mat64.NewSymDense(2, []float64{3, -1, -1, 2}), + bm: []float64{-1, 1}, + bc: mat64.NewSymDense(2, []float64{1.5, 0.2, 0.2, 0.9}), + samples: 100000, + tol: 1e-2, + }, + } { + rnd := rand.New(rand.NewSource(1)) + a, ok := NewNormal(test.am, test.ac, rnd) + if !ok { + panic("bad test") + } + b, ok := NewNormal(test.bm, test.bc, rnd) + if !ok { + panic("bad test") + } + var ce float64 + x := make([]float64, a.Dim()) + for i := 0; i < test.samples; i++ { + a.Rand(x) + ce -= b.LogProb(x) + } + ce /= float64(test.samples) + got := CrossEntropy{}.DistNormal(a, b) + if math.Abs(ce-got) > test.tol { + t.Errorf("CrossEntropy mismatch, case %d: got %v, want %v", cas, got, ce) + } + } +} + +func TestHellingerNormal(t *testing.T) { + for cas, test := range []struct { + am, bm []float64 + ac, bc *mat64.SymDense + samples int + tol float64 + }{ + { + am: []float64{2, 3}, + ac: mat64.NewSymDense(2, []float64{3, -1, -1, 2}), + bm: []float64{-1, 1}, + bc: mat64.NewSymDense(2, []float64{1.5, 0.2, 0.2, 0.9}), + samples: 100000, + tol: 5e-1, + }, + } { + rnd := rand.New(rand.NewSource(1)) + a, ok := NewNormal(test.am, test.ac, rnd) + if !ok { + panic("bad test") + } + b, ok := NewNormal(test.bm, test.bc, rnd) + if !ok { + panic("bad test") + } + lAitchEDoubleHockeySticks := make([]float64, test.samples) + x := make([]float64, a.Dim()) + for i := 0; i < test.samples; i++ { + // Do importance sampling over a: \int (\sqrt(a)-\sqrt(b))^2/a * a dx + a.Rand(x) + pa := a.LogProb(x) + pb := b.LogProb(x) + d := math.Exp(0.5*pa) - math.Exp(0.5*pb) + d = d * d + lAitchEDoubleHockeySticks[i] = math.Log(d) - pa + } + want := math.Sqrt(0.5 * math.Exp(floats.LogSumExp(lAitchEDoubleHockeySticks)-math.Log(float64(test.samples)))) + got := Hellinger{}.DistNormal(a, b) + if math.Abs(want-got) > test.tol { + t.Errorf("Hellinger mismatch, case %d: got %v, want %v", cas, got, want) + } + } +} + +func TestKullbackLeiblerNormal(t *testing.T) { + for cas, test := range []struct { + am, bm []float64 + ac, bc *mat64.SymDense + samples int + tol float64 + }{ + { + am: []float64{2, 3}, + ac: mat64.NewSymDense(2, []float64{3, -1, -1, 2}), + bm: []float64{-1, 1}, + bc: mat64.NewSymDense(2, []float64{1.5, 0.2, 0.2, 0.9}), + samples: 10000, + tol: 1e-2, + }, + } { + rnd := rand.New(rand.NewSource(1)) + a, ok := NewNormal(test.am, test.ac, rnd) + if !ok { + panic("bad test") + } + b, ok := NewNormal(test.bm, test.bc, rnd) + if !ok { + panic("bad test") + } + want := klSample(a.Dim(), test.samples, a, b) + got := KullbackLeibler{}.DistNormal(a, b) + if !floats.EqualWithinAbsOrRel(want, got, test.tol, test.tol) { + t.Errorf("Case %d, KL mismatch: got %v, want %v", cas, got, want) + } + } +} + +func TestKullbackLeiblerUniform(t *testing.T) { + rnd := rand.New(rand.NewSource(1)) + for cas, test := range []struct { + a, b *Uniform + samples int + tol float64 + }{ + { + a: NewUniform([]Bound{{-5, 2}, {-7, 12}}, rnd), + b: NewUniform([]Bound{{-4, 1}, {-7, 10}}, rnd), + samples: 100000, + tol: 1e-2, + }, + { + a: NewUniform([]Bound{{-5, 2}, {-7, 12}}, rnd), + b: NewUniform([]Bound{{-9, -6}, {-7, 10}}, rnd), + samples: 100000, + tol: 1e-2, + }, + } { + a, b := test.a, test.b + want := klSample(a.Dim(), test.samples, a, b) + got := KullbackLeibler{}.DistUniform(a, b) + if math.Abs(want-got) > test.tol { + t.Errorf("Kullback-Leibler mismatch, case %d: got %v, want %v", cas, got, want) + } + } +} + +// klSample finds an estimate of the Kullback-Leibler Divergence through sampling. +func klSample(dim, samples int, l RandLogProber, r LogProber) float64 { + var klmc float64 + x := make([]float64, dim) + for i := 0; i < samples; i++ { + l.Rand(x) + pa := l.LogProb(x) + pb := r.LogProb(x) + klmc += pa - pb + } + return klmc / float64(samples) +} diff --git a/stat/distmv/studentst.go b/stat/distmv/studentst.go new file mode 100644 index 00000000..c691b96c --- /dev/null +++ b/stat/distmv/studentst.go @@ -0,0 +1,354 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + "sort" + + "golang.org/x/tools/container/intsets" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat/distuv" +) + +// StudentsT is a multivariate Student's T distribution. It is a distribution over +// ℝ^n with the probability density +// p(y) = (Γ((ν+n)/2) / Γ(ν/2)) * (νπ)^(-n/2) * |Ʃ|^(-1/2) * +// (1 + 1/ν * (y-μ)^T * Ʃ^-1 * (y-μ))^(-(ν+n)/2) +// where ν is a scalar greater than 2, μ is a vector in ℝ^n, and Ʃ is an n×n +// symmetric positive definite matrix. +// +// In this distribution, ν sets the spread of the distribution, similar to +// the degrees of freedom in a univariate Student's T distribution. As ν → ∞, +// the distribution approaches a multi-variate normal distribution. +// μ is the mean of the distribution, and the covariance is ν/(ν-2)*Ʃ. +// +// See https://en.wikipedia.org/wiki/Student%27s_t-distribution and +// http://users.isy.liu.se/en/rt/roth/student.pdf for more information. +type StudentsT struct { + nu float64 + mu []float64 + src *rand.Rand + + sigma mat64.SymDense // only stored if needed + + chol mat64.Cholesky + lower mat64.TriDense + logSqrtDet float64 + dim int +} + +// NewStudentsT creates a new StudentsT with the given nu, mu, and sigma +// parameters. +// +// NewStudentsT panics if len(mu) == 0, or if len(mu) != sigma.Symmetric(). If +// the covariance matrix is not positive-definite, nil is returned and ok is false. +func NewStudentsT(mu []float64, sigma mat64.Symmetric, nu float64, src *rand.Rand) (dist *StudentsT, ok bool) { + if len(mu) == 0 { + panic(badZeroDimension) + } + dim := sigma.Symmetric() + if dim != len(mu) { + panic(badSizeMismatch) + } + + s := &StudentsT{ + nu: nu, + mu: make([]float64, dim), + dim: dim, + src: src, + } + copy(s.mu, mu) + + ok = s.chol.Factorize(sigma) + if !ok { + return nil, false + } + s.sigma = *mat64.NewSymDense(dim, nil) + s.sigma.CopySym(sigma) + s.lower.LFromCholesky(&s.chol) + s.logSqrtDet = 0.5 * s.chol.LogDet() + return s, true +} + +// ConditionStudentsT returns the Student's T distribution that is the receiver +// conditioned on the input evidence, and the success of the operation. +// The returned Student's T has dimension +// n - len(observed), where n is the dimension of the original receiver. +// The dimension order is preserved during conditioning, so if the value +// of dimension 1 is observed, the returned normal represents dimensions {0, 2, ...} +// of the original Student's T distribution. +// +// ok indicates whether there was a failure during the update. If ok is false +// the operation failed and dist is not usable. +// Mathematically this is impossible, but can occur with finite precision arithmetic. +func (s *StudentsT) ConditionStudentsT(observed []int, values []float64, src *rand.Rand) (dist *StudentsT, ok bool) { + if len(observed) == 0 { + panic("studentst: no observed value") + } + if len(observed) != len(values) { + panic(badInputLength) + } + + for _, v := range observed { + if v < 0 || v >= s.dim { + panic("studentst: observed value out of bounds") + } + } + + newNu, newMean, newSigma := studentsTConditional(observed, values, s.nu, s.mu, &s.sigma) + if newMean == nil { + return nil, false + } + + return NewStudentsT(newMean, newSigma, newNu, src) + +} + +// studentsTConditional updates a Student's T distribution based on the observed samples +// (see documentation for the public function). The Gaussian conditional update +// is treated as a special case when nu == math.Inf(1). +func studentsTConditional(observed []int, values []float64, nu float64, mu []float64, sigma mat64.Symmetric) (newNu float64, newMean []float64, newSigma *mat64.SymDense) { + dim := len(mu) + ob := len(observed) + + unobserved := findUnob(observed, dim) + + unob := len(unobserved) + if unob == 0 { + panic("stat: all dimensions observed") + } + + mu1 := make([]float64, unob) + for i, v := range unobserved { + mu1[i] = mu[v] + } + mu2 := make([]float64, ob) // really v - mu2 + for i, v := range observed { + mu2[i] = values[i] - mu[v] + } + + var sigma11, sigma22 mat64.SymDense + sigma11.SubsetSym(sigma, unobserved) + sigma22.SubsetSym(sigma, observed) + + sigma21 := mat64.NewDense(ob, unob, nil) + for i, r := range observed { + for j, c := range unobserved { + v := sigma.At(r, c) + sigma21.Set(i, j, v) + } + } + + var chol mat64.Cholesky + ok := chol.Factorize(&sigma22) + if !ok { + return math.NaN(), nil, nil + } + + // Compute mu_1 + sigma_{2,1}^T * sigma_{2,2}^-1 (v - mu_2). + v := mat64.NewVector(ob, mu2) + var tmp, tmp2 mat64.Vector + err := tmp.SolveCholeskyVec(&chol, v) + if err != nil { + return math.NaN(), nil, nil + } + tmp2.MulVec(sigma21.T(), &tmp) + + for i := range mu1 { + mu1[i] += tmp2.At(i, 0) + } + + // Compute tmp4 = sigma_{2,1}^T * sigma_{2,2}^-1 * sigma_{2,1}. + // TODO(btracey): Should this be a method of SymDense? + var tmp3, tmp4 mat64.Dense + err = tmp3.SolveCholesky(&chol, sigma21) + if err != nil { + return math.NaN(), nil, nil + } + tmp4.Mul(sigma21.T(), &tmp3) + + // Compute sigma_{1,1} - tmp4 + // TODO(btracey): If tmp4 can constructed with a method, then this can be + // replaced with SubSym. + for i := 0; i < len(unobserved); i++ { + for j := i; j < len(unobserved); j++ { + v := sigma11.At(i, j) + sigma11.SetSym(i, j, v-tmp4.At(i, j)) + } + } + + // The computed variables are accurate for a Normal. + if math.IsInf(nu, 1) { + return nu, mu1, &sigma11 + } + + // Compute beta = (v - mu_2)^T * sigma_{2,2}^-1 * (v - mu_2)^T + beta := mat64.Dot(v, &tmp) + + // Scale the covariance matrix + sigma11.ScaleSym((nu+beta)/(nu+float64(ob)), &sigma11) + + return nu + float64(ob), mu1, &sigma11 +} + +// findUnob returns the unobserved variables (the complementary set to observed). +// findUnob panics if any value repeated in observed. +func findUnob(observed []int, dim int) (unobserved []int) { + var setOb intsets.Sparse + for _, v := range observed { + setOb.Insert(v) + } + var setAll intsets.Sparse + for i := 0; i < dim; i++ { + setAll.Insert(i) + } + var setUnob intsets.Sparse + setUnob.Difference(&setAll, &setOb) + unobserved = setUnob.AppendTo(nil) + sort.Ints(unobserved) + return unobserved +} + +// CovarianceMatrix returns the covariance matrix of the distribution. Upon +// return, the value at element {i, j} of the covariance matrix is equal to +// the covariance of the i^th and j^th variables. +// covariance(i, j) = E[(x_i - E[x_i])(x_j - E[x_j])] +// If the input matrix is nil a new matrix is allocated, otherwise the result +// is stored in-place into the input. +func (st *StudentsT) CovarianceMatrix(s *mat64.SymDense) *mat64.SymDense { + if s == nil { + s = mat64.NewSymDense(st.dim, nil) + } + sn := s.Symmetric() + if sn != st.dim { + panic("normal: input matrix size mismatch") + } + s.CopySym(&st.sigma) + s.ScaleSym(st.nu/(st.nu-2), s) + return s +} + +// Dim returns the dimension of the distribution. +func (s *StudentsT) Dim() int { + return s.dim +} + +// LogProb computes the log of the pdf of the point x. +func (s *StudentsT) LogProb(y []float64) float64 { + if len(y) != s.dim { + panic(badInputLength) + } + + nu := s.nu + n := float64(s.dim) + lg1, _ := math.Lgamma((nu + n) / 2) + lg2, _ := math.Lgamma(nu / 2) + + t1 := lg1 - lg2 - n/2*math.Log(nu*math.Pi) - s.logSqrtDet + + shift := make([]float64, len(y)) + copy(shift, y) + floats.Sub(shift, s.mu) + + x := mat64.NewVector(s.dim, shift) + + var tmp mat64.Vector + tmp.SolveCholeskyVec(&s.chol, x) + + dot := mat64.Dot(&tmp, x) + + return t1 - ((nu+n)/2)*math.Log(1+dot/nu) +} + +// MarginalStudentsT returns the marginal distribution of the given input variables, +// and the success of the operation. +// That is, MarginalStudentsT returns +// p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o +// where x_i are the dimensions in the input, and x_o are the remaining dimensions. +// See https://en.wikipedia.org/wiki/Marginal_distribution for more information. +// +// The input src is passed to the created StudentsT. +// +// ok indicates whether there was a failure during the marginalization. If ok is false +// the operation failed and dist is not usable. +// Mathematically this is impossible, but can occur with finite precision arithmetic. +func (s *StudentsT) MarginalStudentsT(vars []int, src *rand.Rand) (dist *StudentsT, ok bool) { + newMean := make([]float64, len(vars)) + for i, v := range vars { + newMean[i] = s.mu[v] + } + var newSigma mat64.SymDense + newSigma.SubsetSym(&s.sigma, vars) + return NewStudentsT(newMean, &newSigma, s.nu, src) +} + +// MarginalStudentsT returns the marginal distribution of the given input variable. +// That is, MarginalStudentsT returns +// p(x_i) = \int_{x_o} p(x_i | x_o) p(x_o) dx_o +// where i is the input index, and x_o are the remaining dimensions. +// See https://en.wikipedia.org/wiki/Marginal_distribution for more information. +// +// The input src is passed to the call to NewStudentsT. +func (s *StudentsT) MarginalStudentsTSingle(i int, src *rand.Rand) distuv.StudentsT { + return distuv.StudentsT{ + Mu: s.mu[i], + Sigma: math.Sqrt(s.sigma.At(i, i)), + Nu: s.nu, + Src: src, + } +} + +// TODO(btracey): Implement marginal single. Need to modify univariate StudentsT +// to be three-parameter. + +// Mean returns the mean of the probability distribution at x. If the +// input argument is nil, a new slice will be allocated, otherwise the result +// will be put in-place into the receiver. +func (s *StudentsT) Mean(x []float64) []float64 { + x = reuseAs(x, s.dim) + copy(x, s.mu) + return x +} + +// Prob computes the value of the probability density function at x. +func (s *StudentsT) Prob(y []float64) float64 { + return math.Exp(s.LogProb(y)) +} + +// Rand generates a random number according to the distributon. +// If the input slice is nil, new memory is allocated, otherwise the result is stored +// in place. +func (s *StudentsT) Rand(x []float64) []float64 { + // If Y is distributed according to N(0,Sigma), and U is chi^2 with + // parameter ν, then + // X = mu + Y * sqrt(nu / U) + // X is distributed according to this distribution. + + // Generate Y. + x = reuseAs(x, s.dim) + tmp := make([]float64, s.dim) + if s.src == nil { + for i := range x { + tmp[i] = rand.NormFloat64() + } + } else { + for i := range x { + tmp[i] = s.src.NormFloat64() + } + } + xVec := mat64.NewVector(s.dim, x) + tmpVec := mat64.NewVector(s.dim, tmp) + xVec.MulVec(&s.lower, tmpVec) + + u := distuv.ChiSquared{K: s.nu, Src: s.src}.Rand() + floats.Scale(math.Sqrt(s.nu/u), x) + + floats.Add(x, s.mu) + return x +} diff --git a/stat/distmv/studentst_test.go b/stat/distmv/studentst_test.go new file mode 100644 index 00000000..a0f6fc6f --- /dev/null +++ b/stat/distmv/studentst_test.go @@ -0,0 +1,262 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +func TestStudentTProbs(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for _, test := range []struct { + nu float64 + mu []float64 + sigma *mat64.SymDense + + x [][]float64 + probs []float64 + }{ + { + nu: 3, + mu: []float64{0, 0}, + sigma: mat64.NewSymDense(2, []float64{1, 0, 0, 1}), + + x: [][]float64{ + {0, 0}, + {1, -1}, + {3, 4}, + {-1, -2}, + }, + // Outputs compared with WolframAlpha. + probs: []float64{ + 0.159154943091895335768883, + 0.0443811199724279860006777747927, + 0.0005980371870904696541052658, + 0.01370560783418571283428283, + }, + }, + { + nu: 4, + mu: []float64{2, -3}, + sigma: mat64.NewSymDense(2, []float64{8, -1, -1, 5}), + + x: [][]float64{ + {0, 0}, + {1, -1}, + {3, 4}, + {-1, -2}, + {2, -3}, + }, + // Outputs compared with WolframAlpha. + probs: []float64{ + 0.007360810111491788657953608191001, + 0.0143309905845607117740440592999, + 0.0005307774290578041397794096037035009801668903, + 0.0115657422475668739943625904793879, + 0.0254851872062589062995305736215, + }, + }, + } { + s, ok := NewStudentsT(test.mu, test.sigma, test.nu, src) + if !ok { + t.Fatal("bad test") + } + for i, x := range test.x { + xcpy := make([]float64, len(x)) + copy(xcpy, x) + p := s.Prob(x) + if !floats.Same(x, xcpy) { + t.Errorf("X modified during call to prob, %v, %v", x, xcpy) + } + if !floats.EqualWithinAbsOrRel(p, test.probs[i], 1e-10, 1e-10) { + t.Errorf("Probability mismatch. X = %v. Got %v, want %v.", x, p, test.probs[i]) + } + } + } +} + +func TestStudentsTRand(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for _, test := range []struct { + mean []float64 + cov *mat64.SymDense + nu float64 + tolcov float64 + }{ + { + mean: []float64{0, 0}, + cov: mat64.NewSymDense(2, []float64{1, 0, 0, 1}), + nu: 3, + tolcov: 1e-2, + }, + { + mean: []float64{3, 4}, + cov: mat64.NewSymDense(2, []float64{5, 1.2, 1.2, 6}), + nu: 8, + tolcov: 1e-2, + }, + { + mean: []float64{3, 4, -2}, + cov: mat64.NewSymDense(3, []float64{5, 1.2, -0.8, 1.2, 6, 0.4, -0.8, 0.4, 2}), + nu: 8, + tolcov: 1e-2, + }, + } { + s, ok := NewStudentsT(test.mean, test.cov, test.nu, src) + if !ok { + t.Fatal("bad test") + } + nSamples := 10000000 + dim := len(test.mean) + samps := mat64.NewDense(nSamples, dim, nil) + for i := 0; i < nSamples; i++ { + s.Rand(samps.RawRowView(i)) + } + estMean := make([]float64, dim) + for i := range estMean { + estMean[i] = stat.Mean(mat64.Col(nil, i, samps), nil) + } + mean := s.Mean(nil) + if !floats.EqualApprox(estMean, mean, 1e-2) { + t.Errorf("Mean mismatch: want: %v, got %v", test.mean, estMean) + } + cov := s.CovarianceMatrix(nil) + estCov := stat.CovarianceMatrix(nil, samps, nil) + if !mat64.EqualApprox(estCov, cov, test.tolcov) { + t.Errorf("Cov mismatch: want: %v, got %v", cov, estCov) + } + } +} + +func TestStudentsTConditional(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for _, test := range []struct { + mean []float64 + cov *mat64.SymDense + nu float64 + + idx []int + value []float64 + tolcov float64 + }{ + { + mean: []float64{3, 4, -2}, + cov: mat64.NewSymDense(3, []float64{5, 1.2, -0.8, 1.2, 6, 0.4, -0.8, 0.4, 2}), + nu: 8, + idx: []int{0}, + value: []float64{6}, + + tolcov: 1e-2, + }, + } { + s, ok := NewStudentsT(test.mean, test.cov, test.nu, src) + if !ok { + t.Fatal("bad test") + } + + sUp, ok := s.ConditionStudentsT(test.idx, test.value, src) + + // Compute the other values by hand the inefficient way to compare + newNu := test.nu + float64(len(test.idx)) + if newNu != sUp.nu { + t.Errorf("Updated nu mismatch. Got %v, want %v", s.nu, newNu) + } + dim := len(test.mean) + unob := findUnob(test.idx, dim) + ob := test.idx + + muUnob := make([]float64, len(unob)) + for i, v := range unob { + muUnob[i] = test.mean[v] + } + muOb := make([]float64, len(ob)) + for i, v := range ob { + muOb[i] = test.mean[v] + } + + var sig11, sig22 mat64.SymDense + sig11.SubsetSym(&s.sigma, unob) + sig22.SubsetSym(&s.sigma, ob) + + sig12 := mat64.NewDense(len(unob), len(ob), nil) + for i := range unob { + for j := range ob { + sig12.Set(i, j, s.sigma.At(unob[i], ob[j])) + } + } + + shift := make([]float64, len(ob)) + copy(shift, test.value) + floats.Sub(shift, muOb) + + newMu := make([]float64, len(muUnob)) + newMuVec := mat64.NewVector(len(muUnob), newMu) + shiftVec := mat64.NewVector(len(shift), shift) + var tmp mat64.Vector + tmp.SolveVec(&sig22, shiftVec) + newMuVec.MulVec(sig12, &tmp) + floats.Add(newMu, muUnob) + + if !floats.EqualApprox(newMu, sUp.mu, 1e-10) { + t.Errorf("Mu mismatch. Got %v, want %v", sUp.mu, newMu) + } + + var tmp2 mat64.Dense + tmp2.Solve(&sig22, sig12.T()) + + var tmp3 mat64.Dense + tmp3.Mul(sig12, &tmp2) + tmp3.Sub(&sig11, &tmp3) + + dot := mat64.Dot(shiftVec, &tmp) + tmp3.Scale((test.nu+dot)/(test.nu+float64(len(ob))), &tmp3) + if !mat64.EqualApprox(&tmp3, &sUp.sigma, 1e-10) { + t.Errorf("Sigma mismatch") + } + } +} + +func TestStudentsTMarginalSingle(t *testing.T) { + for _, test := range []struct { + mu []float64 + sigma *mat64.SymDense + nu float64 + }{ + { + mu: []float64{2, 3, 4}, + sigma: mat64.NewSymDense(3, []float64{2, 0.5, 3, 0.5, 1, 0.6, 3, 0.6, 10}), + nu: 5, + }, + { + mu: []float64{2, 3, 4, 5}, + sigma: mat64.NewSymDense(4, []float64{2, 0.5, 3, 0.1, 0.5, 1, 0.6, 0.2, 3, 0.6, 10, 0.3, 0.1, 0.2, 0.3, 3}), + nu: 6, + }, + } { + studentst, ok := NewStudentsT(test.mu, test.sigma, test.nu, nil) + if !ok { + t.Fatalf("Bad test, covariance matrix not positive definite") + } + for i, mean := range test.mu { + st := studentst.MarginalStudentsTSingle(i, nil) + if st.Mean() != mean { + t.Errorf("Mean mismatch nil Sigma, idx %v: want %v, got %v.", i, mean, st.Mean()) + } + std := math.Sqrt(test.sigma.At(i, i)) + if math.Abs(st.Sigma-std) > 1e-14 { + t.Errorf("StdDev mismatch nil Sigma, idx %v: want %v, got %v.", i, std, st.StdDev()) + } + if st.Nu != test.nu { + t.Errorf("Nu mismatch nil Sigma, idx %v: want %v, got %v ", i, test.nu, st.Nu) + } + } + } +} diff --git a/stat/distmv/uniform.go b/stat/distmv/uniform.go new file mode 100644 index 00000000..c28a0232 --- /dev/null +++ b/stat/distmv/uniform.go @@ -0,0 +1,196 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "math/rand" +) + +type Bound struct { + Min float64 + Max float64 +} + +// Uniform represents a multivariate uniform distribution. +type Uniform struct { + bounds []Bound + dim int + src *rand.Rand +} + +// NewUniform creates a new uniform distribution with the given bounds. +func NewUniform(bnds []Bound, src *rand.Rand) *Uniform { + dim := len(bnds) + if dim == 0 { + panic(badZeroDimension) + } + for _, b := range bnds { + if b.Max < b.Min { + panic("uniform: maximum less than minimum") + } + } + u := &Uniform{ + bounds: make([]Bound, dim), + dim: dim, + src: src, + } + for i, b := range bnds { + u.bounds[i].Min = b.Min + u.bounds[i].Max = b.Max + } + return u +} + +// NewUnitUniform creates a new Uniform distribution over the dim-dimensional +// unit hypercube. That is, a uniform distribution where each dimension has +// Min = 0 and Max = 1. +func NewUnitUniform(dim int, src *rand.Rand) *Uniform { + if dim <= 0 { + panic(nonPosDimension) + } + bounds := make([]Bound, dim) + for i := range bounds { + bounds[i].Min = 0 + bounds[i].Max = 1 + } + return &Uniform{ + bounds: bounds, + dim: dim, + src: src, + } +} + +// Bounds returns the bounds on the variables of the distribution. If the input +// is nil, a new slice is allocated and returned. If the input is non-nil, then +// the bounds are stored in-place into the input argument, and Bounds will panic +// if len(bounds) != u.Dim(). +func (u *Uniform) Bounds(bounds []Bound) []Bound { + if bounds == nil { + bounds = make([]Bound, u.Dim()) + } + if len(bounds) != u.Dim() { + panic(badInputLength) + } + copy(bounds, u.bounds) + return bounds +} + +// CDF returns the multidimensional cumulative distribution function of the +// probability distribution at the point x. If p is non-nil, the CDF is stored +// in-place into the first argument, otherwise a new slice is allocated and +// returned. +// +// CDF will panic if len(x) is not equal to the dimension of the distribution, +// or if p is non-nil and len(p) is not equal to the dimension of the distribution. +func (u *Uniform) CDF(p, x []float64) []float64 { + if len(x) != u.dim { + panic(badSizeMismatch) + } + if p == nil { + p = make([]float64, u.dim) + } + if len(p) != u.dim { + panic(badSizeMismatch) + } + for i, v := range x { + if v < u.bounds[i].Min { + p[i] = 0 + } else if v > u.bounds[i].Max { + p[i] = 1 + } else { + p[i] = (v - u.bounds[i].Min) / (u.bounds[i].Max - u.bounds[i].Min) + } + } + return p +} + +// Dim returns the dimension of the distribution. +func (u *Uniform) Dim() int { + return u.dim +} + +// Entropy returns the differential entropy of the distribution. +func (u *Uniform) Entropy() float64 { + // Entropy is log of the volume. + var logVol float64 + for _, b := range u.bounds { + logVol += math.Log(b.Max - b.Min) + } + return logVol +} + +// LogProb computes the log of the pdf of the point x. +func (u *Uniform) LogProb(x []float64) float64 { + dim := u.dim + if len(x) != dim { + panic(badSizeMismatch) + } + var logprob float64 + for i, b := range u.bounds { + if x[i] < b.Min || x[i] > b.Max { + return math.Inf(-1) + } + logprob -= math.Log(b.Max - b.Min) + } + return logprob +} + +// Mean returns the mean of the probability distribution at x. If the +// input argument is nil, a new slice will be allocated, otherwise the result +// will be put in-place into the receiver. +func (u *Uniform) Mean(x []float64) []float64 { + x = reuseAs(x, u.dim) + for i, b := range u.bounds { + x[i] = (b.Max + b.Min) / 2 + } + return x +} + +// Prob computes the value of the probability density function at x. +func (u *Uniform) Prob(x []float64) float64 { + return math.Exp(u.LogProb(x)) +} + +// Rand generates a random number according to the distributon. +// If the input slice is nil, new memory is allocated, otherwise the result is stored +// in place. +func (u *Uniform) Rand(x []float64) []float64 { + x = reuseAs(x, u.dim) + if u.src == nil { + for i, b := range u.bounds { + x[i] = rand.Float64()*(b.Max-b.Min) + b.Min + } + return x + } + for i, b := range u.bounds { + x[i] = rand.Float64()*(b.Max-b.Min) + b.Min + } + return x +} + +// Quantile returns the multi-dimensional inverse cumulative distribution function. +// len(x) must equal len(p), and if x is non-nil, len(x) must also equal len(p). +// If x is nil, a new slice will be allocated and returned, otherwise the quantile +// will be stored in-place into x. All of the values of p must be between 0 and 1, +// or Quantile will panic. +func (u *Uniform) Quantile(x, p []float64) []float64 { + if len(p) != u.dim { + panic(badSizeMismatch) + } + if x == nil { + x = make([]float64, u.dim) + } + if len(x) != u.dim { + panic(badSizeMismatch) + } + for i, v := range p { + if v < 0 || v > 1 { + panic(badQuantile) + } + x[i] = v*(u.bounds[i].Max-u.bounds[i].Min) + u.bounds[i].Min + } + return x +} diff --git a/stat/distmv/uniform_test.go b/stat/distmv/uniform_test.go new file mode 100644 index 00000000..c52e09a9 --- /dev/null +++ b/stat/distmv/uniform_test.go @@ -0,0 +1,31 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distmv + +import ( + "math" + "testing" +) + +func TestUniformEntropy(t *testing.T) { + for _, test := range []struct { + Uniform *Uniform + Entropy float64 + }{ + { + NewUniform([]Bound{{0, 1}, {0, 1}}, nil), + 0, + }, + { + NewUniform([]Bound{{-1, 3}, {2, 8}, {-5, -3}}, nil), + math.Log(48), + }, + } { + ent := test.Uniform.Entropy() + if math.Abs(ent-test.Entropy) > 1e-14 { + t.Errorf("Entropy mismatch. Got %v, want %v", ent, test.Entropy) + } + } +} diff --git a/stat/distuv/bernoulli.go b/stat/distuv/bernoulli.go new file mode 100644 index 00000000..c03578f1 --- /dev/null +++ b/stat/distuv/bernoulli.go @@ -0,0 +1,131 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" +) + +// Bernoulli represents a random variable whose value is 1 with probability p and +// value of zero with probability 1-P. The value of P must be between 0 and 1. +// More information at https://en.wikipedia.org/wiki/Bernoulli_distribution. +type Bernoulli struct { + P float64 + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (b Bernoulli) CDF(x float64) float64 { + if x < 0 { + return 0 + } + if x < 1 { + return 1 - b.P + } + return 1 +} + +// Entropy returns the entropy of the distribution. +func (b Bernoulli) Entropy() float64 { + if b.P == 0 { + return 0 + } + if b.P == 1 { + return 1 + } + q := 1 - b.P + return -b.P*math.Log(b.P) - q*math.Log(q) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (b Bernoulli) ExKurtosis() float64 { + pq := b.P * (1 - b.P) + return (1 - 6*pq) / pq +} + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (b Bernoulli) LogProb(x float64) float64 { + if x == 0 { + return math.Log(1 - b.P) + } + if x == 1 { + return math.Log(b.P) + } + return math.Inf(-1) +} + +// Mean returns the mean of the probability distribution. +func (b Bernoulli) Mean() float64 { + return b.P +} + +// Median returns the median of the probability distribution. +func (b Bernoulli) Median() float64 { + p := b.P + switch { + case p < 0.5: + return 0 + case p > 0.5: + return 1 + default: + return 0.5 + } +} + +// NumParameters returns the number of parameters in the distribution. +func (Bernoulli) NumParameters() int { + return 1 +} + +// Prob computes the value of the probability distribution at x. +func (b Bernoulli) Prob(x float64) float64 { + return math.Exp(b.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (b Bernoulli) Quantile(p float64) float64 { + if p < 0 || 1 < p { + panic(badPercentile) + } + if p < 1-b.P { + return 0 + } + return 1 +} + +// Rand returns a random sample drawn from the distribution. +func (b Bernoulli) Rand() float64 { + var rnd float64 + if b.Source == nil { + rnd = rand.Float64() + } else { + rnd = b.Source.Float64() + } + if rnd < b.P { + return 1 + } + return 0 +} + +// Skewness returns the skewness of the distribution. +func (b Bernoulli) Skewness() float64 { + return (1 - 2*b.P) / math.Sqrt(b.P*(1-b.P)) +} + +// StdDev returns the standard deviation of the probability distribution. +func (b Bernoulli) StdDev() float64 { + return math.Sqrt(b.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (b Bernoulli) Survival(x float64) float64 { + return 1 - b.CDF(x) +} + +// Variance returns the variance of the probability distribution. +func (b Bernoulli) Variance() float64 { + return b.P * (1 - b.P) +} diff --git a/stat/distuv/bernoulli_test.go b/stat/distuv/bernoulli_test.go new file mode 100644 index 00000000..107135ec --- /dev/null +++ b/stat/distuv/bernoulli_test.go @@ -0,0 +1,23 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import "testing" + +func TestBernoulli(t *testing.T) { + for i, dist := range []Bernoulli{ + { + P: 0.5, + }, + { + P: 0.9, + }, + { + P: 0.2, + }, + } { + testFullDist(t, dist, i, false) + } +} diff --git a/stat/distuv/beta.go b/stat/distuv/beta.go new file mode 100644 index 00000000..54f2c3ed --- /dev/null +++ b/stat/distuv/beta.go @@ -0,0 +1,126 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/mathext" +) + +// Beta implements the Beta distribution, a two-parameter continuous distribution +// with support between 0 and 1. +// +// The beta distribution has density function +// x^(α-1) * (1-x)^(β-1) * Γ(α+β) / (Γ(α)*Γ(β)) +// +// For more information, see https://en.wikipedia.org/wiki/Beta_distribution +type Beta struct { + // Alpha is the left shape parameter of the distribution. Alpha must be greater + // than 0. + Alpha float64 + // Beta is the right shape parameter of the distribution. Beta must be greater + // than 0. + Beta float64 + + Source *rand.Rand +} + +// CDF computes the value of the cumulative distribution function at x. +func (b Beta) CDF(x float64) float64 { + if x <= 0 { + return 0 + } + if x >= 1 { + return 1 + } + return mathext.RegIncBeta(b.Alpha, b.Beta, x) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (b Beta) ExKurtosis() float64 { + num := 6 * ((b.Alpha-b.Beta)*(b.Alpha-b.Beta)*(b.Alpha+b.Beta+1) - b.Alpha*b.Beta*(b.Alpha+b.Beta+2)) + den := b.Alpha * b.Beta * (b.Alpha + b.Beta + 2) * (b.Alpha + b.Beta + 3) + return num / den +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. +func (b Beta) LogProb(x float64) float64 { + if x < 0 || x > 1 { + return math.Inf(-1) + } + + if b.Alpha <= 0 || b.Beta <= 0 { + panic("beta: negative parameters") + } + + lab, _ := math.Lgamma(b.Alpha + b.Beta) + la, _ := math.Lgamma(b.Alpha) + lb, _ := math.Lgamma(b.Beta) + return lab - la - lb + (b.Alpha-1)*math.Log(x) + (b.Beta-1)*math.Log(1-x) +} + +// Mean returns the mean of the probability distribution. +func (b Beta) Mean() float64 { + return b.Alpha / (b.Alpha + b.Beta) +} + +// Mode returns the mode of the distribution. +// +// Mode returns NaN if either parameter is less than or equal to 1 as a special case. +func (b Beta) Mode() float64 { + if b.Alpha <= 1 || b.Beta <= 1 { + return math.NaN() + } + return (b.Alpha - 1) / (b.Alpha + b.Beta - 2) +} + +// NumParameters returns the number of parameters in the distribution. +func (b Beta) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (b Beta) Prob(x float64) float64 { + return math.Exp(b.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative distribution function. +func (b Beta) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return mathext.InvRegIncBeta(b.Alpha, b.Beta, p) +} + +// Rand returns a random sample drawn from the distribution. +func (b Beta) Rand() float64 { + ga := Gamma{Alpha: b.Alpha, Beta: 1, Source: b.Source}.Rand() + gb := Gamma{Alpha: b.Beta, Beta: 1, Source: b.Source}.Rand() + return ga / (ga + gb) +} + +// StdDev returns the standard deviation of the probability distribution. +func (b Beta) StdDev() float64 { + return math.Sqrt(b.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (b Beta) Survival(x float64) float64 { + switch { + case x <= 0: + return 1 + case x >= 1: + return 0 + } + return mathext.RegIncBeta(b.Beta, b.Alpha, 1-x) +} + +// Variance returns the variance of the probability distribution. +func (b Beta) Variance() float64 { + return b.Alpha * b.Beta / ((b.Alpha + b.Beta) * (b.Alpha + b.Beta) * (b.Alpha + b.Beta + 1)) +} diff --git a/stat/distuv/beta_test.go b/stat/distuv/beta_test.go new file mode 100644 index 00000000..d2128bcb --- /dev/null +++ b/stat/distuv/beta_test.go @@ -0,0 +1,61 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + "sort" + "testing" + + "github.com/gonum/floats" +) + +func TestBetaProb(t *testing.T) { + // Values a comparison with scipy + for _, test := range []struct { + x, alpha, beta, want float64 + }{ + {0.1, 2, 0.5, 0.079056941504209499}, + {0.5, 1, 5.1, 0.29740426605235754}, + {0.1, 0.5, 0.5, 1.0610329539459691}, + {1, 0.5, 0.5, math.Inf(1)}, + {-1, 0.5, 0.5, 0}, + } { + pdf := Beta{Alpha: test.alpha, Beta: test.beta}.Prob(test.x) + if !floats.EqualWithinAbsOrRel(pdf, test.want, 1e-10, 1e-10) { + t.Errorf("Pdf mismatch. Got %v, want %v", pdf, test.want) + } + } +} + +func TestBetaRand(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for i, b := range []Beta{ + {Alpha: 0.5, Beta: 0.5, Source: src}, + {Alpha: 5, Beta: 1, Source: src}, + {Alpha: 2, Beta: 2, Source: src}, + {Alpha: 2, Beta: 5, Source: src}, + } { + testBeta(t, b, i) + } +} + +func testBeta(t *testing.T, b Beta, i int) { + tol := 1e-2 + const n = 1e6 + const bins = 10 + x := make([]float64, n) + generateSamples(x, b) + sort.Float64s(x) + + testRandLogProbContinuous(t, i, 0, x, b, tol, bins) + checkMean(t, i, x, b, tol) + checkVarAndStd(t, i, x, b, tol) + checkExKurtosis(t, i, x, b, 5e-2) + checkProbContinuous(t, i, x, b, 1e-3) + checkQuantileCDFSurvival(t, i, x, b, tol) + checkProbQuantContinuous(t, i, x, b, tol) +} diff --git a/stat/distuv/categorical.go b/stat/distuv/categorical.go new file mode 100644 index 00000000..165a0565 --- /dev/null +++ b/stat/distuv/categorical.go @@ -0,0 +1,184 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" +) + +// Categorical is an extension of the Bernouilli distribution where x takes +// values {0, 1, ..., len(w)-1} where w is the weight vector. Categorical must +// be initialized with NewCategorical. +type Categorical struct { + weights []float64 + + // heap is a weight heap. + // + // It keeps a heap-organised sum of remaining + // index weights that are available to be taken + // from. + // + // Each element holds the sum of weights for + // the corresponding index, plus the sum of + // of its children's weights; the children + // of an element i can be found at positions + // 2*(i+1)-1 and 2*(i+1). The root of the + // weight heap is at element 0. + // + // See comments in container/heap for an + // explanation of the layout of a heap. + heap []float64 + + src *rand.Rand +} + +// NewCategorical constructs a new categorical distribution where the probability +// that x equals i is proportional to w[i]. All of the weights must be +// nonnegative, and at least one of the weights must be positive. +func NewCategorical(w []float64, src *rand.Rand) Categorical { + c := Categorical{ + weights: make([]float64, len(w)), + heap: make([]float64, len(w)), + src: src, + } + c.ReweightAll(w) + return c +} + +// CDF computes the value of the cumulative density function at x. +func (c Categorical) CDF(x float64) float64 { + var cdf float64 + for i, w := range c.weights { + if x < float64(i) { + break + } + cdf += w + } + return cdf / c.heap[0] +} + +// Entropy returns the entropy of the distribution. +func (c Categorical) Entropy() float64 { + var ent float64 + for _, w := range c.weights { + if w == 0 { + continue + } + p := w / c.heap[0] + ent += p * math.Log(p) + } + return -ent +} + +// Len returns the number of values x could possibly take (the length of the +// initial supplied weight vector). +func (c Categorical) Len() int { + return len(c.weights) +} + +// Mean returns the mean of the probability distribution. +func (c Categorical) Mean() float64 { + var mean float64 + for i, v := range c.weights { + mean += float64(i) * v + } + return mean / c.heap[0] +} + +// Prob computes the value of the probability density function at x. +func (c Categorical) Prob(x float64) float64 { + xi := int(x) + if float64(xi) != x { + return 0 + } + if xi < 0 || xi > len(c.weights)-1 { + return 0 + } + return c.weights[xi] / c.heap[0] +} + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (c Categorical) LogProb(x float64) float64 { + return math.Log(c.Prob(x)) +} + +// Rand returns a random draw from the categorical distribution. +func (c Categorical) Rand() float64 { + var r float64 + if c.src == nil { + r = c.heap[0] * rand.Float64() + } else { + r = c.heap[0] * c.src.Float64() + } + i := 1 + last := -1 + left := len(c.weights) + for { + if r -= c.weights[i-1]; r <= 0 { + break // Fall within item i-1. + } + i <<= 1 // Move to left child. + if d := c.heap[i-1]; r > d { + r -= d + // If enough r to pass left child, + // move to right child state will + // be caught at break above. + i++ + } + if i == last || left < 0 { + panic("categorical: bad sample") + } + last = i + left-- + } + return float64(i - 1) +} + +// Reweight sets the weight of item idx to w. The input weight must be +// non-negative, and after reweighting at least one of the weights must be +// positive. +func (c Categorical) Reweight(idx int, w float64) { + if w < 0 { + panic("categorical: negative weight") + } + w, c.weights[idx] = c.weights[idx]-w, w + idx++ + for idx > 0 { + c.heap[idx-1] -= w + idx >>= 1 + } + if c.heap[0] <= 0 { + panic("categorical: sum of the weights non-positive") + } +} + +// ReweightAll resets the weights of the distribution. ReweightAll panics if +// len(w) != c.Len. All of the weights must be nonnegative, and at least one of +// the weights must be positive. +func (c Categorical) ReweightAll(w []float64) { + if len(w) != c.Len() { + panic("categorical: length of the slices do not match") + } + for _, v := range w { + if v < 0 { + panic("categorical: negative weight") + } + } + copy(c.weights, w) + c.reset() +} + +func (c Categorical) reset() { + copy(c.heap, c.weights) + for i := len(c.heap) - 1; i > 0; i-- { + // Sometimes 1-based counting makes sense. + c.heap[((i+1)>>1)-1] += c.heap[i] + } + // TODO(btracey): Renormalization for weird weights? + if c.heap[0] <= 0 { + panic("categorical: sum of the weights non-positive") + } +} diff --git a/stat/distuv/categorical_test.go b/stat/distuv/categorical_test.go new file mode 100644 index 00000000..da6ae6aa --- /dev/null +++ b/stat/distuv/categorical_test.go @@ -0,0 +1,196 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" +) + +func TestCategoricalProb(t *testing.T) { + for _, test := range [][]float64{ + {1, 2, 3, 0}, + } { + dist := NewCategorical(test, nil) + norm := make([]float64, len(test)) + floats.Scale(1/floats.Sum(norm), norm) + for i, v := range norm { + p := dist.Prob(float64(i)) + if math.Abs(p-v) > 1e-14 { + t.Errorf("Probability mismatch element %d", i) + } + p = dist.Prob(float64(i) + 0.5) + if p != 0 { + t.Errorf("Non-zero probability for non-integer x") + } + } + p := dist.Prob(-1) + if p != 0 { + t.Errorf("Non-zero probability for -1") + } + p = dist.Prob(float64(len(test))) + if p != 0 { + t.Errorf("Non-zero probability for len(test)") + } + } +} + +func TestCategoricalRand(t *testing.T) { + for _, test := range [][]float64{ + {1, 2, 3, 0}, + } { + dist := NewCategorical(test, nil) + nSamples := 2000000 + counts := sampleCategorical(t, dist, nSamples) + + probs := make([]float64, len(test)) + for i := range probs { + probs[i] = dist.Prob(float64(i)) + } + same := samedDistCategorical(dist, counts, probs, 1e-2) + if !same { + t.Errorf("Probability mismatch. Want %v, got %v", probs, counts) + } + + dist.Reweight(len(test)-1, 10) + counts = sampleCategorical(t, dist, nSamples) + probs = make([]float64, len(test)) + for i := range probs { + probs[i] = dist.Prob(float64(i)) + } + same = samedDistCategorical(dist, counts, probs, 1e-2) + if !same { + t.Errorf("Probability mismatch after Reweight. Want %v, got %v", probs, counts) + } + + w := make([]float64, len(test)) + for i := range w { + w[i] = rand.Float64() + } + + dist.ReweightAll(w) + counts = sampleCategorical(t, dist, nSamples) + probs = make([]float64, len(test)) + for i := range probs { + probs[i] = dist.Prob(float64(i)) + } + same = samedDistCategorical(dist, counts, probs, 1e-2) + if !same { + t.Errorf("Probability mismatch after ReweightAll. Want %v, got %v", probs, counts) + } + } +} + +func sampleCategorical(t *testing.T, dist Categorical, nSamples int) []float64 { + counts := make([]float64, dist.Len()) + for i := 0; i < nSamples; i++ { + v := dist.Rand() + if float64(int(v)) != v { + t.Fatalf("Random number is not an integer") + } + counts[int(v)]++ + } + sum := floats.Sum(counts) + floats.Scale(1/sum, counts) + return counts +} + +func samedDistCategorical(dist Categorical, counts, probs []float64, tol float64) bool { + same := true + for i, prob := range probs { + if prob == 0 && counts[i] != 0 { + same = false + break + } + if !floats.EqualWithinAbsOrRel(prob, counts[i], tol, tol) { + same = false + break + } + } + return same +} + +func TestCategoricalCDF(t *testing.T) { + for _, test := range [][]float64{ + {1, 2, 3, 0, 4}, + } { + c := make([]float64, len(test)) + copy(c, test) + floats.Scale(1/floats.Sum(c), c) + sum := make([]float64, len(test)) + floats.CumSum(sum, c) + + dist := NewCategorical(test, nil) + cdf := dist.CDF(-0.5) + if cdf != 0 { + t.Errorf("CDF of negative number not zero") + } + for i := range c { + cdf := dist.CDF(float64(i)) + if math.Abs(cdf-sum[i]) > 1e-14 { + t.Errorf("CDF mismatch %v. Want %v, got %v.", float64(i), sum[i], cdf) + } + cdfp := dist.CDF(float64(i) + 0.5) + if cdfp != cdf { + t.Errorf("CDF mismatch for non-integer input") + } + } + } +} + +func TestCategoricalEntropy(t *testing.T) { + for _, test := range []struct { + weights []float64 + entropy float64 + }{ + { + weights: []float64{1, 1}, + entropy: math.Ln2, + }, + { + weights: []float64{1, 1, 1, 1}, + entropy: math.Log(4), + }, + { + weights: []float64{0, 0, 1, 1, 0, 0}, + entropy: math.Ln2, + }, + } { + dist := NewCategorical(test.weights, nil) + entropy := dist.Entropy() + if math.IsNaN(entropy) || math.Abs(entropy-test.entropy) > 1e-14 { + t.Errorf("Entropy mismatch. Want %v, got %v.", test.entropy, entropy) + } + } +} + +func TestCategoricalMean(t *testing.T) { + for _, test := range []struct { + weights []float64 + mean float64 + }{ + { + weights: []float64{10, 0, 0, 0}, + mean: 0, + }, + { + weights: []float64{0, 10, 0, 0}, + mean: 1, + }, + { + weights: []float64{1, 2, 3, 4}, + mean: 2, + }, + } { + dist := NewCategorical(test.weights, nil) + mean := dist.Mean() + if math.IsNaN(mean) || math.Abs(mean-test.mean) > 1e-14 { + t.Errorf("Entropy mismatch. Want %v, got %v.", test.mean, mean) + } + } +} diff --git a/stat/distuv/chisquared.go b/stat/distuv/chisquared.go new file mode 100644 index 00000000..9e7ef8d3 --- /dev/null +++ b/stat/distuv/chisquared.go @@ -0,0 +1,99 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/mathext" +) + +// ChiSquared implements the χ² distribution, a one parameter distribution +// with support on the positive numbers. +// +// The density function is given by +// 1/(2^{k/2} * Γ(k/2)) * x^{k/2 - 1} * e^{-x/2} +// It is a special case of the Gamma distribution, Γ(k/2, 1/2). +// +// For more information, see https://en.wikipedia.org/wiki/Chi-squared_distribution. +type ChiSquared struct { + // K is the shape parameter, corresponding to the degrees of freedom. Must + // be greater than 0. + K float64 + + Src *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (c ChiSquared) CDF(x float64) float64 { + return mathext.GammaInc(c.K/2, x/2) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (c ChiSquared) ExKurtosis() float64 { + return 12 / c.K +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. +func (c ChiSquared) LogProb(x float64) float64 { + if x < 0 { + return math.Inf(-1) + } + lg, _ := math.Lgamma(c.K / 2) + return (c.K/2-1)*math.Log(x) - x/2 - (c.K/2)*math.Ln2 - lg +} + +// Mean returns the mean of the probability distribution. +func (c ChiSquared) Mean() float64 { + return c.K +} + +// Mode returns the mode of the distribution. +func (c ChiSquared) Mode() float64 { + return math.Min(c.K-2, 0) +} + +// NumParameters returns the number of parameters in the distribution. +func (c ChiSquared) NumParameters() int { + return 1 +} + +// Prob computes the value of the probability density function at x. +func (c ChiSquared) Prob(x float64) float64 { + return math.Exp(c.LogProb(x)) +} + +// Rand returns a random sample drawn from the distribution. +func (c ChiSquared) Rand() float64 { + return Gamma{c.K / 2, 0.5, c.Src}.Rand() +} + +// Quantile returns the inverse of the cumulative distribution function. +func (c ChiSquared) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return mathext.GammaIncInv(0.5*c.K, p) * 2 +} + +// StdDev returns the standard deviation of the probability distribution. +func (c ChiSquared) StdDev() float64 { + return math.Sqrt(c.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (c ChiSquared) Survival(x float64) float64 { + if x < 0 { + return 1 + } + return mathext.GammaIncComp(0.5*c.K, 0.5*x) +} + +// Variance returns the variance of the probability distribution. +func (c ChiSquared) Variance() float64 { + return 2 * c.K +} diff --git a/stat/distuv/chisquared_test.go b/stat/distuv/chisquared_test.go new file mode 100644 index 00000000..6cf20525 --- /dev/null +++ b/stat/distuv/chisquared_test.go @@ -0,0 +1,78 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math/rand" + "sort" + "testing" + + "github.com/gonum/floats" +) + +func TestChiSquaredProb(t *testing.T) { + for _, test := range []struct { + x, k, want float64 + }{ + {10, 3, 0.0085003666025203432}, + {2.3, 3, 0.19157345407042367}, + {0.8, 0.2, 0.080363259903912673}, + } { + pdf := ChiSquared{test.k, nil}.Prob(test.x) + if !floats.EqualWithinAbsOrRel(pdf, test.want, 1e-10, 1e-10) { + t.Errorf("Pdf mismatch, x = %v, K = %v. Got %v, want %v", test.x, test.k, pdf, test.want) + } + } +} + +func TestChiSquaredCDF(t *testing.T) { + for _, test := range []struct { + x, k, want float64 + }{ + // Values calculated with scipy.stats.chi2.cdf + {0, 1, 0}, + {0.01, 5, 5.3002700426865167e-07}, + {0.05, 3, 0.002929332764619924}, + {0.5, 2, 0.22119921692859512}, + {0.95, 3, 0.1866520918701263}, + {0.99, 5, 0.036631697220869196}, + {1, 1, 0.68268949213708596}, + {1.5, 4, 0.17335853270322427}, + {10, 10, 0.55950671493478743}, + {25, 15, 0.95005656637357172}, + } { + cdf := ChiSquared{test.k, nil}.CDF(test.x) + if !floats.EqualWithinAbsOrRel(cdf, test.want, 1e-10, 1e-10) { + t.Errorf("CDF mismatch, x = %v, K = %v. Got %v, want %v", test.x, test.k, cdf, test.want) + } + } +} + +func TestChiSquared(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for i, b := range []ChiSquared{ + {3, src}, + {1.5, src}, + {0.9, src}, + } { + testChiSquared(t, b, i) + } +} + +func testChiSquared(t *testing.T, c ChiSquared, i int) { + tol := 1e-2 + const n = 2e6 + const bins = 50 + x := make([]float64, n) + generateSamples(x, c) + sort.Float64s(x) + + testRandLogProbContinuous(t, i, 0, x, c, tol, bins) + checkMean(t, i, x, c, tol) + checkVarAndStd(t, i, x, c, tol) + checkExKurtosis(t, i, x, c, 5e-2) + checkProbContinuous(t, i, x, c, 1e-3) + checkQuantileCDFSurvival(t, i, x, c, 1e-3) +} diff --git a/stat/distuv/constants.go b/stat/distuv/constants.go new file mode 100644 index 00000000..9de7d4dc --- /dev/null +++ b/stat/distuv/constants.go @@ -0,0 +1,24 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +const ( + // oneOverRoot2Pi is the value of 1/(2Pi)^(1/2) + // http://www.wolframalpha.com/input/?i=1%2F%282+*+pi%29%5E%281%2F2%29 + oneOverRoot2Pi = 0.39894228040143267793994605993438186847585863116493465766592582967065792589930183850125233390730693643030255886263518268 + + //LogRoot2Pi is the value of log(sqrt(2*Pi)) + logRoot2Pi = 0.91893853320467274178032973640561763986139747363778341281715154048276569592726039769474329863595419762200564662463433744 + negLogRoot2Pi = -logRoot2Pi + log2Pi = 1.8378770664093454835606594728112352797227949472755668 + ln2 = 0.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641868754200148102057068573368552023 + + // Euler–Mascheroni constant. + eulerGamma = 0.5772156649015328606065120900824024310421593359399235988057672348848677267776646709369470632917467495146314472498070824809605 +) + +const ( + panicNameMismatch = "parameter name mismatch" +) diff --git a/stat/distuv/distribution_test.go b/stat/distuv/distribution_test.go new file mode 100644 index 00000000..2def59b3 --- /dev/null +++ b/stat/distuv/distribution_test.go @@ -0,0 +1,293 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "sort" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/integrate/quad" + "github.com/gonum/stat" +) + +type meaner interface { + Mean() float64 +} + +type quantiler interface { + Quantile(float64) float64 +} + +type medianer interface { + quantiler + Median() float64 +} + +type varStder interface { + StdDev() float64 + Variance() float64 +} + +type entropyer interface { + LogProber + Entropy() float64 +} + +type exKurtosiser interface { + ExKurtosis() float64 + Mean() float64 +} + +type skewnesser interface { + StdDev() float64 + Mean() float64 + Skewness() float64 +} + +type cumulanter interface { + Quantiler + CDF(x float64) float64 + Survival(x float64) float64 +} + +func generateSamples(x []float64, r Rander) { + for i := range x { + x[i] = r.Rand() + } +} + +type probLogprober interface { + Prob(x float64) float64 + LogProb(x float64) float64 +} + +type cumulantProber interface { + cumulanter + probLogprober +} + +func checkMean(t *testing.T, i int, x []float64, m meaner, tol float64) { + mean := stat.Mean(x, nil) + if !floats.EqualWithinAbsOrRel(mean, m.Mean(), tol, tol) { + t.Errorf("Mean mismatch case %v: want: %v, got: %v", i, mean, m.Mean()) + } +} + +func checkMedian(t *testing.T, i int, x []float64, m medianer, tol float64) { + median := stat.Quantile(0.5, stat.Empirical, x, nil) + if !floats.EqualWithinAbsOrRel(median, m.Median(), tol, tol) { + t.Errorf("Median mismatch case %v: want: %v, got: %v", i, median, m.Median()) + } +} + +func checkVarAndStd(t *testing.T, i int, x []float64, v varStder, tol float64) { + variance := stat.Variance(x, nil) + if !floats.EqualWithinAbsOrRel(variance, v.Variance(), tol, tol) { + t.Errorf("Variance mismatch case %v: want: %v, got: %v", i, variance, v.Variance()) + } + std := math.Sqrt(variance) + if !floats.EqualWithinAbsOrRel(std, v.StdDev(), tol, tol) { + t.Errorf("StdDev mismatch case %v: want: %v, got: %v", i, std, v.StdDev()) + } +} + +func checkEntropy(t *testing.T, i int, x []float64, e entropyer, tol float64) { + tmp := make([]float64, len(x)) + for i, v := range x { + tmp[i] = -e.LogProb(v) + } + entropy := stat.Mean(tmp, nil) + if !floats.EqualWithinAbsOrRel(entropy, e.Entropy(), tol, tol) { + t.Errorf("Entropy mismatch case %v: want: %v, got: %v", i, entropy, e.Entropy()) + } +} + +func checkExKurtosis(t *testing.T, i int, x []float64, e exKurtosiser, tol float64) { + mean := e.Mean() + tmp := make([]float64, len(x)) + for i, x := range x { + tmp[i] = math.Pow(x-mean, 4) + } + variance := stat.Variance(x, nil) + mu4 := stat.Mean(tmp, nil) + kurtosis := mu4/(variance*variance) - 3 + if !floats.EqualWithinAbsOrRel(kurtosis, e.ExKurtosis(), tol, tol) { + t.Errorf("ExKurtosis mismatch case %v: want: %v, got: %v", i, kurtosis, e.ExKurtosis()) + } +} + +func checkSkewness(t *testing.T, i int, x []float64, s skewnesser, tol float64) { + mean := s.Mean() + std := s.StdDev() + tmp := make([]float64, len(x)) + for i, v := range x { + tmp[i] = math.Pow(v-mean, 3) + } + mu3 := stat.Mean(tmp, nil) + skewness := mu3 / math.Pow(std, 3) + if !floats.EqualWithinAbsOrRel(skewness, s.Skewness(), tol, tol) { + t.Errorf("Skewness mismatch case %v: want: %v, got: %v", i, skewness, s.Skewness()) + } +} + +func checkQuantileCDFSurvival(t *testing.T, i int, xs []float64, c cumulanter, tol float64) { + // Quantile, CDF, and survival check. + for i, p := range []float64{0.1, 0.25, 0.5, 0.75, 0.9} { + x := c.Quantile(p) + cdf := c.CDF(x) + estCDF := stat.CDF(x, stat.Empirical, xs, nil) + if !floats.EqualWithinAbsOrRel(cdf, estCDF, tol, tol) { + t.Errorf("CDF mismatch case %v: want: %v, got: %v", i, estCDF, cdf) + } + if !floats.EqualWithinAbsOrRel(cdf, p, tol, tol) { + t.Errorf("Quantile/CDF mismatch case %v: want: %v, got: %v", i, p, cdf) + } + if math.Abs(1-cdf-c.Survival(x)) > 1e-14 { + t.Errorf("Survival/CDF mismatch case %v: want: %v, got: %v", i, 1-cdf, c.Survival(x)) + } + } +} + +func checkProbContinuous(t *testing.T, i int, x []float64, p probLogprober, tol float64) { + // Check that the PDF is consistent (integrates to 1). + q := quad.Fixed(p.Prob, math.Inf(-1), math.Inf(1), 1000000, nil, 0) + if math.Abs(q-1) > tol { + t.Errorf("Probability distribution doesn't integrate to 1. Case %v: Got %v", i, q) + } + + // Check that PDF and LogPDF are consistent. + for i, v := range x { + if math.Abs(math.Log(p.Prob(v))-p.LogProb(v)) > 1e-14 { + t.Errorf("Prob and LogProb mismatch case %v at %v: want %v, got %v", i, v, math.Log(v), p.LogProb(v)) + break + } + } +} + +// checkProbQuantContinuous checks that the Prob, Rand, and Quantile are all consistent. +// checkProbContinuous only checks that Prob is a valid distribution (integrates +// to 1 and greater than 0). However, this is also true if the PDF of a different +// distribution is used. This checks that PDF is also consistent with the +// CDF implementation and the random samples. +func checkProbQuantContinuous(t *testing.T, i int, xs []float64, c cumulantProber, tol float64) { + ps := make([]float64, 101) + floats.Span(ps, 0, 1) + + var xp, x float64 + for i, p := range ps { + x = c.Quantile(p) + if p == 0 { + xp = x + if floats.Min(xs) < x { + t.Errorf("Sample of x less than Quantile(0). Case %v.", i) + break + } + continue + } + if p == 1 { + if floats.Max(xs) > x { + t.Errorf("Sample of x greater than Quantile(1). Case %v.", i) + break + } + } + + // The integral of the PDF between xp and x should be the difference in + // the quantiles. + q := quad.Fixed(c.Prob, xp, x, 1000, nil, 0) + if math.Abs(q-(p-ps[i-1])) > 1e-5 { + t.Errorf("Integral of PDF doesn't match quantile. Case %v. Want %v, got %v.", i, p-ps[i-1], q) + break + } + + pEst := stat.CDF(x, stat.Empirical, xs, nil) + if math.Abs(pEst-p) > tol { + t.Errorf("Empirical CDF doesn't match quantile. Case %v.", i) + } + xp = x + } +} + +// checkProbDiscrete confirms that PDF and Rand are consistent for discrete distributions. +func checkProbDiscrete(t *testing.T, i int, xs []float64, p probLogprober, tol float64) { + // Make a map of all of the unique samples. + m := make(map[float64]int) + for _, v := range xs { + m[v]++ + } + for x, count := range m { + prob := float64(count) / float64(len(xs)) + if math.Abs(prob-p.Prob(x)) > tol { + t.Errorf("PDF mismatch case %v at %v: want %v, got %v", i, x, prob, p.Prob(x)) + } + if math.Abs(math.Log(p.Prob(x))-p.LogProb(x)) > 1e-14 { + t.Errorf("Prob and LogProb mismatch case %v at %v: want %v, got %v", i, x, math.Log(x), p.LogProb(x)) + } + } +} + +// dist is a type that implements the standard set of routines. +type fullDist interface { + CDF(x float64) float64 + Entropy() float64 + ExKurtosis() float64 + LogProb(x float64) float64 + Mean() float64 + Median() float64 + NumParameters() int + Prob(x float64) float64 + Quantile(p float64) float64 + Rand() float64 + Skewness() float64 + StdDev() float64 + Survival(x float64) float64 + Variance() float64 +} + +// testFullDist tests all of the functions of a fullDist. +func testFullDist(t *testing.T, f fullDist, i int, continuous bool) { + tol := 1e-2 + const n = 1e6 + x := make([]float64, n) + generateSamples(x, f) + sort.Float64s(x) + + checkMean(t, i, x, f, tol) + checkVarAndStd(t, i, x, f, tol) + checkEntropy(t, i, x, f, tol) + checkExKurtosis(t, i, x, f, tol) + checkSkewness(t, i, x, f, tol) + if continuous { + // In a discrete distribution, the median may not have positive probability. + checkMedian(t, i, x, f, tol) + // In a discrete distribution, the CDF and Quantile may not be perfect mappings. + checkQuantileCDFSurvival(t, i, x, f, tol) + // Integrate over the PDF + checkProbContinuous(t, i, x, f, 1e-10) + checkProbQuantContinuous(t, i, x, f, tol) + } else { + // Check against empirical PDF. + checkProbDiscrete(t, i, x, f, tol) + } +} + +// testRandLogProb tests that LogProb and Rand give consistent results. This +// can be used when the distribution does not implement CDF. +func testRandLogProbContinuous(t *testing.T, i int, min float64, x []float64, f LogProber, tol float64, bins int) { + for cdf := 1 / float64(bins); cdf <= 1-1/float64(bins); cdf += 1 / float64(bins) { + // Get the estimated CDF from the samples + pt := stat.Quantile(cdf, stat.Empirical, x, nil) + + prob := func(x float64) float64 { + return math.Exp(f.LogProb(x)) + } + // Integrate the PDF to find the CDF + estCDF := quad.Fixed(prob, min, pt, 1000, nil, 0) + if !floats.EqualWithinAbsOrRel(cdf, estCDF, tol, tol) { + t.Errorf("Mismatch between integral of PDF and empirical CDF. Case %v. Want %v, got %v", i, cdf, estCDF) + } + } +} diff --git a/stat/distuv/exponential.go b/stat/distuv/exponential.go new file mode 100644 index 00000000..682caa9e --- /dev/null +++ b/stat/distuv/exponential.go @@ -0,0 +1,259 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/floats" + "github.com/gonum/stat" +) + +// Exponential represents the exponential distribution (https://en.wikipedia.org/wiki/Exponential_distribution). +type Exponential struct { + Rate float64 + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (e Exponential) CDF(x float64) float64 { + if x < 0 { + return 0 + } + return 1 - math.Exp(-e.Rate*x) +} + +// ConjugateUpdate updates the parameters of the distribution from the sufficient +// statistics of a set of samples. The sufficient statistics, suffStat, have been +// observed with nSamples observations. The prior values of the distribution are those +// currently in the distribution, and have been observed with priorStrength samples. +// +// For the exponential distribution, the sufficient statistic is the inverse of +// the mean of the samples. +// The prior is having seen priorStrength[0] samples with inverse mean Exponential.Rate +// As a result of this function, Exponential.Rate is updated based on the weighted +// samples, and priorStrength is modified to include the new number of samples observed. +// +// This function panics if len(suffStat) != 1 or len(priorStrength) != 1. +func (e *Exponential) ConjugateUpdate(suffStat []float64, nSamples float64, priorStrength []float64) { + if len(suffStat) != 1 { + panic("exponential: incorrect suffStat length") + } + if len(priorStrength) != 1 { + panic("exponential: incorrect priorStrength length") + } + + totalSamples := nSamples + priorStrength[0] + + totalSum := nSamples / suffStat[0] + if !(priorStrength[0] == 0) { + totalSum += priorStrength[0] / e.Rate + } + e.Rate = totalSamples / totalSum + priorStrength[0] = totalSamples +} + +// Entropy returns the entropy of the distribution. +func (e Exponential) Entropy() float64 { + return 1 - math.Log(e.Rate) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (Exponential) ExKurtosis() float64 { + return 6 +} + +// Fit sets the parameters of the probability distribution from the +// data samples x with relative weights w. +// If weights is nil, then all the weights are 1. +// If weights is not nil, then the len(weights) must equal len(samples). +func (e *Exponential) Fit(samples, weights []float64) { + suffStat := make([]float64, e.NumSuffStat()) + nSamples := e.SuffStat(samples, weights, suffStat) + e.ConjugateUpdate(suffStat, nSamples, make([]float64, e.NumSuffStat())) +} + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (e Exponential) LogProb(x float64) float64 { + if x < 0 { + return math.Inf(-1) + } + return math.Log(e.Rate) - e.Rate*x +} + +// Mean returns the mean of the probability distribution. +func (e Exponential) Mean() float64 { + return 1 / e.Rate +} + +// Median returns the median of the probability distribution. +func (e Exponential) Median() float64 { + return math.Ln2 / e.Rate +} + +// Mode returns the mode of the probability distribution. +func (Exponential) Mode() float64 { + return 0 +} + +// NumParameters returns the number of parameters in the distribution. +func (Exponential) NumParameters() int { + return 1 +} + +// NumSuffStat returns the number of sufficient statistics for the distribution. +func (Exponential) NumSuffStat() int { + return 1 +} + +// Prob computes the value of the probability density function at x. +func (e Exponential) Prob(x float64) float64 { + return math.Exp(e.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (e Exponential) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return -math.Log(1-p) / e.Rate +} + +// Rand returns a random sample drawn from the distribution. +func (e Exponential) Rand() float64 { + var rnd float64 + if e.Source == nil { + rnd = rand.ExpFloat64() + } else { + rnd = e.Source.ExpFloat64() + } + return rnd / e.Rate +} + +// Score returns the score function with respect to the parameters of the +// distribution at the input location x. The score function is the derivative +// of the log-likelihood at x with respect to the parameters +// (∂/∂θ) log(p(x;θ)) +// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise +// Score will panic, and the derivative is stored in-place into deriv. If deriv +// is nil a new slice will be allocated and returned. +// +// The order is [∂LogProb / ∂Rate]. +// +// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29. +// +// Special cases: +// Score(0) = [NaN] +func (e Exponential) Score(deriv []float64, x float64) []float64 { + if deriv == nil { + deriv = make([]float64, e.NumParameters()) + } + if len(deriv) != e.NumParameters() { + panic(badLength) + } + if x > 0 { + deriv[0] = 1/e.Rate - x + return deriv + } + if x < 0 { + deriv[0] = 0 + return deriv + } + deriv[0] = math.NaN() + return deriv +} + +// ScoreInput returns the score function with respect to the input of the +// distribution at the input location specified by x. The score function is the +// derivative of the log-likelihood +// (d/dx) log(p(x)) . +// Special cases: +// ScoreInput(0) = NaN +func (e Exponential) ScoreInput(x float64) float64 { + if x > 0 { + return -e.Rate + } + if x < 0 { + return 0 + } + return math.NaN() +} + +// Skewness returns the skewness of the distribution. +func (Exponential) Skewness() float64 { + return 2 +} + +// StdDev returns the standard deviation of the probability distribution. +func (e Exponential) StdDev() float64 { + return 1 / e.Rate +} + +// SuffStat computes the sufficient statistics of set of samples to update +// the distribution. The sufficient statistics are stored in place, and the +// effective number of samples are returned. +// +// The exponential distribution has one sufficient statistic, the average rate +// of the samples. +// +// If weights is nil, the weights are assumed to be 1, otherwise panics if +// len(samples) != len(weights). Panics if len(suffStat) != NumSuffStat(). +func (Exponential) SuffStat(samples, weights, suffStat []float64) (nSamples float64) { + if len(weights) != 0 && len(samples) != len(weights) { + panic(badLength) + } + + if len(suffStat) != (Exponential{}).NumSuffStat() { + panic(badSuffStat) + } + + if len(weights) == 0 { + nSamples = float64(len(samples)) + } else { + nSamples = floats.Sum(weights) + } + + mean := stat.Mean(samples, weights) + suffStat[0] = 1 / mean + return nSamples +} + +// Survival returns the survival function (complementary CDF) at x. +func (e Exponential) Survival(x float64) float64 { + if x < 0 { + return 1 + } + return math.Exp(-e.Rate * x) +} + +// setParameters modifies the parameters of the distribution. +func (e *Exponential) setParameters(p []Parameter) { + if len(p) != e.NumParameters() { + panic("exponential: incorrect number of parameters to set") + } + if p[0].Name != "Rate" { + panic("exponential: " + panicNameMismatch) + } + e.Rate = p[0].Value +} + +// Variance returns the variance of the probability distribution. +func (e Exponential) Variance() float64 { + return 1 / (e.Rate * e.Rate) +} + +// parameters returns the parameters of the distribution. +func (e Exponential) parameters(p []Parameter) []Parameter { + nParam := e.NumParameters() + if p == nil { + p = make([]Parameter, nParam) + } else if len(p) != nParam { + panic("exponential: improper parameter length") + } + p[0].Name = "Rate" + p[0].Value = e.Rate + return p +} diff --git a/stat/distuv/exponential_test.go b/stat/distuv/exponential_test.go new file mode 100644 index 00000000..9c389702 --- /dev/null +++ b/stat/distuv/exponential_test.go @@ -0,0 +1,71 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "testing" +) + +func TestExponentialProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: 1, + cumProb: 0, + logProb: 0, + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: math.Inf(-1), + }, + { + loc: 1, + prob: 1 / (math.E), + cumProb: 0.6321205588285576784044762298385391325541888689682321654921631983025385042551001966428527256540803563, + logProb: -1, + }, + { + loc: 20, + prob: math.Exp(-20), + cumProb: 0.999999997938846377561442172034059619844179023624192724400896307027755338370835976215440646720089072, + logProb: -20, + }, + } + testDistributionProbs(t, Exponential{Rate: 1}, "Exponential", pts) +} + +func TestExponentialFitPrior(t *testing.T) { + testConjugateUpdate(t, func() ConjugateUpdater { return &Exponential{Rate: 13.7} }) +} + +func TestExponentialScore(t *testing.T) { + for _, test := range []*Exponential{ + { + Rate: 1, + }, + { + Rate: 0.35, + }, + { + Rate: 4.6, + }, + } { + testDerivParam(t, test) + } +} + +func TestExponentialFitPanic(t *testing.T) { + e := Exponential{Rate: 2} + defer func() { + r := recover() + if r != nil { + t.Errorf("unexpected panic for Fit call: %v", r) + } + }() + e.Fit(make([]float64, 10), nil) +} diff --git a/stat/distuv/f.go b/stat/distuv/f.go new file mode 100644 index 00000000..8b4537c3 --- /dev/null +++ b/stat/distuv/f.go @@ -0,0 +1,132 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/mathext" +) + +// F implements the F-distribution, a two-parameter continuous distribution +// with support over the positive real numbers. +// +// The F-distribution has density function +// sqrt(((d1*x)^d1) * d2^d2 / ((d1*x+d2)^(d1+d2))) / (x * B(d1/2,d2/2)) +// where B is the beta function. +// +// For more information, see https://en.wikipedia.org/wiki/F-distribution +type F struct { + D1 float64 // Degrees of freedom for the numerator + D2 float64 // Degrees of freedom for the denominator + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (f F) CDF(x float64) float64 { + return mathext.RegIncBeta(f.D1/2, f.D2/2, f.D1*x/(f.D1*x+f.D2)) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +// +// ExKurtosis returns NaN if the D2 parameter is less or equal to 8. +func (f F) ExKurtosis() float64 { + if f.D2 <= 8 { + return math.NaN() + } + return (12 / (f.D2 - 6)) * ((5*f.D2-22)/(f.D2-8) + ((f.D2-4)/f.D1)*((f.D2-2)/(f.D2-8))*((f.D2-2)/(f.D1+f.D2-2))) +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. +func (f F) LogProb(x float64) float64 { + return 0.5*(f.D1*math.Log(f.D1*x)+f.D2*math.Log(f.D2)-(f.D1+f.D2)*math.Log(f.D1*x+f.D2)) - math.Log(x) - mathext.Lbeta(f.D1/2, f.D2/2) +} + +// Mean returns the mean of the probability distribution. +// +// Mean returns NaN if the D2 parameter is less than or equal to 2. +func (f F) Mean() float64 { + if f.D2 <= 2 { + return math.NaN() + } + return f.D2 / (f.D2 - 2) +} + +// Mode returns the mode of the distribution. +// +// Mode returns NaN if the D1 parameter is less than or equal to 2. +func (f F) Mode() float64 { + if f.D1 <= 2 { + return math.NaN() + } + return ((f.D1 - 2) / f.D1) * (f.D2 / (f.D2 + 2)) +} + +// NumParameters returns the number of parameters in the distribution. +func (f F) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (f F) Prob(x float64) float64 { + return math.Exp(f.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative distribution function. +func (f F) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + y := mathext.InvRegIncBeta(0.5*f.D1, 0.5*f.D2, p) + return f.D2 * y / (f.D1 * (1 - y)) +} + +// Rand returns a random sample drawn from the distribution. +func (f F) Rand() float64 { + u1 := ChiSquared{f.D1, f.Source}.Rand() + u2 := ChiSquared{f.D2, f.Source}.Rand() + return (u1 / f.D1) / (u2 / f.D2) +} + +// Skewness returns the skewness of the distribution. +// +// Skewness returns NaN if the D2 parameter is less than or equal to 6. +func (f F) Skewness() float64 { + if f.D2 <= 6 { + return math.NaN() + } + num := (2*f.D1 + f.D2 - 2) * math.Sqrt(8*(f.D2-4)) + den := (f.D2 - 6) * math.Sqrt(f.D1*(f.D1+f.D2-2)) + return num / den +} + +// StdDev returns the standard deviation of the probability distribution. +// +// StdDev returns NaN if the D2 parameter is less than or equal to 4. +func (f F) StdDev() float64 { + if f.D2 <= 4 { + return math.NaN() + } + return math.Sqrt(f.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (f F) Survival(x float64) float64 { + return 1 - f.CDF(x) +} + +// Variance returns the variance of the probability distribution. +// +// Variance returns NaN if the D2 parameter is less than or equal to 4. +func (f F) Variance() float64 { + if f.D2 <= 4 { + return math.NaN() + } + num := 2 * f.D2 * f.D2 * (f.D1 + f.D2 - 2) + den := f.D1 * (f.D2 - 2) * (f.D2 - 2) * (f.D2 - 4) + return num / den +} diff --git a/stat/distuv/f_test.go b/stat/distuv/f_test.go new file mode 100644 index 00000000..256cc029 --- /dev/null +++ b/stat/distuv/f_test.go @@ -0,0 +1,89 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math/rand" + "sort" + "testing" + + "github.com/gonum/floats" +) + +func TestFProb(t *testing.T) { + for _, test := range []struct { + x, d1, d2, want float64 + }{ + // Values calculated with scipy.stats.f + {0.0001, 4, 6, 0.00053315559110558126}, + {0.1, 1, 1, 0.91507658371794609}, + {0.5, 11, 7, 0.66644660411410883}, + {0.9, 20, 15, 0.88293424959522437}, + {1, 1, 1, 0.15915494309189535}, + {2, 15, 12, 0.16611971273429088}, + {5, 4, 8, 0.013599775603702537}, + {10, 12, 9, 0.00032922887567957289}, + {100, 7, 7, 6.08037637806889e-08}, + {1000, 2, 1, 1.1171959870312232e-05}, + } { + pdf := F{test.d1, test.d2, nil}.Prob(test.x) + if !floats.EqualWithinAbsOrRel(pdf, test.want, 1e-10, 1e-10) { + t.Errorf("Prob mismatch, x = %v, d1 = %v, d2 = %v. Got %v, want %v", test.x, test.d1, test.d2, pdf, test.want) + } + } +} + +func TestFCDF(t *testing.T) { + for _, test := range []struct { + x, d1, d2, want float64 + }{ + // Values calculated with scipy.stats.f + {0.0001, 4, 6, 2.6660741629519019e-08}, + {0.1, 1, 1, 0.19498222904213672}, + {0.5, 11, 7, 0.14625028471336987}, + {0.9, 20, 15, 0.40567939897287852}, + {1, 1, 1, 0.50000000000000011}, + {2, 15, 12, 0.8839384428956264}, + {5, 4, 8, 0.97429642410900219}, + {10, 12, 9, 0.99915733385467187}, + {100, 7, 7, 0.99999823560259171}, + {1000, 2, 1, 0.97764490829950534}, + } { + cdf := F{test.d1, test.d2, nil}.CDF(test.x) + if !floats.EqualWithinAbsOrRel(cdf, test.want, 1e-10, 1e-10) { + t.Errorf("CDF mismatch, x = %v, d1 = %v, d2 = %v. Got %v, want %v", test.x, test.d1, test.d2, cdf, test.want) + } + } +} + +func TestF(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for i, b := range []F{ + {13, 16, src}, + {42, 31, src}, + {77, 92, src}, + } { + testF(t, b, i) + } +} + +func testF(t *testing.T, f F, i int) { + const ( + tol = 1e-2 + n = 2e6 + bins = 50 + ) + x := make([]float64, n) + generateSamples(x, f) + sort.Float64s(x) + + testRandLogProbContinuous(t, i, 0, x, f, tol, bins) + checkProbContinuous(t, i, x, f, 1e-3) + checkMean(t, i, x, f, tol) + checkVarAndStd(t, i, x, f, tol) + checkExKurtosis(t, i, x, f, 5e-2) + checkSkewness(t, i, x, f, tol) + checkQuantileCDFSurvival(t, i, x, f, 1e-3) +} diff --git a/stat/distuv/gamma.go b/stat/distuv/gamma.go new file mode 100644 index 00000000..4c96a700 --- /dev/null +++ b/stat/distuv/gamma.go @@ -0,0 +1,244 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/mathext" +) + +// Gamma implements the Gamma distribution, a two-parameter continuous distribution +// with support over the positive real numbers. +// +// The gamma distribution has density function +// β^α / Γ(α) x^(α-1)e^(-βx) +// +// For more information, see https://en.wikipedia.org/wiki/Gamma_distribution +type Gamma struct { + // Alpha is the shape parameter of the distribution. Alpha must be greater + // than 0. If Alpha == 1, this is equivalent to an exponential distribution. + Alpha float64 + // Beta is the rate parameter of the distribution. Beta must be greater than 0. + // If Beta == 2, this is equivalent to a Chi-Squared distribution. + Beta float64 + + Source *rand.Rand +} + +// CDF computes the value of the cumulative distribution function at x. +func (g Gamma) CDF(x float64) float64 { + if x < 0 { + return 0 + } + return mathext.GammaInc(g.Alpha, g.Beta*x) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (g Gamma) ExKurtosis() float64 { + return 6 / g.Alpha +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. +func (g Gamma) LogProb(x float64) float64 { + if x <= 0 { + return math.Inf(-1) + } + a := g.Alpha + b := g.Beta + lg, _ := math.Lgamma(a) + return a*math.Log(b) - lg + (a-1)*math.Log(x) - b*x +} + +// Mean returns the mean of the probability distribution. +func (g Gamma) Mean() float64 { + return g.Alpha / g.Beta +} + +// Mode returns the mode of the normal distribution. +// +// The mode is NaN in the special case where the Alpha (shape) parameter +// is less than 1. +func (g Gamma) Mode() float64 { + if g.Alpha < 1 { + return math.NaN() + } + return (g.Alpha - 1) / g.Beta +} + +// NumParameters returns the number of parameters in the distribution. +func (Gamma) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (g Gamma) Prob(x float64) float64 { + return math.Exp(g.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative distribution function. +func (g Gamma) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return mathext.GammaIncInv(g.Alpha, p) / g.Beta +} + +// Rand returns a random sample drawn from the distribution. +// +// Rand panics if either alpha or beta is <= 0. +func (g Gamma) Rand() float64 { + if g.Beta <= 0 { + panic("gamma: beta <= 0") + } + + unifrnd := rand.Float64 + exprnd := rand.ExpFloat64 + normrnd := rand.NormFloat64 + if g.Source != nil { + unifrnd = g.Source.Float64 + exprnd = g.Source.ExpFloat64 + normrnd = g.Source.NormFloat64 + } + + a := g.Alpha + b := g.Beta + switch { + case a <= 0: + panic("gamma: alpha < 0") + case a == 1: + // Generate from exponential + return exprnd() / b + case a < 0.3: + // Generate using + // Liu, Chuanhai, Martin, Ryan and Syring, Nick. "Simulating from a + // gamma distribution with small shape parameter" + // https://arxiv.org/abs/1302.1884 + // use this reference: http://link.springer.com/article/10.1007/s00180-016-0692-0 + + // Algorithm adjusted to work in log space as much as possible. + lambda := 1/a - 1 + lw := math.Log(a) - 1 - math.Log(1-a) + lr := -math.Log(1 + math.Exp(lw)) + lc, _ := math.Lgamma(a + 1) + for { + e := exprnd() + var z float64 + if e >= -lr { + z = e + lr + } else { + z = -exprnd() / lambda + } + lh := lc - z - math.Exp(-z/a) + var lEta float64 + if z >= 0 { + lEta = lc - z + } else { + lEta = lc + lw + math.Log(lambda) + lambda*z + } + if lh-lEta > -exprnd() { + return math.Exp(-z/a) / b + } + } + case a >= 0.3 && a < 1: + // Generate using: + // Kundu, Debasis, and Rameshwar D. Gupta. "A convenient way of generating + // gamma random variables using generalized exponential distribution." + // Computational Statistics & Data Analysis 51.6 (2007): 2796-2802. + + // TODO(btracey): Change to using Algorithm 3 if we can find the bug in + // the implementation below. + + // Algorithm 2. + alpha := g.Alpha + a := math.Pow(1-expNegOneHalf, alpha) / (math.Pow(1-expNegOneHalf, alpha) + alpha*math.Exp(-1)/math.Pow(2, alpha)) + b := math.Pow(1-expNegOneHalf, alpha) + alpha/math.E/math.Pow(2, alpha) + var x float64 + for { + u := unifrnd() + if u <= a { + x = -2 * math.Log(1-math.Pow(u*b, 1/alpha)) + } else { + x = -math.Log(math.Pow(2, alpha) / alpha * b * (1 - u)) + } + v := unifrnd() + if x <= 1 { + if v <= math.Pow(x, alpha-1)*math.Exp(-x/2)/(math.Pow(2, alpha-1)*math.Pow(1-math.Exp(-x/2), alpha-1)) { + break + } + } else { + if v <= math.Pow(x, alpha-1) { + break + } + } + } + return x / g.Beta + + /* + // Algorithm 3. + d := 1.0334 - 0.0766*math.Exp(2.2942*alpha) + a := math.Pow(2, alpha) * math.Pow(1-math.Exp(-d/2), alpha) + b := alpha * math.Pow(d, alpha-1) * math.Exp(-d) + c := a + b + var x float64 + for { + u := unifrnd() + if u <= a/(a+b) { + x = -2 * math.Log(1-math.Pow(c*u, 1/a)/2) + } else { + x = -math.Log(c * (1 - u) / (alpha * math.Pow(d, alpha-1))) + } + v := unifrnd() + if x <= d { + if v <= (math.Pow(x, alpha-1)*math.Exp(-x/2))/(math.Pow(2, alpha-1)*math.Pow(1-math.Exp(-x/2), alpha-1)) { + break + } + } else { + if v <= math.Pow(d/x, 1-alpha) { + break + } + } + } + return x / g.Beta + */ + case a > 1: + // Generate using: + // Marsaglia, George, and Wai Wan Tsang. "A simple method for generating + // gamma variables." ACM Transactions on Mathematical Software (TOMS) + // 26.3 (2000): 363-372. + d := a - 1.0/3 + c := 1 / (3 * math.Sqrt(d)) + for { + u := -exprnd() + x := normrnd() + v := 1 + x*c + v = v * v * v + if u < 0.5*x*x+d*(1-v+math.Log(v)) { + return d * v / b + } + } + } + panic("unreachable") +} + +// Survival returns the survival function (complementary CDF) at x. +func (g Gamma) Survival(x float64) float64 { + if x < 0 { + return 1 + } + return mathext.GammaIncComp(g.Alpha, g.Beta*x) +} + +// StdDev returns the standard deviation of the probability distribution. +func (g Gamma) StdDev() float64 { + return math.Sqrt(g.Variance()) +} + +// Variance returns the variance of the probability distribution. +func (g Gamma) Variance() float64 { + return g.Alpha / g.Beta / g.Beta +} diff --git a/stat/distuv/gamma_test.go b/stat/distuv/gamma_test.go new file mode 100644 index 00000000..9321fc42 --- /dev/null +++ b/stat/distuv/gamma_test.go @@ -0,0 +1,63 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math/rand" + "sort" + "testing" + + "github.com/gonum/floats" +) + +func TestGamma(t *testing.T) { + // Values a comparison with scipy + for _, test := range []struct { + x, alpha, want float64 + }{ + {0.9, 0.1, 0.046986817861555757}, + {0.9, 0.01, 0.0045384353289090401}, + {0.45, 0.01, 0.014137035997241795}, + } { + pdf := Gamma{Alpha: test.alpha, Beta: 1}.Prob(test.x) + if !floats.EqualWithinAbsOrRel(pdf, test.want, 1e-10, 1e-10) { + t.Errorf("Pdf mismatch. Got %v, want %v", pdf, test.want) + } + } + src := rand.New(rand.NewSource(1)) + for i, g := range []Gamma{ + + {Alpha: 0.5, Beta: 0.8, Source: src}, + {Alpha: 0.9, Beta: 6, Source: src}, + {Alpha: 0.9, Beta: 500, Source: src}, + + {Alpha: 1, Beta: 1, Source: src}, + + {Alpha: 1.6, Beta: 0.4, Source: src}, + {Alpha: 2.6, Beta: 1.5, Source: src}, + {Alpha: 5.6, Beta: 0.5, Source: src}, + {Alpha: 30, Beta: 1.7, Source: src}, + {Alpha: 30.2, Beta: 1.7, Source: src}, + } { + testGamma(t, g, i) + } +} + +func testGamma(t *testing.T, f Gamma, i int) { + // TODO(btracey): Replace this when Gamma implements FullDist. + tol := 2e-3 + const n = 1e6 + const bins = 50 + x := make([]float64, n) + generateSamples(x, f) + sort.Float64s(x) + + testRandLogProbContinuous(t, i, 0, x, f, tol, bins) + checkMean(t, i, x, f, tol) + checkVarAndStd(t, i, x, f, 2e-2) + checkExKurtosis(t, i, x, f, 5e-2) + checkProbContinuous(t, i, x, f, 1e-3) + checkQuantileCDFSurvival(t, i, x, f, 1e-2) +} diff --git a/stat/distuv/general.go b/stat/distuv/general.go new file mode 100644 index 00000000..8411c85e --- /dev/null +++ b/stat/distuv/general.go @@ -0,0 +1,25 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package distuv provides univariate random distribution types. +package distuv + +import "math" + +// Parameter represents a parameter of a probability distribution +type Parameter struct { + Name string + Value float64 +} + +var ( + badPercentile = "distuv: percentile out of bounds" + badLength = "distuv: slice length mismatch" + badSuffStat = "distuv: wrong suffStat length" + badNoSamples = "distuv: must have at least one sample" +) + +var ( + expNegOneHalf = math.Exp(-0.5) +) diff --git a/stat/distuv/general_test.go b/stat/distuv/general_test.go new file mode 100644 index 00000000..d0da4707 --- /dev/null +++ b/stat/distuv/general_test.go @@ -0,0 +1,205 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "fmt" + "math" + "testing" + + "github.com/gonum/diff/fd" + "github.com/gonum/floats" +) + +type univariateProbPoint struct { + loc float64 + logProb float64 + cumProb float64 + prob float64 +} + +type UniProbDist interface { + Prob(float64) float64 + CDF(float64) float64 + LogProb(float64) float64 + Quantile(float64) float64 + Survival(float64) float64 +} + +func absEq(a, b float64) bool { + if math.Abs(a-b) > 1e-14 { + return false + } + return true +} + +// TODO: Implement a better test for Quantile +func testDistributionProbs(t *testing.T, dist UniProbDist, name string, pts []univariateProbPoint) { + for _, pt := range pts { + logProb := dist.LogProb(pt.loc) + if !absEq(logProb, pt.logProb) { + t.Errorf("Log probability doesnt match for "+name+". Expected %v. Found %v", pt.logProb, logProb) + } + prob := dist.Prob(pt.loc) + if !absEq(prob, pt.prob) { + t.Errorf("Probability doesn't match for "+name+". Expected %v. Found %v", pt.prob, prob) + } + cumProb := dist.CDF(pt.loc) + if !absEq(cumProb, pt.cumProb) { + t.Errorf("Cumulative Probability doesn't match for "+name+". Expected %v. Found %v", pt.cumProb, cumProb) + } + if !absEq(dist.Survival(pt.loc), 1-pt.cumProb) { + t.Errorf("Survival doesn't match for %v. Expected %v, Found %v", name, 1-pt.cumProb, dist.Survival(pt.loc)) + } + if pt.prob != 0 { + if math.Abs(dist.Quantile(pt.cumProb)-pt.loc) > 1e-4 { + fmt.Println("true =", pt.loc) + fmt.Println("calculated=", dist.Quantile(pt.cumProb)) + t.Errorf("Quantile doesn't match for "+name+", loc = %v", pt.loc) + } + } + } +} + +type ConjugateUpdater interface { + NumParameters() int + parameters([]Parameter) []Parameter + + NumSuffStat() int + SuffStat([]float64, []float64, []float64) float64 + ConjugateUpdate([]float64, float64, []float64) + + Rand() float64 +} + +func testConjugateUpdate(t *testing.T, newFittable func() ConjugateUpdater) { + for i, test := range []struct { + samps []float64 + weights []float64 + }{ + { + samps: randn(newFittable(), 10), + weights: nil, + }, + { + samps: randn(newFittable(), 10), + weights: ones(10), + }, + { + samps: randn(newFittable(), 10), + weights: randn(&Exponential{Rate: 1}, 10), + }, + } { + // ensure that conjugate produces the same result both incrementally and all at once + incDist := newFittable() + stats := make([]float64, incDist.NumSuffStat()) + prior := make([]float64, incDist.NumParameters()) + for j := range test.samps { + var incWeights, allWeights []float64 + if test.weights != nil { + incWeights = test.weights[j : j+1] + allWeights = test.weights[0 : j+1] + } + nsInc := incDist.SuffStat(test.samps[j:j+1], incWeights, stats) + incDist.ConjugateUpdate(stats, nsInc, prior) + + allDist := newFittable() + nsAll := allDist.SuffStat(test.samps[0:j+1], allWeights, stats) + allDist.ConjugateUpdate(stats, nsAll, make([]float64, allDist.NumParameters())) + if !parametersEqual(incDist.parameters(nil), allDist.parameters(nil), 1e-12) { + t.Errorf("prior doesn't match after incremental update for (%d, %d). Incremental is %v, all at once is %v", i, j, incDist, allDist) + } + + if test.weights == nil { + onesDist := newFittable() + nsOnes := onesDist.SuffStat(test.samps[0:j+1], ones(j+1), stats) + onesDist.ConjugateUpdate(stats, nsOnes, make([]float64, onesDist.NumParameters())) + if !parametersEqual(onesDist.parameters(nil), incDist.parameters(nil), 1e-14) { + t.Errorf("nil and uniform weighted prior doesn't match for incremental update for (%d, %d). Uniform weighted is %v, nil is %v", i, j, onesDist, incDist) + } + if !parametersEqual(onesDist.parameters(nil), allDist.parameters(nil), 1e-14) { + t.Errorf("nil and uniform weighted prior doesn't match for all at once update for (%d, %d). Uniform weighted is %v, nil is %v", i, j, onesDist, incDist) + } + } + } + } +} + +// randn generates a specified number of random samples +func randn(dist Rander, n int) []float64 { + x := make([]float64, n) + for i := range x { + x[i] = dist.Rand() + } + return x +} + +func ones(n int) []float64 { + x := make([]float64, n) + for i := range x { + x[i] = 1 + } + return x +} + +func parametersEqual(p1, p2 []Parameter, tol float64) bool { + for i, p := range p1 { + if p.Name != p2[i].Name { + return false + } + if math.Abs(p.Value-p2[i].Value) > tol { + return false + } + } + return true +} + +type derivParamTester interface { + LogProb(x float64) float64 + Score(deriv []float64, x float64) []float64 + Quantile(p float64) float64 + NumParameters() int + parameters([]Parameter) []Parameter + setParameters([]Parameter) +} + +func testDerivParam(t *testing.T, d derivParamTester) { + // Tests that the derivative matches for a number of different quantiles + // along the distribution. + nTest := 10 + quantiles := make([]float64, nTest) + floats.Span(quantiles, 0.1, 0.9) + + deriv := make([]float64, d.NumParameters()) + fdDeriv := make([]float64, d.NumParameters()) + + initParams := d.parameters(nil) + init := make([]float64, d.NumParameters()) + for i, v := range initParams { + init[i] = v.Value + } + for _, v := range quantiles { + d.setParameters(initParams) + x := d.Quantile(v) + d.Score(deriv, x) + f := func(p []float64) float64 { + params := d.parameters(nil) + for i, v := range p { + params[i].Value = v + } + d.setParameters(params) + return d.LogProb(x) + } + fd.Gradient(fdDeriv, f, init, nil) + if !floats.EqualApprox(deriv, fdDeriv, 1e-6) { + t.Fatal("Derivative mismatch. Want", fdDeriv, ", got", deriv, ".") + } + d.setParameters(initParams) + d2 := d.Score(nil, x) + if !floats.EqualApprox(d2, deriv, 1e-14) { + t.Errorf("Derivative mismatch when input nil Want %v, got %v", d2, deriv) + } + } +} diff --git a/stat/distuv/interfaces.go b/stat/distuv/interfaces.go new file mode 100644 index 00000000..b2eb80ba --- /dev/null +++ b/stat/distuv/interfaces.go @@ -0,0 +1,22 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +type LogProber interface { + LogProb(float64) float64 +} + +type Rander interface { + Rand() float64 +} + +type RandLogProber interface { + Rander + LogProber +} + +type Quantiler interface { + Quantile(p float64) float64 +} diff --git a/stat/distuv/laplace.go b/stat/distuv/laplace.go new file mode 100644 index 00000000..75f516eb --- /dev/null +++ b/stat/distuv/laplace.go @@ -0,0 +1,252 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + "sort" + + "github.com/gonum/floats" + "github.com/gonum/stat" +) + +// Laplace represents the Laplace distribution (https://en.wikipedia.org/wiki/Laplace_distribution). +type Laplace struct { + Mu float64 // Mean of the Laplace distribution + Scale float64 // Scale of the Laplace distribution + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (l Laplace) CDF(x float64) float64 { + if x < l.Mu { + return 0.5 * math.Exp((x-l.Mu)/l.Scale) + } + return 1 - 0.5*math.Exp(-(x-l.Mu)/l.Scale) +} + +// Entropy returns the entropy of the distribution. +func (l Laplace) Entropy() float64 { + return 1 + math.Log(2*l.Scale) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (l Laplace) ExKurtosis() float64 { + return 3 +} + +// Fit sets the parameters of the probability distribution from the +// data samples x with relative weights w. +// If weights is nil, then all the weights are 1. +// If weights is not nil, then the len(weights) must equal len(samples). +// +// Note: Laplace distribution has no FitPrior because it has no sufficient +// statistics. +func (l *Laplace) Fit(samples, weights []float64) { + if len(samples) != len(weights) { + panic(badLength) + } + + if len(samples) == 0 { + panic(badNoSamples) + } + if len(samples) == 1 { + l.Mu = samples[0] + l.Scale = 0 + return + } + + var ( + sortedSamples []float64 + sortedWeights []float64 + ) + if sort.Float64sAreSorted(samples) { + sortedSamples = samples + sortedWeights = weights + } else { + // Need to copy variables so the input variables aren't effected by the sorting + sortedSamples = make([]float64, len(samples)) + copy(sortedSamples, samples) + sortedWeights := make([]float64, len(samples)) + copy(sortedWeights, weights) + + stat.SortWeighted(sortedSamples, sortedWeights) + } + + // The (weighted) median of the samples is the maximum likelihood estimate + // of the mean parameter + // TODO: Rethink quantile type when stat has more options + l.Mu = stat.Quantile(0.5, stat.Empirical, sortedSamples, sortedWeights) + + sumWeights := floats.Sum(weights) + + // The scale parameter is the average absolute distance + // between the sample and the mean + absError := stat.MomentAbout(1, samples, l.Mu, weights) + + l.Scale = absError / sumWeights +} + +// LogProb computes the natural logarithm of the value of the probability density +// function at x. +func (l Laplace) LogProb(x float64) float64 { + return -math.Ln2 - math.Log(l.Scale) - math.Abs(x-l.Mu)/l.Scale +} + +// MarshalParameters implements the ParameterMarshaler interface +func (l Laplace) MarshalParameters(p []Parameter) { + if len(p) != l.NumParameters() { + panic(badLength) + } + p[0].Name = "Mu" + p[0].Value = l.Mu + p[1].Name = "Scale" + p[1].Value = l.Scale + return +} + +// Mean returns the mean of the probability distribution. +func (l Laplace) Mean() float64 { + return l.Mu +} + +// Median returns the median of the LaPlace distribution. +func (l Laplace) Median() float64 { + return l.Mu +} + +// Mode returns the mode of the LaPlace distribution. +func (l Laplace) Mode() float64 { + return l.Mu +} + +// NumParameters returns the number of parameters in the distribution. +func (l Laplace) NumParameters() int { + return 2 +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (l Laplace) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + if p < 0.5 { + return l.Mu + l.Scale*math.Log(1+2*(p-0.5)) + } + return l.Mu - l.Scale*math.Log(1-2*(p-0.5)) +} + +// Prob computes the value of the probability density function at x. +func (l Laplace) Prob(x float64) float64 { + return math.Exp(l.LogProb(x)) +} + +// Rand returns a random sample drawn from the distribution. +func (l Laplace) Rand() float64 { + var rnd float64 + if l.Source == nil { + rnd = rand.Float64() + } else { + rnd = l.Source.Float64() + } + u := rnd - 0.5 + if u < 0 { + return l.Mu + l.Scale*math.Log(1+2*u) + } + return l.Mu - l.Scale*math.Log(1-2*u) +} + +// Score returns the score function with respect to the parameters of the +// distribution at the input location x. The score function is the derivative +// of the log-likelihood at x with respect to the parameters +// (∂/∂θ) log(p(x;θ)) +// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise +// Score will panic, and the derivative is stored in-place into deriv. If deriv +// is nil a new slice will be allocated and returned. +// +// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Scale]. +// +// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29. +// +// Special cases: +// Score(0) = [0, -0.5/l.Scale] +func (l Laplace) Score(deriv []float64, x float64) []float64 { + if deriv == nil { + deriv = make([]float64, l.NumParameters()) + } + if len(deriv) != l.NumParameters() { + panic(badLength) + } + diff := x - l.Mu + if diff > 0 { + deriv[0] = 1 / l.Scale + } else if diff < 0 { + deriv[0] = -1 / l.Scale + } else if diff == 0 { + deriv[0] = 0 + } else { + // must be NaN + deriv[0] = math.NaN() + } + + deriv[1] = math.Abs(diff)/(l.Scale*l.Scale) - 0.5/(l.Scale) + return deriv +} + +// ScoreInput returns the score function with respect to the input of the +// distribution at the input location specified by x. The score function is the +// derivative of the log-likelihood +// (d/dx) log(p(x)) . +// Special cases: +// ScoreInput(l.Mu) = 0 +func (l Laplace) ScoreInput(x float64) float64 { + diff := x - l.Mu + if diff == 0 { + return 0 + } + if diff > 0 { + return -1 / l.Scale + } + return 1 / l.Scale +} + +// Skewness returns the skewness of the distribution. +func (Laplace) Skewness() float64 { + return 0 +} + +// StdDev returns the standard deviation of the distribution. +func (l Laplace) StdDev() float64 { + return math.Sqrt2 * l.Scale +} + +// Survival returns the survival function (complementary CDF) at x. +func (l Laplace) Survival(x float64) float64 { + if x < l.Mu { + return 1 - 0.5*math.Exp((x-l.Mu)/l.Scale) + } + return 0.5 * math.Exp(-(x-l.Mu)/l.Scale) +} + +// UnmarshalParameters implements the ParameterMarshaler interface +func (l *Laplace) UnmarshalParameters(p []Parameter) { + if len(p) != l.NumParameters() { + panic(badLength) + } + if p[0].Name != "Mu" { + panic("laplace: " + panicNameMismatch) + } + if p[1].Name != "Scale" { + panic("laplace: " + panicNameMismatch) + } + l.Mu = p[0].Value + l.Scale = p[1].Value +} + +// Variance returns the variance of the probability distribution. +func (l Laplace) Variance() float64 { + return 2 * l.Scale * l.Scale +} diff --git a/stat/distuv/laplace_test.go b/stat/distuv/laplace_test.go new file mode 100644 index 00000000..0cbb2594 --- /dev/null +++ b/stat/distuv/laplace_test.go @@ -0,0 +1,58 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "testing" +) + +func TestLaplaceProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: 0.5, + cumProb: 0.5, + logProb: math.Log(0.5), + }, + { + loc: -1, + prob: 1 / (2 * math.E), + cumProb: 0.1839397205857211607977618850807304337229055655158839172539184008487307478724499016785736371729598219, + logProb: math.Log(1 / (2 * math.E)), + }, + { + loc: 1, + prob: 1 / (2 * math.E), + cumProb: 0.8160602794142788392022381149192695662770944344841160827460815991512692521275500983214263628270401781, + logProb: math.Log(1 / (2 * math.E)), + }, + { + loc: -7, + prob: 1 / (2 * math.Pow(math.E, 7)), + cumProb: 0.0004559409827772581040015680422046413132368622637180269204080667109447399446551532646631395032324502210, + logProb: math.Log(1 / (2 * math.Pow(math.E, 7))), + }, + { + loc: 7, + prob: 1 / (2 * math.Pow(math.E, 7)), + cumProb: 0.9995440590172227418959984319577953586867631377362819730795919332890552600553448467353368604967675498, + logProb: math.Log(1 / (2 * math.Pow(math.E, 7))), + }, + { + loc: -20, + prob: math.Exp(-20.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641869), + cumProb: 1.030576811219278913982970190077910488187903637799551846486122330814582011892279676639955463952790684 * 1e-9, + logProb: -20.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641869, + }, + { + loc: 20, + prob: math.Exp(-20.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641869), + cumProb: 0.999999998969423188780721086017029809922089511812096362200448153513877669185417988107720323360044536, + logProb: -20.69314718055994530941723212145817656807550013436025525412068000949339362196969471560586332699641869, + }, + } + testDistributionProbs(t, Laplace{Mu: 0, Scale: 1}, "Laplace", pts) +} diff --git a/stat/distuv/lognormal.go b/stat/distuv/lognormal.go new file mode 100644 index 00000000..e13afce5 --- /dev/null +++ b/stat/distuv/lognormal.go @@ -0,0 +1,112 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" +) + +// LogNormal represents a random variable whose log is normally distributed. +// The probability density function is given by +// 1/(x σ √2π) exp(-(ln(x)-μ)^2)/(2σ^2)) +type LogNormal struct { + Mu float64 + Sigma float64 + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (l LogNormal) CDF(x float64) float64 { + return 0.5 + 0.5*math.Erf((math.Log(x)-l.Mu)/(math.Sqrt2*l.Sigma)) +} + +// Entropy returns the differential entropy of the distribution. +func (l LogNormal) Entropy() float64 { + return 0.5 + 0.5*math.Log(2*math.Pi*l.Sigma*l.Sigma) + l.Mu +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (l LogNormal) ExKurtosis() float64 { + s2 := l.Sigma * l.Sigma + return math.Exp(4*s2) + 2*math.Exp(3*s2) + 3*math.Exp(2*s2) - 6 +} + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (l LogNormal) LogProb(x float64) float64 { + if x < 0 { + return math.Inf(-1) + } + logx := math.Log(x) + normdiff := (logx - l.Mu) / l.Sigma + return -0.5*normdiff*normdiff - logx - math.Log(l.Sigma) - logRoot2Pi +} + +// Mean returns the mean of the probability distribution. +func (l LogNormal) Mean() float64 { + return math.Exp(l.Mu + 0.5*l.Sigma*l.Sigma) +} + +// Median returns the median of the probability distribution. +func (l LogNormal) Median() float64 { + return math.Exp(l.Mu) +} + +// Mode returns the mode of the probability distribution. +func (l LogNormal) Mode() float64 { + return l.Mu +} + +// NumParameters returns the number of parameters in the distribution. +func (LogNormal) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (l LogNormal) Prob(x float64) float64 { + return math.Exp(l.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (l LogNormal) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + // Formula from http://www.math.uah.edu/stat/special/LogNormal.html. + return math.Exp(l.Mu + l.Sigma*UnitNormal.Quantile(p)) +} + +// Rand returns a random sample drawn from the distribution. +func (l LogNormal) Rand() float64 { + var rnd float64 + if l.Source == nil { + rnd = rand.NormFloat64() + } else { + rnd = l.Source.NormFloat64() + } + return math.Exp(rnd*l.Sigma + l.Mu) +} + +// Skewness returns the skewness of the distribution. +func (l LogNormal) Skewness() float64 { + s2 := l.Sigma * l.Sigma + return (math.Exp(s2) + 2) * math.Sqrt(math.Exp(s2)-1) +} + +// StdDev returns the standard deviation of the probability distribution. +func (l LogNormal) StdDev() float64 { + return math.Sqrt(l.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (l LogNormal) Survival(x float64) float64 { + return 0.5 * (1 - math.Erf((math.Log(x)-l.Mu)/(math.Sqrt2*l.Sigma))) +} + +// Variance returns the variance of the probability distribution. +func (l LogNormal) Variance() float64 { + s2 := l.Sigma * l.Sigma + return (math.Exp(s2) - 1) * math.Exp(2*l.Mu+s2) +} diff --git a/stat/distuv/lognormal_test.go b/stat/distuv/lognormal_test.go new file mode 100644 index 00000000..85872854 --- /dev/null +++ b/stat/distuv/lognormal_test.go @@ -0,0 +1,26 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import "testing" + +func TestLognormal(t *testing.T) { + for i, dist := range []LogNormal{ + { + Mu: 0.1, + Sigma: 0.3, + }, + { + Mu: 0.01, + Sigma: 0.01, + }, + { + Mu: 2, + Sigma: 0.01, + }, + } { + testFullDist(t, dist, i, true) + } +} diff --git a/stat/distuv/norm.go b/stat/distuv/norm.go new file mode 100644 index 00000000..a1de60c2 --- /dev/null +++ b/stat/distuv/norm.go @@ -0,0 +1,254 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/floats" + "github.com/gonum/mathext" + "github.com/gonum/stat" +) + +// UnitNormal is an instantiation of the normal distribution with Mu = 0 and Sigma = 1. +var UnitNormal = Normal{Mu: 0, Sigma: 1} + +// Normal respresents a normal (Gaussian) distribution (https://en.wikipedia.org/wiki/Normal_distribution). +type Normal struct { + Mu float64 // Mean of the normal distribution + Sigma float64 // Standard deviation of the normal distribution + Source *rand.Rand + + // Needs to be Mu and Sigma and not Mean and StdDev because Normal has functions + // Mean and StdDev +} + +// CDF computes the value of the cumulative density function at x. +func (n Normal) CDF(x float64) float64 { + return 0.5 * (1 + math.Erf((x-n.Mu)/(n.Sigma*math.Sqrt2))) +} + +// ConjugateUpdate updates the parameters of the distribution from the sufficient +// statistics of a set of samples. The sufficient statistics, suffStat, have been +// observed with nSamples observations. The prior values of the distribution are those +// currently in the distribution, and have been observed with priorStrength samples. +// +// For the normal distribution, the sufficient statistics are the mean and +// uncorrected standard deviation of the samples. +// The prior is having seen strength[0] samples with mean Normal.Mu +// and strength[1] samples with standard deviation Normal.Sigma. As a result of +// this function, Normal.Mu and Normal.Sigma are updated based on the weighted +// samples, and strength is modified to include the new number of samples observed. +// +// This function panics if len(suffStat) != 2 or len(priorStrength) != 2. +func (n *Normal) ConjugateUpdate(suffStat []float64, nSamples float64, priorStrength []float64) { + + // TODO: Support prior strength with math.Inf(1) to allow updating with + // a known mean/standard deviation + + totalMeanSamples := nSamples + priorStrength[0] + totalSum := suffStat[0]*nSamples + n.Mu*priorStrength[0] + + totalVarianceSamples := nSamples + priorStrength[1] + // sample variance + totalVariance := nSamples * suffStat[1] * suffStat[1] + // add prior variance + totalVariance += priorStrength[1] * n.Sigma * n.Sigma + // add cross variance from the difference of the means + meanDiff := (suffStat[0] - n.Mu) + totalVariance += priorStrength[0] * nSamples * meanDiff * meanDiff / totalMeanSamples + + n.Mu = totalSum / totalMeanSamples + n.Sigma = math.Sqrt(totalVariance / totalVarianceSamples) + floats.AddConst(nSamples, priorStrength) +} + +// Entropy returns the differential entropy of the distribution. +func (n Normal) Entropy() float64 { + return 0.5 * (log2Pi + 1 + 2*math.Log(n.Sigma)) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (Normal) ExKurtosis() float64 { + return 0 +} + +// Fit sets the parameters of the probability distribution from the +// data samples x with relative weights w. If weights is nil, then all the weights +// are 1. If weights is not nil, then the len(weights) must equal len(samples). +func (n *Normal) Fit(samples, weights []float64) { + suffStat := make([]float64, n.NumSuffStat()) + nSamples := n.SuffStat(samples, weights, suffStat) + n.ConjugateUpdate(suffStat, nSamples, make([]float64, n.NumSuffStat())) +} + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (n Normal) LogProb(x float64) float64 { + return negLogRoot2Pi - math.Log(n.Sigma) - (x-n.Mu)*(x-n.Mu)/(2*n.Sigma*n.Sigma) +} + +// Mean returns the mean of the probability distribution. +func (n Normal) Mean() float64 { + return n.Mu +} + +// Median returns the median of the normal distribution. +func (n Normal) Median() float64 { + return n.Mu +} + +// Mode returns the mode of the normal distribution. +func (n Normal) Mode() float64 { + return n.Mu +} + +// NumParameters returns the number of parameters in the distribution. +func (Normal) NumParameters() int { + return 2 +} + +// NumSuffStat returns the number of sufficient statistics for the distribution. +func (Normal) NumSuffStat() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (n Normal) Prob(x float64) float64 { + return math.Exp(n.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (n Normal) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return n.Mu + n.Sigma*mathext.NormalQuantile(p) +} + +// Rand returns a random sample drawn from the distribution. +func (n Normal) Rand() float64 { + var rnd float64 + if n.Source == nil { + rnd = rand.NormFloat64() + } else { + rnd = n.Source.NormFloat64() + } + return rnd*n.Sigma + n.Mu +} + +// Score returns the score function with respect to the parameters of the +// distribution at the input location x. The score function is the derivative +// of the log-likelihood at x with respect to the parameters +// (∂/∂θ) log(p(x;θ)) +// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise +// Score will panic, and the derivative is stored in-place into deriv. If deriv +// is nil a new slice will be allocated and returned. +// +// The order is [∂LogProb / ∂Mu, ∂LogProb / ∂Sigma]. +// +// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29. +func (n Normal) Score(deriv []float64, x float64) []float64 { + if deriv == nil { + deriv = make([]float64, n.NumParameters()) + } + if len(deriv) != n.NumParameters() { + panic(badLength) + } + deriv[0] = (x - n.Mu) / (n.Sigma * n.Sigma) + deriv[1] = 1 / n.Sigma * (-1 + ((x-n.Mu)/n.Sigma)*((x-n.Mu)/n.Sigma)) + return deriv +} + +// ScoreInput returns the score function with respect to the input of the +// distribution at the input location specified by x. The score function is the +// derivative of the log-likelihood +// (d/dx) log(p(x)) . +func (n Normal) ScoreInput(x float64) float64 { + return -(1 / (2 * n.Sigma * n.Sigma)) * 2 * (x - n.Mu) +} + +// Skewness returns the skewness of the distribution. +func (Normal) Skewness() float64 { + return 0 +} + +// StdDev returns the standard deviation of the probability distribution. +func (n Normal) StdDev() float64 { + return n.Sigma +} + +// SuffStat computes the sufficient statistics of a set of samples to update +// the distribution. The sufficient statistics are stored in place, and the +// effective number of samples are returned. +// +// The normal distribution has two sufficient statistics, the mean of the samples +// and the standard deviation of the samples. +// +// If weights is nil, the weights are assumed to be 1, otherwise panics if +// len(samples) != len(weights). Panics if len(suffStat) != NumSuffStat(). +func (Normal) SuffStat(samples, weights, suffStat []float64) (nSamples float64) { + lenSamp := len(samples) + if len(weights) != 0 && len(samples) != len(weights) { + panic(badLength) + } + if len(suffStat) != (Normal{}).NumSuffStat() { + panic(badSuffStat) + } + + if len(weights) == 0 { + nSamples = float64(lenSamp) + } else { + nSamples = floats.Sum(weights) + } + + mean := stat.Mean(samples, weights) + suffStat[0] = mean + + // Use Moment and not StdDev because we want it to be uncorrected + variance := stat.MomentAbout(2, samples, mean, weights) + suffStat[1] = math.Sqrt(variance) + return nSamples +} + +// Survival returns the survival function (complementary CDF) at x. +func (n Normal) Survival(x float64) float64 { + return 0.5 * (1 - math.Erf((x-n.Mu)/(n.Sigma*math.Sqrt2))) +} + +// setParameters modifies the parameters of the distribution. +func (n *Normal) setParameters(p []Parameter) { + if len(p) != n.NumParameters() { + panic("normal: incorrect number of parameters to set") + } + if p[0].Name != "Mu" { + panic("normal: " + panicNameMismatch) + } + if p[1].Name != "Sigma" { + panic("normal: " + panicNameMismatch) + } + n.Mu = p[0].Value + n.Sigma = p[1].Value +} + +// Variance returns the variance of the probability distribution. +func (n Normal) Variance() float64 { + return n.Sigma * n.Sigma +} + +// parameters returns the parameters of the distribution. +func (n Normal) parameters(p []Parameter) []Parameter { + nParam := n.NumParameters() + if p == nil { + p = make([]Parameter, nParam) + } else if len(p) != nParam { + panic("normal: improper parameter length") + } + p[0].Name = "Mu" + p[0].Value = n.Mu + p[1].Name = "Sigma" + p[1].Value = n.Sigma + return p +} diff --git a/stat/distuv/norm_example_test.go b/stat/distuv/norm_example_test.go new file mode 100644 index 00000000..e1eabd12 --- /dev/null +++ b/stat/distuv/norm_example_test.go @@ -0,0 +1,35 @@ +// Copyright ©2017 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv_test + +import ( + "fmt" + + "github.com/gonum/stat" + "github.com/gonum/stat/distuv" +) + +func ExampleNormal() { + // Create a normal distribution + dist := distuv.Normal{ + Mu: 2, + Sigma: 5, + } + + data := make([]float64, 1e5) + + // Draw some random values from the standard normal distribution + for i := range data { + data[i] = dist.Rand() + } + + mean, std := stat.MeanStdDev(data, nil) + meanErr := stat.StdErr(std, float64(len(data))) + + fmt.Printf("mean= %1.1f ± %0.1v\n", mean, meanErr) + + // Output: + // mean= 2.0 ± 0.02 +} diff --git a/stat/distuv/norm_test.go b/stat/distuv/norm_test.go new file mode 100644 index 00000000..337dc8af --- /dev/null +++ b/stat/distuv/norm_test.go @@ -0,0 +1,171 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "testing" + + "github.com/gonum/floats" +) + +// TestNormalProbs tests LogProb, Prob, CumProb, and Quantile +func TestNormalProbs(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: oneOverRoot2Pi, + cumProb: 0.5, + logProb: -0.91893853320467274178032973640561763986139747363778341281715, + }, + { + loc: -1, + prob: 0.2419707245191433497978301929355606548286719707374350254875550842811000635700832945083112946939424047, + cumProb: 0.158655253931457051414767454367962077522087033273395609012605, + logProb: math.Log(0.2419707245191433497978301929355606548286719707374350254875550842811000635700832945083112946939424047), + }, + { + loc: 1, + prob: 0.2419707245191433497978301929355606548286719707374350254875550842811000635700832945083112946939424047, + cumProb: 0.841344746068542948585232545632037922477912966726604390987394, + logProb: math.Log(0.2419707245191433497978301929355606548286719707374350254875550842811000635700832945083112946939424047), + }, + { + loc: -7, + prob: 9.134720408364593342868613916794233023000190834851937054490546361277622761970225469305158915808284566e-12, + cumProb: 1.279812543885835004383623690780832998032844154198717929e-12, + logProb: math.Log(9.134720408364593342868613916794233023000190834851937054490546361277622761970225469305158915808284566e-12), + }, + { + loc: 7, + prob: 9.134720408364593342868613916794233023000190834851937054490546361277622761970225469305158915808284566e-12, + cumProb: 0.99999999999872018745611416499561637630921916700196715584580, + logProb: math.Log(9.134720408364593342868613916794233023000190834851937054490546361277622761970225469305158915808284566e-12), + }, + } + testDistributionProbs(t, Normal{Mu: 0, Sigma: 1}, "normal", pts) + + pts = []univariateProbPoint{ + { + loc: 2, + prob: 0.07978845608028653558798921198687637369517172623298693153318516593413158517986036770025046678146138729, + cumProb: 0.5, + logProb: math.Log(0.07978845608028653558798921198687637369517172623298693153318516593413158517986036770025046678146138729), + }, + { + loc: -3, + prob: 0.04839414490382866995956603858711213096573439414748700509751101685622001271401665890166225893878848095, + cumProb: 0.158655253931457051414767454367962077522087033273395609012605, + logProb: math.Log(0.04839414490382866995956603858711213096573439414748700509751101685622001271401665890166225893878848095), + }, + { + loc: 7, + prob: 0.04839414490382866995956603858711213096573439414748700509751101685622001271401665890166225893878848095, + cumProb: 0.841344746068542948585232545632037922477912966726604390987394, + logProb: math.Log(0.04839414490382866995956603858711213096573439414748700509751101685622001271401665890166225893878848095), + }, + { + loc: -33, + prob: 1.826944081672918668573722783358846604600038166970387410898109272255524552394045093861031783161656913e-12, + cumProb: 1.279812543885835004383623690780832998032844154198717929e-12, + logProb: math.Log(1.826944081672918668573722783358846604600038166970387410898109272255524552394045093861031783161656913e-12), + }, + { + loc: 37, + prob: 1.826944081672918668573722783358846604600038166970387410898109272255524552394045093861031783161656913e-12, + cumProb: 0.99999999999872018745611416499561637630921916700196715584580, + logProb: math.Log(1.826944081672918668573722783358846604600038166970387410898109272255524552394045093861031783161656913e-12), + }, + } + testDistributionProbs(t, Normal{Mu: 2, Sigma: 5}, "normal", pts) +} + +func TestNormFitPrior(t *testing.T) { + testConjugateUpdate(t, func() ConjugateUpdater { return &Normal{Mu: -10, Sigma: 6} }) +} + +func TestNormScore(t *testing.T) { + for _, test := range []*Normal{ + { + Mu: 0, + Sigma: 1, + }, + { + Mu: 0.32238, + Sigma: 13.69, + }, + } { + testDerivParam(t, test) + } +} + +func TestNormalQuantile(t *testing.T) { + // Values from https://www.johndcook.com/blog/normal_cdf_inverse/ + p := []float64{ + 0.0000001, + 0.00001, + 0.001, + 0.05, + 0.15, + 0.25, + 0.35, + 0.45, + 0.55, + 0.65, + 0.75, + 0.85, + 0.95, + 0.999, + 0.99999, + 0.9999999, + } + ans := []float64{ + -5.199337582187471, + -4.264890793922602, + -3.090232306167813, + -1.6448536269514729, + -1.0364333894937896, + -0.6744897501960817, + -0.38532046640756773, + -0.12566134685507402, + 0.12566134685507402, + 0.38532046640756773, + 0.6744897501960817, + 1.0364333894937896, + 1.6448536269514729, + 3.090232306167813, + 4.264890793922602, + 5.199337582187471, + } + for i, v := range p { + got := UnitNormal.Quantile(v) + if !floats.EqualWithinAbsOrRel(got, ans[i], 1e-10, 1e-10) { + t.Errorf("Quantile mismatch. Case %d, want: %v, got: %v", i, ans[i], got) + } + } +} + +func TestNormFitPanic(t *testing.T) { + n := Normal{Mu: 0, Sigma: 1} + defer func() { + r := recover() + if r != nil { + t.Errorf("unexpected panic for Fit call: %v", r) + } + }() + n.Fit(make([]float64, 10), nil) +} + +func BenchmarkNormalQuantile(b *testing.B) { + n := Normal{Mu: 2, Sigma: 3.1} + ps := make([]float64, 1000) // ensure there are small values + floats.Span(ps, 0, 1) + for i := 0; i < b.N; i++ { + for _, v := range ps { + x := n.Quantile(v) + _ = x + } + } +} diff --git a/stat/distuv/studentst.go b/stat/distuv/studentst.go new file mode 100644 index 00000000..1505ef98 --- /dev/null +++ b/stat/distuv/studentst.go @@ -0,0 +1,160 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + + "github.com/gonum/mathext" +) + +const logPi = 1.1447298858494001741 // http://oeis.org/A053510 + +// StudentsT implements the three-parameter Student's T distribution, a distribution +// over the real numbers. +// +// The Student's T distribution has density function +// Γ((ν+1)/2) / (sqrt(νπ) Γ(ν/2) σ) (1 + 1/ν * ((x-μ)/σ)^2)^(-(ν+1)/2) +// +// The Student's T distribution approaches the normal distribution as ν → ∞. +// +// For more information, see https://en.wikipedia.org/wiki/Student%27s_t-distribution, +// specifically https://en.wikipedia.org/wiki/Student%27s_t-distribution#Non-standardized_Student.27s_t-distribution . +// +// The standard Student's T distribution is with Mu = 0, and Sigma = 1. +type StudentsT struct { + // Mu is the location parameter of the distribution, and the mean of the + // distribution + Mu float64 + + // Sigma is the scale parameter of the distribution. It is related to the + // standard deviation by std = Sigma * sqrt(Nu/(Nu-2)) + Sigma float64 + + // Nu is the shape prameter of the distribution, representing the number of + // degrees of the distribution, and one less than the number of observations + // from a Normal distribution. + Nu float64 + + Src *rand.Rand +} + +// CDF computes the value of the cumulative distribution function at x. +func (s StudentsT) CDF(x float64) float64 { + // transform to standard normal + y := (x - s.Mu) / s.Sigma + if y == 0 { + return 0.5 + } + // For t > 0 + // F(y) = 1 - 0.5 * I_t(y)(nu/2, 1/2) + // t(y) = nu/(y^2 + nu) + // and 1 - F(y) for t < 0 + t := s.Nu / (y*y + s.Nu) + if y > 0 { + return 1 - 0.5*mathext.RegIncBeta(0.5*s.Nu, 0.5, t) + } + return 0.5 * mathext.RegIncBeta(s.Nu/2, 0.5, t) +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. +func (s StudentsT) LogProb(x float64) float64 { + g1, _ := math.Lgamma((s.Nu + 1) / 2) + g2, _ := math.Lgamma(s.Nu / 2) + z := (x - s.Mu) / s.Sigma + return g1 - g2 - 0.5*math.Log(s.Nu) - 0.5*logPi - math.Log(s.Sigma) - ((s.Nu+1)/2)*math.Log(1+z*z/s.Nu) +} + +// Mean returns the mean of the probability distribution. +func (s StudentsT) Mean() float64 { + return s.Mu +} + +// Mode returns the mode of the distribution. +func (s StudentsT) Mode() float64 { + return s.Mu +} + +// NumParameters returns the number of parameters in the distribution. +func (StudentsT) NumParameters() int { + return 3 +} + +// Prob computes the value of the probability density function at x. +func (s StudentsT) Prob(x float64) float64 { + return math.Exp(s.LogProb(x)) +} + +// Quantile returns the inverse of the cumulative distribution function. +func (s StudentsT) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + // F(x) = 1 - 0.5 * I_t(x)(nu/2, 1/2) + // t(x) = nu/(t^2 + nu) + if p == 0.5 { + return s.Mu + } + var y float64 + if p > 0.5 { + // Know t > 0 + t := mathext.InvRegIncBeta(s.Nu/2, 0.5, 2*(1-p)) + y = math.Sqrt(s.Nu * (1 - t) / t) + } else { + t := mathext.InvRegIncBeta(s.Nu/2, 0.5, 2*p) + y = -math.Sqrt(s.Nu * (1 - t) / t) + } + // Convert out of standard normal + return y*s.Sigma + s.Mu +} + +// Rand returns a random sample drawn from the distribution. +func (s StudentsT) Rand() float64 { + // http://www.math.uah.edu/stat/special/Student.html + n := Normal{0, 1, s.Src}.Rand() + c := Gamma{s.Nu / 2, 0.5, s.Src}.Rand() + z := n / math.Sqrt(c/s.Nu) + return z*s.Sigma + s.Mu +} + +// StdDev returns the standard deviation of the probability distribution. +// +// The standard deviation is undefined for ν <= 1, and this returns math.NaN(). +func (s StudentsT) StdDev() float64 { + return math.Sqrt(s.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (s StudentsT) Survival(x float64) float64 { + // transform to standard normal + y := (x - s.Mu) / s.Sigma + if y == 0 { + return 0.5 + } + // For t > 0 + // F(y) = 1 - 0.5 * I_t(y)(nu/2, 1/2) + // t(y) = nu/(y^2 + nu) + // and 1 - F(y) for t < 0 + t := s.Nu / (y*y + s.Nu) + if y > 0 { + return 0.5 * mathext.RegIncBeta(s.Nu/2, 0.5, t) + } + return 1 - 0.5*mathext.RegIncBeta(s.Nu/2, 0.5, t) +} + +// Variance returns the variance of the probability distribution. +// +// The variance is undefined for ν <= 1, and this returns math.NaN(). +func (s StudentsT) Variance() float64 { + if s.Nu < 1 { + return math.NaN() + } + if s.Nu <= 2 { + return math.Inf(1) + } + return s.Sigma * s.Sigma * s.Nu / (s.Nu - 2) +} diff --git a/stat/distuv/studentst_test.go b/stat/distuv/studentst_test.go new file mode 100644 index 00000000..13c620c5 --- /dev/null +++ b/stat/distuv/studentst_test.go @@ -0,0 +1,83 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" + "sort" + "testing" + + "github.com/gonum/floats" +) + +func TestStudentsTProb(t *testing.T) { + for _, test := range []struct { + x, mu, sigma, nu, want float64 + }{ + // Values comparison with scipy. + {0.01, 0, 1, 2.74, 0.364778548181318}, + {-0.01, 0, 1, 2.74, 0.364778548181318}, + {0.4, 0, 1, 1.6, 0.30376391362582678}, + {-0.4, 0, 1, 1.6, 0.30376391362582678}, + {0.2, 15, 5, 10, 0.0024440848858034393}, + } { + pdf := StudentsT{test.mu, test.sigma, test.nu, nil}.Prob(test.x) + if !floats.EqualWithinAbsOrRel(pdf, test.want, 1e-10, 1e-10) { + t.Errorf("Pdf mismatch, x = %v, Nu = %v. Got %v, want %v", test.x, test.nu, pdf, test.want) + } + } +} + +func TestStudentsT(t *testing.T) { + src := rand.New(rand.NewSource(1)) + for i, b := range []StudentsT{ + {0, 1, 3.3, src}, + {0, 1, 7.2, src}, + {0, 1, 12, src}, + {0.9, 0.8, 6, src}, + } { + testStudentsT(t, b, i) + } +} + +func testStudentsT(t *testing.T, c StudentsT, i int) { + tol := 1e-2 + const n = 1e6 + const bins = 50 + x := make([]float64, n) + generateSamples(x, c) + sort.Float64s(x) + + testRandLogProbContinuous(t, i, math.Inf(-1), x, c, tol, bins) + checkMean(t, i, x, c, tol) + if c.Nu > 2 { + checkVarAndStd(t, i, x, c, tol) + } + checkProbContinuous(t, i, x, c, 1e-3) + checkQuantileCDFSurvival(t, i, x, c, tol) + checkProbQuantContinuous(t, i, x, c, tol) +} + +func TestStudentsTQuantile(t *testing.T) { + nSteps := 101 + probs := make([]float64, nSteps) + floats.Span(probs, 0, 1) + for i, b := range []StudentsT{ + {0, 1, 3.3, nil}, + {0, 1, 7.2, nil}, + {0, 1, 12, nil}, + {0.9, 0.8, 6, nil}, + } { + for _, p := range probs { + x := b.Quantile(p) + p2 := b.CDF(x) + if !floats.EqualWithinAbsOrRel(p, p2, 1e-10, 1e-10) { + t.Errorf("mismatch between CDF and Quantile. Case %v. Want %v, got %v", i, p, p2) + break + } + } + } +} diff --git a/stat/distuv/uniform.go b/stat/distuv/uniform.go new file mode 100644 index 00000000..4c9dec96 --- /dev/null +++ b/stat/distuv/uniform.go @@ -0,0 +1,159 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/rand" +) + +// UnitUniform is an instantiation of the uniform distribution with Min = 0 +// and Max = 1. +var UnitUniform = Uniform{Min: 0, Max: 1} + +// Uniform represents a continuous uniform distribution (https://en.wikipedia.org/wiki/Uniform_distribution_%28continuous%29). +type Uniform struct { + Min float64 + Max float64 + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (u Uniform) CDF(x float64) float64 { + if x < u.Min { + return 0 + } + if x > u.Max { + return 1 + } + return (x - u.Min) / (u.Max - u.Min) +} + +// Uniform doesn't have any of the DLogProbD? because the derivative is 0 everywhere +// except where it's undefined + +// Entropy returns the entropy of the distribution. +func (u Uniform) Entropy() float64 { + return math.Log(u.Max - u.Min) +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (Uniform) ExKurtosis() float64 { + return -6.0 / 5.0 +} + +// Uniform doesn't have Fit because it's a bad idea to fit a uniform from data. + +// LogProb computes the natural logarithm of the value of the probability density function at x. +func (u Uniform) LogProb(x float64) float64 { + if x < u.Min { + return math.Inf(-1) + } + if x > u.Max { + return math.Inf(-1) + } + return -math.Log(u.Max - u.Min) +} + +// MarshalParameters implements the ParameterMarshaler interface +func (u Uniform) MarshalParameters(p []Parameter) { + if len(p) != u.NumParameters() { + panic("uniform: improper parameter length") + } + p[0].Name = "Min" + p[0].Value = u.Min + p[1].Name = "Max" + p[1].Value = u.Max + return +} + +// Mean returns the mean of the probability distribution. +func (u Uniform) Mean() float64 { + return (u.Max + u.Min) / 2 +} + +// Median returns the median of the probability distribution. +func (u Uniform) Median() float64 { + return (u.Max + u.Min) / 2 +} + +// Uniform doesn't have a mode because it's any value in the distribution + +// NumParameters returns the number of parameters in the distribution. +func (Uniform) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (u Uniform) Prob(x float64) float64 { + if x < u.Min { + return 0 + } + if x > u.Max { + return 0 + } + return 1 / (u.Max - u.Min) +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (u Uniform) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return p*(u.Max-u.Min) + u.Min +} + +// Rand returns a random sample drawn from the distribution. +func (u Uniform) Rand() float64 { + var rnd float64 + if u.Source == nil { + rnd = rand.Float64() + } else { + rnd = u.Source.Float64() + } + return rnd*(u.Max-u.Min) + u.Min +} + +// Skewness returns the skewness of the distribution. +func (Uniform) Skewness() float64 { + return 0 +} + +// StdDev returns the standard deviation of the probability distribution. +func (u Uniform) StdDev() float64 { + return math.Sqrt(u.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (u Uniform) Survival(x float64) float64 { + if x < u.Min { + return 1 + } + if x > u.Max { + return 0 + } + return (u.Max - x) / (u.Max - u.Min) +} + +// UnmarshalParameters implements the ParameterMarshaler interface +func (u *Uniform) UnmarshalParameters(p []Parameter) { + if len(p) != u.NumParameters() { + panic("uniform: incorrect number of parameters to set") + } + if p[0].Name != "Min" { + panic("uniform: " + panicNameMismatch) + } + if p[1].Name != "Max" { + panic("uniform: " + panicNameMismatch) + } + + u.Min = p[0].Value + u.Max = p[1].Value +} + +// Variance returns the variance of the probability distribution. +func (u Uniform) Variance() float64 { + return 1.0 / 12.0 * (u.Max - u.Min) * (u.Max - u.Min) +} diff --git a/stat/distuv/weibull.go b/stat/distuv/weibull.go new file mode 100644 index 00000000..72f7f97a --- /dev/null +++ b/stat/distuv/weibull.go @@ -0,0 +1,247 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "math/cmplx" + "math/rand" +) + +// Weibull distribution. Valid range for x is [0,+∞). +type Weibull struct { + // Shape parameter of the distribution. A value of 1 represents + // the exponential distribution. A value of 2 represents the + // Rayleigh distribution. Valid range is (0,+∞). + K float64 + // Scale parameter of the distribution. Valid range is (0,+∞). + Lambda float64 + // Source of random numbers + Source *rand.Rand +} + +// CDF computes the value of the cumulative density function at x. +func (w Weibull) CDF(x float64) float64 { + if x < 0 { + return 0 + } else { + return 1 - cmplx.Abs(cmplx.Exp(w.LogCDF(x))) + } +} + +// Entropy returns the entropy of the distribution. +func (w Weibull) Entropy() float64 { + return eulerGamma*(1-1/w.K) + math.Log(w.Lambda/w.K) + 1 +} + +// ExKurtosis returns the excess kurtosis of the distribution. +func (w Weibull) ExKurtosis() float64 { + return (-6*w.gammaIPow(1, 4) + 12*w.gammaIPow(1, 2)*math.Gamma(1+2/w.K) - 3*w.gammaIPow(2, 2) - 4*math.Gamma(1+1/w.K)*math.Gamma(1+3/w.K) + math.Gamma(1+4/w.K)) / math.Pow(math.Gamma(1+2/w.K)-w.gammaIPow(1, 2), 2) +} + +// gammIPow is a shortcut for computing the gamma function to a power. +func (w Weibull) gammaIPow(i, pow float64) float64 { + return math.Pow(math.Gamma(1+i/w.K), pow) +} + +// LogCDF computes the value of the log of the cumulative density function at x. +func (w Weibull) LogCDF(x float64) complex128 { + if x < 0 { + return 0 + } else { + return cmplx.Log(-1) + complex(-math.Pow(x/w.Lambda, w.K), 0) + } +} + +// LogProb computes the natural logarithm of the value of the probability +// density function at x. Zero is returned if x is less than zero. +// +// Special cases occur when x == 0, and the result depends on the shape +// parameter as follows: +// If 0 < K < 1, LogProb returns +Inf. +// If K == 1, LogProb returns 0. +// If K > 1, LogProb returns -Inf. +func (w Weibull) LogProb(x float64) float64 { + if x < 0 { + return 0 + } else { + return math.Log(w.K) - math.Log(w.Lambda) + (w.K-1)*(math.Log(x)-math.Log(w.Lambda)) - math.Pow(x/w.Lambda, w.K) + } +} + +// Survival returns the log of the survival function (complementary CDF) at x. +func (w Weibull) LogSurvival(x float64) float64 { + if x < 0 { + return 0 + } else { + return -math.Pow(x/w.Lambda, w.K) + } +} + +// Mean returns the mean of the probability distribution. +func (w Weibull) Mean() float64 { + return w.Lambda * math.Gamma(1+1/w.K) +} + +// Median returns the median of the normal distribution. +func (w Weibull) Median() float64 { + return w.Lambda * math.Pow(ln2, 1/w.K) +} + +// Mode returns the mode of the normal distribution. +// +// The mode is NaN in the special case where the K (shape) parameter +// is less than 1. +func (w Weibull) Mode() float64 { + if w.K > 1 { + return w.Lambda * math.Pow((w.K-1)/w.K, 1/w.K) + } else if w.K == 1 { + return 0 + } else { + return math.NaN() + } +} + +// NumParameters returns the number of parameters in the distribution. +func (Weibull) NumParameters() int { + return 2 +} + +// Prob computes the value of the probability density function at x. +func (w Weibull) Prob(x float64) float64 { + if x < 0 { + return 0 + } else { + return math.Exp(w.LogProb(x)) + } +} + +// Quantile returns the inverse of the cumulative probability distribution. +func (w Weibull) Quantile(p float64) float64 { + if p < 0 || p > 1 { + panic(badPercentile) + } + return w.Lambda * math.Pow(-math.Log(1-p), 1/w.K) +} + +// Rand returns a random sample drawn from the distribution. +func (w Weibull) Rand() float64 { + var rnd float64 + if w.Source == nil { + rnd = rand.Float64() + } else { + rnd = w.Source.Float64() + } + return w.Quantile(rnd) +} + +// Score returns the score function with respect to the parameters of the +// distribution at the input location x. The score function is the derivative +// of the log-likelihood at x with respect to the parameters +// (∂/∂θ) log(p(x;θ)) +// If deriv is non-nil, len(deriv) must equal the number of parameters otherwise +// Score will panic, and the derivative is stored in-place into deriv. If deriv +// is nil a new slice will be allocated and returned. +// +// The order is [∂LogProb / ∂K, ∂LogProb / ∂λ]. +// +// For more information, see https://en.wikipedia.org/wiki/Score_%28statistics%29. +// +// Special cases: +// Score(0) = [NaN, NaN] +func (w Weibull) Score(deriv []float64, x float64) []float64 { + if deriv == nil { + deriv = make([]float64, w.NumParameters()) + } + if len(deriv) != w.NumParameters() { + panic(badLength) + } + if x > 0 { + deriv[0] = 1/w.K + math.Log(x) - math.Log(w.Lambda) - (math.Log(x)-math.Log(w.Lambda))*math.Pow(x/w.Lambda, w.K) + deriv[1] = (w.K * (math.Pow(x/w.Lambda, w.K) - 1)) / w.Lambda + return deriv + } + if x < 0 { + deriv[0] = 0 + deriv[1] = 0 + return deriv + } + deriv[0] = math.NaN() + deriv[0] = math.NaN() + return deriv +} + +// ScoreInput returns the score function with respect to the input of the +// distribution at the input location specified by x. The score function is the +// derivative of the log-likelihood +// (d/dx) log(p(x)) . +// +// Special cases: +// ScoreInput(0) = NaN +func (w Weibull) ScoreInput(x float64) float64 { + if x > 0 { + return (-w.K*math.Pow(x/w.Lambda, w.K) + w.K - 1) / x + } + if x < 0 { + return 0 + } + return math.NaN() +} + +// Skewness returns the skewness of the distribution. +func (w Weibull) Skewness() float64 { + stdDev := w.StdDev() + firstGamma, firstGammaSign := math.Lgamma(1 + 3/w.K) + logFirst := firstGamma + 3*(math.Log(w.Lambda)-math.Log(stdDev)) + logSecond := math.Log(3) + math.Log(w.Mean()) + 2*math.Log(stdDev) - 3*math.Log(stdDev) + logThird := 3 * (math.Log(w.Mean()) - math.Log(stdDev)) + return float64(firstGammaSign)*math.Exp(logFirst) - math.Exp(logSecond) - math.Exp(logThird) +} + +// StdDev returns the standard deviation of the probability distribution. +func (w Weibull) StdDev() float64 { + return math.Sqrt(w.Variance()) +} + +// Survival returns the survival function (complementary CDF) at x. +func (w Weibull) Survival(x float64) float64 { + return math.Exp(w.LogSurvival(x)) +} + +// setParameters modifies the parameters of the distribution. +func (w *Weibull) setParameters(p []Parameter) { + if len(p) != w.NumParameters() { + panic("weibull: incorrect number of parameters to set") + } + if p[0].Name != "K" { + panic("weibull: " + panicNameMismatch) + } + if p[1].Name != "λ" { + panic("weibull: " + panicNameMismatch) + } + w.K = p[0].Value + w.Lambda = p[1].Value +} + +// Variance returns the variance of the probability distribution. +func (w Weibull) Variance() float64 { + return math.Pow(w.Lambda, 2) * (math.Gamma(1+2/w.K) - w.gammaIPow(1, 2)) +} + +// parameters returns the parameters of the distribution. +func (w Weibull) parameters(p []Parameter) []Parameter { + nParam := w.NumParameters() + if p == nil { + p = make([]Parameter, nParam) + } else if len(p) != nParam { + panic("weibull: improper parameter length") + } + p[0].Name = "K" + p[0].Value = w.K + p[1].Name = "λ" + p[1].Value = w.Lambda + return p + +} diff --git a/stat/distuv/weibull_test.go b/stat/distuv/weibull_test.go new file mode 100644 index 00000000..cd78f49f --- /dev/null +++ b/stat/distuv/weibull_test.go @@ -0,0 +1,209 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package distuv + +import ( + "math" + "testing" +) + +func TestHalfKStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: math.Inf(1), + cumProb: 0, + logProb: math.Inf(1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 0.183939720585721, + cumProb: 0.632120558828558, + logProb: -1.693147180559950, + }, + { + loc: 20, + prob: 0.001277118038048, + cumProb: 0.988577109006533, + logProb: -6.663149272336520, + }, + } + testDistributionProbs(t, Weibull{K: 0.5, Lambda: 1}, "0.5K Standard Weibull", pts) +} + +func TestExponentialStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: 1, + cumProb: 0, + logProb: math.Inf(1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 0.367879441171442, + cumProb: 0.632120558828558, + logProb: -1.0, + }, + { + loc: 20, + prob: 0.000000002061154, + cumProb: 0.999999997938846, + logProb: -20.0, + }, + } + testDistributionProbs(t, Weibull{K: 1, Lambda: 1}, "1K (Exponential) Standard Weibull", pts) +} + +func TestRayleighStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: 0, + cumProb: 0, + logProb: math.Inf(-1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 0.735758882342885, + cumProb: 0.632120558828558, + logProb: -0.306852819440055, + }, + { + loc: 20, + prob: 0, + cumProb: 1, + logProb: -396.31112054588607, + }, + } + testDistributionProbs(t, Weibull{K: 2, Lambda: 1}, "2K (Rayleigh) Standard Weibull", pts) +} + +func TestFiveKStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: 0, + cumProb: 0, + logProb: math.Inf(-1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 1.839397205857210, + cumProb: 0.632120558828558, + logProb: 0.609437912434100, + }, + { + loc: 20, + prob: 0, + cumProb: 1, + logProb: -3199986.4076329935, + }, + } + testDistributionProbs(t, Weibull{K: 5, Lambda: 1}, "5K Standard Weibull", pts) +} + +func TestScaledUpHalfKStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: math.Inf(1), + cumProb: 0, + logProb: math.Inf(1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 0.180436508682207, + cumProb: 0.558022622759326, + logProb: -1.712376315541750, + }, + { + loc: 20, + prob: 0.002369136850928, + cumProb: 0.974047406098605, + logProb: -6.045229588092130, + }, + } + testDistributionProbs(t, Weibull{K: 0.5, Lambda: 1.5}, "0.5K 1.5λ Weibull", pts) +} + +func TestScaledDownHalfKStandardWeibullProb(t *testing.T) { + pts := []univariateProbPoint{ + { + loc: 0, + prob: math.Inf(1), + cumProb: 0, + logProb: math.Inf(1), + }, + { + loc: -1, + prob: 0, + cumProb: 0, + logProb: 0, + }, + { + loc: 1, + prob: 0.171909491538362, + cumProb: 0.756883265565786, + logProb: -1.760787152653070, + }, + { + loc: 20, + prob: 0.000283302579100, + cumProb: 0.998208237166091, + logProb: -8.168995047393730, + }, + } + testDistributionProbs(t, Weibull{K: 0.5, Lambda: 0.5}, "0.5K 0.5λ Weibull", pts) +} + +func TestWeibullScore(t *testing.T) { + for _, test := range []*Weibull{ + { + K: 1, + Lambda: 1, + }, + { + K: 2, + Lambda: 3.6, + }, + { + K: 3.4, + Lambda: 8, + }, + } { + testDerivParam(t, test) + } +} diff --git a/stat/faithful_test.go b/stat/faithful_test.go new file mode 100644 index 00000000..981eff1c --- /dev/null +++ b/stat/faithful_test.go @@ -0,0 +1,81 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +// faithful is the faithful data set from R. +var faithful = struct{ waiting, eruptions []float64 }{ + waiting: []float64{ + 79, 54, 74, 62, 85, 55, 88, 85, + 51, 85, 54, 84, 78, 47, 83, 52, + 62, 84, 52, 79, 51, 47, 78, 69, + 74, 83, 55, 76, 78, 79, 73, 77, + 66, 80, 74, 52, 48, 80, 59, 90, + 80, 58, 84, 58, 73, 83, 64, 53, + 82, 59, 75, 90, 54, 80, 54, 83, + 71, 64, 77, 81, 59, 84, 48, 82, + 60, 92, 78, 78, 65, 73, 82, 56, + 79, 71, 62, 76, 60, 78, 76, 83, + 75, 82, 70, 65, 73, 88, 76, 80, + 48, 86, 60, 90, 50, 78, 63, 72, + 84, 75, 51, 82, 62, 88, 49, 83, + 81, 47, 84, 52, 86, 81, 75, 59, + 89, 79, 59, 81, 50, 85, 59, 87, + 53, 69, 77, 56, 88, 81, 45, 82, + 55, 90, 45, 83, 56, 89, 46, 82, + 51, 86, 53, 79, 81, 60, 82, 77, + 76, 59, 80, 49, 96, 53, 77, 77, + 65, 81, 71, 70, 81, 93, 53, 89, + 45, 86, 58, 78, 66, 76, 63, 88, + 52, 93, 49, 57, 77, 68, 81, 81, + 73, 50, 85, 74, 55, 77, 83, 83, + 51, 78, 84, 46, 83, 55, 81, 57, + 76, 84, 77, 81, 87, 77, 51, 78, + 60, 82, 91, 53, 78, 46, 77, 84, + 49, 83, 71, 80, 49, 75, 64, 76, + 53, 94, 55, 76, 50, 82, 54, 75, + 78, 79, 78, 78, 70, 79, 70, 54, + 86, 50, 90, 54, 54, 77, 79, 64, + 75, 47, 86, 63, 85, 82, 57, 82, + 67, 74, 54, 83, 73, 73, 88, 80, + 71, 83, 56, 79, 78, 84, 58, 83, + 43, 60, 75, 81, 46, 90, 46, 74, + }, + eruptions: []float64{ + 3.600, 1.800, 3.333, 2.283, 4.533, 2.883, 4.700, 3.600, + 1.950, 4.350, 1.833, 3.917, 4.200, 1.750, 4.700, 2.167, + 1.750, 4.800, 1.600, 4.250, 1.800, 1.750, 3.450, 3.067, + 4.533, 3.600, 1.967, 4.083, 3.850, 4.433, 4.300, 4.467, + 3.367, 4.033, 3.833, 2.017, 1.867, 4.833, 1.833, 4.783, + 4.350, 1.883, 4.567, 1.750, 4.533, 3.317, 3.833, 2.100, + 4.633, 2.000, 4.800, 4.716, 1.833, 4.833, 1.733, 4.883, + 3.717, 1.667, 4.567, 4.317, 2.233, 4.500, 1.750, 4.800, + 1.817, 4.400, 4.167, 4.700, 2.067, 4.700, 4.033, 1.967, + 4.500, 4.000, 1.983, 5.067, 2.017, 4.567, 3.883, 3.600, + 4.133, 4.333, 4.100, 2.633, 4.067, 4.933, 3.950, 4.517, + 2.167, 4.000, 2.200, 4.333, 1.867, 4.817, 1.833, 4.300, + 4.667, 3.750, 1.867, 4.900, 2.483, 4.367, 2.100, 4.500, + 4.050, 1.867, 4.700, 1.783, 4.850, 3.683, 4.733, 2.300, + 4.900, 4.417, 1.700, 4.633, 2.317, 4.600, 1.817, 4.417, + 2.617, 4.067, 4.250, 1.967, 4.600, 3.767, 1.917, 4.500, + 2.267, 4.650, 1.867, 4.167, 2.800, 4.333, 1.833, 4.383, + 1.883, 4.933, 2.033, 3.733, 4.233, 2.233, 4.533, 4.817, + 4.333, 1.983, 4.633, 2.017, 5.100, 1.800, 5.033, 4.000, + 2.400, 4.600, 3.567, 4.000, 4.500, 4.083, 1.800, 3.967, + 2.200, 4.150, 2.000, 3.833, 3.500, 4.583, 2.367, 5.000, + 1.933, 4.617, 1.917, 2.083, 4.583, 3.333, 4.167, 4.333, + 4.500, 2.417, 4.000, 4.167, 1.883, 4.583, 4.250, 3.767, + 2.033, 4.433, 4.083, 1.833, 4.417, 2.183, 4.800, 1.833, + 4.800, 4.100, 3.966, 4.233, 3.500, 4.366, 2.250, 4.667, + 2.100, 4.350, 4.133, 1.867, 4.600, 1.783, 4.367, 3.850, + 1.933, 4.500, 2.383, 4.700, 1.867, 3.833, 3.417, 4.233, + 2.400, 4.800, 2.000, 4.150, 1.867, 4.267, 1.750, 4.483, + 4.000, 4.117, 4.083, 4.267, 3.917, 4.550, 4.083, 2.417, + 4.183, 2.217, 4.450, 1.883, 1.850, 4.283, 3.950, 2.333, + 4.150, 2.350, 4.933, 2.900, 4.583, 3.833, 2.083, 4.367, + 2.133, 4.350, 2.200, 4.450, 3.567, 4.500, 4.150, 3.817, + 3.917, 4.450, 2.000, 4.283, 4.767, 4.533, 1.850, 4.250, + 1.983, 2.250, 4.750, 4.117, 2.150, 4.417, 1.817, 4.467, + }, +} diff --git a/stat/moments_bench_test.go b/stat/moments_bench_test.go new file mode 100644 index 00000000..6df41394 --- /dev/null +++ b/stat/moments_bench_test.go @@ -0,0 +1,611 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// a set of benchmarks to evaluate the performance of the various +// moment statistics: Mean, Variance, StdDev, MeanVariance, MeanStdDev, +// Covariance, Correlation, Skew, ExKurtosis, Moment, MomentAbout, ... +// +// It tests both weighted and unweighted versions by using a slice of +// all ones. + +package stat + +import ( + "math/rand" + "testing" +) + +const ( + small = 10 + medium = 1000 + large = 100000 + huge = 10000000 +) + +// tests for unweighted versions + +func RandomSlice(l int) []float64 { + s := make([]float64, l) + for i := range s { + s[i] = rand.Float64() + } + return s +} + +func benchmarkMean(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Mean(s, wts) + } +} + +func BenchmarkMeanSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkMean(b, s, nil) +} + +func BenchmarkMeanMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkMean(b, s, nil) +} + +func BenchmarkMeanLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkMean(b, s, nil) +} + +func BenchmarkMeanHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkMean(b, s, nil) +} + +func BenchmarkMeanSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkMean(b, s, wts) +} + +func BenchmarkMeanMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkMean(b, s, wts) +} + +func BenchmarkMeanLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkMean(b, s, wts) +} + +func BenchmarkMeanHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkMean(b, s, wts) +} + +func benchmarkVariance(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Variance(s, wts) + } +} + +func BenchmarkVarianceSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkVariance(b, s, nil) +} + +func BenchmarkVarianceMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkVariance(b, s, nil) +} + +func BenchmarkVarianceLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkVariance(b, s, nil) +} + +func BenchmarkVarianceHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkVariance(b, s, nil) +} + +func BenchmarkVarianceSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkVariance(b, s, wts) +} + +func BenchmarkVarianceMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkVariance(b, s, wts) +} + +func BenchmarkVarianceLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkVariance(b, s, wts) +} + +func BenchmarkVarianceHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkVariance(b, s, wts) +} + +func benchmarkStdDev(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + StdDev(s, wts) + } +} + +func BenchmarkStdDevSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkStdDev(b, s, nil) +} + +func BenchmarkStdDevMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkStdDev(b, s, nil) +} + +func BenchmarkStdDevLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkStdDev(b, s, nil) +} + +func BenchmarkStdDevHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkStdDev(b, s, nil) +} + +func BenchmarkStdDevSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkStdDev(b, s, wts) +} + +func BenchmarkStdDevMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkStdDev(b, s, wts) +} + +func BenchmarkStdDevLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkStdDev(b, s, wts) +} + +func BenchmarkStdDevHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkStdDev(b, s, wts) +} + +func benchmarkMeanVariance(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + MeanVariance(s, wts) + } +} + +func BenchmarkMeanVarianceSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkMeanVariance(b, s, nil) +} + +func BenchmarkMeanVarianceMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkMeanVariance(b, s, nil) +} + +func BenchmarkMeanVarianceLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkMeanVariance(b, s, nil) +} + +func BenchmarkMeanVarianceHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkMeanVariance(b, s, nil) +} + +func BenchmarkMeanVarianceSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkMeanVariance(b, s, wts) +} + +func BenchmarkMeanVarianceMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkMeanVariance(b, s, wts) +} + +func BenchmarkMeanVarianceLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkMeanVariance(b, s, wts) +} + +func BenchmarkMeanVarianceHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkMeanVariance(b, s, wts) +} + +func benchmarkMeanStdDev(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + MeanStdDev(s, wts) + } +} + +func BenchmarkMeanStdDevSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkMeanStdDev(b, s, nil) +} + +func BenchmarkMeanStdDevMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkMeanStdDev(b, s, nil) +} + +func BenchmarkMeanStdDevLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkMeanStdDev(b, s, nil) +} + +func BenchmarkMeanStdDevHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkMeanStdDev(b, s, nil) +} + +func BenchmarkMeanStdDevSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkMeanStdDev(b, s, wts) +} + +func BenchmarkMeanStdDevMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkMeanStdDev(b, s, wts) +} + +func BenchmarkMeanStdDevLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkMeanStdDev(b, s, wts) +} + +func BenchmarkMeanStdDevHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkMeanStdDev(b, s, wts) +} + +func benchmarkCovariance(b *testing.B, s1, s2, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Covariance(s1, s2, wts) + } +} + +func BenchmarkCovarianceSmall(b *testing.B) { + s1 := RandomSlice(small) + s2 := RandomSlice(small) + benchmarkCovariance(b, s1, s2, nil) +} + +func BenchmarkCovarianceMedium(b *testing.B) { + s1 := RandomSlice(medium) + s2 := RandomSlice(medium) + benchmarkCovariance(b, s1, s2, nil) +} + +func BenchmarkCovarianceLarge(b *testing.B) { + s1 := RandomSlice(large) + s2 := RandomSlice(large) + benchmarkCovariance(b, s1, s2, nil) +} + +func BenchmarkCovarianceHuge(b *testing.B) { + s1 := RandomSlice(huge) + s2 := RandomSlice(huge) + benchmarkCovariance(b, s1, s2, nil) +} + +func BenchmarkCovarianceSmallWeighted(b *testing.B) { + s1 := RandomSlice(small) + s2 := RandomSlice(small) + wts := RandomSlice(small) + benchmarkCovariance(b, s1, s2, wts) +} + +func BenchmarkCovarianceMediumWeighted(b *testing.B) { + s1 := RandomSlice(medium) + s2 := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkCovariance(b, s1, s2, wts) +} + +func BenchmarkCovarianceLargeWeighted(b *testing.B) { + s1 := RandomSlice(large) + s2 := RandomSlice(large) + wts := RandomSlice(large) + benchmarkCovariance(b, s1, s2, wts) +} + +func BenchmarkCovarianceHugeWeighted(b *testing.B) { + s1 := RandomSlice(huge) + s2 := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkCovariance(b, s1, s2, wts) +} + +func benchmarkCorrelation(b *testing.B, s1, s2, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Correlation(s1, s2, wts) + } +} + +func BenchmarkCorrelationSmall(b *testing.B) { + s1 := RandomSlice(small) + s2 := RandomSlice(small) + benchmarkCorrelation(b, s1, s2, nil) +} + +func BenchmarkCorrelationMedium(b *testing.B) { + s1 := RandomSlice(medium) + s2 := RandomSlice(medium) + benchmarkCorrelation(b, s1, s2, nil) +} + +func BenchmarkCorrelationLarge(b *testing.B) { + s1 := RandomSlice(large) + s2 := RandomSlice(large) + benchmarkCorrelation(b, s1, s2, nil) +} + +func BenchmarkCorrelationHuge(b *testing.B) { + s1 := RandomSlice(huge) + s2 := RandomSlice(huge) + benchmarkCorrelation(b, s1, s2, nil) +} + +func BenchmarkCorrelationSmallWeighted(b *testing.B) { + s1 := RandomSlice(small) + s2 := RandomSlice(small) + wts := RandomSlice(small) + benchmarkCorrelation(b, s1, s2, wts) +} + +func BenchmarkCorrelationMediumWeighted(b *testing.B) { + s1 := RandomSlice(medium) + s2 := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkCorrelation(b, s1, s2, wts) +} + +func BenchmarkCorrelationLargeWeighted(b *testing.B) { + s1 := RandomSlice(large) + s2 := RandomSlice(large) + wts := RandomSlice(large) + benchmarkCorrelation(b, s1, s2, wts) +} + +func BenchmarkCorrelationHugeWeighted(b *testing.B) { + s1 := RandomSlice(huge) + s2 := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkCorrelation(b, s1, s2, wts) +} + +func benchmarkSkew(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Skew(s, wts) + } +} + +func BenchmarkSkewSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkSkew(b, s, nil) +} + +func BenchmarkSkewMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkSkew(b, s, nil) +} + +func BenchmarkSkewLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkSkew(b, s, nil) +} + +func BenchmarkSkewHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkSkew(b, s, nil) +} + +func BenchmarkSkewSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkSkew(b, s, wts) +} + +func BenchmarkSkewMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkSkew(b, s, wts) +} + +func BenchmarkSkewLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkSkew(b, s, wts) +} + +func BenchmarkSkewHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkSkew(b, s, wts) +} + +func benchmarkExKurtosis(b *testing.B, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + ExKurtosis(s, wts) + } +} + +func BenchmarkExKurtosisSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkExKurtosis(b, s, nil) +} + +func BenchmarkExKurtosisMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkExKurtosis(b, s, nil) +} + +func BenchmarkExKurtosisLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkExKurtosis(b, s, nil) +} + +func BenchmarkExKurtosisHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkExKurtosis(b, s, nil) +} + +func BenchmarkExKurtosisSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkExKurtosis(b, s, wts) +} + +func BenchmarkExKurtosisMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkExKurtosis(b, s, wts) +} + +func BenchmarkExKurtosisLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkExKurtosis(b, s, wts) +} + +func BenchmarkExKurtosisHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkExKurtosis(b, s, wts) +} + +func benchmarkMoment(b *testing.B, n float64, s, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + Moment(n, s, wts) + } +} + +func BenchmarkMomentSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkMoment(b, 5, s, nil) +} + +func BenchmarkMomentMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkMoment(b, 5, s, nil) +} + +func BenchmarkMomentLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkMoment(b, 5, s, nil) +} + +func BenchmarkMomentHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkMoment(b, 5, s, nil) +} + +func BenchmarkMomentSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkMoment(b, 5, s, wts) +} + +func BenchmarkMomentMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkMoment(b, 5, s, wts) +} + +func BenchmarkMomentLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkMoment(b, 5, s, wts) +} + +func BenchmarkMomentHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkMoment(b, 5, s, wts) +} + +func benchmarkMomentAbout(b *testing.B, n float64, s []float64, mean float64, wts []float64) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + MomentAbout(n, s, mean, wts) + } +} + +func BenchmarkMomentAboutSmall(b *testing.B) { + s := RandomSlice(small) + benchmarkMomentAbout(b, 5, s, 0, nil) +} + +func BenchmarkMomentAboutMedium(b *testing.B) { + s := RandomSlice(medium) + benchmarkMomentAbout(b, 5, s, 0, nil) +} + +func BenchmarkMomentAboutLarge(b *testing.B) { + s := RandomSlice(large) + benchmarkMomentAbout(b, 5, s, 0, nil) +} + +func BenchmarkMomentAboutHuge(b *testing.B) { + s := RandomSlice(huge) + benchmarkMomentAbout(b, 5, s, 0, nil) +} + +func BenchmarkMomentAboutSmallWeighted(b *testing.B) { + s := RandomSlice(small) + wts := RandomSlice(small) + benchmarkMomentAbout(b, 5, s, 0, wts) +} + +func BenchmarkMomentAboutMediumWeighted(b *testing.B) { + s := RandomSlice(medium) + wts := RandomSlice(medium) + benchmarkMomentAbout(b, 5, s, 0, wts) +} + +func BenchmarkMomentAboutLargeWeighted(b *testing.B) { + s := RandomSlice(large) + wts := RandomSlice(large) + benchmarkMomentAbout(b, 5, s, 0, wts) +} + +func BenchmarkMomentAboutHugeWeighted(b *testing.B) { + s := RandomSlice(huge) + wts := RandomSlice(huge) + benchmarkMomentAbout(b, 5, s, 0, wts) +} diff --git a/stat/pca_cca.go b/stat/pca_cca.go new file mode 100644 index 00000000..c3fdef7f --- /dev/null +++ b/stat/pca_cca.go @@ -0,0 +1,320 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "errors" + "math" + + "github.com/gonum/floats" + "github.com/gonum/matrix" + "github.com/gonum/matrix/mat64" +) + +// PC is a type for computing and extracting the principal components of a +// matrix. The results of the principal components analysis are only valid +// if the call to PrincipalComponents was successful. +type PC struct { + n, d int + weights []float64 + svd *mat64.SVD + ok bool +} + +// PrincipalComponents performs a weighted principal components analysis on the +// matrix of the input data which is represented as an n×d matrix a where each +// row is an observation and each column is a variable. +// +// PrincipalComponents centers the variables but does not scale the variance. +// +// The weights slice is used to weight the observations. If weights is nil, each +// weight is considered to have a value of one, otherwise the length of weights +// must match the number of observations or PrincipalComponents will panic. +// +// PrincipalComponents returns whether the analysis was successful. +func (c *PC) PrincipalComponents(a mat64.Matrix, weights []float64) (ok bool) { + c.n, c.d = a.Dims() + if weights != nil && len(weights) != c.n { + panic("stat: len(weights) != observations") + } + + c.svd, c.ok = svdFactorizeCentered(c.svd, a, weights) + if c.ok { + c.weights = append(c.weights[:0], weights...) + } + return c.ok +} + +// Vectors returns the component direction vectors of a principal components +// analysis. The vectors are returned in the columns of a d×min(n, d) matrix. +// If dst is not nil it must either be zero-sized or be a d×min(n, d) matrix. +// dst will be used as the destination for the direction vector data. If dst +// is nil, a new mat64.Dense is allocated for the destination. +func (c *PC) Vectors(dst *mat64.Dense) *mat64.Dense { + if !c.ok { + panic("stat: use of unsuccessful principal components analysis") + } + + if dst == nil { + dst = &mat64.Dense{} + } else if d, n := dst.Dims(); (n != 0 || d != 0) && (d != c.d || n != min(c.n, c.d)) { + panic(matrix.ErrShape) + } + dst.VFromSVD(c.svd) + return dst +} + +// Vars returns the column variances of the principal component scores, +// b * vecs, where b is a matrix with centered columns. Variances are returned +// in descending order. +// If dst is not nil it is used to store the variances and returned. +// Vars will panic if the receiver has not successfully performed a principal +// components analysis or dst is not nil and the length of dst is not min(n, d). +func (c *PC) Vars(dst []float64) []float64 { + if !c.ok { + panic("stat: use of unsuccessful principal components analysis") + } + if dst != nil && len(dst) != min(c.n, c.d) { + panic("stat: length of slice does not match analysis") + } + + dst = c.svd.Values(dst) + var f float64 + if c.weights == nil { + f = 1 / float64(c.n-1) + } else { + f = 1 / (floats.Sum(c.weights) - 1) + } + for i, v := range dst { + dst[i] = f * v * v + } + return dst +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// CC is a type for computing the canonical correlations of a pair of matrices. +// The results of the canonical correlation analysis are only valid +// if the call to CanonicalCorrelations was successful. +type CC struct { + // n is the number of observations used to + // construct the canonical correlations. + n int + + // xd and yd are used for size checks. + xd, yd int + + x, y, c *mat64.SVD + ok bool +} + +// CanonicalCorrelations returns a CC which can provide the results of canonical +// correlation analysis of the input data x and y, columns of which should be +// interpretable as two sets of measurements on the same observations (rows). +// These observations are optionally weighted by weights. +// +// Canonical correlation analysis finds associations between two sets of +// variables on the same observations by finding linear combinations of the two +// sphered datasets that maximize the correlation between them. +// +// Some notation: let Xc and Yc denote the centered input data matrices x +// and y (column means subtracted from each column), let Sx and Sy denote the +// sample covariance matrices within x and y respectively, and let Sxy denote +// the covariance matrix between x and y. The sphered data can then be expressed +// as Xc * Sx^{-1/2} and Yc * Sy^{-1/2} respectively, and the correlation matrix +// between the sphered data is called the canonical correlation matrix, +// Sx^{-1/2} * Sxy * Sy^{-1/2}. In cases where S^{-1/2} is ambiguous for some +// covariance matrix S, S^{-1/2} is taken to be E * D^{-1/2} * E^T where S can +// be eigendecomposed as S = E * D * E^T. +// +// The canonical correlations are the correlations between the corresponding +// pairs of canonical variables and can be obtained with c.Corrs(). Canonical +// variables can be obtained by projecting the sphered data into the left and +// right eigenvectors of the canonical correlation matrix, and these +// eigenvectors can be obtained with c.Left(m, true) and c.Right(m, true) +// respectively. The canonical variables can also be obtained directly from the +// centered raw data by using the back-transformed eigenvectors which can be +// obtained with c.Left(m, false) and c.Right(m, false) respectively. +// +// The first pair of left and right eigenvectors of the canonical correlation +// matrix can be interpreted as directions into which the respective sphered +// data can be projected such that the correlation between the two projections +// is maximized. The second pair and onwards solve the same optimization but +// under the constraint that they are uncorrelated (orthogonal in sphered space) +// to previous projections. +// +// CanonicalCorrelations will panic if the inputs x and y do not have the same +// number of rows. +// +// The slice weights is used to weight the observations. If weights is nil, each +// weight is considered to have a value of one, otherwise the length of weights +// must match the number of observations (rows of both x and y) or +// CanonicalCorrelations will panic. +// +// More details can be found at +// https://en.wikipedia.org/wiki/Canonical_correlation +// or in Chapter 3 of +// Koch, Inge. Analysis of multivariate and high-dimensional data. +// Vol. 32. Cambridge University Press, 2013. ISBN: 9780521887939 +func (c *CC) CanonicalCorrelations(x, y mat64.Matrix, weights []float64) error { + var yn int + c.n, c.xd = x.Dims() + yn, c.yd = y.Dims() + if c.n != yn { + panic("stat: unequal number of observations") + } + if weights != nil && len(weights) != c.n { + panic("stat: len(weights) != observations") + } + + // Center and factorize x and y. + c.x, c.ok = svdFactorizeCentered(c.x, x, weights) + if !c.ok { + return errors.New("stat: failed to factorize x") + } + c.y, c.ok = svdFactorizeCentered(c.y, y, weights) + if !c.ok { + return errors.New("stat: failed to factorize y") + } + var xu, xv, yu, yv mat64.Dense + xu.UFromSVD(c.x) + xv.VFromSVD(c.x) + yu.UFromSVD(c.y) + yv.VFromSVD(c.y) + + // Calculate and factorise the canonical correlation matrix. + var ccor mat64.Dense + ccor.Product(&xv, xu.T(), &yu, yv.T()) + if c.c == nil { + c.c = &mat64.SVD{} + } + c.ok = c.c.Factorize(&ccor, matrix.SVDThin) + if !c.ok { + return errors.New("stat: failed to factorize ccor") + } + return nil +} + +// Corrs returns the canonical correlations, using dst if it is not nil. +// If dst is not nil and len(dst) does not match the number of columns in +// the y input matrix, Corrs will panic. +func (c *CC) Corrs(dst []float64) []float64 { + if !c.ok { + panic("stat: canonical correlations missing or invalid") + } + + if dst != nil && len(dst) != c.yd { + panic("stat: length of destination does not match input dimension") + } + return c.c.Values(dst) +} + +// Left returns the left eigenvectors of the canonical correlation matrix if +// spheredSpace is true. If spheredSpace is false it returns these eigenvectors +// back-transformed to the original data space. +// If dst is not nil it must either be zero-sized or be an xd×yd matrix where xd +// and yd are the number of variables in the input x and y matrices. dst will +// be used as the destination for the vector data. If dst is nil, a new +// mat64.Dense is allocated for the destination. +func (c *CC) Left(dst *mat64.Dense, spheredSpace bool) *mat64.Dense { + if !c.ok || c.n < 2 { + panic("stat: canonical correlations missing or invalid") + } + + if dst == nil { + dst = &mat64.Dense{} + } else if d, n := dst.Dims(); (n != 0 || d != 0) && (n != c.yd || d != c.xd) { + panic(matrix.ErrShape) + } + dst.UFromSVD(c.c) + if spheredSpace { + return dst + } + + var xv mat64.Dense + xs := c.x.Values(nil) + xv.VFromSVD(c.x) + + scaleColsReciSqrt(&xv, xs) + + dst.Product(&xv, xv.T(), dst) + dst.Scale(math.Sqrt(float64(c.n-1)), dst) + return dst +} + +// Right returns the right eigenvectors of the canonical correlation matrix if +// spheredSpace is true. If spheredSpace is false it returns these eigenvectors +// back-transformed to the original data space. +// If dst is not nil it must either be zero-sized or be an yd×yd matrix where yd +// is the number of variables in the input y matrix. dst will +// be used as the destination for the vector data. If dst is nil, a new +// mat64.Dense is allocated for the destination. +func (c *CC) Right(dst *mat64.Dense, spheredSpace bool) *mat64.Dense { + if !c.ok || c.n < 2 { + panic("stat: canonical correlations missing or invalid") + } + + if dst == nil { + dst = &mat64.Dense{} + } else if d, n := dst.Dims(); (n != 0 || d != 0) && (n != c.yd || d != c.yd) { + panic(matrix.ErrShape) + } + dst.VFromSVD(c.c) + if spheredSpace { + return dst + } + + var yv mat64.Dense + ys := c.y.Values(nil) + yv.VFromSVD(c.y) + + scaleColsReciSqrt(&yv, ys) + + dst.Product(&yv, yv.T(), dst) + dst.Scale(math.Sqrt(float64(c.n-1)), dst) + return dst +} + +func svdFactorizeCentered(work *mat64.SVD, m mat64.Matrix, weights []float64) (svd *mat64.SVD, ok bool) { + n, d := m.Dims() + centered := mat64.NewDense(n, d, nil) + col := make([]float64, n) + for j := 0; j < d; j++ { + mat64.Col(col, j, m) + floats.AddConst(-Mean(col, weights), col) + centered.SetCol(j, col) + } + for i, w := range weights { + floats.Scale(math.Sqrt(w), centered.RawRowView(i)) + } + if work == nil { + work = &mat64.SVD{} + } + ok = work.Factorize(centered, matrix.SVDThin) + return work, ok +} + +// scaleColsReciSqrt scales the columns of cols +// by the reciprocal square-root of vals. +func scaleColsReciSqrt(cols *mat64.Dense, vals []float64) { + if cols == nil { + panic("stat: input nil") + } + n, d := cols.Dims() + if len(vals) != d { + panic("stat: input length mismatch") + } + col := make([]float64, n) + for j := 0; j < d; j++ { + mat64.Col(col, j, cols) + floats.Scale(math.Sqrt(1/vals[j]), col) + cols.SetCol(j, col) + } +} diff --git a/stat/pca_example_test.go b/stat/pca_example_test.go new file mode 100644 index 00000000..d7565cb0 --- /dev/null +++ b/stat/pca_example_test.go @@ -0,0 +1,60 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import ( + "fmt" + + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" +) + +func ExamplePrincipalComponents() { + // iris is a truncated sample of the Fisher's Iris dataset. + n := 10 + d := 4 + iris := mat64.NewDense(n, d, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }) + + // Calculate the principal component direction vectors + // and variances. + var pc stat.PC + ok := pc.PrincipalComponents(iris, nil) + if !ok { + return + } + fmt.Printf("variances = %.4f\n\n", pc.Vars(nil)) + + // Project the data onto the first 2 principal components. + k := 2 + var proj mat64.Dense + proj.Mul(iris, pc.Vectors(nil).Slice(0, d, 0, k)) + + fmt.Printf("proj = %.4f", mat64.Formatted(&proj, mat64.Prefix(" "))) + + // Output: + // variances = [0.1666 0.0207 0.0079 0.0019] + // + // proj = ⎡-6.1686 1.4659⎤ + // ⎢-5.6767 1.6459⎥ + // ⎢-5.6699 1.3642⎥ + // ⎢-5.5643 1.3816⎥ + // ⎢-6.1734 1.3309⎥ + // ⎢-6.7278 1.4021⎥ + // ⎢-5.7743 1.1498⎥ + // ⎢-6.0466 1.4714⎥ + // ⎢-5.2709 1.3570⎥ + // ⎣-5.7533 1.6207⎦ +} diff --git a/stat/pca_test.go b/stat/pca_test.go new file mode 100644 index 00000000..06618f5f --- /dev/null +++ b/stat/pca_test.go @@ -0,0 +1,183 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" +) + +func TestPrincipalComponents(t *testing.T) { +tests: + for i, test := range []struct { + data mat64.Matrix + weights []float64 + wantVecs *mat64.Dense + wantVars []float64 + epsilon float64 + }{ + // Test results verified using R. + { + data: mat64.NewDense(3, 3, []float64{ + 1, 2, 3, + 4, 5, 6, + 7, 8, 9, + }), + wantVecs: mat64.NewDense(3, 3, []float64{ + 0.5773502691896258, 0.8164965809277261, 0, + 0.577350269189626, -0.4082482904638632, -0.7071067811865476, + 0.5773502691896258, -0.4082482904638631, 0.7071067811865475, + }), + wantVars: []float64{27, 0, 0}, + epsilon: 1e-12, + }, + { // Truncated iris data. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.6681110197952722, 0.7064764857539533, -0.14026590216895132, -0.18666578956412125, + -0.7166344774801547, -0.6427036135482664, -0.135650285905254, 0.23444848208629923, + -0.164411275166307, 0.11898477441068218, 0.9136367900709548, 0.35224901970831746, + -0.11415613655453069, -0.2714141920887426, 0.35664028439226514, -0.8866286823515034, + }), + wantVars: []float64{0.1665786313282786, 0.02065509475412993, 0.007944620317765855, 0.0019327647109368329}, + epsilon: 1e-12, + }, + { // Truncated iris data to form wide matrix. + data: mat64.NewDense(3, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + }), + wantVecs: mat64.NewDense(4, 3, []float64{ + -0.5705187254552365, -0.7505979435049239, 0.08084520834544455, + -0.8166537769529318, 0.5615147645527523, -0.032338083338177705, + -0.08709186238359454, -0.3482870890450082, -0.22636658336724505, + 0, 0, -0.9701425001453315, + }), + wantVars: []float64{0.0844692361537822, 0.022197430512884326, 0}, + epsilon: 1e-12, + }, + { // Truncated iris data transposed to check for operation on fat input. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }).T(), + wantVecs: mat64.NewDense(10, 4, []float64{ + -0.3366602459946619, -0.1373634006401213, 0.3465102523547623, -0.10290179303893479, + -0.31381852053861975, 0.5197145790632827, 0.5567296129086686, -0.15923062170153618, + -0.30857197637565165, -0.07670930360819002, 0.36159923003337235, 0.3342301027853355, + -0.29527124351656137, 0.16885455995353074, -0.5056204762881208, 0.32580913261444344, + -0.3327611073694004, -0.39365834489416474, 0.04900050959307464, 0.46812879383236555, + -0.34445484362044815, -0.2985206914561878, -0.1009714701361799, -0.16803618186050803, + -0.2986246350957691, -0.4222037823717799, -0.11838613462182519, -0.580283530375069, + -0.325911246223126, 0.024366468758217238, -0.12082035131864265, 0.16756027181337868, + -0.2814284432361538, 0.240812316260054, -0.24061437569068145, -0.365034616264623, + -0.31906138507685167, 0.4423912824105986, -0.2906412122303604, 0.027551046870337714, + }), + wantVars: []float64{41.8851906634233, 0.07762619213464989, 0.010516477775373585, 0}, + epsilon: 1e-12, + }, + { // Truncated iris data unitary weights. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + weights: []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.6681110197952722, 0.7064764857539533, -0.14026590216895132, -0.18666578956412125, + -0.7166344774801547, -0.6427036135482664, -0.135650285905254, 0.23444848208629923, + -0.164411275166307, 0.11898477441068218, 0.9136367900709548, 0.35224901970831746, + -0.11415613655453069, -0.2714141920887426, 0.35664028439226514, -0.8866286823515034, + }), + wantVars: []float64{0.1665786313282786, 0.02065509475412993, 0.007944620317765855, 0.0019327647109368329}, + epsilon: 1e-12, + }, + { // Truncated iris data non-unitary weights. + data: mat64.NewDense(10, 4, []float64{ + 5.1, 3.5, 1.4, 0.2, + 4.9, 3.0, 1.4, 0.2, + 4.7, 3.2, 1.3, 0.2, + 4.6, 3.1, 1.5, 0.2, + 5.0, 3.6, 1.4, 0.2, + 5.4, 3.9, 1.7, 0.4, + 4.6, 3.4, 1.4, 0.3, + 5.0, 3.4, 1.5, 0.2, + 4.4, 2.9, 1.4, 0.2, + 4.9, 3.1, 1.5, 0.1, + }), + weights: []float64{2, 3, 1, 1, 1, 1, 1, 1, 1, 2}, + wantVecs: mat64.NewDense(4, 4, []float64{ + -0.618936145422414, 0.763069301531647, 0.124857741232537, 0.138035623677211, + -0.763958271606519, -0.603881770702898, 0.118267155321333, -0.194184052457746, + -0.143552119754944, 0.090014599564871, -0.942209377020044, -0.289018426115945, + -0.112599271966947, -0.212012782487076, -0.287515067921680, 0.927203898682805, + }), + wantVars: []float64{0.129621985550623, 0.022417487771598, 0.006454461065715, 0.002495076601075}, + epsilon: 1e-12, + }, + } { + var pc PC + var vecs *mat64.Dense + var vars []float64 + for j := 0; j < 2; j++ { + ok := pc.PrincipalComponents(test.data, test.weights) + vecs = pc.Vectors(vecs) + vars = pc.Vars(vars) + if !ok { + t.Errorf("unexpected SVD failure for test %d use %d", i, j) + continue tests + } + if !mat64.EqualApprox(vecs, test.wantVecs, test.epsilon) { + t.Errorf("%d use %d: unexpected PCA result got:\n%v\nwant:\n%v", + i, j, mat64.Formatted(vecs), mat64.Formatted(test.wantVecs)) + } + if !approxEqual(vars, test.wantVars, test.epsilon) { + t.Errorf("%d use %d: unexpected variance result got:%v, want:%v", + i, j, vars, test.wantVars) + } + } + } +} + +func approxEqual(a, b []float64, epsilon float64) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if !floats.EqualWithinAbsOrRel(v, b[i], epsilon, epsilon) { + return false + } + } + return true +} diff --git a/stat/roc.go b/stat/roc.go new file mode 100644 index 00000000..73584b61 --- /dev/null +++ b/stat/roc.go @@ -0,0 +1,121 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import "sort" + +// ROC returns paired false positive rate (FPR) and true positive rate +// (TPR) values corresponding to n cutoffs spanning the relative +// (or receiver) operator characteristic (ROC) curve obtained when y is +// treated as a binary classifier for classes with weights. +// +// Cutoffs are equally spaced from eps less than the minimum value of y +// to the maximum value of y, including both endpoints meaning that the +// resulting ROC curve will always begin at (0,0) and end at (1,1). +// +// The input y must be sorted, and SortWeightedLabeled can be used in +// order to sort y together with classes and weights. +// +// For a given cutoff value, observations corresponding to entries in y +// greater than the cutoff value are classified as false, while those +// below (or equal to) the cutoff value are classified as true. These +// assigned class labels are compared with the true values in the classes +// slice and used to calculate the FPR and TPR. +// +// If weights is nil, all weights are treated as 1. +// +// When n is zero all possible cutoffs are calculated, resulting +// in fpr and tpr having length one greater than the number of unique +// values in y. When n is greater than one fpr and tpr will be returned +// with length n. ROC will panic if n is equal to one or less than 0. +// +// More details about ROC curves are available at +// https://en.wikipedia.org/wiki/Receiver_operating_characteristic +func ROC(n int, y []float64, classes []bool, weights []float64) (tpr, fpr []float64) { + if len(y) != len(classes) { + panic("stat: slice length mismatch") + } + if weights != nil && len(y) != len(weights) { + panic("stat: slice length mismatch") + } + if !sort.Float64sAreSorted(y) { + panic("stat: input must be sorted") + } + + var incWidth, tol float64 + if n == 0 { + if len(y) == 0 { + return nil, nil + } + tpr = make([]float64, len(y)+1) + fpr = make([]float64, len(y)+1) + } else { + if n < 2 { + panic("stat: cannot calculate fewer than 2 points on a ROC curve") + } + if len(y) == 0 { + return nil, nil + } + tpr = make([]float64, n) + fpr = make([]float64, n) + incWidth = (y[len(y)-1] - y[0]) / float64(n-1) + tol = y[0] + incWidth + if incWidth == 0 { + tpr[n-1] = 1 + fpr[n-1] = 1 + return + } + } + + var bin int = 1 // the initial bin is known to have 0 fpr and 0 tpr + var nPos, nNeg float64 + for i, u := range classes { + var posWeight, negWeight float64 = 0, 1 + if weights != nil { + negWeight = weights[i] + } + if u { + posWeight, negWeight = negWeight, posWeight + } + nPos += posWeight + nNeg += negWeight + tpr[bin] += posWeight + fpr[bin] += negWeight + + // Assess if the bin needs to be updated. If n is zero, + // the bin is always updated, unless consecutive y values + // are equal. Otherwise, the bin must be updated until it + // matches the next y value (skipping empty bins). + if n == 0 { + if i != (len(y)-1) && y[i] != y[i+1] { + bin++ + tpr[bin] = tpr[bin-1] + fpr[bin] = fpr[bin-1] + } + } else { + for i != (len(y)-1) && y[i+1] > tol { + tol += incWidth + bin++ + tpr[bin] = tpr[bin-1] + fpr[bin] = fpr[bin-1] + } + } + } + if n == 0 { + tpr = tpr[:(bin + 1)] + fpr = fpr[:(bin + 1)] + } + + invNeg := 1 / nNeg + invPos := 1 / nPos + for i := range tpr { + tpr[i] *= invPos + fpr[i] *= invNeg + } + tpr[len(tpr)-1] = 1 + fpr[len(fpr)-1] = 1 + + return tpr, fpr +} diff --git a/stat/roc_example_test.go b/stat/roc_example_test.go new file mode 100644 index 00000000..41cc3634 --- /dev/null +++ b/stat/roc_example_test.go @@ -0,0 +1,56 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat_test + +import ( + "fmt" + + "github.com/gonum/integrate" + "github.com/gonum/stat" +) + +func ExampleROC_unweighted() { + y := []float64{0, 3, 5, 6, 7.5, 8} + classes := []bool{true, false, true, false, false, false} + weights := []float64{4, 1, 6, 3, 2, 2} + + tpr, fpr := stat.ROC(0, y, classes, weights) + fmt.Printf("true positive rate: %v\n", tpr) + fmt.Printf("false positive rate: %v\n", fpr) + + // Output: + // true positive rate: [0 0.4 0.4 1 1 1 1] + // false positive rate: [0 0 0.125 0.125 0.5 0.75 1] +} + +func ExampleROC_weighted() { + y := []float64{0, 3, 5, 6, 7.5, 8} + classes := []bool{true, false, true, false, false, false} + + tpr, fpr := stat.ROC(0, y, classes, nil) + fmt.Printf("true positive rate: %v\n", tpr) + fmt.Printf("false positive rate: %v\n", fpr) + + // Output: + // true positive rate: [0 0.5 0.5 1 1 1 1] + // false positive rate: [0 0 0.25 0.25 0.5 0.75 1] +} + +func ExampleROC_aUC() { + y := []float64{0.1, 0.35, 0.4, 0.8} + classes := []bool{true, false, true, false} + + tpr, fpr := stat.ROC(0, y, classes, nil) + // compute Area Under Curve + auc := integrate.Trapezoidal(fpr, tpr) + fmt.Printf("true positive rate: %v\n", tpr) + fmt.Printf("false positive rate: %v\n", fpr) + fmt.Printf("auc: %v\n", auc) + + // Output: + // true positive rate: [0 0.5 0.5 1 1] + // false positive rate: [0 0 0.5 0.5 1] + // auc: 0.75 +} diff --git a/stat/roc_test.go b/stat/roc_test.go new file mode 100644 index 00000000..60d22c5b --- /dev/null +++ b/stat/roc_test.go @@ -0,0 +1,178 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "testing" + + "github.com/gonum/floats" +) + +// Test cases where calculated manually. +func TestROC(t *testing.T) { + cases := []struct { + y []float64 + c []bool + w []float64 + n int + wantTPR []float64 + wantFPR []float64 + }{ + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + wantTPR: []float64{0, 0.5, 0.5, 1, 1, 1, 1}, + wantFPR: []float64{0, 0, 0.25, 0.25, 0.5, 0.75, 1}, + }, + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + wantTPR: []float64{0, 0.4, 0.4, 1, 1, 1, 1}, + wantFPR: []float64{0, 0, 0.125, 0.125, 0.5, 0.75, 1}, + }, + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + n: int(5), + wantTPR: []float64{0, 0.5, 0.5, 1, 1}, + wantFPR: []float64{0, 0, 0.25, 0.5, 1}, + }, + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + n: int(9), + wantTPR: []float64{0, 0.5, 0.5, 0.5, 0.5, 1, 1, 1, 1}, + wantFPR: []float64{0, 0, 0, 0.25, 0.25, 0.25, 0.5, 0.5, 1}, + }, + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + n: int(5), + wantTPR: []float64{0, 0.4, 0.4, 1, 1}, + wantFPR: []float64{0, 0, 0.125, 0.5, 1}, + }, + { + y: []float64{0, 3, 5, 6, 7.5, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + n: int(9), + wantTPR: []float64{0, 0.4, 0.4, 0.4, 0.4, 1, 1, 1, 1}, + wantFPR: []float64{0, 0, 0, 0.125, 0.125, 0.125, 0.5, 0.5, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + wantTPR: []float64{0, 0.5, 0.5, 1, 1}, + wantFPR: []float64{0, 0, 0.25, 0.75, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + wantTPR: []float64{0, 0.4, 0.4, 1, 1}, + wantFPR: []float64{0, 0, 0.125, 0.75, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + n: int(5), + wantTPR: []float64{0, 0.5, 0.5, 1, 1}, + wantFPR: []float64{0, 0, 0.25, 0.75, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + n: int(9), + wantTPR: []float64{0, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1, 1}, + wantFPR: []float64{0, 0, 0, 0.25, 0.25, 0.25, 0.75, 0.75, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + n: int(5), + wantTPR: []float64{0, 0.4, 0.4, 1, 1}, + wantFPR: []float64{0, 0, 0.125, 0.75, 1}, + }, + { + y: []float64{0, 3, 6, 6, 6, 8}, + c: []bool{true, false, true, false, false, false}, + w: []float64{4, 1, 6, 3, 2, 2}, + n: int(9), + wantTPR: []float64{0, 0.4, 0.4, 0.4, 0.4, 0.4, 1, 1, 1}, + wantFPR: []float64{0, 0, 0, 0.125, 0.125, 0.125, 0.75, 0.75, 1}, + }, + { + y: []float64{1, 2}, + c: []bool{true, true}, + wantTPR: []float64{0, 0.5, 1}, + wantFPR: []float64{0, 0, 1}, + }, + { + y: []float64{1, 2}, + c: []bool{true, true}, + n: int(2), + wantTPR: []float64{0, 1}, + wantFPR: []float64{0, 1}, + }, + { + y: []float64{1, 2}, + c: []bool{true, true}, + n: int(7), + wantTPR: []float64{0, 0.5, 0.5, 0.5, 0.5, 0.5, 1}, + wantFPR: []float64{0, 0, 0, 0, 0, 0, 1}, + }, + { + y: []float64{1}, + c: []bool{true}, + wantTPR: []float64{0, 1}, + wantFPR: []float64{0, 1}, + }, + { + y: []float64{1}, + c: []bool{true}, + n: int(2), + wantTPR: []float64{0, 1}, + wantFPR: []float64{0, 1}, + }, + { + y: []float64{1}, + c: []bool{false}, + wantTPR: []float64{0, 1}, + wantFPR: []float64{0, 1}, + }, + { + y: []float64{0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 10}, + c: []bool{true, false, true, true, false, false, true}, + n: int(5), + wantTPR: []float64{0, 0.75, 0.75, 0.75, 1}, + wantFPR: []float64{0, 1, 1, 1, 1}, + }, + { + y: []float64{}, + c: []bool{}, + wantTPR: nil, + wantFPR: nil, + }, + { + y: []float64{}, + c: []bool{}, + n: int(5), + wantTPR: nil, + wantFPR: nil, + }, + } + for i, test := range cases { + gotTPR, gotFPR := ROC(test.n, test.y, test.c, test.w) + if !floats.Same(gotTPR, test.wantTPR) { + t.Errorf("%d: unexpected TPR got:%v want:%v", i, gotTPR, test.wantTPR) + } + if !floats.Same(gotFPR, test.wantFPR) { + t.Errorf("%d: unexpected FPR got:%v want:%v", i, gotFPR, test.wantFPR) + } + } +} diff --git a/stat/samplemv/metropolishastings.go b/stat/samplemv/metropolishastings.go new file mode 100644 index 00000000..991aaa40 --- /dev/null +++ b/stat/samplemv/metropolishastings.go @@ -0,0 +1,221 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package samplemv + +import ( + "math" + "math/rand" + + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat/distmv" +) + +var _ Sampler = MetropolisHastingser{} + +// MHProposal defines a proposal distribution for Metropolis Hastings. +type MHProposal interface { + // ConditionalLogProb returns the probability of the first argument + // conditioned on being at the second argument. + // p(x|y) + // ConditionalLogProb panics if the input slices are not the same length. + ConditionalLogProb(x, y []float64) (prob float64) + + // ConditionalRand generates a new random location conditioned being at the + // location y. If the first arguement is nil, a new slice is allocated and + // returned. Otherwise, the random location is stored in-place into the first + // argument, and ConditionalRand will panic if the input slice lengths differ. + ConditionalRand(x, y []float64) []float64 +} + +// MetropolisHastingser is a wrapper around the MetropolisHastings sampling type. +// +// BurnIn sets the number of samples to discard before keeping the first sample. +// A properly set BurnIn rate will decorrelate the sampling chain from the initial +// location. The proper BurnIn value will depend on the mixing time of the +// Markov chain defined by the target and proposal distributions. +// +// Rate sets the number of samples to discard in between each kept sample. A +// higher rate will better approximate independently and identically distributed +// samples, while a lower rate will keep more information (at the cost of +// higher correlation between samples). If Rate is 0 it is defaulted to 1. +// +// The initial value is NOT changed during calls to Sample. +type MetropolisHastingser struct { + Initial []float64 + Target distmv.LogProber + Proposal MHProposal + Src *rand.Rand + + BurnIn int + Rate int +} + +// Sample generates rows(batch) samples using the Metropolis Hastings sample +// generation method. The initial location is NOT updated during the call to Sample. +// +// The number of columns in batch must equal len(m.Initial), otherwise Sample +// will panic. +func (m MetropolisHastingser) Sample(batch *mat64.Dense) { + burnIn := m.BurnIn + rate := m.Rate + if rate == 0 { + rate = 1 + } + r, c := batch.Dims() + if len(m.Initial) != c { + panic("metropolishastings: length mismatch") + } + + // Use the optimal size for the temporary memory to allow the fewest calls + // to MetropolisHastings. The case where tmp shadows samples must be + // aligned with the logic after burn-in so that tmp does not shadow samples + // during the rate portion. + tmp := batch + if rate > r { + tmp = mat64.NewDense(rate, c, nil) + } + rTmp, _ := tmp.Dims() + + // Perform burn-in. + remaining := burnIn + initial := make([]float64, c) + copy(initial, m.Initial) + for remaining != 0 { + newSamp := min(rTmp, remaining) + MetropolisHastings(tmp.View(0, 0, newSamp, c).(*mat64.Dense), initial, m.Target, m.Proposal, m.Src) + copy(initial, tmp.RawRowView(newSamp-1)) + remaining -= newSamp + } + + if rate == 1 { + MetropolisHastings(batch, initial, m.Target, m.Proposal, m.Src) + return + } + + if rTmp <= r { + tmp = mat64.NewDense(rate, c, nil) + } + + // Take a single sample from the chain. + MetropolisHastings(batch.View(0, 0, 1, c).(*mat64.Dense), initial, m.Target, m.Proposal, m.Src) + + copy(initial, batch.RawRowView(0)) + // For all of the other samples, first generate Rate samples and then actually + // accept the last one. + for i := 1; i < r; i++ { + MetropolisHastings(tmp, initial, m.Target, m.Proposal, m.Src) + v := tmp.RawRowView(rate - 1) + batch.SetRow(i, v) + copy(initial, v) + } +} + +// MetropolisHastings generates rows(batch) samples using the Metropolis Hastings +// algorithm (http://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm), +// with the given target and proposal distributions, starting at the intial location +// and storing the results in-place into samples. If src != nil, it will be used to generate random +// numbers, otherwise rand.Float64 will be used. +// +// Metropolis-Hastings is a Markov-chain Monte Carlo algorithm that generates +// samples according to the distribution specified by target by using the Markov +// chain implicitly defined by the proposal distribution. At each +// iteration, a proposal point is generated randomly from the current location. +// This proposal point is accepted with probability +// p = min(1, (target(new) * proposal(current|new)) / (target(current) * proposal(new|current))) +// If the new location is accepted, it is stored into batch and becomes the +// new current location. If it is rejected, the current location remains and +// is stored into samples. Thus, a location is stored into batch at every iteration. +// +// The samples in Metropolis Hastings are correlated with one another through the +// Markov chain. As a result, the initial value can have a significant influence +// on the early samples, and so, typically, the first samples generated by the chain +// are ignored. This is known as "burn-in", and can be accomplished with slicing. +// The best choice for burn-in length will depend on the sampling and target +// distributions. +// +// Many choose to have a sampling "rate" where a number of samples +// are ignored in between each kept sample. This helps decorrelate +// the samples from one another, but also reduces the number of available samples. +// A sampling rate can be implemented with successive calls to MetropolisHastings. +func MetropolisHastings(batch *mat64.Dense, initial []float64, target distmv.LogProber, proposal MHProposal, src *rand.Rand) { + f64 := rand.Float64 + if src != nil { + f64 = src.Float64 + } + if len(initial) == 0 { + panic("metropolishastings: zero length initial") + } + r, _ := batch.Dims() + current := make([]float64, len(initial)) + copy(current, initial) + proposed := make([]float64, len(initial)) + currentLogProb := target.LogProb(initial) + for i := 0; i < r; i++ { + proposal.ConditionalRand(proposed, current) + proposedLogProb := target.LogProb(proposed) + probTo := proposal.ConditionalLogProb(proposed, current) + probBack := proposal.ConditionalLogProb(current, proposed) + + accept := math.Exp(proposedLogProb + probBack - probTo - currentLogProb) + if accept > f64() { + copy(current, proposed) + currentLogProb = proposedLogProb + } + batch.SetRow(i, current) + } +} + +// ProposalNormal is a sampling distribution for Metropolis-Hastings. It has a +// fixed covariance matrix and changes the mean based on the current sampling +// location. +type ProposalNormal struct { + normal *distmv.Normal +} + +// NewProposalNormal constructs a new ProposalNormal for use as a proposal +// distribution for Metropolis-Hastings. ProposalNormal is a multivariate normal +// distribution (implemented by distmv.Normal) where the covariance matrix is fixed +// and the mean of the distribution changes. +// +// NewProposalNormal returns {nil, false} if the covariance matrix is not positive-definite. +func NewProposalNormal(sigma *mat64.SymDense, src *rand.Rand) (*ProposalNormal, bool) { + mu := make([]float64, sigma.Symmetric()) + normal, ok := distmv.NewNormal(mu, sigma, src) + if !ok { + return nil, false + } + p := &ProposalNormal{ + normal: normal, + } + return p, true +} + +// ConditionalLogProb returns the probability of the first argument conditioned on +// being at the second argument. +// p(x|y) +// ConditionalLogProb panics if the input slices are not the same length or +// are not equal to the dimension of the covariance matrix. +func (p *ProposalNormal) ConditionalLogProb(x, y []float64) (prob float64) { + // Either SetMean or LogProb will panic if the slice lengths are innaccurate. + p.normal.SetMean(y) + return p.normal.LogProb(x) +} + +// ConditionalRand generates a new random location conditioned being at the +// location y. If the first arguement is nil, a new slice is allocated and +// returned. Otherwise, the random location is stored in-place into the first +// argument, and ConditionalRand will panic if the input slice lengths differ or +// if they are not equal to the dimension of the covariance matrix. +func (p *ProposalNormal) ConditionalRand(x, y []float64) []float64 { + if x == nil { + x = make([]float64, p.normal.Dim()) + } + if len(x) != len(y) { + panic(badLengthMismatch) + } + p.normal.SetMean(y) + p.normal.Rand(x) + return x +} diff --git a/stat/samplemv/sample_test.go b/stat/samplemv/sample_test.go new file mode 100644 index 00000000..228f495d --- /dev/null +++ b/stat/samplemv/sample_test.go @@ -0,0 +1,280 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +package samplemv + +import ( + "fmt" + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat" + "github.com/gonum/stat/distmv" +) + +type lhDist interface { + Quantile(x, p []float64) []float64 + CDF(p, x []float64) []float64 + Dim() int +} + +func TestLatinHypercube(t *testing.T) { + for _, nSamples := range []int{1, 2, 5, 10, 20} { + for _, dist := range []lhDist{ + distmv.NewUniform([]distmv.Bound{{0, 3}}, nil), + distmv.NewUniform([]distmv.Bound{{0, 3}, {-1, 5}, {-4, -1}}, nil), + } { + dim := dist.Dim() + batch := mat64.NewDense(nSamples, dim, nil) + LatinHypercube(batch, dist, nil) + // Latin hypercube should have one entry per hyperrow. + present := make([][]bool, nSamples) + for i := range present { + present[i] = make([]bool, dim) + } + cdf := make([]float64, dim) + for i := 0; i < nSamples; i++ { + dist.CDF(cdf, batch.RawRowView(i)) + for j := 0; j < dim; j++ { + p := cdf[j] + quadrant := int(math.Floor(p * float64(nSamples))) + present[quadrant][j] = true + } + } + allPresent := true + for i := 0; i < nSamples; i++ { + for j := 0; j < dim; j++ { + if present[i][j] == false { + allPresent = false + } + } + } + if !allPresent { + t.Errorf("All quadrants not present") + } + } + } +} + +func TestImportance(t *testing.T) { + // Test by finding the expected value of a multi-variate normal. + dim := 3 + target, ok := randomNormal(dim) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + muImp := make([]float64, dim) + sigmaImp := mat64.NewSymDense(dim, nil) + for i := 0; i < dim; i++ { + sigmaImp.SetSym(i, i, 3) + } + proposal, ok := distmv.NewNormal(muImp, sigmaImp, nil) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + nSamples := 100000 + batch := mat64.NewDense(nSamples, dim, nil) + weights := make([]float64, nSamples) + Importance(batch, weights, target, proposal) + + compareNormal(t, target, batch, weights) +} + +func TestRejection(t *testing.T) { + // Test by finding the expected value of a uniform. + dim := 3 + bounds := make([]distmv.Bound, dim) + for i := 0; i < dim; i++ { + min := rand.NormFloat64() + max := rand.NormFloat64() + if min > max { + min, max = max, min + } + bounds[i].Min = min + bounds[i].Max = max + } + target := distmv.NewUniform(bounds, nil) + mu := target.Mean(nil) + + muImp := make([]float64, dim) + sigmaImp := mat64.NewSymDense(dim, nil) + for i := 0; i < dim; i++ { + sigmaImp.SetSym(i, i, 6) + } + proposal, ok := distmv.NewNormal(muImp, sigmaImp, nil) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + nSamples := 1000 + batch := mat64.NewDense(nSamples, dim, nil) + weights := make([]float64, nSamples) + _, ok = Rejection(batch, target, proposal, 1000, nil) + if !ok { + t.Error("Bad test, nan samples") + } + + for i := 0; i < dim; i++ { + col := mat64.Col(nil, i, batch) + ev := stat.Mean(col, weights) + if math.Abs(ev-mu[i]) > 1e-2 { + t.Errorf("Mean mismatch: Want %v, got %v", mu[i], ev) + } + } +} + +func TestMetropolisHastings(t *testing.T) { + // Test by finding the expected value of a normal distribution. + dim := 3 + target, ok := randomNormal(dim) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + sigmaImp := mat64.NewSymDense(dim, nil) + for i := 0; i < dim; i++ { + sigmaImp.SetSym(i, i, 0.25) + } + proposal, ok := NewProposalNormal(sigmaImp, nil) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + nSamples := 1000000 + burnin := 5000 + batch := mat64.NewDense(nSamples, dim, nil) + initial := make([]float64, dim) + MetropolisHastings(batch, initial, target, proposal, nil) + batch = batch.View(burnin, 0, nSamples-burnin, dim).(*mat64.Dense) + + compareNormal(t, target, batch, nil) +} + +// randomNormal constructs a random Normal distribution. +func randomNormal(dim int) (*distmv.Normal, bool) { + data := make([]float64, dim*dim) + for i := range data { + data[i] = rand.Float64() + } + a := mat64.NewDense(dim, dim, data) + var sigma mat64.SymDense + sigma.SymOuterK(1, a) + mu := make([]float64, dim) + for i := range mu { + mu[i] = rand.NormFloat64() + } + return distmv.NewNormal(mu, &sigma, nil) +} + +func compareNormal(t *testing.T, want *distmv.Normal, batch *mat64.Dense, weights []float64) { + dim := want.Dim() + mu := want.Mean(nil) + sigma := want.CovarianceMatrix(nil) + n, _ := batch.Dims() + if weights == nil { + weights = make([]float64, n) + for i := range weights { + weights[i] = 1 + } + } + for i := 0; i < dim; i++ { + col := mat64.Col(nil, i, batch) + ev := stat.Mean(col, weights) + if math.Abs(ev-mu[i]) > 1e-2 { + t.Errorf("Mean mismatch: Want %v, got %v", mu[i], ev) + } + } + + cov := stat.CovarianceMatrix(nil, batch, weights) + if !mat64.EqualApprox(cov, sigma, 1.5e-1) { + t.Errorf("Covariance matrix mismatch") + } +} + +func TestMetropolisHastingser(t *testing.T) { + for seed, test := range []struct { + dim, burnin, rate, samples int + }{ + {3, 10, 1, 1}, + {3, 10, 2, 1}, + {3, 10, 1, 2}, + {3, 10, 3, 2}, + {3, 10, 7, 4}, + {3, 10, 7, 4}, + + {3, 11, 51, 103}, + {3, 11, 103, 51}, + {3, 51, 11, 103}, + {3, 51, 103, 11}, + {3, 103, 11, 51}, + {3, 103, 51, 11}, + } { + dim := test.dim + + initial := make([]float64, dim) + target, ok := randomNormal(dim) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + sigmaImp := mat64.NewSymDense(dim, nil) + for i := 0; i < dim; i++ { + sigmaImp.SetSym(i, i, 0.25) + } + proposal, ok := NewProposalNormal(sigmaImp, nil) + if !ok { + t.Fatal("bad test, sigma not pos def") + } + + // Test the Metropolis Hastingser by generating all the samples, then generating + // the same samples with a burnin and rate. + rand.Seed(int64(seed)) + mh := MetropolisHastingser{ + Initial: initial, + Target: target, + Proposal: proposal, + Src: nil, + BurnIn: 0, + Rate: 0, + } + samples := test.samples + burnin := test.burnin + rate := test.rate + fullBatch := mat64.NewDense(1+burnin+rate*(samples-1), dim, nil) + mh.Sample(fullBatch) + mh = MetropolisHastingser{ + Initial: initial, + Target: target, + Proposal: proposal, + Src: nil, + BurnIn: burnin, + Rate: rate, + } + rand.Seed(int64(seed)) + batch := mat64.NewDense(samples, dim, nil) + mh.Sample(batch) + + same := true + count := burnin + for i := 0; i < samples; i++ { + if !floats.Equal(batch.RawRowView(i), fullBatch.RawRowView(count)) { + fmt.Println("sample ", i, "is different") + same = false + break + } + count += rate + } + + if !same { + fmt.Printf("%v\n", mat64.Formatted(batch)) + fmt.Printf("%v\n", mat64.Formatted(fullBatch)) + + t.Errorf("sampling mismatch: dim = %v, burnin = %v, rate = %v, samples = %v", dim, burnin, rate, samples) + } + } +} diff --git a/stat/samplemv/samplemv.go b/stat/samplemv/samplemv.go new file mode 100644 index 00000000..a4529bed --- /dev/null +++ b/stat/samplemv/samplemv.go @@ -0,0 +1,282 @@ +// Copyright ©2016 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package samplemv implements advanced sampling routines from explicit and implicit +// probability distributions. +// +// Each sampling routine is implemented as a stateless function with a +// complementary wrapper type. The wrapper types allow the sampling routines +// to implement interfaces. +package samplemv + +import ( + "errors" + "math" + "math/rand" + + "github.com/gonum/matrix/mat64" + "github.com/gonum/stat/distmv" +) + +var ( + badLengthMismatch = "samplemv: slice length mismatch" +) + +var ( + _ Sampler = LatinHypercuber{} + _ Sampler = (*Rejectioner)(nil) + _ Sampler = IIDer{} + + _ WeightedSampler = SampleUniformWeighted{} + _ WeightedSampler = Importancer{} +) + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// Sampler generates a batch of samples according to the rule specified by the +// implementing type. The number of samples generated is equal to rows(batch), +// and the samples are stored in-place into the input. +type Sampler interface { + Sample(batch *mat64.Dense) +} + +// WeightedSampler generates a batch of samples and their relative weights +// according to the rule specified by the implementing type. The number of samples +// generated is equal to rows(batch), and the samples and weights +// are stored in-place into the inputs. The length of weights must equal +// rows(batch), otherwise SampleWeighted will panic. +type WeightedSampler interface { + SampleWeighted(batch *mat64.Dense, weights []float64) +} + +// SampleUniformWeighted wraps a Sampler type to create a WeightedSampler where all +// weights are equal. +type SampleUniformWeighted struct { + Sampler +} + +// SampleWeighted generates rows(batch) samples from the embedded Sampler type +// and sets all of the weights equal to 1. If rows(batch) and len(weights) +// of weights are not equal, SampleWeighted will panic. +func (w SampleUniformWeighted) SampleWeighted(batch *mat64.Dense, weights []float64) { + r, _ := batch.Dims() + if r != len(weights) { + panic(badLengthMismatch) + } + w.Sample(batch) + for i := range weights { + weights[i] = 1 + } +} + +// LatinHypercuber is a wrapper around the LatinHypercube sampling generation +// method. +type LatinHypercuber struct { + Q distmv.Quantiler + Src *rand.Rand +} + +// Sample generates rows(batch) samples using the LatinHypercube generation +// procedure. +func (l LatinHypercuber) Sample(batch *mat64.Dense) { + LatinHypercube(batch, l.Q, l.Src) +} + +// LatinHypercube generates rows(batch) samples using Latin hypercube sampling +// from the given distribution. If src is not nil, it will be used to generate +// random numbers, otherwise rand.Float64 will be used. +// +// Latin hypercube sampling divides the cumulative distribution function into equally +// spaced bins and guarantees that one sample is generated per bin. Within each bin, +// the location is randomly sampled. The distmv.NewUnitUniform function can be used +// for easy sampling from the unit hypercube. +func LatinHypercube(batch *mat64.Dense, q distmv.Quantiler, src *rand.Rand) { + r, c := batch.Dims() + var f64 func() float64 + var perm func(int) []int + if src != nil { + f64 = src.Float64 + perm = src.Perm + } else { + f64 = rand.Float64 + perm = rand.Perm + } + r64 := float64(r) + for i := 0; i < c; i++ { + p := perm(r) + for j := 0; j < r; j++ { + var v float64 + v = f64() + v = v/r64 + float64(j)/r64 + batch.Set(p[j], i, v) + } + } + p := make([]float64, c) + for i := 0; i < r; i++ { + copy(p, batch.RawRowView(i)) + q.Quantile(batch.RawRowView(i), p) + } +} + +// Importancer is a wrapper around the Importance sampling generation method. +type Importancer struct { + Target distmv.LogProber + Proposal distmv.RandLogProber +} + +// SampleWeighted generates rows(batch) samples using the Importance sampling +// generation procedure. +func (l Importancer) SampleWeighted(batch *mat64.Dense, weights []float64) { + Importance(batch, weights, l.Target, l.Proposal) +} + +// Importance sampling generates rows(batch) samples from the proposal distribution, +// and stores the locations and importance sampling weights in place. +// +// Importance sampling is a variance reduction technique where samples are +// generated from a proposal distribution, q(x), instead of the target distribution +// p(x). This allows relatively unlikely samples in p(x) to be generated more frequently. +// +// The importance sampling weight at x is given by p(x)/q(x). To reduce variance, +// a good proposal distribution will bound this sampling weight. This implies the +// support of q(x) should be at least as broad as p(x), and q(x) should be "fatter tailed" +// than p(x). +// +// If weights is nil, the weights are not stored. The length of weights must equal +// the length of batch, otherwise Importance will panic. +func Importance(batch *mat64.Dense, weights []float64, target distmv.LogProber, proposal distmv.RandLogProber) { + r, _ := batch.Dims() + if r != len(weights) { + panic(badLengthMismatch) + } + for i := 0; i < r; i++ { + v := batch.RawRowView(i) + proposal.Rand(v) + weights[i] = math.Exp(target.LogProb(v) - proposal.LogProb(v)) + } +} + +// ErrRejection is returned when the constant in Rejection is not sufficiently high. +var ErrRejection = errors.New("rejection: acceptance ratio above 1") + +// Rejectioner is a wrapper around the Rejection sampling generation procedure. +// If the rejection sampling fails during the call to Sample, all samples will +// be set to math.NaN() and a call to Err will return a non-nil value. +type Rejectioner struct { + C float64 + Target distmv.LogProber + Proposal distmv.RandLogProber + Src *rand.Rand + + err error + proposed int +} + +// Err returns nil if the most recent call to sample was successful, and returns +// ErrRejection if it was not. +func (r *Rejectioner) Err() error { + return r.err +} + +// Proposed returns the number of samples proposed during the most recent call to +// Sample. +func (r *Rejectioner) Proposed() int { + return r.proposed +} + +// Sample generates rows(batch) using the Rejection sampling generation procedure. +// Rejection sampling may fail if the constant is insufficiently high, as described +// in the function comment for Rejection. If the generation fails, the samples +// are set to math.NaN(), and a call to Err will return a non-nil value. +func (r *Rejectioner) Sample(batch *mat64.Dense) { + r.err = nil + r.proposed = 0 + proposed, ok := Rejection(batch, r.Target, r.Proposal, r.C, r.Src) + if !ok { + r.err = ErrRejection + } + r.proposed = proposed +} + +// Rejection generates rows(batch) samples using the rejection sampling algorithm and +// stores them in place into samples. +// Sampling continues until batch is filled. Rejection returns the total number of proposed +// locations and a boolean indicating if the rejection sampling assumption is +// violated (see details below). If the returned boolean is false, all elements +// of samples are set to NaN. If src != nil, it will be used to generate random +// numbers, otherwise rand.Float64 will be used. +// +// Rejection sampling generates points from the target distribution by using +// the proposal distribution. At each step of the algorithm, the proposaed point +// is accepted with probability +// p = target(x) / (proposal(x) * c) +// where target(x) is the probability of the point according to the target distribution +// and proposal(x) is the probability according to the proposal distribution. +// The constant c must be chosen such that target(x) < proposal(x) * c for all x. +// The expected number of proposed samples is len(samples) * c. +// +// Target may return the true (log of) the probablity of the location, or it may return +// a value that is proportional to the probability (logprob + constant). This is +// useful for cases where the probability distribution is only known up to a normalization +// constant. +func Rejection(batch *mat64.Dense, target distmv.LogProber, proposal distmv.RandLogProber, c float64, src *rand.Rand) (nProposed int, ok bool) { + if c < 1 { + panic("rejection: acceptance constant must be greater than 1") + } + f64 := rand.Float64 + if src != nil { + f64 = src.Float64 + } + r, dim := batch.Dims() + v := make([]float64, dim) + var idx int + for { + nProposed++ + proposal.Rand(v) + qx := proposal.LogProb(v) + px := target.LogProb(v) + accept := math.Exp(px-qx) / c + if accept > 1 { + // Invalidate the whole result and return a failure. + for i := 0; i < r; i++ { + for j := 0; j < dim; j++ { + batch.Set(i, j, math.NaN()) + } + } + return nProposed, false + } + if accept > f64() { + batch.SetRow(idx, v) + idx++ + if idx == r { + break + } + } + } + return nProposed, true +} + +// IIDer is a wrapper around the IID sample generation method. +type IIDer struct { + Dist distmv.Rander +} + +// Sample generates a set of identically and independently distributed samples. +func (iid IIDer) Sample(batch *mat64.Dense) { + IID(batch, iid.Dist) +} + +// IID generates a set of independently and identically distributed samples from +// the input distribution. +func IID(batch *mat64.Dense, d distmv.Rander) { + r, _ := batch.Dims() + for i := 0; i < r; i++ { + d.Rand(batch.RawRowView(i)) + } +} diff --git a/stat/sampleuv/example_burnin_test.go b/stat/sampleuv/example_burnin_test.go new file mode 100644 index 00000000..c8893c64 --- /dev/null +++ b/stat/sampleuv/example_burnin_test.go @@ -0,0 +1,39 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sampleuv + +import "github.com/gonum/stat/distuv" + +type ProposalDist struct { + Sigma float64 +} + +func (p ProposalDist) ConditionalRand(y float64) float64 { + return distuv.Normal{Mu: y, Sigma: p.Sigma}.Rand() +} + +func (p ProposalDist) ConditionalLogProb(x, y float64) float64 { + return distuv.Normal{Mu: y, Sigma: p.Sigma}.LogProb(x) +} + +func ExampleMetropolisHastings_burnin() { + n := 1000 // The number of samples to generate. + burnin := 50 // Number of samples to ignore at the start. + var initial float64 + // target is the distribution from which we would like to sample. + target := distuv.Weibull{K: 5, Lambda: 0.5} + // proposal is the proposal distribution. Here, we are choosing + // a tight Gaussian distribution around the current location. In + // typical problems, if Sigma is too small, it takes a lot of samples + // to move around the distribution. If Sigma is too large, it can be hard + // to find acceptable samples. + proposal := ProposalDist{Sigma: 0.2} + + samples := make([]float64, n+burnin) + MetropolisHastings(samples, initial, target, proposal, nil) + + // Remove the initial samples through slicing. + samples = samples[burnin:] +} diff --git a/stat/sampleuv/example_rate_test.go b/stat/sampleuv/example_rate_test.go new file mode 100644 index 00000000..790f0dd2 --- /dev/null +++ b/stat/sampleuv/example_rate_test.go @@ -0,0 +1,49 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sampleuv + +import "github.com/gonum/stat/distuv" + +func max(a, b int) int { + if a < b { + return b + } + return a +} + +func ExampleMetropolisHastings_samplingRate() { + // See Burnin example for a description of these quantities. + n := 1000 + burnin := 300 + var initial float64 + target := distuv.Weibull{K: 5, Lambda: 0.5} + proposal := ProposalDist{Sigma: 0.2} + + // Successive samples are correlated with one another through the + // Markov Chain defined by the proposal distribution. To get less + // correlated samples, one may use a sampling rate, in which only + // one sample from every few is accepted from the chain. This can + // be accomplished through a for loop. + rate := 50 + + tmp := make([]float64, max(rate, burnin)) + + // First deal with burnin. + tmp = tmp[:burnin] + MetropolisHastings(tmp, initial, target, proposal, nil) + // The final sample in tmp in the final point in the chain. + // Use it as the new initial location. + initial = tmp[len(tmp)-1] + + // Now, generate samples by using one every rate samples. + tmp = tmp[:rate] + samples := make([]float64, n) + samples[0] = initial + for i := 1; i < len(samples); i++ { + MetropolisHastings(tmp, initial, target, proposal, nil) + initial = tmp[len(tmp)-1] + samples[i] = initial + } +} diff --git a/stat/sampleuv/sample.go b/stat/sampleuv/sample.go new file mode 100644 index 00000000..58b81b93 --- /dev/null +++ b/stat/sampleuv/sample.go @@ -0,0 +1,399 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package sampleuv implements advanced sampling routines from explicit and implicit +// probability distributions. +// +// Each sampling routine is implemented as a stateless function with a +// complementary wrapper type. The wrapper types allow the sampling routines +// to implement interfaces. +package sampleuv + +import ( + "errors" + "math" + "math/rand" + + "github.com/gonum/stat/distuv" +) + +var ( + badLengthMismatch = "sample: slice length mismatch" +) + +var ( + _ Sampler = LatinHypercuber{} + _ Sampler = MetropolisHastingser{} + _ Sampler = (*Rejectioner)(nil) + _ Sampler = IIDer{} + + _ WeightedSampler = SampleUniformWeighted{} + _ WeightedSampler = Importancer{} +) + +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// Sampler generates a batch of samples according to the rule specified by the +// implementing type. The number of samples generated is equal to len(batch), +// and the samples are stored in-place into the input. +type Sampler interface { + Sample(batch []float64) +} + +// WeightedSampler generates a batch of samples and their relative weights +// according to the rule specified by the implementing type. The number of samples +// generated is equal to len(batch), and the samples and weights +// are stored in-place into the inputs. The length of weights must equal +// len(batch), otherwise SampleWeighted will panic. +type WeightedSampler interface { + SampleWeighted(batch, weights []float64) +} + +// SampleUniformWeighted wraps a Sampler type to create a WeightedSampler where all +// weights are equal. +type SampleUniformWeighted struct { + Sampler +} + +// SampleWeighted generates len(batch) samples from the embedded Sampler type +// and sets all of the weights equal to 1. If len(batch) and len(weights) +// are not equal, SampleWeighted will panic. +func (w SampleUniformWeighted) SampleWeighted(batch, weights []float64) { + if len(batch) != len(weights) { + panic(badLengthMismatch) + } + w.Sample(batch) + for i := range weights { + weights[i] = 1 + } +} + +// LatinHypercuber is a wrapper around the LatinHypercube sampling generation +// method. +type LatinHypercuber struct { + Q distuv.Quantiler + Src *rand.Rand +} + +// Sample generates len(batch) samples using the LatinHypercube generation +// procedure. +func (l LatinHypercuber) Sample(batch []float64) { + LatinHypercube(batch, l.Q, l.Src) +} + +// LatinHypercube generates len(batch) samples using Latin hypercube sampling +// from the given distribution. If src != nil, it will be used to generate +// random numbers, otherwise rand.Float64 will be used. +// +// Latin hypercube sampling divides the cumulative distribution function into equally +// spaced bins and guarantees that one sample is generated per bin. Within each bin, +// the location is randomly sampled. The distuv.UnitUniform variable can be used +// for easy generation from the unit interval. +func LatinHypercube(batch []float64, q distuv.Quantiler, src *rand.Rand) { + n := len(batch) + var perm []int + var f64 func() float64 + if src != nil { + f64 = src.Float64 + perm = src.Perm(n) + } else { + f64 = rand.Float64 + perm = rand.Perm(n) + } + for i := range batch { + v := f64()/float64(n) + float64(i)/float64(n) + batch[perm[i]] = q.Quantile(v) + } +} + +// Importancer is a wrapper around the Importance sampling generation method. +type Importancer struct { + Target distuv.LogProber + Proposal distuv.RandLogProber +} + +// Sample generates len(batch) samples using the Importance sampling generation +// procedure. +func (l Importancer) SampleWeighted(batch, weights []float64) { + Importance(batch, weights, l.Target, l.Proposal) +} + +// Importance sampling generates len(batch) samples from the proposal distribution, +// and stores the locations and importance sampling weights in place. +// +// Importance sampling is a variance reduction technique where samples are +// generated from a proposal distribution, q(x), instead of the target distribution +// p(x). This allows relatively unlikely samples in p(x) to be generated more frequently. +// +// The importance sampling weight at x is given by p(x)/q(x). To reduce variance, +// a good proposal distribution will bound this sampling weight. This implies the +// support of q(x) should be at least as broad as p(x), and q(x) should be "fatter tailed" +// than p(x). +// +// If weights is nil, the weights are not stored. The length of weights must equal +// the length of batch, otherwise Importance will panic. +func Importance(batch, weights []float64, target distuv.LogProber, proposal distuv.RandLogProber) { + if len(batch) != len(weights) { + panic(badLengthMismatch) + } + for i := range batch { + v := proposal.Rand() + batch[i] = v + weights[i] = math.Exp(target.LogProb(v) - proposal.LogProb(v)) + } +} + +// ErrRejection is returned when the constant in Rejection is not sufficiently high. +var ErrRejection = errors.New("rejection: acceptance ratio above 1") + +// Rejectioner is a wrapper around the Rejection sampling generation procedure. +// If the rejection sampling fails during the call to Sample, all samples will +// be set to math.NaN() and a call to Err will return a non-nil value. +type Rejectioner struct { + C float64 + Target distuv.LogProber + Proposal distuv.RandLogProber + Src *rand.Rand + + err error + proposed int +} + +// Err returns nil if the most recent call to sample was successful, and returns +// ErrRejection if it was not. +func (r *Rejectioner) Err() error { + return r.err +} + +// Proposed returns the number of samples proposed during the most recent call to +// Sample. +func (r *Rejectioner) Proposed() int { + return r.proposed +} + +// Sample generates len(batch) using the Rejection sampling generation procedure. +// Rejection sampling may fail if the constant is insufficiently high, as described +// in the function comment for Rejection. If the generation fails, the samples +// are set to math.NaN(), and a call to Err will return a non-nil value. +func (r *Rejectioner) Sample(batch []float64) { + r.err = nil + r.proposed = 0 + proposed, ok := Rejection(batch, r.Target, r.Proposal, r.C, r.Src) + if !ok { + r.err = ErrRejection + } + r.proposed = proposed +} + +// Rejection generates len(batch) samples using the rejection sampling algorithm +// and stores them in place into samples. Sampling continues until batch is +// filled. Rejection returns the total number of proposed locations and a boolean +// indicating if the rejection sampling assumption is violated (see details +// below). If the returned boolean is false, all elements of samples are set to +// NaN. If src is not nil, it will be used to generate random numbers, otherwise +// rand.Float64 will be used. +// +// Rejection sampling generates points from the target distribution by using +// the proposal distribution. At each step of the algorithm, the proposed point +// is accepted with probability +// p = target(x) / (proposal(x) * c) +// where target(x) is the probability of the point according to the target distribution +// and proposal(x) is the probability according to the proposal distribution. +// The constant c must be chosen such that target(x) < proposal(x) * c for all x. +// The expected number of proposed samples is len(samples) * c. +// +// Target may return the true (log of) the probablity of the location, or it may return +// a value that is proportional to the probability (logprob + constant). This is +// useful for cases where the probability distribution is only known up to a normalization +// constant. +func Rejection(batch []float64, target distuv.LogProber, proposal distuv.RandLogProber, c float64, src *rand.Rand) (nProposed int, ok bool) { + if c < 1 { + panic("rejection: acceptance constant must be greater than 1") + } + f64 := rand.Float64 + if src != nil { + f64 = src.Float64 + } + var idx int + for { + nProposed++ + v := proposal.Rand() + qx := proposal.LogProb(v) + px := target.LogProb(v) + accept := math.Exp(px-qx) / c + if accept > 1 { + // Invalidate the whole result and return a failure. + for i := range batch { + batch[i] = math.NaN() + } + return nProposed, false + } + if accept > f64() { + batch[idx] = v + idx++ + if idx == len(batch) { + break + } + } + } + return nProposed, true +} + +// MHProposal defines a proposal distribution for Metropolis Hastings. +type MHProposal interface { + // ConditionalDist returns the probability of the first argument conditioned on + // being at the second argument + // p(x|y) + ConditionalLogProb(x, y float64) (prob float64) + + // ConditionalRand generates a new random location conditioned being at the + // location y. + ConditionalRand(y float64) (x float64) +} + +// MetropolisHastingser is a wrapper around the MetropolisHastings sampling type. +// +// BurnIn sets the number of samples to discard before keeping the first sample. +// A properly set BurnIn rate will decorrelate the sampling chain from the initial +// location. The proper BurnIn value will depend on the mixing time of the +// Markov chain defined by the target and proposal distributions. +// +// Rate sets the number of samples to discard in between each kept sample. A +// higher rate will better approximate independently and identically distributed +// samples, while a lower rate will keep more information (at the cost of +// higher correlation between samples). If Rate is 0 it is defaulted to 1. +// +// The initial value is NOT changed during calls to Sample. +type MetropolisHastingser struct { + Initial float64 + Target distuv.LogProber + Proposal MHProposal + Src *rand.Rand + + BurnIn int + Rate int +} + +// Sample generates len(batch) samples using the Metropolis Hastings sample +// generation method. The initial location is NOT updated during the call to Sample. +func (m MetropolisHastingser) Sample(batch []float64) { + burnIn := m.BurnIn + rate := m.Rate + if rate == 0 { + rate = 1 + } + + // Use the optimal size for the temporary memory to allow the fewest calls + // to MetropolisHastings. The case where tmp shadows samples must be + // aligned with the logic after burn-in so that tmp does not shadow samples + // during the rate portion. + tmp := batch + if rate > len(batch) { + tmp = make([]float64, rate) + } + + // Perform burn-in. + remaining := burnIn + initial := m.Initial + for remaining != 0 { + newSamp := min(len(tmp), remaining) + MetropolisHastings(tmp[newSamp:], initial, m.Target, m.Proposal, m.Src) + initial = tmp[newSamp-1] + remaining -= newSamp + } + + if rate == 1 { + MetropolisHastings(batch, initial, m.Target, m.Proposal, m.Src) + return + } + + if len(tmp) <= len(batch) { + tmp = make([]float64, rate) + } + + // Take a single sample from the chain + MetropolisHastings(batch[0:1], initial, m.Target, m.Proposal, m.Src) + initial = batch[0] + + // For all of the other samples, first generate Rate samples and then actually + // accept the last one. + for i := 1; i < len(batch); i++ { + MetropolisHastings(tmp, initial, m.Target, m.Proposal, m.Src) + v := tmp[rate-1] + batch[i] = v + initial = v + } +} + +// MetropolisHastings generates len(batch) samples using the Metropolis Hastings +// algorithm (http://en.wikipedia.org/wiki/Metropolis%E2%80%93Hastings_algorithm), +// with the given target and proposal distributions, starting at the intial location +// and storing the results in-place into samples. If src != nil, it will be used to generate random +// numbers, otherwise rand.Float64 will be used. +// +// Metropolis-Hastings is a Markov-chain Monte Carlo algorithm that generates +// samples according to the distribution specified by target by using the Markov +// chain implicitly defined by the proposal distribution. At each +// iteration, a proposal point is generated randomly from the current location. +// This proposal point is accepted with probability +// p = min(1, (target(new) * proposal(current|new)) / (target(current) * proposal(new|current))) +// If the new location is accepted, it is stored into batch and becomes the +// new current location. If it is rejected, the current location remains and +// is stored into samples. Thus, a location is stored into batch at every iteration. +// +// The samples in Metropolis Hastings are correlated with one another through the +// Markov chain. As a result, the initial value can have a significant influence +// on the early samples, and so, typically, the first samples generated by the chain +// are ignored. This is known as "burn-in", and can be accomplished with slicing. +// The best choice for burn-in length will depend on the sampling and target +// distributions. +// +// Many choose to have a sampling "rate" where a number of samples +// are ignored in between each kept sample. This helps decorrelate +// the samples from one another, but also reduces the number of available samples. +// A sampling rate can be implemented with successive calls to MetropolisHastings. +func MetropolisHastings(batch []float64, initial float64, target distuv.LogProber, proposal MHProposal, src *rand.Rand) { + f64 := rand.Float64 + if src != nil { + f64 = src.Float64 + } + current := initial + currentLogProb := target.LogProb(initial) + for i := range batch { + proposed := proposal.ConditionalRand(current) + proposedLogProb := target.LogProb(proposed) + probTo := proposal.ConditionalLogProb(proposed, current) + probBack := proposal.ConditionalLogProb(current, proposed) + + accept := math.Exp(proposedLogProb + probBack - probTo - currentLogProb) + if accept > f64() { + current = proposed + currentLogProb = proposedLogProb + } + batch[i] = current + } +} + +// IIDer is a wrapper around the IID sample generation method. +type IIDer struct { + Dist distuv.Rander +} + +// Sample generates a set of identically and independently distributed samples. +func (iid IIDer) Sample(batch []float64) { + IID(batch, iid.Dist) +} + +// IID generates a set of independently and identically distributed samples from +// the input distribution. +func IID(batch []float64, d distuv.Rander) { + for i := range batch { + batch[i] = d.Rand() + } +} diff --git a/stat/sampleuv/sample_test.go b/stat/sampleuv/sample_test.go new file mode 100644 index 00000000..9c5c5ad6 --- /dev/null +++ b/stat/sampleuv/sample_test.go @@ -0,0 +1,99 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sampleuv + +import ( + "math" + "sort" + "testing" + + "github.com/gonum/stat" + "github.com/gonum/stat/distuv" +) + +type lhDist interface { + Quantile(float64) float64 + CDF(float64) float64 +} + +func TestLatinHypercube(t *testing.T) { + for _, nSamples := range []int{1, 2, 5, 10, 20} { + samples := make([]float64, nSamples) + for _, dist := range []lhDist{ + distuv.Uniform{Min: 0, Max: 1}, + distuv.Uniform{Min: 0, Max: 10}, + distuv.Normal{Mu: 5, Sigma: 3}, + } { + LatinHypercube(samples, dist, nil) + sort.Float64s(samples) + for i, v := range samples { + p := dist.CDF(v) + if p < float64(i)/float64(nSamples) || p > float64(i+1)/float64(nSamples) { + t.Errorf("probability out of bounds") + } + } + } + } +} + +func TestImportance(t *testing.T) { + // Test by finding the expected value of a Normal. + trueMean := 3.0 + target := distuv.Normal{Mu: trueMean, Sigma: 2} + proposal := distuv.Normal{Mu: 0, Sigma: 5} + nSamples := 100000 + x := make([]float64, nSamples) + weights := make([]float64, nSamples) + Importance(x, weights, target, proposal) + ev := stat.Mean(x, weights) + if math.Abs(ev-trueMean) > 1e-2 { + t.Errorf("Mean mismatch: Want %v, got %v", trueMean, ev) + } +} + +func TestRejection(t *testing.T) { + // Test by finding the expected value of a Normal. + trueMean := 3.0 + target := distuv.Normal{Mu: trueMean, Sigma: 2} + proposal := distuv.Normal{Mu: 0, Sigma: 5} + + nSamples := 100000 + x := make([]float64, nSamples) + Rejection(x, target, proposal, 100, nil) + ev := stat.Mean(x, nil) + if math.Abs(ev-trueMean) > 1e-2 { + t.Errorf("Mean mismatch: Want %v, got %v", trueMean, ev) + } +} + +type condNorm struct { + Sigma float64 +} + +func (c condNorm) ConditionalRand(y float64) float64 { + return distuv.Normal{Mu: y, Sigma: c.Sigma}.Rand() +} + +func (c condNorm) ConditionalLogProb(x, y float64) float64 { + return distuv.Normal{Mu: y, Sigma: c.Sigma}.LogProb(x) +} + +func TestMetropolisHastings(t *testing.T) { + // Test by finding the expected value of a Normal. + trueMean := 3.0 + target := distuv.Normal{Mu: trueMean, Sigma: 2} + proposal := condNorm{Sigma: 5} + + burnin := 500 + nSamples := 100000 + burnin + x := make([]float64, nSamples) + MetropolisHastings(x, 100, target, proposal, nil) + // Remove burnin + x = x[burnin:] + ev := stat.Mean(x, nil) + if math.Abs(ev-trueMean) > 1e-2 { + t.Errorf("Mean mismatch: Want %v, got %v", trueMean, ev) + } +} diff --git a/stat/sampleuv/weighted.go b/stat/sampleuv/weighted.go new file mode 100644 index 00000000..d094a01d --- /dev/null +++ b/stat/sampleuv/weighted.go @@ -0,0 +1,137 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this code is governed by a BSD-style +// license that can be found in the LICENSE file + +package sampleuv + +import ( + "math/rand" + + "github.com/gonum/floats" +) + +// Weighted provides sampling without replacement from a collection of items with +// non-uniform probability. +type Weighted struct { + weights []float64 + // heap is a weight heap. + // + // It keeps a heap-organised sum of remaining + // index weights that are available to be taken + // from. + // + // Each element holds the sum of weights for + // the corresponding index, plus the sum of + // of its children's weights; the children + // of an element i can be found at positions + // 2*(i+1)-1 and 2*(i+1). The root of the + // weight heap is at element 0. + // + // See comments in container/heap for an + // explanation of the layout of a heap. + heap []float64 + src *rand.Rand +} + +// NewWeighted returns a Weighted for the weights w. If src is nil, rand.Rand is +// used as the random source. +// +// Note that sampling from weights with a high variance or overall low absolute +// value sum may result in problems with numerical stability. +func NewWeighted(w []float64, src *rand.Rand) Weighted { + s := Weighted{ + weights: make([]float64, len(w)), + heap: make([]float64, len(w)), + } + s.ReweightAll(w) + return s +} + +// Len returns the number of items held by the Weighted, including items +// already taken. +func (s Weighted) Len() int { return len(s.weights) } + +// Take returns an index from the Weighted with probability proportional +// to the weight of the item. The weight of the item is then set to zero. +// Take returns false if there are no items remaining. +func (s Weighted) Take() (idx int, ok bool) { + const small = 1e-12 + if floats.EqualWithinAbsOrRel(s.heap[0], 0, small, small) { + return -1, false + } + + var r float64 + if s.src == nil { + r = s.heap[0] * rand.Float64() + } else { + r = s.heap[0] * s.src.Float64() + } + i := 1 + last := -1 + left := len(s.weights) + for { + if r -= s.weights[i-1]; r <= 0 { + break // Fall within item i-1. + } + i <<= 1 // Move to left child. + if d := s.heap[i-1]; r > d { + r -= d + // If enough r to pass left child + // move to right child state will + // be caught at break above. + i++ + } + if i == last || left < 0 { + // No progression. + return -1, false + } + last = i + left-- + } + + w, idx := s.weights[i-1], i-1 + + s.weights[i-1] = 0 + for i > 0 { + s.heap[i-1] -= w + // The following condition is necessary to + // handle floating point error. If we see + // a heap value below zero, we know we need + // to rebuild it. + if s.heap[i-1] < 0 { + s.reset() + return idx, true + } + i >>= 1 + } + + return idx, true +} + +// Reweight sets the weight of item idx to w. +func (s Weighted) Reweight(idx int, w float64) { + w, s.weights[idx] = s.weights[idx]-w, w + idx++ + for idx > 0 { + s.heap[idx-1] -= w + idx >>= 1 + } +} + +// ReweightAll sets the weight of all items in the Weighted. ReweightAll +// panics if len(w) != s.Len. +func (s Weighted) ReweightAll(w []float64) { + if len(w) != s.Len() { + panic("floats: length of the slices do not match") + } + copy(s.weights, w) + s.reset() +} + +func (s Weighted) reset() { + copy(s.heap, s.weights) + for i := len(s.heap) - 1; i > 0; i-- { + // Sometimes 1-based counting makes sense. + s.heap[((i+1)>>1)-1] += s.heap[i] + } +} diff --git a/stat/sampleuv/weighted_test.go b/stat/sampleuv/weighted_test.go new file mode 100644 index 00000000..d09a8d9b --- /dev/null +++ b/stat/sampleuv/weighted_test.go @@ -0,0 +1,267 @@ +// Copyright ©2015 The gonum Authors. All rights reserved. +// Use of this code is governed by a BSD-style +// license that can be found in the LICENSE file + +package sampleuv + +import ( + "flag" + "math/rand" + "reflect" + "testing" + "time" + + "github.com/gonum/floats" +) + +var prob = flag.Bool("prob", false, "enables probabilistic testing of the random weighted sampler") + +const sigChi2 = 16.92 // p = 0.05 df = 9 + +var ( + newExp = func() []float64 { + return []float64{1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9} + } + exp = newExp() + + obt = []float64{973, 1937, 3898, 7897, 15769, 31284, 62176, 125408, 250295, 500363} +) + +func newTestWeighted() Weighted { + weights := make([]float64, len(obt)) + for i := range weights { + weights[i] = float64(int(1) << uint(i)) + } + return NewWeighted(weights, nil) +} + +func TestWeightedUnseeded(t *testing.T) { + rand.Seed(0) + + want := Weighted{ + weights: []float64{1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9}, + heap: []float64{ + exp[0] + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9] + exp[2] + exp[5] + exp[6], + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9], + exp[2] + exp[5] + exp[6], + exp[3] + exp[7] + exp[8], + exp[4] + exp[9], + exp[5], + exp[6], + exp[7], + exp[8], + exp[9], + }, + } + + ts := newTestWeighted() + if !reflect.DeepEqual(ts, want) { + t.Fatalf("unexpected new Weighted value:\ngot: %#v\nwant:%#v", ts, want) + } + + f := make([]float64, len(obt)) + for i := 0; i < 1e6; i++ { + item, ok := newTestWeighted().Take() + if !ok { + t.Fatal("Weighted unexpectedly empty") + } + f[item]++ + } + + exp := newExp() + fac := floats.Sum(f) / floats.Sum(exp) + for i := range f { + exp[i] *= fac + } + + if !reflect.DeepEqual(f, obt) { + t.Fatalf("unexpected selection:\ngot: %#v\nwant:%#v", f, obt) + } + + // Check that this is within statistical expectations - we know this is true for this set. + X := chi2(f, exp) + if X >= sigChi2 { + t.Errorf("H₀: d(Sample) = d(Expect), H₁: d(S) ≠ d(Expect). df = %d, p = 0.05, X² threshold = %.2f, X² = %f", len(f)-1, sigChi2, X) + } +} + +func TestWeightedTimeSeeded(t *testing.T) { + if !*prob { + t.Skip("probabilistic testing not requested") + } + t.Log("Note: This test is stochastic and is expected to fail with probability ≈ 0.05.") + + rand.Seed(time.Now().Unix()) + + f := make([]float64, len(obt)) + for i := 0; i < 1e6; i++ { + item, ok := newTestWeighted().Take() + if !ok { + t.Fatal("Weighted unexpectedly empty") + } + f[item]++ + } + + exp := newExp() + fac := floats.Sum(f) / floats.Sum(exp) + for i := range f { + exp[i] *= fac + } + + // Check that our obtained values are within statistical expectations for p = 0.05. + // This will not be true approximately 1 in 20 tests. + X := chi2(f, exp) + if X >= sigChi2 { + t.Errorf("H₀: d(Sample) = d(Expect), H₁: d(S) ≠ d(Expect). df = %d, p = 0.05, X² threshold = %.2f, X² = %f", len(f)-1, sigChi2, X) + } +} + +func TestWeightZero(t *testing.T) { + rand.Seed(0) + + want := Weighted{ + weights: []float64{1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 0, 1 << 7, 1 << 8, 1 << 9}, + heap: []float64{ + exp[0] + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9] + exp[2] + exp[5], + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9], + exp[2] + exp[5], + exp[3] + exp[7] + exp[8], + exp[4] + exp[9], + exp[5], + 0, + exp[7], + exp[8], + exp[9], + }, + } + + ts := newTestWeighted() + ts.Reweight(6, 0) + if !reflect.DeepEqual(ts, want) { + t.Fatalf("unexpected new Weighted value:\ngot: %#v\nwant:%#v", ts, want) + } + + f := make([]float64, len(obt)) + for i := 0; i < 1e6; i++ { + ts := newTestWeighted() + ts.Reweight(6, 0) + item, ok := ts.Take() + if !ok { + t.Fatal("Weighted unexpectedly empty") + } + f[item]++ + } + + exp := newExp() + fac := floats.Sum(f) / floats.Sum(exp) + for i := range f { + exp[i] *= fac + } + + if f[6] != 0 { + t.Errorf("unexpected selection rate for zero-weighted item: got: %v want:%v", f[6], 0) + } + if reflect.DeepEqual(f[:6], obt[:6]) { + t.Fatal("unexpected selection: too few elements chosen in range:\ngot: %v\nwant:%v", + f[:6], obt[:6]) + } + if reflect.DeepEqual(f[7:], obt[7:]) { + t.Fatal("unexpected selection: too few elements chosen in range:\ngot: %v\nwant:%v", + f[7:], obt[7:]) + } +} + +func TestWeightIncrease(t *testing.T) { + rand.Seed(0) + + want := Weighted{ + weights: []float64{1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 9 * 2, 1 << 7, 1 << 8, 1 << 9}, + heap: []float64{ + exp[0] + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9] + exp[2] + exp[5] + exp[9]*2, + exp[1] + exp[3] + exp[4] + exp[7] + exp[8] + exp[9], + exp[2] + exp[5] + exp[9]*2, + exp[3] + exp[7] + exp[8], + exp[4] + exp[9], + exp[5], + exp[9] * 2, + exp[7], + exp[8], + exp[9], + }, + } + + ts := newTestWeighted() + ts.Reweight(6, ts.weights[len(ts.weights)-1]*2) + if !reflect.DeepEqual(ts, want) { + t.Fatalf("unexpected new Weighted value:\ngot: %#v\nwant:%#v", ts, want) + } + + f := make([]float64, len(obt)) + for i := 0; i < 1e6; i++ { + ts := newTestWeighted() + ts.Reweight(6, ts.weights[len(ts.weights)-1]*2) + item, ok := ts.Take() + if !ok { + t.Fatal("Weighted unexpectedly empty") + } + f[item]++ + } + + exp := newExp() + fac := floats.Sum(f) / floats.Sum(exp) + for i := range f { + exp[i] *= fac + } + + if f[6] < f[9] { + t.Errorf("unexpected selection rate for re-weighted item: got: %v want:%v", f[6], f[9]) + } + if reflect.DeepEqual(f[:6], obt[:6]) { + t.Fatal("unexpected selection: too many elements chosen in range:\ngot: %v\nwant:%v", + f[:6], obt[:6]) + } + if reflect.DeepEqual(f[7:], obt[7:]) { + t.Fatal("unexpected selection: too many elements chosen in range:\ngot: %v\nwant:%v", + f[7:], obt[7:]) + } +} + +func chi2(ob, ex []float64) (sum float64) { + for i := range ob { + x := ob[i] - ex[i] + sum += (x * x) / ex[i] + } + + return sum +} + +func TestWeightedNoResample(t *testing.T) { + const ( + tries = 10 + n = 10e5 + ) + ts := NewWeighted(make([]float64, n), nil) + w := make([]float64, n) + for i := 0; i < tries; i++ { + for j := range w { + w[j] = rand.Float64() * n + } + ts.ReweightAll(w) + taken := make(map[int]struct{}) + var c int + for { + item, ok := ts.Take() + if !ok { + if c != n { + t.Errorf("unexpected number of items: got: %d want: %d", c, n) + } + break + } + c++ + if _, exists := taken[item]; exists { + t.Errorf("unexpected duplicate sample for item: %d", item) + } + taken[item] = struct{}{} + } + } +} diff --git a/stat/stat.go b/stat/stat.go new file mode 100644 index 00000000..a476ac91 --- /dev/null +++ b/stat/stat.go @@ -0,0 +1,1161 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package stat provides generalized statistical functions. +package stat + +import ( + "math" + "sort" + + "github.com/gonum/floats" +) + +// CumulantKind specifies the behavior for calculating the empirical CDF or Quantile +type CumulantKind int + +const ( + // Constant values should match the R nomenclature. See + // https://en.wikipedia.org/wiki/Quantile#Estimating_the_quantiles_of_a_population + + // Empirical treats the distribution as the actual empirical distribution. + Empirical CumulantKind = 1 +) + +// bhattacharyyaCoeff computes the Bhattacharyya Coefficient for probability distributions given by: +// \sum_i \sqrt{p_i q_i} +// +// It is assumed that p and q have equal length. +func bhattacharyyaCoeff(p, q []float64) float64 { + var bc float64 + for i, a := range p { + bc += math.Sqrt(a * q[i]) + } + return bc +} + +// Bhattacharyya computes the distance between the probability distributions p and q given by: +// -\ln ( \sum_i \sqrt{p_i q_i} ) +// +// The lengths of p and q must be equal. It is assumed that p and q sum to 1. +func Bhattacharyya(p, q []float64) float64 { + if len(p) != len(q) { + panic("stat: slice length mismatch") + } + bc := bhattacharyyaCoeff(p, q) + return -math.Log(bc) +} + +// CDF returns the empirical cumulative distribution function value of x, that is +// the fraction of the samples less than or equal to q. The +// exact behavior is determined by the CumulantKind. CDF is theoretically +// the inverse of the Quantile function, though it may not be the actual inverse +// for all values q and CumulantKinds. +// +// The x data must be sorted in increasing order. If weights is nil then all +// of the weights are 1. If weights is not nil, then len(x) must equal len(weights). +// +// CumulantKind behaviors: +// - Empirical: Returns the lowest fraction for which q is greater than or equal +// to that fraction of samples +func CDF(q float64, c CumulantKind, x, weights []float64) float64 { + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + if floats.HasNaN(x) { + return math.NaN() + } + if !sort.Float64sAreSorted(x) { + panic("x data are not sorted") + } + + if q < x[0] { + return 0 + } + if q >= x[len(x)-1] { + return 1 + } + + var sumWeights float64 + if weights == nil { + sumWeights = float64(len(x)) + } else { + sumWeights = floats.Sum(weights) + } + + // Calculate the index + switch c { + case Empirical: + // Find the smallest value that is greater than that percent of the samples + var w float64 + for i, v := range x { + if v > q { + return w / sumWeights + } + if weights == nil { + w++ + } else { + w += weights[i] + } + } + panic("impossible") + default: + panic("stat: bad cumulant kind") + } +} + +// ChiSquare computes the chi-square distance between the observed frequences 'obs' and +// expected frequences 'exp' given by: +// \sum_i (obs_i-exp_i)^2 / exp_i +// +// The lengths of obs and exp must be equal. +func ChiSquare(obs, exp []float64) float64 { + if len(obs) != len(exp) { + panic("stat: slice length mismatch") + } + var result float64 + for i, a := range obs { + b := exp[i] + if a == 0 && b == 0 { + continue + } + result += (a - b) * (a - b) / b + } + return result +} + +// CircularMean returns the circular mean of the dataset. +// atan2(\sum_i w_i * sin(alpha_i), \sum_i w_i * cos(alpha_i)) +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func CircularMean(x, weights []float64) float64 { + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + + var aX, aY float64 + if weights != nil { + for i, v := range x { + aX += weights[i] * math.Cos(v) + aY += weights[i] * math.Sin(v) + } + } else { + for _, v := range x { + aX += math.Cos(v) + aY += math.Sin(v) + } + } + + return math.Atan2(aY, aX) +} + +// Correlation returns the weighted correlation between the samples of x and y +// with the given means. +// sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (stdX * stdY) +// The lengths of x and y must be equal. If weights is nil then all of the +// weights are 1. If weights is not nil, then len(x) must equal len(weights). +func Correlation(x, y, weights []float64) float64 { + // This is a two-pass corrected implementation. It is an adaptation of the + // algorithm used in the MeanVariance function, which applies a correction + // to the typical two pass approach. + + if len(x) != len(y) { + panic("stat: slice length mismatch") + } + xu := Mean(x, weights) + yu := Mean(y, weights) + var ( + sxx float64 + syy float64 + sxy float64 + xcompensation float64 + ycompensation float64 + ) + if weights == nil { + for i, xv := range x { + yv := y[i] + xd := xv - xu + yd := yv - yu + sxx += xd * xd + syy += yd * yd + sxy += xd * yd + xcompensation += xd + ycompensation += yd + } + // xcompensation and ycompensation are from Chan, et. al. + // referenced in the MeanVariance function. They are analogous + // to the second term in (1.7) in that paper. + sxx -= xcompensation * xcompensation / float64(len(x)) + syy -= ycompensation * ycompensation / float64(len(x)) + + return (sxy - xcompensation*ycompensation/float64(len(x))) / math.Sqrt(sxx*syy) + + } + + var sumWeights float64 + for i, xv := range x { + w := weights[i] + yv := y[i] + xd := xv - xu + wxd := w * xd + yd := yv - yu + wyd := w * yd + sxx += wxd * xd + syy += wyd * yd + sxy += wxd * yd + xcompensation += wxd + ycompensation += wyd + sumWeights += w + } + // xcompensation and ycompensation are from Chan, et. al. + // referenced in the MeanVariance function. They are analogous + // to the second term in (1.7) in that paper, except they use + // the sumWeights instead of the sample count. + sxx -= xcompensation * xcompensation / sumWeights + syy -= ycompensation * ycompensation / sumWeights + + return (sxy - xcompensation*ycompensation/sumWeights) / math.Sqrt(sxx*syy) +} + +// Covariance returns the weighted covariance between the samples of x and y. +// sum_i {w_i (x_i - meanX) * (y_i - meanY)} / (sum_j {w_j} - 1) +// The lengths of x and y must be equal. If weights is nil then all of the +// weights are 1. If weights is not nil, then len(x) must equal len(weights). +func Covariance(x, y, weights []float64) float64 { + // This is a two-pass corrected implementation. It is an adaptation of the + // algorithm used in the MeanVariance function, which applies a correction + // to the typical two pass approach. + + if len(x) != len(y) { + panic("stat: slice length mismatch") + } + xu := Mean(x, weights) + yu := Mean(y, weights) + var ( + ss float64 + xcompensation float64 + ycompensation float64 + ) + if weights == nil { + for i, xv := range x { + yv := y[i] + xd := xv - xu + yd := yv - yu + ss += xd * yd + xcompensation += xd + ycompensation += yd + } + // xcompensation and ycompensation are from Chan, et. al. + // referenced in the MeanVariance function. They are analogous + // to the second term in (1.7) in that paper. + return (ss - xcompensation*ycompensation/float64(len(x))) / float64(len(x)-1) + } + + var sumWeights float64 + + for i, xv := range x { + w := weights[i] + yv := y[i] + wxd := w * (xv - xu) + yd := (yv - yu) + ss += wxd * yd + xcompensation += wxd + ycompensation += w * yd + sumWeights += w + } + // xcompensation and ycompensation are from Chan, et. al. + // referenced in the MeanVariance function. They are analogous + // to the second term in (1.7) in that paper, except they use + // the sumWeights instead of the sample count. + return (ss - xcompensation*ycompensation/sumWeights) / (sumWeights - 1) +} + +// CrossEntropy computes the cross-entropy between the two distributions specified +// in p and q. +func CrossEntropy(p, q []float64) float64 { + if len(p) != len(q) { + panic("stat: slice length mismatch") + } + var ce float64 + for i, v := range p { + if v != 0 { + ce -= v * math.Log(q[i]) + } + } + return ce +} + +// Entropy computes the Shannon entropy of a distribution or the distance between +// two distributions. The natural logarithm is used. +// - sum_i (p_i * log_e(p_i)) +func Entropy(p []float64) float64 { + var e float64 + for _, v := range p { + if v != 0 { // Entropy needs 0 * log(0) == 0 + e -= v * math.Log(v) + } + } + return e +} + +// ExKurtosis returns the population excess kurtosis of the sample. +// The kurtosis is defined by the 4th moment of the mean divided by the squared +// variance. The excess kurtosis subtracts 3.0 so that the excess kurtosis of +// the normal distribution is zero. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func ExKurtosis(x, weights []float64) float64 { + mean, std := MeanStdDev(x, weights) + if weights == nil { + var e float64 + for _, v := range x { + z := (v - mean) / std + e += z * z * z * z + } + mul, offset := kurtosisCorrection(float64(len(x))) + return e*mul - offset + } + + var ( + e float64 + sumWeights float64 + ) + for i, v := range x { + z := (v - mean) / std + e += weights[i] * z * z * z * z + sumWeights += weights[i] + } + mul, offset := kurtosisCorrection(sumWeights) + return e*mul - offset +} + +// n is the number of samples +// see https://en.wikipedia.org/wiki/Kurtosis +func kurtosisCorrection(n float64) (mul, offset float64) { + return ((n + 1) / (n - 1)) * (n / (n - 2)) * (1 / (n - 3)), 3 * ((n - 1) / (n - 2)) * ((n - 1) / (n - 3)) +} + +// GeometricMean returns the weighted geometric mean of the dataset +// \prod_i {x_i ^ w_i} +// This only applies with positive x and positive weights. If weights is nil +// then all of the weights are 1. If weights is not nil, then len(x) must equal +// len(weights). +func GeometricMean(x, weights []float64) float64 { + if weights == nil { + var s float64 + for _, v := range x { + s += math.Log(v) + } + s /= float64(len(x)) + return math.Exp(s) + } + if len(x) != len(weights) { + panic("stat: slice length mismatch") + } + var ( + s float64 + sumWeights float64 + ) + for i, v := range x { + s += weights[i] * math.Log(v) + sumWeights += weights[i] + } + s /= sumWeights + return math.Exp(s) +} + +// HarmonicMean returns the weighted harmonic mean of the dataset +// \sum_i {w_i} / ( sum_i {w_i / x_i} ) +// This only applies with positive x and positive weights. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func HarmonicMean(x, weights []float64) float64 { + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + // TODO: Fix this to make it more efficient and avoid allocation + + // This can be numerically unstable (for example if x is very small) + // W = \sum_i {w_i} + // hm = exp(log(W) - log(\sum_i w_i / x_i)) + + logs := make([]float64, len(x)) + var W float64 + for i := range x { + if weights == nil { + logs[i] = -math.Log(x[i]) + W++ + continue + } + logs[i] = math.Log(weights[i]) - math.Log(x[i]) + W += weights[i] + } + + // Sum all of the logs + v := floats.LogSumExp(logs) // this computes log(\sum_i { w_i / x_i}) + return math.Exp(math.Log(W) - v) +} + +// Hellinger computes the distance between the probability distributions p and q given by: +// \sqrt{ 1 - \sum_i \sqrt{p_i q_i} } +// +// The lengths of p and q must be equal. It is assumed that p and q sum to 1. +func Hellinger(p, q []float64) float64 { + if len(p) != len(q) { + panic("stat: slice length mismatch") + } + bc := bhattacharyyaCoeff(p, q) + return math.Sqrt(1 - bc) +} + +// Histogram sums up the weighted number of data points in each bin. +// The weight of data point x[i] will be placed into count[j] if +// dividers[j] <= x < dividers[j+1]. The "span" function in the floats package can assist +// with bin creation. +// +// The following conditions on the inputs apply: +// - The count variable must either be nil or have length of one less than dividers. +// - The values in dividers must be sorted (use the sort package). +// - The x values must be sorted. +// - If weights is nil then all of the weights are 1. +// - If weights is not nil, then len(x) must equal len(weights). +func Histogram(count, dividers, x, weights []float64) []float64 { + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + if count == nil { + count = make([]float64, len(dividers)-1) + } + if len(dividers) < 2 { + panic("histogram: fewer than two dividers") + } + if len(count) != len(dividers)-1 { + panic("histogram: bin count mismatch") + } + if !sort.Float64sAreSorted(dividers) { + panic("histogram: dividers are not sorted") + } + if !sort.Float64sAreSorted(x) { + panic("histogram: x data are not sorted") + } + for i := range count { + count[i] = 0 + } + if len(x) == 0 { + return count + } + if x[0] < dividers[0] { + panic("histogram: minimum x value is less than lowest divider") + } + if x[len(x)-1] >= dividers[len(dividers)-1] { + panic("histogram: minimum x value is greater than highest divider") + } + + idx := 0 + comp := dividers[idx+1] + if weights == nil { + for _, v := range x { + if v < comp { + // Still in the current bucket + count[idx]++ + continue + } + // Find the next divider where v is less than the divider + for j := idx + 1; j < len(dividers); j++ { + if v < dividers[j+1] { + idx = j + comp = dividers[j+1] + break + } + } + count[idx]++ + } + return count + } + + for i, v := range x { + if v < comp { + // Still in the current bucket + count[idx] += weights[i] + continue + } + // Need to find the next divider where v is less than the divider. + for j := idx + 1; j < len(count); j++ { + if v < dividers[j+1] { + idx = j + comp = dividers[j+1] + break + } + } + count[idx] += weights[i] + } + return count +} + +// JensenShannon computes the JensenShannon divergence between the distributions +// p and q. The Jensen-Shannon divergence is defined as +// m = 0.5 * (p + q) +// JS(p, q) = 0.5 ( KL(p, m) + KL(q, m) ) +// Unlike Kullback-Liebler, the Jensen-Shannon distance is symmetric. The value +// is between 0 and ln(2). +func JensenShannon(p, q []float64) float64 { + if len(p) != len(q) { + panic("stat: slice length mismatch") + } + var js float64 + for i, v := range p { + qi := q[i] + m := 0.5 * (v + qi) + if v != 0 { + // add kl from p to m + js += 0.5 * v * (math.Log(v) - math.Log(m)) + } + if qi != 0 { + // add kl from q to m + js += 0.5 * qi * (math.Log(qi) - math.Log(m)) + } + } + return js +} + +// KolmogorovSmirnov computes the largest distance between two empirical CDFs. +// Each dataset x and y consists of sample locations and counts, xWeights and +// yWeights, respectively. +// +// x and y may have different lengths, though len(x) must equal len(xWeights), and +// len(y) must equal len(yWeights). Both x and y must be sorted. +// +// Special cases are: +// = 0 if len(x) == len(y) == 0 +// = 1 if len(x) == 0, len(y) != 0 or len(x) != 0 and len(y) == 0 +func KolmogorovSmirnov(x, xWeights, y, yWeights []float64) float64 { + if xWeights != nil && len(x) != len(xWeights) { + panic("stat: slice length mismatch") + } + if yWeights != nil && len(y) != len(yWeights) { + panic("stat: slice length mismatch") + } + if len(x) == 0 || len(y) == 0 { + if len(x) == 0 && len(y) == 0 { + return 0 + } + return 1 + } + + if floats.HasNaN(x) { + return math.NaN() + } + if floats.HasNaN(y) { + return math.NaN() + } + + if !sort.Float64sAreSorted(x) { + panic("x data are not sorted") + } + if !sort.Float64sAreSorted(y) { + panic("y data are not sorted") + } + + xWeightsNil := xWeights == nil + yWeightsNil := yWeights == nil + + var ( + maxDist float64 + xSum, ySum float64 + xCdf, yCdf float64 + xIdx, yIdx int + ) + + if xWeightsNil { + xSum = float64(len(x)) + } else { + xSum = floats.Sum(xWeights) + } + + if yWeightsNil { + ySum = float64(len(y)) + } else { + ySum = floats.Sum(yWeights) + } + + xVal := x[0] + yVal := y[0] + + // Algorithm description: + // The goal is to find the maximum difference in the empirical CDFs for the + // two datasets. The CDFs are piecewise-constant, and thus the distance + // between the CDFs will only change at the values themselves. + // + // To find the maximum distance, step through the data in ascending order + // of value between the two datasets. At each step, compute the empirical CDF + // and compare the local distance with the maximum distance. + // Due to some corner cases, equal data entries must be tallied simultaneously. + for { + switch { + case xVal < yVal: + xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) + case yVal < xVal: + yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) + case xVal == yVal: + newX := x[xIdx] + newY := y[yIdx] + if newX < newY { + xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) + } else if newY < newX { + yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) + } else { + // Update them both, they'll be equal next time and the right + // thing will happen + xVal, xCdf, xIdx = updateKS(xIdx, xCdf, xSum, x, xWeights, xWeightsNil) + yVal, yCdf, yIdx = updateKS(yIdx, yCdf, ySum, y, yWeights, yWeightsNil) + } + default: + panic("unreachable") + } + + dist := math.Abs(xCdf - yCdf) + if dist > maxDist { + maxDist = dist + } + + // Both xCdf and yCdf will equal 1 at the end, so if we have reached the + // end of either sample list, the distance is as large as it can be. + if xIdx == len(x) || yIdx == len(y) { + return maxDist + } + } +} + +// updateKS gets the next data point from one of the set. In doing so, it combines +// the weight of all the data points of equal value. Upon return, val is the new +// value of the data set, newCdf is the total combined CDF up until this point, +// and newIdx is the index of the next location in that sample to examine. +func updateKS(idx int, cdf, sum float64, values, weights []float64, isNil bool) (val, newCdf float64, newIdx int) { + // Sum up all the weights of consecutive values that are equal + if isNil { + newCdf = cdf + 1/sum + } else { + newCdf = cdf + weights[idx]/sum + } + newIdx = idx + 1 + for { + if newIdx == len(values) { + return values[newIdx-1], newCdf, newIdx + } + if values[newIdx-1] != values[newIdx] { + return values[newIdx], newCdf, newIdx + } + if isNil { + newCdf += 1 / sum + } else { + newCdf += weights[newIdx] / sum + } + newIdx++ + } +} + +// KullbackLeibler computes the Kullback-Leibler distance between the +// distributions p and q. The natural logarithm is used. +// sum_i(p_i * log(p_i / q_i)) +// Note that the Kullback-Leibler distance is not symmetric; +// KullbackLeibler(p,q) != KullbackLeibler(q,p) +func KullbackLeibler(p, q []float64) float64 { + if len(p) != len(q) { + panic("stat: slice length mismatch") + } + var kl float64 + for i, v := range p { + if v != 0 { // Entropy needs 0 * log(0) == 0 + kl += v * (math.Log(v) - math.Log(q[i])) + } + } + return kl +} + +// LinearRegression computes the best-fit line +// y = alpha + beta*x +// to the data in x and y with the given weights. If origin is true, the +// regression is forced to pass through the origin. +// +// Specifically, LinearRegression computes the values of alpha and +// beta such that the total residual +// \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 +// is minimized. If origin is true, then alpha is forced to be zero. +// +// The lengths of x and y must be equal. If weights is nil then all of the +// weights are 1. If weights is not nil, then len(x) must equal len(weights). +func LinearRegression(x, y, weights []float64, origin bool) (alpha, beta float64) { + if len(x) != len(y) { + panic("stat: slice length mismatch") + } + if weights != nil && len(weights) != len(x) { + panic("stat: slice length mismatch") + } + + w := 1.0 + if origin { + var x2Sum, xySum float64 + for i, xi := range x { + if weights != nil { + w = weights[i] + } + yi := y[i] + xySum += w * xi * yi + x2Sum += w * xi * xi + } + beta = xySum / x2Sum + + return 0, beta + } + + beta = Covariance(x, y, weights) / Variance(x, weights) + alpha = Mean(y, weights) - beta*Mean(x, weights) + return alpha, beta +} + +// RSquared returns the coefficient of determination defined as +// R^2 = 1 - \sum_i w[i]*(y[i] - alpha - beta*x[i])^2 / \sum_i w[i]*(y[i] - mean(y))^2 +// for the line +// y = alpha + beta*x +// and the data in x and y with the given weights. +// +// The lengths of x and y must be equal. If weights is nil then all of the +// weights are 1. If weights is not nil, then len(x) must equal len(weights). +func RSquared(x, y, weights []float64, alpha, beta float64) float64 { + if len(x) != len(y) { + panic("stat: slice length mismatch") + } + if weights != nil && len(weights) != len(x) { + panic("stat: slice length mismatch") + } + + w := 1.0 + yMean := Mean(y, weights) + var res, tot, d float64 + for i, xi := range x { + if weights != nil { + w = weights[i] + } + yi := y[i] + fi := alpha + beta*xi + d = yi - fi + res += w * d * d + d = yi - yMean + tot += w * d * d + } + return 1 - res/tot +} + +// RSquaredFrom returns the coefficient of determination defined as +// R^2 = 1 - \sum_i w[i]*(estimate[i] - value[i])^2 / \sum_i w[i]*(value[i] - mean(values))^2 +// and the data in estimates and values with the given weights. +// +// The lengths of estimates and values must be equal. If weights is nil then +// all of the weights are 1. If weights is not nil, then len(values) must +// equal len(weights). +func RSquaredFrom(estimates, values, weights []float64) float64 { + if len(estimates) != len(values) { + panic("stat: slice length mismatch") + } + if weights != nil && len(weights) != len(values) { + panic("stat: slice length mismatch") + } + + w := 1.0 + mean := Mean(values, weights) + var res, tot, d float64 + for i, val := range values { + if weights != nil { + w = weights[i] + } + d = val - estimates[i] + res += w * d * d + d = val - mean + tot += w * d * d + } + return 1 - res/tot +} + +// RNoughtSquared returns the coefficient of determination defined as +// R₀^2 = \sum_i w[i]*(beta*x[i])^2 / \sum_i w[i]*y[i]^2 +// for the line +// y = beta*x +// and the data in x and y with the given weights. RNoughtSquared should +// only be used for best-fit lines regressed through the origin. +// +// The lengths of x and y must be equal. If weights is nil then all of the +// weights are 1. If weights is not nil, then len(x) must equal len(weights). +func RNoughtSquared(x, y, weights []float64, beta float64) float64 { + if len(x) != len(y) { + panic("stat: slice length mismatch") + } + if weights != nil && len(weights) != len(x) { + panic("stat: slice length mismatch") + } + + w := 1.0 + var ssr, tot float64 + for i, xi := range x { + if weights != nil { + w = weights[i] + } + fi := beta * xi + ssr += w * fi * fi + yi := y[i] + tot += w * yi * yi + } + return ssr / tot +} + +// Mean computes the weighted mean of the data set. +// sum_i {w_i * x_i} / sum_i {w_i} +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func Mean(x, weights []float64) float64 { + if weights == nil { + return floats.Sum(x) / float64(len(x)) + } + if len(x) != len(weights) { + panic("stat: slice length mismatch") + } + var ( + sumValues float64 + sumWeights float64 + ) + for i, w := range weights { + sumValues += w * x[i] + sumWeights += w + } + return sumValues / sumWeights +} + +// Mode returns the most common value in the dataset specified by x and the +// given weights. Strict float64 equality is used when comparing values, so users +// should take caution. If several values are the mode, any of them may be returned. +func Mode(x, weights []float64) (val float64, count float64) { + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + if len(x) == 0 { + return 0, 0 + } + m := make(map[float64]float64) + if weights == nil { + for _, v := range x { + m[v]++ + } + } else { + for i, v := range x { + m[v] += weights[i] + } + } + var ( + maxCount float64 + max float64 + ) + for val, count := range m { + if count > maxCount { + maxCount = count + max = val + } + } + return max, maxCount +} + +// Moment computes the weighted n^th moment of the samples, +// E[(x - μ)^N] +// No degrees of freedom correction is done. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func Moment(moment float64, x, weights []float64) float64 { + mean := Mean(x, weights) + if weights == nil { + var m float64 + for _, v := range x { + m += math.Pow(v-mean, moment) + } + return m / float64(len(x)) + } + var ( + m float64 + sumWeights float64 + ) + for i, v := range x { + m += weights[i] * math.Pow(v-mean, moment) + sumWeights += weights[i] + } + return m / sumWeights +} + +// MomentAbout computes the weighted n^th weighted moment of the samples about +// the given mean \mu, +// E[(x - μ)^N] +// No degrees of freedom correction is done. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func MomentAbout(moment float64, x []float64, mean float64, weights []float64) float64 { + if weights == nil { + var m float64 + for _, v := range x { + m += math.Pow(v-mean, moment) + } + m /= float64(len(x)) + return m + } + if len(weights) != len(x) { + panic("stat: slice length mismatch") + } + var ( + m float64 + sumWeights float64 + ) + for i, v := range x { + m += weights[i] * math.Pow(v-mean, moment) + sumWeights += weights[i] + } + return m / sumWeights +} + +// Quantile returns the sample of x such that x is greater than or +// equal to the fraction p of samples. The exact behavior is determined by the +// CumulantKind, and p should be a number between 0 and 1. Quantile is theoretically +// the inverse of the CDF function, though it may not be the actual inverse +// for all values p and CumulantKinds. +// +// The x data must be sorted in increasing order. If weights is nil then all +// of the weights are 1. If weights is not nil, then len(x) must equal len(weights). +// +// CumulantKind behaviors: +// - Empirical: Returns the lowest value q for which q is greater than or equal +// to the fraction p of samples +func Quantile(p float64, c CumulantKind, x, weights []float64) float64 { + if !(p >= 0 && p <= 1) { + panic("stat: percentile out of bounds") + } + + if weights != nil && len(x) != len(weights) { + panic("stat: slice length mismatch") + } + if floats.HasNaN(x) { + return math.NaN() // This is needed because the algorithm breaks otherwise + } + if !sort.Float64sAreSorted(x) { + panic("x data are not sorted") + } + + var sumWeights float64 + if weights == nil { + sumWeights = float64(len(x)) + } else { + sumWeights = floats.Sum(weights) + } + switch c { + case Empirical: + var cumsum float64 + fidx := p * sumWeights + for i := range x { + if weights == nil { + cumsum++ + } else { + cumsum += weights[i] + } + if cumsum >= fidx { + return x[i] + } + } + panic("impossible") + default: + panic("stat: bad cumulant kind") + } +} + +// Skew computes the skewness of the sample data. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func Skew(x, weights []float64) float64 { + + mean, std := MeanStdDev(x, weights) + if weights == nil { + var s float64 + for _, v := range x { + z := (v - mean) / std + s += z * z * z + } + return s * skewCorrection(float64(len(x))) + } + var ( + s float64 + sumWeights float64 + ) + for i, v := range x { + z := (v - mean) / std + s += weights[i] * z * z * z + sumWeights += weights[i] + } + return s * skewCorrection(sumWeights) +} + +// From: http://www.amstat.org/publications/jse/v19n2/doane.pdf page 7 +func skewCorrection(n float64) float64 { + return (n / (n - 1)) * (1 / (n - 2)) +} + +// SortWeighted rearranges the data in x along with their corresponding +// weights so that the x data are sorted. The data is sorted in place. +// Weights may be nil, but if weights is non-nil then it must have the same +// length as x. +func SortWeighted(x, weights []float64) { + if weights == nil { + sort.Float64s(x) + return + } + if len(x) != len(weights) { + panic("stat: slice length mismatch") + } + sort.Sort(weightSorter{ + x: x, + w: weights, + }) +} + +type weightSorter struct { + x []float64 + w []float64 +} + +func (w weightSorter) Len() int { return len(w.x) } +func (w weightSorter) Less(i, j int) bool { return w.x[i] < w.x[j] } +func (w weightSorter) Swap(i, j int) { + w.x[i], w.x[j] = w.x[j], w.x[i] + w.w[i], w.w[j] = w.w[j], w.w[i] +} + +// SortWeightedLabeled rearranges the data in x along with their +// corresponding weights and boolean labels so that the x data are sorted. +// The data is sorted in place. Weights and labels may be nil, if either +// is non-nil it must have the same length as x. +func SortWeightedLabeled(x []float64, labels []bool, weights []float64) { + if labels == nil { + SortWeighted(x, weights) + return + } + if weights == nil { + if len(x) != len(labels) { + panic("stat: slice length mismatch") + } + sort.Sort(labelSorter{ + x: x, + l: labels, + }) + return + } + if len(x) != len(labels) || len(x) != len(weights) { + panic("stat: slice length mismatch") + } + sort.Sort(weightLabelSorter{ + x: x, + l: labels, + w: weights, + }) +} + +type labelSorter struct { + x []float64 + l []bool +} + +func (a labelSorter) Len() int { return len(a.x) } +func (a labelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } +func (a labelSorter) Swap(i, j int) { + a.x[i], a.x[j] = a.x[j], a.x[i] + a.l[i], a.l[j] = a.l[j], a.l[i] +} + +type weightLabelSorter struct { + x []float64 + l []bool + w []float64 +} + +func (a weightLabelSorter) Len() int { return len(a.x) } +func (a weightLabelSorter) Less(i, j int) bool { return a.x[i] < a.x[j] } +func (a weightLabelSorter) Swap(i, j int) { + a.x[i], a.x[j] = a.x[j], a.x[i] + a.l[i], a.l[j] = a.l[j], a.l[i] + a.w[i], a.w[j] = a.w[j], a.w[i] +} + +// StdDev returns the sample standard deviation. +func StdDev(x, weights []float64) float64 { + _, std := MeanStdDev(x, weights) + return std +} + +// MeanStdDev returns the sample mean and standard deviation +func MeanStdDev(x, weights []float64) (mean, std float64) { + mean, variance := MeanVariance(x, weights) + return mean, math.Sqrt(variance) +} + +// StdErr returns the standard error in the mean with the given values. +func StdErr(std, sampleSize float64) float64 { + return std / math.Sqrt(sampleSize) +} + +// StdScore returns the standard score (a.k.a. z-score, z-value) for the value x +// with the givem mean and standard deviation, i.e. +// (x - mean) / std +func StdScore(x, mean, std float64) float64 { + return (x - mean) / std +} + +// Variance computes the weighted sample variance: +// \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func Variance(x, weights []float64) float64 { + _, variance := MeanVariance(x, weights) + return variance +} + +// MeanVariance computes the sample mean and variance, where the mean and variance are +// \sum_i w_i * x_i / (sum_i w_i) +// \sum_i w_i (x_i - mean)^2 / (sum_i w_i - 1) +// respectively. +// If weights is nil then all of the weights are 1. If weights is not nil, then +// len(x) must equal len(weights). +func MeanVariance(x, weights []float64) (mean, variance float64) { + + // This uses the corrected two-pass algorithm (1.7), from "Algorithms for computing + // the sample variance: Analysis and recommendations" by Chan, Tony F., Gene H. Golub, + // and Randall J. LeVeque. + + // note that this will panic if the slice lengths do not match + mean = Mean(x, weights) + var ( + ss float64 + compensation float64 + ) + if weights == nil { + for _, v := range x { + d := v - mean + ss += d * d + compensation += d + } + variance = (ss - compensation*compensation/float64(len(x))) / float64(len(x)-1) + return + } + + var sumWeights float64 + for i, v := range x { + w := weights[i] + d := v - mean + wd := w * d + ss += wd * d + compensation += wd + sumWeights += w + } + variance = (ss - compensation*compensation/sumWeights) / (sumWeights - 1) + return +} diff --git a/stat/stat_test.go b/stat/stat_test.go new file mode 100644 index 00000000..6ed9350a --- /dev/null +++ b/stat/stat_test.go @@ -0,0 +1,1569 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "fmt" + "math" + "reflect" + "testing" + + "github.com/gonum/floats" +) + +func ExampleCircularMean() { + x := []float64{0, 0.25 * math.Pi, 0.75 * math.Pi} + weights := []float64{1, 2, 2.5} + cmean := CircularMean(x, weights) + + fmt.Printf("The circular mean is %.5f.\n", cmean) + // Output: + // The circular mean is 1.37037. +} + +func TestCircularMean(t *testing.T) { + for i, test := range []struct { + x []float64 + wts []float64 + ans float64 + }{ + // Values compared against scipy. + { + x: []float64{0, 2 * math.Pi}, + ans: 0, + }, + { + x: []float64{0, 0.5 * math.Pi}, + ans: 0.78539816339744, + }, + { + x: []float64{-1.5 * math.Pi, 0.5 * math.Pi, 2.5 * math.Pi}, + wts: []float64{1, 2, 3}, + ans: 0.5 * math.Pi, + }, + { + x: []float64{0, 0.5 * math.Pi}, + wts: []float64{1, 2}, + ans: 1.10714871779409, + }, + } { + c := CircularMean(test.x, test.wts) + if math.Abs(c-test.ans) > 1e-14 { + t.Errorf("Circular mean mismatch case %d: Expected %v, Found %v", i, test.ans, c) + } + } + if !Panics(func() { CircularMean(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("CircularMean did not panic with x, wts length mismatch") + } +} + +func ExampleCorrelation() { + x := []float64{8, -3, 7, 8, -4} + y := []float64{10, 5, 6, 3, -1} + w := []float64{2, 1.5, 3, 3, 2} + + fmt.Println("Correlation computes the degree to which two datasets move together") + fmt.Println("about their mean. For example, x and y above move similarly.") + + c := Correlation(x, y, w) + fmt.Printf("Correlation is %.5f\n", c) + + // Output: + // Correlation computes the degree to which two datasets move together + // about their mean. For example, x and y above move similarly. + // Correlation is 0.59915 +} + +func TestCorrelation(t *testing.T) { + for i, test := range []struct { + x []float64 + y []float64 + w []float64 + ans float64 + }{ + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{8, -3, 7, 8, -4}, + w: nil, + ans: 1, + }, + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{8, -3, 7, 8, -4}, + w: []float64{1, 1, 1, 1, 1}, + ans: 1, + }, + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{8, -3, 7, 8, -4}, + w: []float64{1, 6, 7, 0.8, 2.1}, + ans: 1, + }, + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{10, 15, 4, 5, -1}, + w: nil, + ans: 0.0093334660769059, + }, + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{10, 15, 4, 5, -1}, + w: nil, + ans: 0.0093334660769059, + }, + { + x: []float64{8, -3, 7, 8, -4}, + y: []float64{10, 15, 4, 5, -1}, + w: []float64{1, 3, 1, 2, 2}, + ans: -0.13966633352689, + }, + } { + c := Correlation(test.x, test.y, test.w) + if math.Abs(test.ans-c) > 1e-14 { + t.Errorf("Correlation mismatch case %d. Expected %v, Found %v", i, test.ans, c) + } + } + if !Panics(func() { Correlation(make([]float64, 2), make([]float64, 3), make([]float64, 3)) }) { + t.Errorf("Correlation did not panic with length mismatch") + } + if !Panics(func() { Correlation(make([]float64, 2), make([]float64, 3), nil) }) { + t.Errorf("Correlation did not panic with length mismatch") + } + if !Panics(func() { Correlation(make([]float64, 3), make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Correlation did not panic with weights length mismatch") + } +} + +func ExampleCovariance() { + fmt.Println("Covariance computes the degree to which datasets move together") + fmt.Println("about their mean.") + x := []float64{8, -3, 7, 8, -4} + y := []float64{10, 2, 2, 4, 1} + cov := Covariance(x, y, nil) + fmt.Printf("Cov = %.4f\n", cov) + fmt.Println("If datasets move perfectly together, the variance equals the covariance") + y2 := []float64{12, 1, 11, 12, 0} + cov2 := Covariance(x, y2, nil) + varX := Variance(x, nil) + fmt.Printf("Cov2 is %.4f, VarX is %.4f", cov2, varX) + // Output: + // Covariance computes the degree to which datasets move together + // about their mean. + // Cov = 13.8000 + // If datasets move perfectly together, the variance equals the covariance + // Cov2 is 37.7000, VarX is 37.7000 +} + +func TestCovariance(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + weights []float64 + ans float64 + }{ + { + p: []float64{0.75, 0.1, 0.05}, + q: []float64{0.5, 0.25, 0.25}, + ans: 0.05625, + }, + { + p: []float64{1, 2, 3}, + q: []float64{2, 4, 6}, + ans: 2, + }, + { + p: []float64{1, 2, 3}, + q: []float64{1, 4, 9}, + ans: 4, + }, + { + p: []float64{1, 2, 3}, + q: []float64{1, 4, 9}, + weights: []float64{1, 1.5, 1}, + ans: 3.2, + }, + { + p: []float64{1, 4, 9}, + q: []float64{1, 4, 9}, + weights: []float64{1, 1.5, 1}, + ans: 13.142857142857146, + }, + } { + c := Covariance(test.p, test.q, test.weights) + if math.Abs(c-test.ans) > 1e-14 { + t.Errorf("Covariance mismatch case %d: Expected %v, Found %v", i, test.ans, c) + } + } + + // test the panic states + if !Panics(func() { Covariance(make([]float64, 2), make([]float64, 3), nil) }) { + t.Errorf("Covariance did not panic with x, y length mismatch") + } + if !Panics(func() { Covariance(make([]float64, 3), make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Covariance did not panic with x, weights length mismatch") + } + +} + +func TestCrossEntropy(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + ans float64 + }{ + { + p: []float64{0.75, 0.1, 0.05}, + q: []float64{0.5, 0.25, 0.25}, + ans: 0.7278045395879426, + }, + { + p: []float64{0.75, 0.1, 0.05, 0, 0, 0}, + q: []float64{0.5, 0.25, 0.25, 0, 0, 0}, + ans: 0.7278045395879426, + }, + { + p: []float64{0.75, 0.1, 0.05, 0, 0, 0.1}, + q: []float64{0.5, 0.25, 0.25, 0, 0, 0}, + ans: math.Inf(1), + }, + { + p: nil, + q: nil, + ans: 0, + }, + } { + c := CrossEntropy(test.p, test.q) + if math.Abs(c-test.ans) > 1e-14 { + t.Errorf("Cross entropy mismatch case %d: Expected %v, Found %v", i, test.ans, c) + } + } + if !Panics(func() { CrossEntropy(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("CrossEntropy did not panic with p, q length mismatch") + } +} + +func ExampleEntropy() { + + p := []float64{0.05, 0.1, 0.9, 0.05} + entP := Entropy(p) + + q := []float64{0.2, 0.4, 0.25, 0.15} + entQ := Entropy(q) + + r := []float64{0.2, 0, 0, 0.5, 0, 0.2, 0.1, 0, 0, 0} + entR := Entropy(r) + + s := []float64{0, 0, 1, 0} + entS := Entropy(s) + + fmt.Println("Entropy is a measure of the amount of uncertainty in a distribution") + fmt.Printf("The second bin of p is very likely to occur. It's entropy is %.4f\n", entP) + fmt.Printf("The distribution of q is more spread out. It's entropy is %.4f\n", entQ) + fmt.Println("Adding buckets with zero probability does not change the entropy.") + fmt.Printf("The entropy of r is: %.4f\n", entR) + fmt.Printf("A distribution with no uncertainty has entropy %.4f\n", entS) + // Output: + // Entropy is a measure of the amount of uncertainty in a distribution + // The second bin of p is very likely to occur. It's entropy is 0.6247 + // The distribution of q is more spread out. It's entropy is 1.3195 + // Adding buckets with zero probability does not change the entropy. + // The entropy of r is: 1.2206 + // A distribution with no uncertainty has entropy 0.0000 +} + +func ExampleExKurtosis() { + fmt.Println(`Kurtosis is a measure of the 'peakedness' of a distribution, and the +excess kurtosis is the kurtosis above or below that of the standard normal +distribution`) + x := []float64{5, 4, -3, -2} + kurt := ExKurtosis(x, nil) + fmt.Printf("ExKurtosis = %.5f\n", kurt) + weights := []float64{1, 2, 3, 5} + wKurt := ExKurtosis(x, weights) + fmt.Printf("Weighted ExKurtosis is %.4f", wKurt) + // Output: + // Kurtosis is a measure of the 'peakedness' of a distribution, and the + // excess kurtosis is the kurtosis above or below that of the standard normal + // distribution + // ExKurtosis = -5.41200 + // Weighted ExKurtosis is -0.6779 +} + +func TestExKurtosis(t *testing.T) { + // the example does a good job, this just has to cover the panic + if !Panics(func() { ExKurtosis(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("ExKurtosis did not panic with x, weights length mismatch") + } +} + +func ExampleGeometricMean() { + x := []float64{8, 2, 9, 15, 4} + weights := []float64{2, 2, 6, 7, 1} + mean := Mean(x, weights) + gmean := GeometricMean(x, weights) + + logx := make([]float64, len(x)) + for i, v := range x { + logx[i] = math.Log(v) + } + expMeanLog := math.Exp(Mean(logx, weights)) + fmt.Printf("The arithmetic mean is %.4f, but the geometric mean is %.4f.\n", mean, gmean) + fmt.Printf("The exponential of the mean of the logs is %.4f\n", expMeanLog) + // Output: + // The arithmetic mean is 10.1667, but the geometric mean is 8.7637. + // The exponential of the mean of the logs is 8.7637 +} + +func TestGeometricMean(t *testing.T) { + for i, test := range []struct { + x []float64 + wts []float64 + ans float64 + }{ + { + x: []float64{2, 8}, + ans: 4, + }, + { + x: []float64{3, 81}, + wts: []float64{2, 1}, + ans: 9, + }, + } { + c := GeometricMean(test.x, test.wts) + if math.Abs(c-test.ans) > 1e-14 { + t.Errorf("Geometric mean mismatch case %d: Expected %v, Found %v", i, test.ans, c) + } + } + if !Panics(func() { GeometricMean(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("GeometricMean did not panic with x, wts length mismatch") + } +} + +func ExampleHarmonicMean() { + x := []float64{8, 2, 9, 15, 4} + weights := []float64{2, 2, 6, 7, 1} + mean := Mean(x, weights) + hmean := HarmonicMean(x, weights) + + fmt.Printf("The arithmetic mean is %.5f, but the harmonic mean is %.4f.\n", mean, hmean) + // Output: + // The arithmetic mean is 10.16667, but the harmonic mean is 6.8354. +} + +func TestHarmonicMean(t *testing.T) { + for i, test := range []struct { + x []float64 + wts []float64 + ans float64 + }{ + { + x: []float64{.5, .125}, + ans: .2, + }, + { + x: []float64{.5, .125}, + wts: []float64{2, 1}, + ans: .25, + }, + } { + c := HarmonicMean(test.x, test.wts) + if math.Abs(c-test.ans) > 1e-14 { + t.Errorf("Harmonic mean mismatch case %d: Expected %v, Found %v", i, test.ans, c) + } + } + if !Panics(func() { HarmonicMean(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("HarmonicMean did not panic with x, wts length mismatch") + } +} + +func TestHistogram(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + dividers []float64 + ans []float64 + }{ + { + x: []float64{1, 3, 5, 6, 7, 8}, + dividers: []float64{0, 2, 4, 6, 7, 9}, + ans: []float64{1, 1, 1, 1, 2}, + }, + { + x: []float64{1, 3, 5, 6, 7, 8}, + dividers: []float64{1, 2, 4, 6, 7, 9}, + weights: []float64{1, 2, 1, 1, 1, 2}, + ans: []float64{1, 2, 1, 1, 3}, + }, + { + x: []float64{1, 8}, + dividers: []float64{0, 2, 4, 6, 7, 9}, + weights: []float64{1, 2}, + ans: []float64{1, 0, 0, 0, 2}, + }, + { + x: []float64{1, 8}, + dividers: []float64{0, 2, 4, 6, 7, 9}, + ans: []float64{1, 0, 0, 0, 1}, + }, + { + x: []float64{}, + dividers: []float64{1, 3}, + ans: []float64{0}, + }, + } { + hist := Histogram(nil, test.dividers, test.x, test.weights) + if !floats.Equal(hist, test.ans) { + t.Errorf("Hist mismatch case %d. Expected %v, Found %v", i, test.ans, hist) + } + // Test with non-zero values + Histogram(hist, test.dividers, test.x, test.weights) + if !floats.Equal(hist, test.ans) { + t.Errorf("Hist mismatch case %d. Expected %v, Found %v", i, test.ans, hist) + } + } + // panic cases + for _, test := range []struct { + name string + x []float64 + weights []float64 + dividers []float64 + count []float64 + }{ + { + name: "len(x) != len(weights)", + x: []float64{1, 3, 5, 6, 7, 8}, + weights: []float64{1, 1, 1, 1}, + }, + { + name: "len(count) != len(dividers) - 1", + x: []float64{1, 3, 5, 6, 7, 8}, + dividers: []float64{1, 4, 9}, + count: make([]float64, 6), + }, + { + name: "dividers not sorted", + x: []float64{1, 3, 5, 6, 7, 8}, + dividers: []float64{0, -1, 0}, + }, + { + name: "x not sorted", + x: []float64{1, 5, 2, 9, 7, 8}, + dividers: []float64{1, 4, 9}, + }, + { + name: "fewer than 2 dividers", + x: []float64{1, 2, 3}, + dividers: []float64{5}, + }, + { + name: "x too large", + x: []float64{1, 2, 3}, + dividers: []float64{1, 3}, + }, + { + name: "x too small", + x: []float64{1, 2, 3}, + dividers: []float64{2, 3}, + }, + } { + if !Panics(func() { Histogram(test.count, test.dividers, test.x, test.weights) }) { + t.Errorf("Histogram did not panic when %s", test.name) + } + } +} + +func ExampleHistogram() { + x := make([]float64, 101) + for i := range x { + x[i] = 1.1 * float64(i) // x data ranges from 0 to 110 + } + dividers := []float64{0, 7, 20, 100, 1000} + fmt.Println(`Histogram counts the amount of data in the bins specified by +the dividers. In this data set, there are 7 data points less than 7 (between dividers[0] +and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]), +and 0 data points above 1000. Since dividers has length 5, there will be 4 bins.`) + hist := Histogram(nil, dividers, x, nil) + fmt.Printf("Hist = %v\n", hist) + + fmt.Println() + fmt.Println("For ease, the floats Span function can be used to set the dividers") + nBins := 10 + dividers = make([]float64, nBins+1) + min := floats.Min(x) + max := floats.Max(x) + // Increase the maximum divider so that the maximum value of x is contained + // within the last bucket. + max += 1 + floats.Span(dividers, min, max) + // Span includes the min and the max. Trim the dividers to create 10 buckets + hist = Histogram(nil, dividers, x, nil) + fmt.Printf("Hist = %v\n", hist) + fmt.Println() + fmt.Println(`Histogram also works with weighted data, and allows reusing of +the count field in order to avoid extra garbage`) + weights := make([]float64, len(x)) + for i := range weights { + weights[i] = float64(i + 1) + } + Histogram(hist, dividers, x, weights) + fmt.Printf("Weighted Hist = %v\n", hist) + + // Output: + // Histogram counts the amount of data in the bins specified by + // the dividers. In this data set, there are 7 data points less than 7 (between dividers[0] + // and dividers[1]), 12 data points between 7 and 20 (dividers[1] and dividers[2]), + // and 0 data points above 1000. Since dividers has length 5, there will be 4 bins. + // Hist = [7 12 72 10] + // + // For ease, the floats Span function can be used to set the dividers + // Hist = [11 10 10 10 10 10 10 10 10 10] + // + // Histogram also works with weighted data, and allows reusing of + // the count field in order to avoid extra garbage + // Weighted Hist = [66 165 265 365 465 565 665 765 865 965] +} + +func TestJensenShannon(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + }{ + { + p: []float64{0.5, 0.1, 0.3, 0.1}, + q: []float64{0.1, 0.4, 0.25, 0.25}, + }, + { + p: []float64{0.4, 0.6, 0.0}, + q: []float64{0.2, 0.2, 0.6}, + }, + { + p: []float64{0.1, 0.1, 0.0, 0.8}, + q: []float64{0.6, 0.3, 0.0, 0.1}, + }, + { + p: []float64{0.5, 0.1, 0.3, 0.1}, + q: []float64{0.5, 0, 0.25, 0.25}, + }, + { + p: []float64{0.5, 0.1, 0, 0.4}, + q: []float64{0.1, 0.4, 0.25, 0.25}, + }, + } { + + m := make([]float64, len(test.p)) + p := test.p + q := test.q + floats.Add(m, p) + floats.Add(m, q) + floats.Scale(0.5, m) + + js1 := 0.5*KullbackLeibler(p, m) + 0.5*KullbackLeibler(q, m) + js2 := JensenShannon(p, q) + + if math.IsNaN(js2) { + t.Errorf("In case %v, JS distance is NaN", i) + } + + if math.Abs(js1-js2) > 1e-14 { + t.Errorf("JS mismatch case %v. Expected %v, found %v.", i, js1, js2) + } + } + if !Panics(func() { JensenShannon(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("JensenShannon did not panic with p, q length mismatch") + } +} + +func TestKolmogorovSmirnov(t *testing.T) { + for i, test := range []struct { + x []float64 + xWeights []float64 + y []float64 + yWeights []float64 + dist float64 + }{ + + { + dist: 0, + }, + { + x: []float64{1}, + dist: 1, + }, + { + y: []float64{1}, + dist: 1, + }, + { + x: []float64{1}, + xWeights: []float64{8}, + dist: 1, + }, + { + y: []float64{1}, + yWeights: []float64{8}, + dist: 1, + }, + { + x: []float64{1}, + xWeights: []float64{8}, + y: []float64{1}, + yWeights: []float64{8}, + dist: 0, + }, + { + x: []float64{1, 1, 1}, + xWeights: []float64{2, 3, 7}, + y: []float64{1}, + yWeights: []float64{8}, + dist: 0, + }, + { + x: []float64{1, 1, 1, 1, 1}, + y: []float64{1, 1, 1}, + yWeights: []float64{2, 5, 2}, + dist: 0, + }, + + { + x: []float64{1, 2, 3}, + y: []float64{1, 2, 3}, + dist: 0, + }, + { + x: []float64{1, 2, 3}, + y: []float64{1, 2, 3}, + yWeights: []float64{1, 1, 1}, + dist: 0, + }, + + { + x: []float64{1, 2, 3}, + xWeights: []float64{1, 1, 1}, + y: []float64{1, 2, 3}, + yWeights: []float64{1, 1, 1}, + dist: 0, + }, + { + x: []float64{1, 2}, + xWeights: []float64{2, 5}, + y: []float64{1, 1, 2, 2, 2, 2, 2}, + dist: 0, + }, + { + x: []float64{1, 1, 2, 2, 2, 2, 2}, + y: []float64{1, 2}, + yWeights: []float64{2, 5}, + dist: 0, + }, + { + x: []float64{1, 1, 2, 2, 2}, + xWeights: []float64{0.5, 1.5, 1, 2, 2}, + y: []float64{1, 2}, + yWeights: []float64{2, 5}, + dist: 0, + }, + { + x: []float64{1, 2, 3, 4}, + y: []float64{5, 6}, + dist: 1, + }, + { + x: []float64{5, 6}, + y: []float64{1, 2, 3, 4}, + dist: 1, + }, + { + x: []float64{5, 6}, + xWeights: []float64{8, 7}, + y: []float64{1, 2, 3, 4}, + dist: 1, + }, + { + x: []float64{5, 6}, + xWeights: []float64{8, 7}, + y: []float64{1, 2, 3, 4}, + yWeights: []float64{9, 2, 1, 6}, + dist: 1, + }, + { + x: []float64{-4, 5, 6}, + xWeights: []float64{0, 8, 7}, + y: []float64{1, 2, 3, 4}, + yWeights: []float64{9, 2, 1, 6}, + dist: 1, + }, + { + x: []float64{-4, -2, -2, 5, 6}, + xWeights: []float64{0, 0, 0, 8, 7}, + y: []float64{1, 2, 3, 4}, + yWeights: []float64{9, 2, 1, 6}, + dist: 1, + }, + { + x: []float64{1, 2, 3}, + y: []float64{1, 1, 3}, + dist: 1.0 / 3.0, + }, + { + x: []float64{1, 2, 3}, + y: []float64{1, 3}, + yWeights: []float64{2, 1}, + dist: 1.0 / 3.0, + }, + { + x: []float64{1, 2, 3}, + xWeights: []float64{2, 2, 2}, + y: []float64{1, 3}, + yWeights: []float64{2, 1}, + dist: 1.0 / 3.0, + }, + { + x: []float64{2, 3, 4}, + y: []float64{1, 5}, + dist: 1.0 / 2.0, + }, + { + x: []float64{1, 2, math.NaN()}, + y: []float64{1, 1, 3}, + dist: math.NaN(), + }, + { + x: []float64{1, 2, 3}, + y: []float64{1, 1, math.NaN()}, + dist: math.NaN(), + }, + } { + dist := KolmogorovSmirnov(test.x, test.xWeights, test.y, test.yWeights) + if math.Abs(dist-test.dist) > 1e-14 && !(math.IsNaN(test.dist) && math.IsNaN(dist)) { + t.Errorf("Distance mismatch case %v: Expected: %v, Found: %v", i, test.dist, dist) + } + } + // panic cases + for _, test := range []struct { + name string + x []float64 + xWeights []float64 + y []float64 + yWeights []float64 + }{ + { + name: "len(x) != len(xWeights)", + x: []float64{1, 3, 5, 6, 7, 8}, + xWeights: []float64{1, 1, 1, 1}, + }, + { + name: "len(y) != len(yWeights)", + x: []float64{1, 3, 5, 6, 7, 8}, + y: []float64{1, 3, 5, 6, 7, 8}, + yWeights: []float64{1, 1, 1, 1}, + }, + { + name: "x not sorted", + x: []float64{10, 3, 5, 6, 7, 8}, + y: []float64{1, 3, 5, 6, 7, 8}, + }, + { + name: "y not sorted", + x: []float64{1, 3, 5, 6, 7, 8}, + y: []float64{10, 3, 5, 6, 7, 8}, + }, + } { + if !Panics(func() { KolmogorovSmirnov(test.x, test.xWeights, test.y, test.yWeights) }) { + t.Errorf("KolmogorovSmirnov did not panic when %s", test.name) + } + } +} + +func ExampleKullbackLeibler() { + + p := []float64{0.05, 0.1, 0.9, 0.05} + q := []float64{0.2, 0.4, 0.25, 0.15} + s := []float64{0, 0, 1, 0} + + klPQ := KullbackLeibler(p, q) + klPS := KullbackLeibler(p, s) + klPP := KullbackLeibler(p, p) + + fmt.Println("Kullback-Leibler is one measure of the difference between two distributions") + fmt.Printf("The K-L distance between p and q is %.4f\n", klPQ) + fmt.Println("It is impossible for s and p to be the same distribution, because") + fmt.Println("the first bucket has zero probability in s and non-zero in p. Thus,") + fmt.Printf("the K-L distance between them is %.4f\n", klPS) + fmt.Printf("The K-L distance between identical distributions is %.4f\n", klPP) + + // Kullback-Leibler is one measure of the difference between two distributions + // The K-L distance between p and q is 0.8900 + // It is impossible for s and p to be the same distribution, because + // the first bucket has zero probability in s and non-zero in p. Thus, + // the K-L distance between them is +Inf + // The K-L distance between identical distributions is 0.0000 +} + +func TestKullbackLeibler(t *testing.T) { + if !Panics(func() { KullbackLeibler(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("KullbackLeibler did not panic with p, q length mismatch") + } +} + +var linearRegressionTests = []struct { + name string + + x, y []float64 + weights []float64 + origin bool + + alpha float64 + beta float64 + r float64 + + tol float64 +}{ + { + name: "faithful", + + x: faithful.waiting, + y: faithful.eruptions, + + // Values calculated by R using lm(eruptions ~ waiting, data=faithful). + alpha: -1.87402, + beta: 0.07563, + r: 0.8114608, + + tol: 1e-5, + }, + { + name: "faithful through origin", + + x: faithful.waiting, + y: faithful.eruptions, + origin: true, + + // Values calculated by R using lm(eruptions ~ waiting - 1, data=faithful). + alpha: 0, + beta: 0.05013, + r: 0.9726036, + + tol: 1e-5, + }, + { + name: "faithful explicit weights", + + x: faithful.waiting, + y: faithful.eruptions, + weights: func() []float64 { + w := make([]float64, len(faithful.eruptions)) + for i := range w { + w[i] = 1 + } + return w + }(), + + // Values calculated by R using lm(eruptions ~ waiting, data=faithful). + alpha: -1.87402, + beta: 0.07563, + r: 0.8114608, + + tol: 1e-5, + }, + { + name: "faithful non-uniform weights", + + x: faithful.waiting, + y: faithful.eruptions, + weights: faithful.waiting, // Just an arbitrary set of non-uniform weights. + + // Values calculated by R using lm(eruptions ~ waiting, data=faithful, weights=faithful$waiting). + alpha: -1.79268, + beta: 0.07452, + r: 0.7840372, + + tol: 1e-5, + }, +} + +func TestLinearRegression(t *testing.T) { + for _, test := range linearRegressionTests { + alpha, beta := LinearRegression(test.x, test.y, test.weights, test.origin) + var r float64 + if test.origin { + r = RNoughtSquared(test.x, test.y, test.weights, beta) + } else { + r = RSquared(test.x, test.y, test.weights, alpha, beta) + ests := make([]float64, len(test.y)) + for i, x := range test.x { + ests[i] = alpha + beta*x + } + rvals := RSquaredFrom(ests, test.y, test.weights) + if r != rvals { + t.Errorf("%s: RSquared and RSquaredFrom mismatch: %v != %v", test.name, r, rvals) + } + } + if !floats.EqualWithinAbsOrRel(alpha, test.alpha, test.tol, test.tol) { + t.Errorf("%s: unexpected alpha estimate: want:%v got:%v", test.name, test.alpha, alpha) + } + if !floats.EqualWithinAbsOrRel(beta, test.beta, test.tol, test.tol) { + t.Errorf("%s: unexpected beta estimate: want:%v got:%v", test.name, test.beta, beta) + } + if !floats.EqualWithinAbsOrRel(r, test.r, test.tol, test.tol) { + t.Errorf("%s: unexpected r estimate: want:%v got:%v", test.name, test.r, r) + } + } +} + +func TestChiSquare(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + res float64 + }{ + { + p: []float64{16, 18, 16, 14, 12, 12}, + q: []float64{16, 16, 16, 16, 16, 8}, + res: 3.5, + }, + { + p: []float64{16, 18, 16, 14, 12, 12}, + q: []float64{8, 20, 20, 16, 12, 12}, + res: 9.25, + }, + { + p: []float64{40, 60, 30, 45}, + q: []float64{50, 50, 50, 50}, + res: 12.5, + }, + { + p: []float64{40, 60, 30, 45, 0, 0}, + q: []float64{50, 50, 50, 50, 0, 0}, + res: 12.5, + }, + } { + resultpq := ChiSquare(test.p, test.q) + + if math.Abs(resultpq-test.res) > 1e-10 { + t.Errorf("ChiSquare distance mismatch in case %d. Expected %v, Found %v", i, test.res, resultpq) + } + } + if !Panics(func() { ChiSquare(make([]float64, 2), make([]float64, 3)) }) { + t.Errorf("ChiSquare did not panic with length mismatch") + } +} + +// Panics returns true if the called function panics during evaluation. +func Panics(fun func()) (b bool) { + defer func() { + err := recover() + if err != nil { + b = true + } + }() + fun() + return +} + +func TestBhattacharyya(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + res float64 + }{ + { + p: []float64{0.5, 0.1, 0.3, 0.1}, + q: []float64{0.1, 0.4, 0.25, 0.25}, + res: 0.15597338718671386, + }, + { + p: []float64{0.4, 0.6, 0.0}, + q: []float64{0.2, 0.2, 0.6}, + res: 0.46322207765351153, + }, + { + p: []float64{0.1, 0.1, 0.0, 0.8}, + q: []float64{0.6, 0.3, 0.0, 0.1}, + res: 0.3552520032137785, + }, + } { + resultpq := Bhattacharyya(test.p, test.q) + resultqp := Bhattacharyya(test.q, test.p) + + if math.Abs(resultpq-test.res) > 1e-10 { + t.Errorf("Bhattacharyya distance mismatch in case %d. Expected %v, Found %v", i, test.res, resultpq) + } + if math.Abs(resultpq-resultqp) > 1e-10 { + t.Errorf("Bhattacharyya distance is assymmetric in case %d.", i) + } + } + // Bhattacharyya should panic if the inputs have different length + if !Panics(func() { Bhattacharyya(make([]float64, 2), make([]float64, 3)) }) { + t.Errorf("Bhattacharyya did not panic with length mismatch") + } +} + +func TestHellinger(t *testing.T) { + for i, test := range []struct { + p []float64 + q []float64 + res float64 + }{ + { + p: []float64{0.5, 0.1, 0.3, 0.1}, + q: []float64{0.1, 0.4, 0.25, 0.25}, + res: 0.3800237367441919, + }, + { + p: []float64{0.4, 0.6, 0.0}, + q: []float64{0.2, 0.2, 0.6}, + res: 0.6088900771170487, + }, + { + p: []float64{0.1, 0.1, 0.0, 0.8}, + q: []float64{0.6, 0.3, 0.0, 0.1}, + res: 0.5468118803484205, + }, + } { + resultpq := Hellinger(test.p, test.q) + resultqp := Hellinger(test.q, test.p) + + if math.Abs(resultpq-test.res) > 1e-10 { + t.Errorf("Hellinger distance mismatch in case %d. Expected %v, Found %v", i, test.res, resultpq) + } + if math.Abs(resultpq-resultqp) > 1e-10 { + t.Errorf("Hellinger distance is assymmetric in case %d.", i) + } + } + if !Panics(func() { Hellinger(make([]float64, 2), make([]float64, 3)) }) { + t.Errorf("Hellinger did not panic with length mismatch") + } +} + +func ExampleMean() { + x := []float64{8.2, -6, 5, 7} + mean := Mean(x, nil) + fmt.Printf("The mean of the samples is %.4f\n", mean) + w := []float64{2, 6, 3, 5} + weightedMean := Mean(x, w) + fmt.Printf("The weighted mean of the samples is %.4f\n", weightedMean) + x2 := []float64{8.2, 8.2, -6, -6, -6, -6, -6, -6, 5, 5, 5, 7, 7, 7, 7, 7} + mean2 := Mean(x2, nil) + fmt.Printf("The mean of x2 is %.4f\n", mean2) + fmt.Println("The weights act as if there were more samples of that number") + // Output: + // The mean of the samples is 3.5500 + // The weighted mean of the samples is 1.9000 + // The mean of x2 is 1.9000 + // The weights act as if there were more samples of that number +} +func TestMean(t *testing.T) { + if !Panics(func() { Mean(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Mean did not panic with x, weights length mismatch") + } +} + +func TestMode(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + ans float64 + count float64 + }{ + {}, + { + x: []float64{1, 6, 1, 9, -2}, + ans: 1, + count: 2, + }, + { + x: []float64{1, 6, 1, 9, -2}, + weights: []float64{1, 7, 3, 5, 0}, + ans: 6, + count: 7, + }, + } { + m, count := Mode(test.x, test.weights) + if test.ans != m { + t.Errorf("Mode mismatch case %d. Expected %v, found %v", i, test.ans, m) + } + if test.count != count { + t.Errorf("Mode count mismatch case %d. Expected %v, found %v", i, test.count, count) + } + } + if !Panics(func() { Mode(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Mode did not panic with x, weights length mismatch") + } +} + +func TestMoment(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + moment float64 + ans float64 + }{ + { + x: []float64{6, 2, 4, 8, 10}, + moment: 5, + ans: 0, + }, + { + x: []float64{6, 2, 4, 8, 10}, + weights: []float64{1, 2, 2, 2, 1}, + moment: 5, + ans: 121.875, + }, + } { + m := Moment(test.moment, test.x, test.weights) + if math.Abs(test.ans-m) > 1e-14 { + t.Errorf("Moment mismatch case %d. Expected %v, found %v", i, test.ans, m) + } + } + if !Panics(func() { Moment(1, make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Moment did not panic with x, weights length mismatch") + } +} + +func TestMomentAbout(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + moment float64 + mean float64 + ans float64 + }{ + { + x: []float64{6, 2, 4, 8, 9}, + mean: 3, + moment: 5, + ans: 2.2288e3, + }, + { + x: []float64{6, 2, 4, 8, 9}, + weights: []float64{1, 2, 2, 2, 1}, + mean: 3, + moment: 5, + ans: 1.783625e3, + }, + } { + m := MomentAbout(test.moment, test.x, test.mean, test.weights) + if math.Abs(test.ans-m) > 1e-14 { + t.Errorf("MomentAbout mismatch case %d. Expected %v, found %v", i, test.ans, m) + } + } + if !Panics(func() { MomentAbout(1, make([]float64, 3), 0, make([]float64, 2)) }) { + t.Errorf("MomentAbout did not panic with x, weights length mismatch") + } +} + +func TestCDF(t *testing.T) { + cumulantKinds := []CumulantKind{Empirical} + for i, test := range []struct { + q []float64 + x []float64 + weights []float64 + ans [][]float64 + }{ + {}, + { + q: []float64{0, 0.9, 1, 1.1, 2.9, 3, 3.1, 4.9, 5, 5.1}, + x: []float64{1, 2, 3, 4, 5}, + ans: [][]float64{{0, 0, 0.2, 0.2, 0.4, 0.6, 0.6, 0.8, 1, 1}}, + }, + { + q: []float64{0, 0.9, 1, 1.1, 2.9, 3, 3.1, 4.9, 5, 5.1}, + x: []float64{1, 2, 3, 4, 5}, + weights: []float64{1, 1, 1, 1, 1}, + ans: [][]float64{{0, 0, 0.2, 0.2, 0.4, 0.6, 0.6, 0.8, 1, 1}}, + }, + { + q: []float64{0, 0.9, 1}, + x: []float64{math.NaN()}, + ans: [][]float64{{math.NaN(), math.NaN(), math.NaN()}}, + }, + } { + copyX := make([]float64, len(test.x)) + copy(copyX, test.x) + var copyW []float64 + if test.weights != nil { + copyW = make([]float64, len(test.weights)) + copy(copyW, test.weights) + } + for j, q := range test.q { + for k, kind := range cumulantKinds { + v := CDF(q, kind, test.x, test.weights) + if !floats.Equal(copyX, test.x) && !math.IsNaN(v) { + t.Errorf("x changed for case %d kind %d percentile %v", i, k, q) + } + if !floats.Equal(copyW, test.weights) { + t.Errorf("x changed for case %d kind %d percentile %v", i, k, q) + } + if v != test.ans[k][j] && !(math.IsNaN(v) && math.IsNaN(test.ans[k][j])) { + t.Errorf("mismatch case %d kind %d percentile %v. Expected: %v, found: %v", i, k, q, test.ans[k][j], v) + } + } + } + } + + // these test cases should all result in a panic + for i, test := range []struct { + name string + q float64 + kind CumulantKind + x []float64 + weights []float64 + }{ + { + name: "len(x) != len(weights)", + q: 1.5, + kind: Empirical, + x: []float64{1, 2, 3, 4, 5}, + weights: []float64{1, 2, 3}, + }, + { + name: "unsorted x", + q: 1.5, + kind: Empirical, + x: []float64{3, 2, 1}, + }, + { + name: "unknown CumulantKind", + q: 1.5, + kind: CumulantKind(1000), // bogus + x: []float64{1, 2, 3}, + }, + } { + if !Panics(func() { CDF(test.q, test.kind, test.x, test.weights) }) { + t.Errorf("did not panic as expected with %s for case %d kind %d percentile %v x %v weights %v", test.name, i, test.kind, test.q, test.x, test.weights) + } + } + +} + +func TestQuantile(t *testing.T) { + cumulantKinds := []CumulantKind{Empirical} + for i, test := range []struct { + p []float64 + x []float64 + w []float64 + ans [][]float64 + }{ + { + p: []float64{0, 0.05, 0.1, 0.15, 0.45, 0.5, 0.55, 0.85, 0.9, 0.95, 1}, + x: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + w: nil, + ans: [][]float64{{1, 1, 1, 2, 5, 5, 6, 9, 9, 10, 10}}, + }, + { + p: []float64{0, 0.05, 0.1, 0.15, 0.45, 0.5, 0.55, 0.85, 0.9, 0.95, 1}, + x: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + w: []float64{3, 3, 3, 3, 3, 3, 3, 3, 3, 3}, + ans: [][]float64{{1, 1, 1, 2, 5, 5, 6, 9, 9, 10, 10}}, + }, + { + p: []float64{0.5}, + x: []float64{1, 2, 3, 4, 5, 6, 7, 8, math.NaN(), 10}, + ans: [][]float64{{math.NaN()}}, + }, + } { + copyX := make([]float64, len(test.x)) + copy(copyX, test.x) + var copyW []float64 + if test.w != nil { + copyW = make([]float64, len(test.w)) + copy(copyW, test.w) + } + for j, p := range test.p { + for k, kind := range cumulantKinds { + v := Quantile(p, kind, test.x, test.w) + if !floats.Same(copyX, test.x) { + t.Errorf("x changed for case %d kind %d percentile %v", i, k, p) + } + if !floats.Same(copyW, test.w) { + t.Errorf("x changed for case %d kind %d percentile %v", i, k, p) + } + if v != test.ans[k][j] && !(math.IsNaN(v) && math.IsNaN(test.ans[k][j])) { + t.Errorf("mismatch case %d kind %d percentile %v. Expected: %v, found: %v", i, k, p, test.ans[k][j], v) + } + } + } + } + // panic cases + for _, test := range []struct { + name string + p float64 + c CumulantKind + x []float64 + w []float64 + }{ + { + name: "p < 0", + c: Empirical, + p: -1, + }, + { + name: "p > 1", + c: Empirical, + p: 2, + }, + { + name: "p is NaN", + c: Empirical, + p: math.NaN(), + }, + { + name: "len(x) != len(weights)", + c: Empirical, + p: .5, + x: make([]float64, 4), + w: make([]float64, 2), + }, + { + name: "x not sorted", + c: Empirical, + p: .5, + x: []float64{3, 2, 1}, + }, + { + name: "CumulantKind is unknown", + c: CumulantKind(1000), + p: .5, + x: []float64{1, 2, 3}, + }, + } { + if !Panics(func() { Quantile(test.p, test.c, test.x, test.w) }) { + t.Errorf("Quantile did not panic when %s", test.name) + } + } +} + +func ExampleStdDev() { + x := []float64{8, 2, -9, 15, 4} + stdev := StdDev(x, nil) + fmt.Printf("The standard deviation of the samples is %.4f\n", stdev) + + weights := []float64{2, 2, 6, 7, 1} + weightedStdev := StdDev(x, weights) + fmt.Printf("The weighted standard deviation of the samples is %.4f\n", weightedStdev) + // Output: + // The standard deviation of the samples is 8.8034 + // The weighted standard deviation of the samples is 10.5733 +} + +func ExampleStdErr() { + x := []float64{8, 2, -9, 15, 4} + weights := []float64{2, 2, 6, 7, 1} + mean := Mean(x, weights) + stdev := StdDev(x, weights) + nSamples := floats.Sum(weights) + stdErr := StdErr(stdev, nSamples) + fmt.Printf("The standard deviation is %.4f and there are %g samples, so the mean\nis likely %.4f ± %.4f.", stdev, nSamples, mean, stdErr) + // Output: + // The standard deviation is 10.5733 and there are 18 samples, so the mean + // is likely 4.1667 ± 2.4921. +} + +func TestSkew(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + ans float64 + }{ + { + x: []float64{8, 3, 7, 8, 4}, + weights: nil, + ans: -0.581456499151665, + }, + { + x: []float64{8, 3, 7, 8, 4}, + weights: []float64{1, 1, 1, 1, 1}, + ans: -0.581456499151665, + }, + { + x: []float64{8, 3, 7, 8, 4}, + weights: []float64{2, 1, 2, 1, 1}, + ans: -1.12066646837198, + }, + } { + skew := Skew(test.x, test.weights) + if math.Abs(skew-test.ans) > 1e-14 { + t.Errorf("Skew mismatch case %d. Expected %v, Found %v", i, test.ans, skew) + } + } + if !Panics(func() { Skew(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Skew did not panic with x, weights length mismatch") + } +} + +func TestSortWeighted(t *testing.T) { + for i, test := range []struct { + x []float64 + w []float64 + ansx []float64 + answ []float64 + }{ + { + x: []float64{8, 3, 7, 8, 4}, + ansx: []float64{3, 4, 7, 8, 8}, + }, + { + x: []float64{8, 3, 7, 8, 4}, + w: []float64{.5, 1, 1, .5, 1}, + ansx: []float64{3, 4, 7, 8, 8}, + answ: []float64{1, 1, 1, .5, .5}, + }, + } { + SortWeighted(test.x, test.w) + if !floats.Same(test.x, test.ansx) { + t.Errorf("SortWeighted mismatch case %d. Expected x %v, Found x %v", i, test.ansx, test.x) + } + if !(test.w == nil) && !floats.Same(test.w, test.answ) { + t.Errorf("SortWeighted mismatch case %d. Expected w %v, Found w %v", i, test.answ, test.w) + } + } + if !Panics(func() { SortWeighted(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("SortWeighted did not panic with x, weights length mismatch") + } +} + +func TestSortWeightedLabeled(t *testing.T) { + for i, test := range []struct { + x []float64 + l []bool + w []float64 + ansx []float64 + ansl []bool + answ []float64 + }{ + { + x: []float64{8, 3, 7, 8, 4}, + ansx: []float64{3, 4, 7, 8, 8}, + }, + { + x: []float64{8, 3, 7, 8, 4}, + w: []float64{.5, 1, 1, .5, 1}, + ansx: []float64{3, 4, 7, 8, 8}, + answ: []float64{1, 1, 1, .5, .5}, + }, + { + x: []float64{8, 3, 7, 8, 4}, + l: []bool{false, false, true, false, true}, + ansx: []float64{3, 4, 7, 8, 8}, + ansl: []bool{false, true, true, false, false}, + }, + { + x: []float64{8, 3, 7, 8, 4}, + l: []bool{false, false, true, false, true}, + w: []float64{.5, 1, 1, .5, 1}, + ansx: []float64{3, 4, 7, 8, 8}, + ansl: []bool{false, true, true, false, false}, + answ: []float64{1, 1, 1, .5, .5}, + }, + } { + SortWeightedLabeled(test.x, test.l, test.w) + if !floats.Same(test.x, test.ansx) { + t.Errorf("SortWeightedLabelled mismatch case %d. Expected x %v, Found x %v", i, test.ansx, test.x) + } + if (test.l != nil) && !reflect.DeepEqual(test.l, test.ansl) { + t.Errorf("SortWeightedLabelled mismatch case %d. Expected l %v, Found l %v", i, test.ansl, test.l) + } + if (test.w != nil) && !floats.Same(test.w, test.answ) { + t.Errorf("SortWeightedLabelled mismatch case %d. Expected w %v, Found w %v", i, test.answ, test.w) + } + } + if !Panics(func() { SortWeightedLabeled(make([]float64, 3), make([]bool, 2), make([]float64, 3)) }) { + t.Errorf("SortWeighted did not panic with x, labels length mismatch") + } + if !Panics(func() { SortWeightedLabeled(make([]float64, 3), make([]bool, 2), nil) }) { + t.Errorf("SortWeighted did not panic with x, labels length mismatch") + } + if !Panics(func() { SortWeightedLabeled(make([]float64, 3), make([]bool, 3), make([]float64, 2)) }) { + t.Errorf("SortWeighted did not panic with x, weights length mismatch") + } + if !Panics(func() { SortWeightedLabeled(make([]float64, 3), nil, make([]float64, 2)) }) { + t.Errorf("SortWeighted did not panic with x, weights length mismatch") + } +} + +func TestVariance(t *testing.T) { + for i, test := range []struct { + x []float64 + weights []float64 + ans float64 + }{ + { + x: []float64{8, -3, 7, 8, -4}, + weights: nil, + ans: 37.7, + }, + { + x: []float64{8, -3, 7, 8, -4}, + weights: []float64{1, 1, 1, 1, 1}, + ans: 37.7, + }, + { + x: []float64{8, 3, 7, 8, 4}, + weights: []float64{2, 1, 2, 1, 1}, + ans: 4.2857142857142865, + }, + { + x: []float64{1, 4, 9}, + weights: []float64{1, 1.5, 1}, + ans: 13.142857142857146, + }, + { + x: []float64{1, 2, 3}, + weights: []float64{1, 1.5, 1}, + ans: .8, + }, + } { + variance := Variance(test.x, test.weights) + if math.Abs(variance-test.ans) > 1e-14 { + t.Errorf("Variance mismatch case %d. Expected %v, Found %v", i, test.ans, variance) + } + } + if !Panics(func() { Variance(make([]float64, 3), make([]float64, 2)) }) { + t.Errorf("Variance did not panic with x, weights length mismatch") + } + +} + +func ExampleVariance() { + x := []float64{8, 2, -9, 15, 4} + variance := Variance(x, nil) + fmt.Printf("The variance of the samples is %.4f\n", variance) + + weights := []float64{2, 2, 6, 7, 1} + weightedVariance := Variance(x, weights) + fmt.Printf("The weighted variance of the samples is %.4f\n", weightedVariance) + // Output: + // The variance of the samples is 77.5000 + // The weighted variance of the samples is 111.7941 +} + +func TestStdScore(t *testing.T) { + for i, test := range []struct { + x float64 + u float64 + s float64 + z float64 + }{ + { + x: 4, + u: -6, + s: 5, + z: 2, + }, + { + x: 1, + u: 0, + s: 1, + z: 1, + }, + } { + z := StdScore(test.x, test.u, test.s) + if math.Abs(z-test.z) > 1e-14 { + t.Errorf("StdScore mismatch case %d. Expected %v, Found %v", i, test.z, z) + } + } + +} diff --git a/stat/statmat.go b/stat/statmat.go new file mode 100644 index 00000000..d464506c --- /dev/null +++ b/stat/statmat.go @@ -0,0 +1,147 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "math" + + "github.com/gonum/floats" + "github.com/gonum/matrix" + "github.com/gonum/matrix/mat64" +) + +// CovarianceMatrix returns the covariance matrix (also known as the +// variance-covariance matrix) calculated from a matrix of data, x, using +// a two-pass algorithm. +// +// If weights is not nil the weighted covariance of x is calculated. weights +// must have length equal to the number of rows in input data matrix and +// must not contain negative elements. +// If cov is not nil it must either be zero-sized or have the same number of +// columns as the input data matrix. cov will be used as the destination for +// the covariance data. If cov is nil, a new mat64.SymDense is allocated for +// the destination. +func CovarianceMatrix(cov *mat64.SymDense, x mat64.Matrix, weights []float64) *mat64.SymDense { + // This is the matrix version of the two-pass algorithm. It doesn't use the + // additional floating point error correction that the Covariance function uses + // to reduce the impact of rounding during centering. + + r, c := x.Dims() + + if cov == nil { + cov = mat64.NewSymDense(c, nil) + } else if n := cov.Symmetric(); n != c && n != 0 { + panic(matrix.ErrShape) + } + + var xt mat64.Dense + xt.Clone(x.T()) + // Subtract the mean of each of the columns. + for i := 0; i < c; i++ { + v := xt.RawRowView(i) + // This will panic with ErrShape if len(weights) != len(v), so + // we don't have to check the size later. + mean := Mean(v, weights) + floats.AddConst(-mean, v) + } + + if weights == nil { + // Calculate the normalization factor + // scaled by the sample size. + cov.SymOuterK(1/(float64(r)-1), &xt) + return cov + } + + // Multiply by the sqrt of the weights, so that multiplication is symmetric. + sqrtwts := make([]float64, r) + for i, w := range weights { + if w < 0 { + panic("stat: negative covariance matrix weights") + } + sqrtwts[i] = math.Sqrt(w) + } + // Weight the rows. + for i := 0; i < c; i++ { + v := xt.RawRowView(i) + floats.Mul(v, sqrtwts) + } + + // Calculate the normalization factor + // scaled by the weighted sample size. + cov.SymOuterK(1/(floats.Sum(weights)-1), &xt) + return cov +} + +// CorrelationMatrix returns the correlation matrix calculated from a matrix +// of data, x, using a two-pass algorithm. +// +// If weights is not nil the weighted correlation of x is calculated. weights +// must have length equal to the number of rows in input data matrix and +// must not contain negative elements. +// If corr is not nil it must either be zero-sized or have the same number of +// columns as the input data matrix. corr will be used as the destination for +// the correlation data. If corr is nil, a new mat64.SymDense is allocated for +// the destination. +func CorrelationMatrix(corr *mat64.SymDense, x mat64.Matrix, weights []float64) *mat64.SymDense { + // This will panic if the sizes don't match, or if weights is the wrong size. + corr = CovarianceMatrix(corr, x, weights) + covToCorr(corr) + return corr +} + +// covToCorr converts a covariance matrix to a correlation matrix. +func covToCorr(c *mat64.SymDense) { + r := c.Symmetric() + + s := make([]float64, r) + for i := 0; i < r; i++ { + s[i] = 1 / math.Sqrt(c.At(i, i)) + } + for i, sx := range s { + // Ensure that the diagonal has exactly ones. + c.SetSym(i, i, 1) + for j := i + 1; j < r; j++ { + v := c.At(i, j) + c.SetSym(i, j, v*sx*s[j]) + } + } +} + +// corrToCov converts a correlation matrix to a covariance matrix. +// The input sigma should be vector of standard deviations corresponding +// to the covariance. It will panic if len(sigma) is not equal to the +// number of rows in the correlation matrix. +func corrToCov(c *mat64.SymDense, sigma []float64) { + r, _ := c.Dims() + + if r != len(sigma) { + panic(matrix.ErrShape) + } + for i, sx := range sigma { + // Ensure that the diagonal has exactly sigma squared. + c.SetSym(i, i, sx*sx) + for j := i + 1; j < r; j++ { + v := c.At(i, j) + c.SetSym(i, j, v*sx*sigma[j]) + } + } +} + +// Mahalanobis computes the Mahalanobis distance +// D = sqrt((x-y)^T * Σ^-1 * (x-y)) +// between the vectors x and y given the cholesky decomposition of Σ. +// Mahalanobis returns NaN if the linear solve fails. +// +// See https://en.wikipedia.org/wiki/Mahalanobis_distance for more information. +func Mahalanobis(x, y *mat64.Vector, chol *mat64.Cholesky) float64 { + var diff mat64.Vector + diff.SubVec(x, y) + var tmp mat64.Vector + err := tmp.SolveCholeskyVec(chol, &diff) + if err != nil { + return math.NaN() + } + return math.Sqrt(mat64.Dot(&tmp, &diff)) +} diff --git a/stat/statmat_test.go b/stat/statmat_test.go new file mode 100644 index 00000000..fd352c6e --- /dev/null +++ b/stat/statmat_test.go @@ -0,0 +1,463 @@ +// Copyright ©2014 The gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package stat + +import ( + "math" + "math/rand" + "testing" + + "github.com/gonum/floats" + "github.com/gonum/matrix/mat64" +) + +func TestCovarianceMatrix(t *testing.T) { + // An alternative way to test this is to call the Variance and + // Covariance functions and ensure that the results are identical. + for i, test := range []struct { + data *mat64.Dense + weights []float64 + ans *mat64.Dense + }{ + { + data: mat64.NewDense(5, 2, []float64{ + -2, -4, + -1, 2, + 0, 0, + 1, -2, + 2, 4, + }), + weights: nil, + ans: mat64.NewDense(2, 2, []float64{ + 2.5, 3, + 3, 10, + }), + }, { + data: mat64.NewDense(3, 2, []float64{ + 1, 1, + 2, 4, + 3, 9, + }), + weights: []float64{ + 1, + 1.5, + 1, + }, + ans: mat64.NewDense(2, 2, []float64{ + .8, 3.2, + 3.2, 13.142857142857146, + }), + }, + } { + // Make a copy of the data to check that it isn't changing. + r := test.data.RawMatrix() + d := make([]float64, len(r.Data)) + copy(d, r.Data) + + w := make([]float64, len(test.weights)) + if test.weights != nil { + copy(w, test.weights) + } + for _, cov := range []*mat64.SymDense{nil, &mat64.SymDense{}} { + c := CovarianceMatrix(cov, test.data, test.weights) + if !mat64.Equal(c, test.ans) { + t.Errorf("%d: expected cov %v, found %v", i, test.ans, c) + } + if !floats.Equal(d, r.Data) { + t.Errorf("%d: data was modified during execution", i) + } + if !floats.Equal(w, test.weights) { + t.Errorf("%d: weights was modified during execution", i) + } + + // compare with call to Covariance + _, cols := c.Dims() + for ci := 0; ci < cols; ci++ { + for cj := 0; cj < cols; cj++ { + x := mat64.Col(nil, ci, test.data) + y := mat64.Col(nil, cj, test.data) + cov := Covariance(x, y, test.weights) + if math.Abs(cov-c.At(ci, cj)) > 1e-14 { + t.Errorf("CovMat does not match at (%v, %v). Want %v, got %v.", ci, cj, cov, c.At(ci, cj)) + } + } + } + } + + } + if !Panics(func() { CovarianceMatrix(nil, mat64.NewDense(5, 2, nil), []float64{}) }) { + t.Errorf("CovarianceMatrix did not panic with weight size mismatch") + } + if !Panics(func() { CovarianceMatrix(mat64.NewSymDense(1, nil), mat64.NewDense(5, 2, nil), nil) }) { + t.Errorf("CovarianceMatrix did not panic with preallocation size mismatch") + } + if !Panics(func() { CovarianceMatrix(nil, mat64.NewDense(2, 2, []float64{1, 2, 3, 4}), []float64{1, -1}) }) { + t.Errorf("CovarianceMatrix did not panic with negative weights") + } +} + +func TestCorrelationMatrix(t *testing.T) { + for i, test := range []struct { + data *mat64.Dense + weights []float64 + ans *mat64.Dense + }{ + { + data: mat64.NewDense(3, 3, []float64{ + 1, 2, 3, + 3, 4, 5, + 5, 6, 7, + }), + weights: nil, + ans: mat64.NewDense(3, 3, []float64{ + 1, 1, 1, + 1, 1, 1, + 1, 1, 1, + }), + }, + { + data: mat64.NewDense(5, 2, []float64{ + -2, -4, + -1, 2, + 0, 0, + 1, -2, + 2, 4, + }), + weights: nil, + ans: mat64.NewDense(2, 2, []float64{ + 1, 0.6, + 0.6, 1, + }), + }, { + data: mat64.NewDense(3, 2, []float64{ + 1, 1, + 2, 4, + 3, 9, + }), + weights: []float64{ + 1, + 1.5, + 1, + }, + ans: mat64.NewDense(2, 2, []float64{ + 1, 0.9868703275903379, + 0.9868703275903379, 1, + }), + }, + } { + // Make a copy of the data to check that it isn't changing. + r := test.data.RawMatrix() + d := make([]float64, len(r.Data)) + copy(d, r.Data) + + w := make([]float64, len(test.weights)) + if test.weights != nil { + copy(w, test.weights) + } + for _, corr := range []*mat64.SymDense{nil, &mat64.SymDense{}} { + c := CorrelationMatrix(corr, test.data, test.weights) + if !mat64.Equal(c, test.ans) { + t.Errorf("%d: expected corr %v, found %v", i, test.ans, c) + } + if !floats.Equal(d, r.Data) { + t.Errorf("%d: data was modified during execution", i) + } + if !floats.Equal(w, test.weights) { + t.Errorf("%d: weights was modified during execution", i) + } + + // compare with call to Covariance + _, cols := c.Dims() + for ci := 0; ci < cols; ci++ { + for cj := 0; cj < cols; cj++ { + x := mat64.Col(nil, ci, test.data) + y := mat64.Col(nil, cj, test.data) + corr := Correlation(x, y, test.weights) + if math.Abs(corr-c.At(ci, cj)) > 1e-14 { + t.Errorf("CorrMat does not match at (%v, %v). Want %v, got %v.", ci, cj, corr, c.At(ci, cj)) + } + } + } + } + + } + if !Panics(func() { CorrelationMatrix(nil, mat64.NewDense(5, 2, nil), []float64{}) }) { + t.Errorf("CorrelationMatrix did not panic with weight size mismatch") + } + if !Panics(func() { CorrelationMatrix(mat64.NewSymDense(1, nil), mat64.NewDense(5, 2, nil), nil) }) { + t.Errorf("CorrelationMatrix did not panic with preallocation size mismatch") + } + if !Panics(func() { CorrelationMatrix(nil, mat64.NewDense(2, 2, []float64{1, 2, 3, 4}), []float64{1, -1}) }) { + t.Errorf("CorrelationMatrix did not panic with negative weights") + } +} + +func TestCorrCov(t *testing.T) { + // test both Cov2Corr and Cov2Corr + for i, test := range []struct { + data *mat64.Dense + weights []float64 + }{ + { + data: mat64.NewDense(3, 3, []float64{ + 1, 2, 3, + 3, 4, 5, + 5, 6, 7, + }), + weights: nil, + }, + { + data: mat64.NewDense(5, 2, []float64{ + -2, -4, + -1, 2, + 0, 0, + 1, -2, + 2, 4, + }), + weights: nil, + }, { + data: mat64.NewDense(3, 2, []float64{ + 1, 1, + 2, 4, + 3, 9, + }), + weights: []float64{ + 1, + 1.5, + 1, + }, + }, + } { + corr := CorrelationMatrix(nil, test.data, test.weights) + cov := CovarianceMatrix(nil, test.data, test.weights) + + r := cov.Symmetric() + + // Get the diagonal elements from cov to determine the sigmas. + sigmas := make([]float64, r) + for i := range sigmas { + sigmas[i] = math.Sqrt(cov.At(i, i)) + } + + covFromCorr := mat64.NewSymDense(corr.Symmetric(), nil) + covFromCorr.CopySym(corr) + corrToCov(covFromCorr, sigmas) + + corrFromCov := mat64.NewSymDense(cov.Symmetric(), nil) + corrFromCov.CopySym(cov) + covToCorr(corrFromCov) + + if !mat64.EqualApprox(corr, corrFromCov, 1e-14) { + t.Errorf("%d: corrToCov did not match direct Correlation calculation. Want: %v, got: %v. ", i, corr, corrFromCov) + } + if !mat64.EqualApprox(cov, covFromCorr, 1e-14) { + t.Errorf("%d: covToCorr did not match direct Covariance calculation. Want: %v, got: %v. ", i, cov, covFromCorr) + } + + if !Panics(func() { corrToCov(mat64.NewSymDense(2, nil), []float64{}) }) { + t.Errorf("CorrelationMatrix did not panic with sigma size mismatch") + } + } +} + +func TestMahalanobis(t *testing.T) { + // Comparison with scipy. + for cas, test := range []struct { + x, y *mat64.Vector + Sigma *mat64.SymDense + ans float64 + }{ + { + x: mat64.NewVector(3, []float64{1, 2, 3}), + y: mat64.NewVector(3, []float64{0.8, 1.1, -1}), + Sigma: mat64.NewSymDense(3, + []float64{ + 0.8, 0.3, 0.1, + 0.3, 0.7, -0.1, + 0.1, -0.1, 7}), + ans: 1.9251757377680914, + }, + } { + var chol mat64.Cholesky + ok := chol.Factorize(test.Sigma) + if !ok { + panic("bad test") + } + ans := Mahalanobis(test.x, test.y, &chol) + if math.Abs(ans-test.ans) > 1e-14 { + t.Errorf("Cas %d: got %v, want %v", cas, ans, test.ans) + } + } +} + +// benchmarks + +func randMat(r, c int) mat64.Matrix { + x := make([]float64, r*c) + for i := range x { + x[i] = rand.Float64() + } + return mat64.NewDense(r, c, x) +} + +func benchmarkCovarianceMatrix(b *testing.B, m mat64.Matrix) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + CovarianceMatrix(nil, m, nil) + } +} +func benchmarkCovarianceMatrixWeighted(b *testing.B, m mat64.Matrix) { + r, _ := m.Dims() + wts := make([]float64, r) + for i := range wts { + wts[i] = 0.5 + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + CovarianceMatrix(nil, m, wts) + } +} +func benchmarkCovarianceMatrixInPlace(b *testing.B, m mat64.Matrix) { + _, c := m.Dims() + res := mat64.NewSymDense(c, nil) + b.ResetTimer() + for i := 0; i < b.N; i++ { + CovarianceMatrix(res, m, nil) + } +} + +func BenchmarkCovarianceMatrixSmallxSmall(b *testing.B) { + // 10 * 10 elements + x := randMat(small, small) + benchmarkCovarianceMatrix(b, x) +} +func BenchmarkCovarianceMatrixSmallxMedium(b *testing.B) { + // 10 * 1000 elements + x := randMat(small, medium) + benchmarkCovarianceMatrix(b, x) +} + +func BenchmarkCovarianceMatrixMediumxSmall(b *testing.B) { + // 1000 * 10 elements + x := randMat(medium, small) + benchmarkCovarianceMatrix(b, x) +} +func BenchmarkCovarianceMatrixMediumxMedium(b *testing.B) { + // 1000 * 1000 elements + x := randMat(medium, medium) + benchmarkCovarianceMatrix(b, x) +} + +func BenchmarkCovarianceMatrixLargexSmall(b *testing.B) { + // 1e5 * 10 elements + x := randMat(large, small) + benchmarkCovarianceMatrix(b, x) +} + +func BenchmarkCovarianceMatrixHugexSmall(b *testing.B) { + // 1e7 * 10 elements + x := randMat(huge, small) + benchmarkCovarianceMatrix(b, x) +} + +func BenchmarkCovarianceMatrixSmallxSmallWeighted(b *testing.B) { + // 10 * 10 elements + x := randMat(small, small) + benchmarkCovarianceMatrixWeighted(b, x) +} +func BenchmarkCovarianceMatrixSmallxMediumWeighted(b *testing.B) { + // 10 * 1000 elements + x := randMat(small, medium) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixMediumxSmallWeighted(b *testing.B) { + // 1000 * 10 elements + x := randMat(medium, small) + benchmarkCovarianceMatrixWeighted(b, x) +} +func BenchmarkCovarianceMatrixMediumxMediumWeighted(b *testing.B) { + // 1000 * 1000 elements + x := randMat(medium, medium) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixLargexSmallWeighted(b *testing.B) { + // 1e5 * 10 elements + x := randMat(large, small) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixHugexSmallWeighted(b *testing.B) { + // 1e7 * 10 elements + x := randMat(huge, small) + benchmarkCovarianceMatrixWeighted(b, x) +} + +func BenchmarkCovarianceMatrixSmallxSmallInPlace(b *testing.B) { + // 10 * 10 elements + x := randMat(small, small) + benchmarkCovarianceMatrixInPlace(b, x) +} +func BenchmarkCovarianceMatrixSmallxMediumInPlace(b *testing.B) { + // 10 * 1000 elements + x := randMat(small, medium) + benchmarkCovarianceMatrixInPlace(b, x) +} + +func BenchmarkCovarianceMatrixMediumxSmallInPlace(b *testing.B) { + // 1000 * 10 elements + x := randMat(medium, small) + benchmarkCovarianceMatrixInPlace(b, x) +} +func BenchmarkCovarianceMatrixMediumxMediumInPlace(b *testing.B) { + // 1000 * 1000 elements + x := randMat(medium, medium) + benchmarkCovarianceMatrixInPlace(b, x) +} + +func BenchmarkCovarianceMatrixLargexSmallInPlace(b *testing.B) { + // 1e5 * 10 elements + x := randMat(large, small) + benchmarkCovarianceMatrixInPlace(b, x) +} + +func BenchmarkCovarianceMatrixHugexSmallInPlace(b *testing.B) { + // 1e7 * 10 elements + x := randMat(huge, small) + benchmarkCovarianceMatrixInPlace(b, x) +} + +func BenchmarkCovToCorr(b *testing.B) { + // generate a 10x10 covariance matrix + m := randMat(small, small) + c := CovarianceMatrix(nil, m, nil) + cc := mat64.NewSymDense(c.Symmetric(), nil) + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + cc.CopySym(c) + b.StartTimer() + covToCorr(cc) + } +} + +func BenchmarkCorrToCov(b *testing.B) { + // generate a 10x10 correlation matrix + m := randMat(small, small) + c := CorrelationMatrix(nil, m, nil) + cc := mat64.NewSymDense(c.Symmetric(), nil) + sigma := make([]float64, small) + for i := range sigma { + sigma[i] = 2 + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + cc.CopySym(c) + b.StartTimer() + corrToCov(cc, sigma) + } +}