Skip to content

Commit 9c2787d

Browse files
Cdf testing with Kolmogorov Smirnov (#1494)
1 parent d1f961c commit 9c2787d

File tree

2 files changed

+198
-0
lines changed

2 files changed

+198
-0
lines changed

rand_distr/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
- The `serde1` feature has been renamed `serde` (#1477)
99
- Fix panic in Binomial (#1484)
1010
- Move some of the computations in Binomial from `sample` to `new` (#1484)
11+
- Add Kolmogorov Smirnov test for sampling of `Normal` and `Binomial` (#1494)
1112

1213
### Added
1314
- Add plots for `rand_distr` distributions to documentation (#1434)

rand_distr/tests/cdf.rs

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
// Copyright 2021 Developers of the Rand project.
2+
//
3+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4+
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5+
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
6+
// option. This file may not be copied, modified, or distributed
7+
// except according to those terms.
8+
9+
use core::f64;
10+
11+
use num_traits::AsPrimitive;
12+
use rand::SeedableRng;
13+
use rand_distr::{Distribution, Normal};
14+
use special::Beta;
15+
use special::Primitive;
16+
17+
// [1] Nonparametric Goodness-of-Fit Tests for Discrete Null Distributions
18+
// by Taylor B. Arnold and John W. Emerson
19+
// http://www.stat.yale.edu/~jay/EmersonMaterials/DiscreteGOF.pdf
20+
21+
/// Empirical Cumulative Distribution Function (ECDF)
22+
struct Ecdf {
23+
sorted_samples: Vec<f64>,
24+
}
25+
26+
impl Ecdf {
27+
fn new(mut samples: Vec<f64>) -> Self {
28+
samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
29+
Self {
30+
sorted_samples: samples,
31+
}
32+
}
33+
34+
/// Returns the step points of the ECDF
35+
/// The ECDF is a step function that increases by 1/n at each sample point
36+
/// The function is continuous from the right, so we give the bigger value at the step points
37+
/// First point is (-inf, 0.0), last point is (max(samples), 1.0)
38+
fn step_points(&self) -> Vec<(f64, f64)> {
39+
let mut points = Vec::with_capacity(self.sorted_samples.len() + 1);
40+
let mut last = f64::NEG_INFINITY;
41+
let mut count = 0;
42+
let n = self.sorted_samples.len() as f64;
43+
for &x in &self.sorted_samples {
44+
if x != last {
45+
points.push((last, count as f64 / n));
46+
last = x;
47+
}
48+
count += 1;
49+
}
50+
points.push((last, count as f64 / n));
51+
points
52+
}
53+
}
54+
55+
fn kolmogorov_smirnov_statistic_continuous(ecdf: Ecdf, cdf: impl Fn(f64) -> f64) -> f64 {
56+
// We implement equation (3) from [1]
57+
58+
let mut max_diff: f64 = 0.;
59+
60+
let step_points = ecdf.step_points(); // x_i in the paper
61+
for i in 1..step_points.len() {
62+
let (x_i, f_i) = step_points[i];
63+
let (_, f_i_1) = step_points[i - 1];
64+
let cdf_i = cdf(x_i);
65+
let max_1 = (cdf_i - f_i).abs();
66+
let max_2 = (cdf_i - f_i_1).abs();
67+
68+
max_diff = max_diff.max(max_1).max(max_2);
69+
}
70+
max_diff
71+
}
72+
73+
fn kolmogorov_smirnov_statistic_discrete(ecdf: Ecdf, cdf: impl Fn(i64) -> f64) -> f64 {
74+
// We implement equation (4) from [1]
75+
76+
let mut max_diff: f64 = 0.;
77+
78+
let step_points = ecdf.step_points(); // x_i in the paper
79+
for i in 1..step_points.len() {
80+
let (x_i, f_i) = step_points[i];
81+
let (_, f_i_1) = step_points[i - 1];
82+
let max_1 = (cdf(x_i as i64) - f_i).abs();
83+
let max_2 = (cdf(x_i as i64 - 1) - f_i_1).abs(); // -1 is the same as -epsilon, because we have integer support
84+
85+
max_diff = max_diff.max(max_1).max(max_2);
86+
}
87+
max_diff
88+
}
89+
90+
const SAMPLE_SIZE: u64 = 1_000_000;
91+
92+
fn critical_value() -> f64 {
93+
// If the sampler is correct, we expect less than 0.001 false positives (alpha = 0.001).
94+
// Passing this does not prove that the sampler is correct but is a good indication.
95+
1.95 / (SAMPLE_SIZE as f64).sqrt()
96+
}
97+
98+
fn sample_ecdf<T>(seed: u64, dist: impl Distribution<T>) -> Ecdf
99+
where
100+
T: AsPrimitive<f64>,
101+
{
102+
let mut rng = rand::rngs::SmallRng::seed_from_u64(seed);
103+
let samples = (0..SAMPLE_SIZE)
104+
.map(|_| dist.sample(&mut rng).as_())
105+
.collect();
106+
Ecdf::new(samples)
107+
}
108+
109+
/// Tests a distribution against an analytical CDF.
110+
/// The CDF has to be continuous.
111+
pub fn test_continuous(seed: u64, dist: impl Distribution<f64>, cdf: impl Fn(f64) -> f64) {
112+
let ecdf = sample_ecdf(seed, dist);
113+
let ks_statistic = kolmogorov_smirnov_statistic_continuous(ecdf, cdf);
114+
115+
let critical_value = critical_value();
116+
117+
println!("KS statistic: {}", ks_statistic);
118+
println!("Critical value: {}", critical_value);
119+
assert!(ks_statistic < critical_value);
120+
}
121+
122+
/// Tests a distribution over integers against an analytical CDF.
123+
/// The analytical CDF must not have jump points which are not integers.
124+
pub fn test_discrete<I: AsPrimitive<f64>>(
125+
seed: u64,
126+
dist: impl Distribution<I>,
127+
cdf: impl Fn(i64) -> f64,
128+
) {
129+
let ecdf = sample_ecdf(seed, dist);
130+
let ks_statistic = kolmogorov_smirnov_statistic_discrete(ecdf, cdf);
131+
132+
// This critical value is bigger than it could be for discrete distributions, but because of large sample sizes this should not matter too much
133+
let critical_value = critical_value();
134+
135+
println!("KS statistic: {}", ks_statistic);
136+
println!("Critical value: {}", critical_value);
137+
assert!(ks_statistic < critical_value);
138+
}
139+
140+
fn normal_cdf(x: f64, mean: f64, std_dev: f64) -> f64 {
141+
0.5 * ((mean - x) / (std_dev * f64::consts::SQRT_2)).erfc()
142+
}
143+
144+
#[test]
145+
fn normal() {
146+
let parameters = [
147+
(0.0, 1.0),
148+
(0.0, 0.1),
149+
(1.0, 10.0),
150+
(1.0, 100.0),
151+
(-1.0, 0.00001),
152+
(-1.0, 0.0000001),
153+
];
154+
155+
for (seed, (mean, std_dev)) in parameters.into_iter().enumerate() {
156+
test_continuous(seed as u64, Normal::new(mean, std_dev).unwrap(), |x| {
157+
normal_cdf(x, mean, std_dev)
158+
});
159+
}
160+
}
161+
162+
fn binomial_cdf(k: i64, p: f64, n: u64) -> f64 {
163+
if k < 0 {
164+
return 0.0;
165+
}
166+
let k = k as u64;
167+
if k >= n {
168+
return 1.0;
169+
}
170+
171+
let a = (n - k) as f64;
172+
let b = k as f64 + 1.0;
173+
174+
let q = 1.0 - p;
175+
176+
let ln_beta_ab = a.ln_beta(b);
177+
178+
q.inc_beta(a, b, ln_beta_ab)
179+
}
180+
181+
#[test]
182+
fn binomial() {
183+
let parameters = [
184+
(0.5, 10),
185+
(0.5, 100),
186+
(0.1, 10),
187+
(0.0000001, 1000000),
188+
(0.0000001, 10),
189+
(0.9999, 2),
190+
];
191+
192+
for (seed, (p, n)) in parameters.into_iter().enumerate() {
193+
test_discrete(seed as u64, rand_distr::Binomial::new(n, p).unwrap(), |k| {
194+
binomial_cdf(k, p, n)
195+
});
196+
}
197+
}

0 commit comments

Comments
 (0)