From b8e980d6ce432ed8813465201a18a31f1f2a60eb Mon Sep 17 00:00:00 2001 From: Vinzent Steinberg Date: Sun, 28 May 2017 23:33:16 +0200 Subject: [PATCH] Implement kurtosis Also improve skewness tests. --- src/lib.rs | 2 +- src/moments/kurtosis.rs | 145 ++++++++++++++++++++++++++++++++++++++++ src/moments/mod.rs | 1 + src/moments/skewness.rs | 8 ++- tests/kurtosis.rs | 76 +++++++++++++++++++++ tests/skewness.rs | 8 +-- 6 files changed, 232 insertions(+), 8 deletions(-) create mode 100644 src/moments/kurtosis.rs create mode 100644 tests/kurtosis.rs diff --git a/src/lib.rs b/src/lib.rs index 20fb090..a77ea08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,7 +45,7 @@ mod minmax; mod reduce; mod quantile; -pub use moments::{Mean, Variance, Skewness, MeanWithError}; +pub use moments::{Mean, Variance, Skewness, Kurtosis, MeanWithError}; pub use weighted_mean::{WeightedMean, WeightedMeanWithError}; pub use minmax::{Min, Max}; pub use quantile::Quantile; diff --git a/src/moments/kurtosis.rs b/src/moments/kurtosis.rs new file mode 100644 index 0000000..2106c5d --- /dev/null +++ b/src/moments/kurtosis.rs @@ -0,0 +1,145 @@ +/// Estimate the arithmetic mean, the variance, the skewness and the kurtosis of +/// a sequence of numbers ("population"). +/// +/// This can be used to estimate the standard error of the mean. +#[derive(Debug, Clone)] +pub struct Kurtosis { + /// Estimator of mean, variance and skewness. + avg: Skewness, + /// Intermediate sum of terms to the fourth for calculating the skewness. + sum_4: f64, +} + +impl Kurtosis { + /// Create a new skewness estimator. + #[inline] + pub fn new() -> Kurtosis { + Kurtosis { + avg: Skewness::new(), + sum_4: 0., + } + } + + /// Add an observation sampled from the population. + #[inline] + pub fn add(&mut self, x: f64) { + let delta = x - self.mean(); + self.increment(); + let n = f64::approx_from(self.len()).unwrap(); + self.add_inner(delta, delta/n); + } + + /// Increment the sample size. + /// + /// This does not update anything else. + #[inline] + fn increment(&mut self) { + self.avg.increment(); + } + + /// Add an observation given an already calculated difference from the mean + /// divided by the number of samples, assuming the inner count of the sample + /// size was already updated. + /// + /// This is useful for avoiding unnecessary divisions in the inner loop. + #[inline] + fn add_inner(&mut self, delta: f64, delta_n: f64) { + // This algorithm was suggested by Terriberry. + // + // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. + let n = f64::approx_from(self.len()).unwrap(); + let term = delta * delta_n * (n - 1.); + let delta_n_sq = delta_n*delta_n; + self.sum_4 += term * delta_n_sq * (n*n - 3.*n + 3.) + + 6. * delta_n_sq * self.avg.avg.sum_2 + - 4. * delta_n * self.avg.sum_3; + self.avg.add_inner(delta, delta_n); + } + + /// Determine whether the sample is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.avg.is_empty() + } + + /// Estimate the mean of the population. + /// + /// Returns 0 for an empty sample. + #[inline] + pub fn mean(&self) -> f64 { + self.avg.mean() + } + + /// Return the sample size. + #[inline] + pub fn len(&self) -> u64 { + self.avg.len() + } + + /// Calculate the sample variance. + /// + /// This is an unbiased estimator of the variance of the population. + #[inline] + pub fn sample_variance(&self) -> f64 { + self.avg.sample_variance() + } + + /// Calculate the population variance of the sample. + /// + /// This is a biased estimator of the variance of the population. + #[inline] + pub fn population_variance(&self) -> f64 { + self.avg.population_variance() + } + + /// Estimate the standard error of the mean of the population. + #[inline] + pub fn error_mean(&self) -> f64 { + self.avg.error_mean() + } + + /// Estimate the skewness of the population. + #[inline] + pub fn skewness(&self) -> f64 { + self.avg.skewness() + } + + /// Estimate the kurtosis of the population. + #[inline] + pub fn kurtosis(&self) -> f64 { + if self.sum_4 == 0. { + return 0.; + } + let n = f64::approx_from(self.len()).unwrap(); + n * self.sum_4 / (self.avg.avg.sum_2 * self.avg.avg.sum_2) - 3. + } + + /// Merge another sample into this one. + #[inline] + pub fn merge(&mut self, other: &Kurtosis) { + let len_self = f64::approx_from(self.len()).unwrap(); + let len_other = f64::approx_from(other.len()).unwrap(); + let len_total = len_self + len_other; + let delta = other.mean() - self.mean(); + let delta_n = delta / len_total; + let delta_n_sq = delta_n * delta_n; + self.sum_4 += other.sum_4 + + delta * delta_n*delta_n_sq * len_self*len_other + * (len_self*len_self - len_self*len_other + len_other*len_other) + + 6.*delta_n_sq * (len_self*len_self * other.avg.avg.sum_2 + len_other*len_other * self.avg.avg.sum_2) + + 4.*delta_n * (len_self * other.avg.sum_3 - len_other * self.avg.sum_3); + self.avg.merge(&other.avg); + } +} + +impl core::iter::FromIterator for Kurtosis { + fn from_iter(iter: T) -> Kurtosis + where T: IntoIterator + { + let mut a = Kurtosis::new(); + for i in iter { + a.add(i); + } + a + } +} diff --git a/src/moments/mod.rs b/src/moments/mod.rs index 5e7aa87..d96921d 100644 --- a/src/moments/mod.rs +++ b/src/moments/mod.rs @@ -1,5 +1,6 @@ include!("mean.rs"); include!("variance.rs"); include!("skewness.rs"); +include!("kurtosis.rs"); pub type MeanWithError = Variance; diff --git a/src/moments/skewness.rs b/src/moments/skewness.rs index 3245231..63ac47d 100644 --- a/src/moments/skewness.rs +++ b/src/moments/skewness.rs @@ -48,7 +48,8 @@ impl Skewness { // // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. let n = f64::approx_from(self.len()).unwrap(); - self.sum_3 += delta*delta_n*delta_n * (n - 1.)*(n - 2.) + let term = delta * delta_n * (n - 1.); + self.sum_3 += term * delta_n * (n - 2.) - 3.*delta_n * self.avg.sum_2; self.avg.add_inner(delta_n); } @@ -114,9 +115,10 @@ impl Skewness { let len_other = f64::approx_from(other.len()).unwrap(); let len_total = len_self + len_other; let delta = other.mean() - self.mean(); + let delta_n = delta / len_total; self.sum_3 += other.sum_3 - + delta*delta*delta * len_self*len_other*(len_self - len_other) / (len_total*len_total) - + 3.*delta * (len_self*other.avg.sum_2 - len_other*self.avg.sum_2) / len_total; + + delta*delta_n*delta_n * len_self*len_other*(len_self - len_other) + + 3.*delta_n * (len_self * other.avg.sum_2 - len_other * self.avg.sum_2); self.avg.merge(&other.avg); } } diff --git a/tests/kurtosis.rs b/tests/kurtosis.rs new file mode 100644 index 0000000..0632464 --- /dev/null +++ b/tests/kurtosis.rs @@ -0,0 +1,76 @@ +#[macro_use] extern crate average; + +extern crate core; + +extern crate rand; + +use core::iter::Iterator; + +use average::Kurtosis; + +#[test] +fn trivial() { + let mut a = Kurtosis::new(); + assert_eq!(a.len(), 0); + a.add(1.0); + assert_eq!(a.mean(), 1.0); + assert_eq!(a.len(), 1); + assert_eq!(a.sample_variance(), 0.0); + assert_eq!(a.population_variance(), 0.0); + assert_eq!(a.error_mean(), 0.0); + assert_eq!(a.skewness(), 0.0); + a.add(1.0); + assert_eq!(a.mean(), 1.0); + assert_eq!(a.len(), 2); + assert_eq!(a.sample_variance(), 0.0); + assert_eq!(a.population_variance(), 0.0); + assert_eq!(a.error_mean(), 0.0); + assert_eq!(a.skewness(), 0.0); +} + +#[test] +fn simple() { + let mut a: Kurtosis = (1..6).map(f64::from).collect(); + assert_eq!(a.mean(), 3.0); + assert_eq!(a.len(), 5); + assert_eq!(a.sample_variance(), 2.5); + assert_almost_eq!(a.error_mean(), f64::sqrt(0.5), 1e-16); + assert_eq!(a.skewness(), 0.0); + a.add(1.0); + assert_almost_eq!(a.skewness(), 0.2795084971874741, 1e-15); + assert_almost_eq!(a.kurtosis(), -1.365, 1e-15); +} + +#[test] +fn merge() { + let sequence: &[f64] = &[1., 2., 3., -4., 5.1, 6.3, 7.3, -8., 9., 1.]; + for mid in 0..sequence.len() { + let (left, right) = sequence.split_at(mid); + let avg_total: Kurtosis = sequence.iter().map(|x| *x).collect(); + let mut avg_left: Kurtosis = left.iter().map(|x| *x).collect(); + let avg_right: Kurtosis = right.iter().map(|x| *x).collect(); + avg_left.merge(&avg_right); + assert_eq!(avg_total.len(), avg_left.len()); + assert_almost_eq!(avg_total.mean(), avg_left.mean(), 1e-14); + assert_almost_eq!(avg_total.sample_variance(), avg_left.sample_variance(), 1e-14); + assert_almost_eq!(avg_total.skewness(), avg_left.skewness(), 1e-14); + assert_almost_eq!(avg_total.kurtosis(), avg_left.kurtosis(), 1e-14); + } +} + +#[test] +fn exponential_distribution() { + use rand::distributions::{Exp, IndependentSample}; + let lambda = 2.0; + let normal = Exp::new(lambda); + let mut a = Kurtosis::new(); + for _ in 0..6_000_000 { + a.add(normal.ind_sample(&mut ::rand::thread_rng())); + } + assert_almost_eq!(a.mean(), 1./lambda, 1e-2); + assert_almost_eq!(a.sample_variance().sqrt(), 1./lambda, 1e-2); + assert_almost_eq!(a.population_variance().sqrt(), 1./lambda, 1e-2); + assert_almost_eq!(a.error_mean(), 0.0, 1e-2); + assert_almost_eq!(a.skewness(), 2.0, 1e-2); + assert_almost_eq!(a.kurtosis(), 6.0, 4e-2); +} diff --git a/tests/skewness.rs b/tests/skewness.rs index 80d41b2..a8979d2 100644 --- a/tests/skewness.rs +++ b/tests/skewness.rs @@ -42,7 +42,7 @@ fn simple() { #[test] fn merge() { - let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; + let sequence: &[f64] = &[1., 2., 3., -4., 5., 6., 7., 8., 9., 1.]; for mid in 0..sequence.len() { let (left, right) = sequence.split_at(mid); let avg_total: Skewness = sequence.iter().map(|x| *x).collect(); @@ -50,9 +50,9 @@ fn merge() { let avg_right: Skewness = right.iter().map(|x| *x).collect(); avg_left.merge(&avg_right); assert_eq!(avg_total.len(), avg_left.len()); - assert_eq!(avg_total.mean(), avg_left.mean()); - assert_eq!(avg_total.sample_variance(), avg_left.sample_variance()); - assert_eq!(avg_total.skewness(), avg_left.skewness()); + assert_almost_eq!(avg_total.mean(), avg_left.mean(), 1e-14); + assert_almost_eq!(avg_total.sample_variance(), avg_left.sample_variance(), 1e-14); + assert_almost_eq!(avg_total.skewness(), avg_left.skewness(), 1e-14); } }