From 4f5ddbd6e977cd48bc38c40f8fdc1d111c9aad7d Mon Sep 17 00:00:00 2001 From: Vinzent Steinberg Date: Sun, 28 May 2017 22:37:48 +0200 Subject: [PATCH] Implement skewness and add missing inline annotations --- src/lib.rs | 2 +- src/moments/mean.rs | 1 + src/moments/mod.rs | 1 + src/moments/skewness.rs | 134 ++++++++++++++++++++++++++++++++++++++++ src/moments/variance.rs | 4 +- tests/skewness.rs | 73 ++++++++++++++++++++++ 6 files changed, 213 insertions(+), 2 deletions(-) create mode 100644 src/moments/skewness.rs create mode 100644 tests/skewness.rs diff --git a/src/lib.rs b/src/lib.rs index b276fb3..20fb090 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -45,7 +45,7 @@ mod minmax; mod reduce; mod quantile; -pub use moments::{Mean, Variance, MeanWithError}; +pub use moments::{Mean, Variance, Skewness, MeanWithError}; pub use weighted_mean::{WeightedMean, WeightedMeanWithError}; pub use minmax::{Min, Max}; pub use quantile::Quantile; diff --git a/src/moments/mean.rs b/src/moments/mean.rs index 4213321..355c715 100644 --- a/src/moments/mean.rs +++ b/src/moments/mean.rs @@ -51,6 +51,7 @@ impl Mean { /// size was already updated. /// /// This is useful for avoiding unnecessary divisions in the inner loop. + #[inline] fn add_inner(&mut self, delta_n: f64) { // This algorithm introduced by Welford in 1962 trades numerical // stability for a division inside the loop. diff --git a/src/moments/mod.rs b/src/moments/mod.rs index 32914b3..5e7aa87 100644 --- a/src/moments/mod.rs +++ b/src/moments/mod.rs @@ -1,4 +1,5 @@ include!("mean.rs"); include!("variance.rs"); +include!("skewness.rs"); pub type MeanWithError = Variance; diff --git a/src/moments/skewness.rs b/src/moments/skewness.rs new file mode 100644 index 0000000..3245231 --- /dev/null +++ b/src/moments/skewness.rs @@ -0,0 +1,134 @@ +/// Estimate the arithmetic mean, the variance and the skewness of a sequence of +/// numbers ("population"). +/// +/// This can be used to estimate the standard error of the mean. +#[derive(Debug, Clone)] +pub struct Skewness { + /// Estimator of mean and variance. + avg: MeanWithError, + /// Intermediate sum of cubes for calculating the skewness. + sum_3: f64, +} + +impl Skewness { + /// Create a new skewness estimator. + #[inline] + pub fn new() -> Skewness { + Skewness { + avg: MeanWithError::new(), + sum_3: 0., + } + } + + /// Add an observation sampled from the population. + #[inline] + pub fn add(&mut self, x: f64) { + let delta = x - self.mean(); + self.increment(); + let n = f64::approx_from(self.len()).unwrap(); + self.add_inner(delta, delta/n); + } + + /// Increment the sample size. + /// + /// This does not update anything else. + #[inline] + fn increment(&mut self) { + self.avg.increment(); + } + + /// Add an observation given an already calculated difference from the mean + /// divided by the number of samples, assuming the inner count of the sample + /// size was already updated. + /// + /// This is useful for avoiding unnecessary divisions in the inner loop. + #[inline] + fn add_inner(&mut self, delta: f64, delta_n: f64) { + // This algorithm was suggested by Terriberry. + // + // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. + let n = f64::approx_from(self.len()).unwrap(); + self.sum_3 += delta*delta_n*delta_n * (n - 1.)*(n - 2.) + - 3.*delta_n * self.avg.sum_2; + self.avg.add_inner(delta_n); + } + + /// Determine whether the sample is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.avg.is_empty() + } + + /// Estimate the mean of the population. + /// + /// Returns 0 for an empty sample. + #[inline] + pub fn mean(&self) -> f64 { + self.avg.mean() + } + + /// Return the sample size. + #[inline] + pub fn len(&self) -> u64 { + self.avg.len() + } + + /// Calculate the sample variance. + /// + /// This is an unbiased estimator of the variance of the population. + #[inline] + pub fn sample_variance(&self) -> f64 { + self.avg.sample_variance() + } + + /// Calculate the population variance of the sample. + /// + /// This is a biased estimator of the variance of the population. + #[inline] + pub fn population_variance(&self) -> f64 { + self.avg.population_variance() + } + + /// Estimate the standard error of the mean of the population. + #[inline] + pub fn error_mean(&self) -> f64 { + self.avg.error() + } + + /// Estimate the skewness of the population. + #[inline] + pub fn skewness(&self) -> f64 { + if self.sum_3 == 0. { + return 0.; + } + let n = f64::approx_from(self.len()).unwrap(); + let sum_2 = self.avg.sum_2; + debug_assert!(sum_2 != 0.); + n.sqrt() * self.sum_3 / (sum_2*sum_2*sum_2).sqrt() + } + + /// Merge another sample into this one. + #[inline] + pub fn merge(&mut self, other: &Skewness) { + let len_self = f64::approx_from(self.len()).unwrap(); + let len_other = f64::approx_from(other.len()).unwrap(); + let len_total = len_self + len_other; + let delta = other.mean() - self.mean(); + self.sum_3 += other.sum_3 + + delta*delta*delta * len_self*len_other*(len_self - len_other) / (len_total*len_total) + + 3.*delta * (len_self*other.avg.sum_2 - len_other*self.avg.sum_2) / len_total; + self.avg.merge(&other.avg); + } +} + +impl core::iter::FromIterator for Skewness { + fn from_iter(iter: T) -> Skewness + where T: IntoIterator + { + let mut a = Skewness::new(); + for i in iter { + a.add(i); + } + a + } +} diff --git a/src/moments/variance.rs b/src/moments/variance.rs index 835f009..a28eee1 100644 --- a/src/moments/variance.rs +++ b/src/moments/variance.rs @@ -21,7 +21,8 @@ pub struct Variance { } impl Variance { - /// Create a new average estimator. + /// Create a new variance estimator. + #[inline] pub fn new() -> Variance { Variance { avg: Mean::new(), sum_2: 0. } } @@ -48,6 +49,7 @@ impl Variance { /// size was already updated. /// /// This is useful for avoiding unnecessary divisions in the inner loop. + #[inline] fn add_inner(&mut self, delta_n: f64) { // This algorithm introduced by Welford in 1962 trades numerical // stability for a division inside the loop. diff --git a/tests/skewness.rs b/tests/skewness.rs new file mode 100644 index 0000000..80d41b2 --- /dev/null +++ b/tests/skewness.rs @@ -0,0 +1,73 @@ +#[macro_use] extern crate average; + +extern crate core; + +extern crate rand; + +use core::iter::Iterator; + +use average::Skewness; + +#[test] +fn trivial() { + let mut a = Skewness::new(); + assert_eq!(a.len(), 0); + a.add(1.0); + assert_eq!(a.mean(), 1.0); + assert_eq!(a.len(), 1); + assert_eq!(a.sample_variance(), 0.0); + assert_eq!(a.population_variance(), 0.0); + assert_eq!(a.error_mean(), 0.0); + assert_eq!(a.skewness(), 0.0); + a.add(1.0); + assert_eq!(a.mean(), 1.0); + assert_eq!(a.len(), 2); + assert_eq!(a.sample_variance(), 0.0); + assert_eq!(a.population_variance(), 0.0); + assert_eq!(a.error_mean(), 0.0); + assert_eq!(a.skewness(), 0.0); +} + +#[test] +fn simple() { + let mut a: Skewness = (1..6).map(f64::from).collect(); + assert_eq!(a.mean(), 3.0); + assert_eq!(a.len(), 5); + assert_eq!(a.sample_variance(), 2.5); + assert_almost_eq!(a.error_mean(), f64::sqrt(0.5), 1e-16); + assert_eq!(a.skewness(), 0.0); + a.add(1.0); + assert_almost_eq!(a.skewness(), 0.2795084971874741, 1e-15); +} + +#[test] +fn merge() { + let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; + for mid in 0..sequence.len() { + let (left, right) = sequence.split_at(mid); + let avg_total: Skewness = sequence.iter().map(|x| *x).collect(); + let mut avg_left: Skewness = left.iter().map(|x| *x).collect(); + let avg_right: Skewness = right.iter().map(|x| *x).collect(); + avg_left.merge(&avg_right); + assert_eq!(avg_total.len(), avg_left.len()); + assert_eq!(avg_total.mean(), avg_left.mean()); + assert_eq!(avg_total.sample_variance(), avg_left.sample_variance()); + assert_eq!(avg_total.skewness(), avg_left.skewness()); + } +} + +#[test] +fn exponential_distribution() { + use rand::distributions::{Exp, IndependentSample}; + let lambda = 2.0; + let normal = Exp::new(lambda); + let mut a = Skewness::new(); + for _ in 0..2_000_000 { + a.add(normal.ind_sample(&mut ::rand::thread_rng())); + } + assert_almost_eq!(a.mean(), 1./lambda, 1e-2); + assert_almost_eq!(a.sample_variance().sqrt(), 1./lambda, 1e-2); + assert_almost_eq!(a.population_variance().sqrt(), 1./lambda, 1e-2); + assert_almost_eq!(a.error_mean(), 0.0, 1e-2); + assert_almost_eq!(a.skewness(), 2.0, 1e-2); +}