Implement skewness and add missing inline annotations

This commit is contained in:
Vinzent Steinberg 2017-05-28 22:37:48 +02:00
parent 8c6c55190e
commit 4f5ddbd6e9
6 changed files with 213 additions and 2 deletions

View File

@ -45,7 +45,7 @@ mod minmax;
mod reduce;
mod quantile;
pub use moments::{Mean, Variance, MeanWithError};
pub use moments::{Mean, Variance, Skewness, MeanWithError};
pub use weighted_mean::{WeightedMean, WeightedMeanWithError};
pub use minmax::{Min, Max};
pub use quantile::Quantile;

View File

@ -51,6 +51,7 @@ impl Mean {
/// size was already updated.
///
/// This is useful for avoiding unnecessary divisions in the inner loop.
#[inline]
fn add_inner(&mut self, delta_n: f64) {
// This algorithm introduced by Welford in 1962 trades numerical
// stability for a division inside the loop.

View File

@ -1,4 +1,5 @@
include!("mean.rs");
include!("variance.rs");
include!("skewness.rs");
pub type MeanWithError = Variance;

134
src/moments/skewness.rs Normal file
View File

@ -0,0 +1,134 @@
/// Estimate the arithmetic mean, the variance and the skewness of a sequence of
/// numbers ("population").
///
/// This can be used to estimate the standard error of the mean.
#[derive(Debug, Clone)]
pub struct Skewness {
/// Estimator of mean and variance.
avg: MeanWithError,
/// Intermediate sum of cubes for calculating the skewness.
sum_3: f64,
}
impl Skewness {
/// Create a new skewness estimator.
#[inline]
pub fn new() -> Skewness {
Skewness {
avg: MeanWithError::new(),
sum_3: 0.,
}
}
/// Add an observation sampled from the population.
#[inline]
pub fn add(&mut self, x: f64) {
let delta = x - self.mean();
self.increment();
let n = f64::approx_from(self.len()).unwrap();
self.add_inner(delta, delta/n);
}
/// Increment the sample size.
///
/// This does not update anything else.
#[inline]
fn increment(&mut self) {
self.avg.increment();
}
/// Add an observation given an already calculated difference from the mean
/// divided by the number of samples, assuming the inner count of the sample
/// size was already updated.
///
/// This is useful for avoiding unnecessary divisions in the inner loop.
#[inline]
fn add_inner(&mut self, delta: f64, delta_n: f64) {
// This algorithm was suggested by Terriberry.
//
// See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
let n = f64::approx_from(self.len()).unwrap();
self.sum_3 += delta*delta_n*delta_n * (n - 1.)*(n - 2.)
- 3.*delta_n * self.avg.sum_2;
self.avg.add_inner(delta_n);
}
/// Determine whether the sample is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.avg.is_empty()
}
/// Estimate the mean of the population.
///
/// Returns 0 for an empty sample.
#[inline]
pub fn mean(&self) -> f64 {
self.avg.mean()
}
/// Return the sample size.
#[inline]
pub fn len(&self) -> u64 {
self.avg.len()
}
/// Calculate the sample variance.
///
/// This is an unbiased estimator of the variance of the population.
#[inline]
pub fn sample_variance(&self) -> f64 {
self.avg.sample_variance()
}
/// Calculate the population variance of the sample.
///
/// This is a biased estimator of the variance of the population.
#[inline]
pub fn population_variance(&self) -> f64 {
self.avg.population_variance()
}
/// Estimate the standard error of the mean of the population.
#[inline]
pub fn error_mean(&self) -> f64 {
self.avg.error()
}
/// Estimate the skewness of the population.
#[inline]
pub fn skewness(&self) -> f64 {
if self.sum_3 == 0. {
return 0.;
}
let n = f64::approx_from(self.len()).unwrap();
let sum_2 = self.avg.sum_2;
debug_assert!(sum_2 != 0.);
n.sqrt() * self.sum_3 / (sum_2*sum_2*sum_2).sqrt()
}
/// Merge another sample into this one.
#[inline]
pub fn merge(&mut self, other: &Skewness) {
let len_self = f64::approx_from(self.len()).unwrap();
let len_other = f64::approx_from(other.len()).unwrap();
let len_total = len_self + len_other;
let delta = other.mean() - self.mean();
self.sum_3 += other.sum_3
+ delta*delta*delta * len_self*len_other*(len_self - len_other) / (len_total*len_total)
+ 3.*delta * (len_self*other.avg.sum_2 - len_other*self.avg.sum_2) / len_total;
self.avg.merge(&other.avg);
}
}
impl core::iter::FromIterator<f64> for Skewness {
fn from_iter<T>(iter: T) -> Skewness
where T: IntoIterator<Item=f64>
{
let mut a = Skewness::new();
for i in iter {
a.add(i);
}
a
}
}

View File

@ -21,7 +21,8 @@ pub struct Variance {
}
impl Variance {
/// Create a new average estimator.
/// Create a new variance estimator.
#[inline]
pub fn new() -> Variance {
Variance { avg: Mean::new(), sum_2: 0. }
}
@ -48,6 +49,7 @@ impl Variance {
/// size was already updated.
///
/// This is useful for avoiding unnecessary divisions in the inner loop.
#[inline]
fn add_inner(&mut self, delta_n: f64) {
// This algorithm introduced by Welford in 1962 trades numerical
// stability for a division inside the loop.

73
tests/skewness.rs Normal file
View File

@ -0,0 +1,73 @@
#[macro_use] extern crate average;
extern crate core;
extern crate rand;
use core::iter::Iterator;
use average::Skewness;
#[test]
fn trivial() {
let mut a = Skewness::new();
assert_eq!(a.len(), 0);
a.add(1.0);
assert_eq!(a.mean(), 1.0);
assert_eq!(a.len(), 1);
assert_eq!(a.sample_variance(), 0.0);
assert_eq!(a.population_variance(), 0.0);
assert_eq!(a.error_mean(), 0.0);
assert_eq!(a.skewness(), 0.0);
a.add(1.0);
assert_eq!(a.mean(), 1.0);
assert_eq!(a.len(), 2);
assert_eq!(a.sample_variance(), 0.0);
assert_eq!(a.population_variance(), 0.0);
assert_eq!(a.error_mean(), 0.0);
assert_eq!(a.skewness(), 0.0);
}
#[test]
fn simple() {
let mut a: Skewness = (1..6).map(f64::from).collect();
assert_eq!(a.mean(), 3.0);
assert_eq!(a.len(), 5);
assert_eq!(a.sample_variance(), 2.5);
assert_almost_eq!(a.error_mean(), f64::sqrt(0.5), 1e-16);
assert_eq!(a.skewness(), 0.0);
a.add(1.0);
assert_almost_eq!(a.skewness(), 0.2795084971874741, 1e-15);
}
#[test]
fn merge() {
let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
for mid in 0..sequence.len() {
let (left, right) = sequence.split_at(mid);
let avg_total: Skewness = sequence.iter().map(|x| *x).collect();
let mut avg_left: Skewness = left.iter().map(|x| *x).collect();
let avg_right: Skewness = right.iter().map(|x| *x).collect();
avg_left.merge(&avg_right);
assert_eq!(avg_total.len(), avg_left.len());
assert_eq!(avg_total.mean(), avg_left.mean());
assert_eq!(avg_total.sample_variance(), avg_left.sample_variance());
assert_eq!(avg_total.skewness(), avg_left.skewness());
}
}
#[test]
fn exponential_distribution() {
use rand::distributions::{Exp, IndependentSample};
let lambda = 2.0;
let normal = Exp::new(lambda);
let mut a = Skewness::new();
for _ in 0..2_000_000 {
a.add(normal.ind_sample(&mut ::rand::thread_rng()));
}
assert_almost_eq!(a.mean(), 1./lambda, 1e-2);
assert_almost_eq!(a.sample_variance().sqrt(), 1./lambda, 1e-2);
assert_almost_eq!(a.population_variance().sqrt(), 1./lambda, 1e-2);
assert_almost_eq!(a.error_mean(), 0.0, 1e-2);
assert_almost_eq!(a.skewness(), 2.0, 1e-2);
}