From 4f5ddbd6e977cd48bc38c40f8fdc1d111c9aad7d Mon Sep 17 00:00:00 2001
From: Vinzent Steinberg <Vinzent.Steinberg@gmail.com>
Date: Sun, 28 May 2017 22:37:48 +0200
Subject: [PATCH] Implement skewness and add missing inline annotations

---
 src/lib.rs              |   2 +-
 src/moments/mean.rs     |   1 +
 src/moments/mod.rs      |   1 +
 src/moments/skewness.rs | 134 ++++++++++++++++++++++++++++++++++++++++
 src/moments/variance.rs |   4 +-
 tests/skewness.rs       |  73 ++++++++++++++++++++++
 6 files changed, 213 insertions(+), 2 deletions(-)
 create mode 100644 src/moments/skewness.rs
 create mode 100644 tests/skewness.rs

diff --git a/src/lib.rs b/src/lib.rs
index b276fb3..20fb090 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,7 +45,7 @@ mod minmax;
 mod reduce;
 mod quantile;
 
-pub use moments::{Mean, Variance, MeanWithError};
+pub use moments::{Mean, Variance, Skewness, MeanWithError};
 pub use weighted_mean::{WeightedMean, WeightedMeanWithError};
 pub use minmax::{Min, Max};
 pub use quantile::Quantile;
diff --git a/src/moments/mean.rs b/src/moments/mean.rs
index 4213321..355c715 100644
--- a/src/moments/mean.rs
+++ b/src/moments/mean.rs
@@ -51,6 +51,7 @@ impl Mean {
     /// size was already updated.
     ///
     /// This is useful for avoiding unnecessary divisions in the inner loop.
+    #[inline]
     fn add_inner(&mut self, delta_n: f64) {
         // This algorithm introduced by Welford in 1962 trades numerical
         // stability for a division inside the loop.
diff --git a/src/moments/mod.rs b/src/moments/mod.rs
index 32914b3..5e7aa87 100644
--- a/src/moments/mod.rs
+++ b/src/moments/mod.rs
@@ -1,4 +1,5 @@
 include!("mean.rs");
 include!("variance.rs");
+include!("skewness.rs");
 
 pub type MeanWithError = Variance;
diff --git a/src/moments/skewness.rs b/src/moments/skewness.rs
new file mode 100644
index 0000000..3245231
--- /dev/null
+++ b/src/moments/skewness.rs
@@ -0,0 +1,134 @@
+/// Estimate the arithmetic mean, the variance and the skewness of a sequence of
+/// numbers ("population").
+///
+/// This can be used to estimate the standard error of the mean.
+#[derive(Debug, Clone)]
+pub struct Skewness {
+    /// Estimator of mean and variance.
+    avg: MeanWithError,
+    /// Intermediate sum of cubes for calculating the skewness.
+    sum_3: f64,
+}
+
+impl Skewness {
+    /// Create a new skewness estimator.
+    #[inline]
+    pub fn new() -> Skewness {
+        Skewness {
+            avg: MeanWithError::new(),
+            sum_3: 0.,
+        }
+    }
+
+    /// Add an observation sampled from the population.
+    #[inline]
+    pub fn add(&mut self, x: f64) {
+        let delta = x - self.mean();
+        self.increment();
+        let n = f64::approx_from(self.len()).unwrap();
+        self.add_inner(delta, delta/n);
+    }
+
+    /// Increment the sample size.
+    ///
+    /// This does not update anything else.
+    #[inline]
+    fn increment(&mut self) {
+        self.avg.increment();
+    }
+
+    /// Add an observation given an already calculated difference from the mean
+    /// divided by the number of samples, assuming the inner count of the sample
+    /// size was already updated.
+    ///
+    /// This is useful for avoiding unnecessary divisions in the inner loop.
+    #[inline]
+    fn add_inner(&mut self, delta: f64, delta_n: f64) {
+        // This algorithm was suggested by Terriberry.
+        //
+        // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
+        let n = f64::approx_from(self.len()).unwrap();
+        self.sum_3 += delta*delta_n*delta_n * (n - 1.)*(n - 2.)
+            - 3.*delta_n * self.avg.sum_2;
+        self.avg.add_inner(delta_n);
+    }
+
+    /// Determine whether the sample is empty.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.avg.is_empty()
+    }
+
+    /// Estimate the mean of the population.
+    ///
+    /// Returns 0 for an empty sample.
+    #[inline]
+    pub fn mean(&self) -> f64 {
+        self.avg.mean()
+    }
+
+    /// Return the sample size.
+    #[inline]
+    pub fn len(&self) -> u64 {
+        self.avg.len()
+    }
+
+    /// Calculate the sample variance.
+    ///
+    /// This is an unbiased estimator of the variance of the population.
+    #[inline]
+    pub fn sample_variance(&self) -> f64 {
+        self.avg.sample_variance()
+    }
+
+    /// Calculate the population variance of the sample.
+    ///
+    /// This is a biased estimator of the variance of the population.
+    #[inline]
+    pub fn population_variance(&self) -> f64 {
+        self.avg.population_variance()
+    }
+
+    /// Estimate the standard error of the mean of the population.
+    #[inline]
+    pub fn error_mean(&self) -> f64 {
+        self.avg.error()
+    }
+
+    /// Estimate the skewness of the population.
+    #[inline]
+    pub fn skewness(&self) -> f64 {
+        if self.sum_3 == 0. {
+            return 0.;
+        }
+        let n = f64::approx_from(self.len()).unwrap();
+        let sum_2 = self.avg.sum_2;
+        debug_assert!(sum_2 != 0.);
+        n.sqrt() * self.sum_3 / (sum_2*sum_2*sum_2).sqrt()
+    }
+
+    /// Merge another sample into this one.
+    #[inline]
+    pub fn merge(&mut self, other: &Skewness) {
+        let len_self = f64::approx_from(self.len()).unwrap();
+        let len_other = f64::approx_from(other.len()).unwrap();
+        let len_total = len_self + len_other;
+        let delta = other.mean() - self.mean();
+        self.sum_3 += other.sum_3
+            + delta*delta*delta * len_self*len_other*(len_self - len_other) / (len_total*len_total)
+            + 3.*delta * (len_self*other.avg.sum_2 - len_other*self.avg.sum_2) / len_total;
+        self.avg.merge(&other.avg);
+    }
+}
+
+impl core::iter::FromIterator<f64> for Skewness {
+    fn from_iter<T>(iter: T) -> Skewness
+        where T: IntoIterator<Item=f64>
+    {
+        let mut a = Skewness::new();
+        for i in iter {
+            a.add(i);
+        }
+        a
+    }
+}
diff --git a/src/moments/variance.rs b/src/moments/variance.rs
index 835f009..a28eee1 100644
--- a/src/moments/variance.rs
+++ b/src/moments/variance.rs
@@ -21,7 +21,8 @@ pub struct Variance {
 }
 
 impl Variance {
-    /// Create a new average estimator.
+    /// Create a new variance estimator.
+    #[inline]
     pub fn new() -> Variance {
         Variance { avg: Mean::new(), sum_2: 0. }
     }
@@ -48,6 +49,7 @@ impl Variance {
     /// size was already updated.
     ///
     /// This is useful for avoiding unnecessary divisions in the inner loop.
+    #[inline]
     fn add_inner(&mut self, delta_n: f64) {
         // This algorithm introduced by Welford in 1962 trades numerical
         // stability for a division inside the loop.
diff --git a/tests/skewness.rs b/tests/skewness.rs
new file mode 100644
index 0000000..80d41b2
--- /dev/null
+++ b/tests/skewness.rs
@@ -0,0 +1,73 @@
+#[macro_use] extern crate average;
+
+extern crate core;
+
+extern crate rand;
+
+use core::iter::Iterator;
+
+use average::Skewness;
+
+#[test]
+fn trivial() {
+    let mut a = Skewness::new();
+    assert_eq!(a.len(), 0);
+    a.add(1.0);
+    assert_eq!(a.mean(), 1.0);
+    assert_eq!(a.len(), 1);
+    assert_eq!(a.sample_variance(), 0.0);
+    assert_eq!(a.population_variance(), 0.0);
+    assert_eq!(a.error_mean(), 0.0);
+    assert_eq!(a.skewness(), 0.0);
+    a.add(1.0);
+    assert_eq!(a.mean(), 1.0);
+    assert_eq!(a.len(), 2);
+    assert_eq!(a.sample_variance(), 0.0);
+    assert_eq!(a.population_variance(), 0.0);
+    assert_eq!(a.error_mean(), 0.0);
+    assert_eq!(a.skewness(), 0.0);
+}
+
+#[test]
+fn simple() {
+    let mut a: Skewness = (1..6).map(f64::from).collect();
+    assert_eq!(a.mean(), 3.0);
+    assert_eq!(a.len(), 5);
+    assert_eq!(a.sample_variance(), 2.5);
+    assert_almost_eq!(a.error_mean(), f64::sqrt(0.5), 1e-16);
+    assert_eq!(a.skewness(), 0.0);
+    a.add(1.0);
+    assert_almost_eq!(a.skewness(), 0.2795084971874741, 1e-15);
+}
+
+#[test]
+fn merge() {
+    let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
+    for mid in 0..sequence.len() {
+        let (left, right) = sequence.split_at(mid);
+        let avg_total: Skewness = sequence.iter().map(|x| *x).collect();
+        let mut avg_left: Skewness = left.iter().map(|x| *x).collect();
+        let avg_right: Skewness = right.iter().map(|x| *x).collect();
+        avg_left.merge(&avg_right);
+        assert_eq!(avg_total.len(), avg_left.len());
+        assert_eq!(avg_total.mean(), avg_left.mean());
+        assert_eq!(avg_total.sample_variance(), avg_left.sample_variance());
+        assert_eq!(avg_total.skewness(), avg_left.skewness());
+    }
+}
+
+#[test]
+fn exponential_distribution() {
+    use rand::distributions::{Exp, IndependentSample};
+    let lambda = 2.0;
+    let normal = Exp::new(lambda);
+    let mut a = Skewness::new();
+    for _ in 0..2_000_000 {
+        a.add(normal.ind_sample(&mut ::rand::thread_rng()));
+    }
+    assert_almost_eq!(a.mean(), 1./lambda, 1e-2);
+    assert_almost_eq!(a.sample_variance().sqrt(), 1./lambda, 1e-2);
+    assert_almost_eq!(a.population_variance().sqrt(), 1./lambda, 1e-2);
+    assert_almost_eq!(a.error_mean(), 0.0, 1e-2);
+    assert_almost_eq!(a.skewness(), 2.0, 1e-2);
+}