Factor out calculation of average

Now it is possible to calculate the average without calculating the error.
2017-05-24 10:48:27 +02:00 · 2017-05-24 10:48:27 +02:00 · a95ab05c10
commit a95ab05c10
parent 962adb91d7
5 changed files with 335 additions and 126 deletions
--- a/src/average.rs
+++ b/src/average.rs
@ -2,6 +2,118 @@ use core;
 use conv::ApproxFrom;
 /// Estimate the arithmetic mean of a sequence of numbers ("population").
 ///
 /// Everything is calculated iteratively using constant memory, so the sequence
 /// of numbers can be an iterator. The used algorithms try to avoid numerical
 /// instabilities.
 ///
 ///
 /// ## Example
 ///
 /// ```
 /// use average::Average;
 ///
 /// let a: Average = (1..6).map(Into::into).collect();
 /// println!("The average is {}.", a.mean());
 /// ```
 #[derive(Debug, Clone)]
 pub struct Average {
    /// Average value.
    avg: f64,
    /// Sample size.
    n: u64,
 }
 impl Average {
    /// Create a new average estimator.
    pub fn new() -> Average {
        Average { avg: 0., n: 0 }
    }
    /// Add an element sampled from the population.
    #[inline]
    pub fn add(&mut self, sample: f64) {
        // This algorithm introduced by Welford in 1962 trades numerical
        // stability for a division inside the loop.
        //
        // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
        self.n += 1;
        let delta = sample - self.avg;
        self.avg += delta / f64::approx_from(self.n).unwrap();
    }
    /// Determine whether the samples are empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.n == 0
    }
    /// Estimate the mean of the population.
    #[inline]
    pub fn mean(&self) -> f64 {
        self.avg
    }
    /// Return the number of samples.
    #[inline]
    pub fn len(&self) -> u64 {
        self.n
    }
    /// Merge another sample into this one.
    ///
    ///
    /// ## Example
    ///
    /// ```
    /// use average::Average;
    ///
    /// let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
    /// let (left, right) = sequence.split_at(3);
    /// let avg_total: Average = sequence.iter().map(|x| *x).collect();
    /// let mut avg_left: Average = left.iter().map(|x| *x).collect();
    /// let avg_right: Average = right.iter().map(|x| *x).collect();
    /// avg_left.merge(&avg_right);
    /// assert_eq!(avg_total.mean(), avg_left.mean());
    /// ```
    #[inline]
    pub fn merge(&mut self, other: &Average) {
        // This algorithm was proposed by Chan et al. in 1979.
        //
        // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
        let len_self = f64::approx_from(self.n).unwrap();
        let len_other = f64::approx_from(other.n).unwrap();
        let len_total = len_self + len_other;
        self.n += other.n;
        self.avg = (len_self * self.avg + len_other * other.avg) / len_total;
        // Chan et al. use
        //
        //     self.avg += delta * len_other / len_total;
        //
        // instead but this results in cancelation if the number of samples are similar.
    }
 }
 impl core::default::Default for Average {
    fn default() -> Average {
        Average::new()
    }
 }
 impl core::iter::FromIterator<f64> for Average {
    fn from_iter<T>(iter: T) -> Average
        where T: IntoIterator<Item=f64>
    {
        let mut a = Average::new();
        for i in iter {
            a.add(i);
        }
        a
    }
 }
 /// Estimate the arithmetic mean and the variance of a sequence of numbers
 /// ("population").
 ///
@ -22,10 +134,8 @@ use conv::ApproxFrom;
 /// ```
 #[derive(Debug, Clone)]
 pub struct AverageWithError {
-    /// Average value.
+    /// Estimator of average.
-    avg: f64,
+    avg: Average,
    /// Number of samples.
    n: u64,
    /// Intermediate sum of squares for calculating the variance.
    v: f64,
 }
@ -33,7 +143,7 @@ pub struct AverageWithError {
 impl AverageWithError {
    /// Create a new average estimator.
    pub fn new() -> AverageWithError {
-        AverageWithError { avg: 0., n: 0, v: 0. }
+        AverageWithError { avg: Average::new(), v: 0. }
    }
    /// Add an element sampled from the population.
@ -43,53 +153,60 @@ impl AverageWithError {
        // stability for a division inside the loop.
        //
        // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
-        self.n += 1;
+        let delta = sample - self.avg.mean();
-        let delta = sample - self.avg;
+        self.avg.add(sample);
-        self.avg += delta / f64::approx_from(self.n).unwrap();
+        self.v += delta * (sample - self.avg.mean());
        self.v += delta * (sample - self.avg);
    }
    /// Determine whether the samples are empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
-        self.n == 0
+        self.avg.is_empty()
    }
    /// Estimate the mean of the population.
    #[inline]
    pub fn mean(&self) -> f64 {
-        self.avg
+        self.avg.mean()
    }
    /// Return the number of samples.
    #[inline]
    pub fn len(&self) -> u64 {
-        self.n
+        self.avg.len()
    }
    /// Calculate the sample variance.
    ///
    /// This is an unbiased estimator of the variance of the population.
    #[inline]
    pub fn sample_variance(&self) -> f64 {
-        if self.n < 2 {
+        if self.avg.len() < 2 {
            return 0.;
        }
-        self.v / f64::approx_from(self.n - 1).unwrap()
+        self.v / f64::approx_from(self.avg.len() - 1).unwrap()
    }
    /// Calculate the population variance of the sample.
    ///
    /// This is a biased estimator of the variance of the population.
    #[inline]
    pub fn population_variance(&self) -> f64 {
-        if self.n < 2 {
+        let n = self.avg.len();
        if n < 2 {
            return 0.;
        }
-        self.v / f64::approx_from(self.n).unwrap()
+        self.v / f64::approx_from(n).unwrap()
    }
    /// Estimate the standard error of the mean of the population.
    #[inline]
    pub fn error(&self) -> f64 {
-        if self.n == 0 {
+        let n = self.avg.len();
        if n == 0 {
            return 0.;
        }
-        (self.sample_variance() / f64::approx_from(self.n).unwrap()).sqrt()
+        (self.sample_variance() / f64::approx_from(n).unwrap()).sqrt()
    }
    /// Merge another sample into this one.
@ -109,21 +226,16 @@ impl AverageWithError {
    /// assert_eq!(avg_total.mean(), avg_left.mean());
    /// assert_eq!(avg_total.sample_variance(), avg_left.sample_variance());
    /// ```
    #[inline]
    pub fn merge(&mut self, other: &AverageWithError) {
        // This algorithm was proposed by Chan et al. in 1979.
        //
        // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance.
-        let delta = other.avg - self.avg;
+        let len_self = f64::approx_from(self.len()).unwrap();
-        let len_self = f64::approx_from(self.n).unwrap();
+        let len_other = f64::approx_from(other.len()).unwrap();
        let len_other = f64::approx_from(other.n).unwrap();
        let len_total = len_self + len_other;
-        self.n += other.n;
+        let delta = other.mean() - self.mean();
-        self.avg = (len_self * self.avg + len_other * other.avg) / len_total;
+        self.avg.merge(&other.avg);
        // Chan et al. use
        //
        //     self.avg += delta * len_other / len_total;
        //
        // instead but this results in cancelation if the number of samples are similar.
        self.v += other.v + delta*delta * len_self * len_other / len_total;
    }
 }
@ -145,23 +257,3 @@ impl core::iter::FromIterator<f64> for AverageWithError {
        a
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn merge() {
        let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
        for mid in 0..sequence.len() {
            let (left, right) = sequence.split_at(mid);
            let avg_total: AverageWithError = sequence.iter().map(|x| *x).collect();
            let mut avg_left: AverageWithError = left.iter().map(|x| *x).collect();
            let avg_right: AverageWithError = right.iter().map(|x| *x).collect();
            avg_left.merge(&avg_right);
            assert_eq!(avg_total.n, avg_left.n);
            assert_eq!(avg_total.avg, avg_left.avg);
            assert_eq!(avg_total.v, avg_left.v);
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -2,8 +2,8 @@
 //! sequence of numbers, and for their standard errors. The typical workflow
 //! looks like this:
 //!
-//! 1. Initialize your estimator of choice ([`AverageWithError`] or
+//! 1. Initialize your estimator of choice ([`Average`], [`AverageWithError`],
-//!    [`WeightedAverageWithError`]) with `new()`.
+//!    [`WeightedAverage`] or [`WeightedAverageWithError`]) with `new()`.
 //! 2. Add some subset (called "samples") of the sequence of numbers (called
 //!    "population") for which you want to estimate the average, using `add()`
 //!    or `collect()`.
@ -13,8 +13,11 @@
 //! You can run several estimators in parallel and merge them into one with
 //! `merge()`.
 //!
-//! [`AverageWithError`]: ./average/struct.Average.html
+//! [`Average`]: ./average/struct.Average.html
-//! [`WeightedAverageWithError`]: ./weighted_average/struct.WeightedAverage.html
+//! [`AverageWithError`]: ./average/struct.AverageWithError.html
 //! [`WeightedAverage`]: ./weighted_average/struct.WeightedAverage.html
 //! [`WeightedAverageWithError`]: ./weighted_average/struct.WeightedAverageWithError.html
 //!
 //!
 //! ## Example
 //!
@ -29,12 +32,10 @@
 #![no_std]
 extern crate conv;
 #[cfg(test)] extern crate rand;
 #[cfg(test)] #[macro_use] extern crate std;
 #[macro_use] mod macros;
 mod average;
 mod weighted_average;
-pub use average::AverageWithError;
+pub use average::{Average, AverageWithError};
-pub use weighted_average::WeightedAverageWithError;
+pub use weighted_average::{WeightedAverage, WeightedAverageWithError};
--- a/src/weighted_average.rs
+++ b/src/weighted_average.rs
@ -2,6 +2,117 @@ use core;
 use super::AverageWithError;
 /// Estimate the weighted and unweighted arithmetic mean of a sequence of
 /// numbers ("population").
 ///
 ///
 /// ## Example
 ///
 /// ```
 /// use average::WeightedAverage;
 ///
 /// let a: WeightedAverage = (1..6).zip(1..6)
 ///     .map(|(x, w)| (f64::from(x), f64::from(w))).collect();
 /// println!("The weighted average is {}.", a.mean());
 /// ```
 #[derive(Debug, Clone)]
 pub struct WeightedAverage  {
    /// Sum of the weights.
    weight_sum: f64,
    /// Weighted average value.
    weighted_avg: f64,
 }
 impl WeightedAverage {
    /// Create a new weighted and unweighted average estimator.
    pub fn new() -> WeightedAverage {
        WeightedAverage {
            weight_sum: 0., weighted_avg: 0.,
        }
    }
    /// Add a weighted element sampled from the population.
    #[inline]
    pub fn add(&mut self, sample: f64, weight: f64) {
        // The algorithm for the unweighted average was suggested by Welford in 1962.
        //
        // See
        // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
        // and
        // http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf.
        self.weight_sum += weight;
        let prev_avg = self.weighted_avg;
        self.weighted_avg = prev_avg + (weight / self.weight_sum) * (sample - prev_avg);
    }
    /// Determine whether the sample is empty.
    ///
    /// Might be a false positive if the sum of weights is zero.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.weight_sum == 0.
    }
    /// Return the sum of the weights.
    #[inline]
    pub fn sum_weights(&self) -> f64 {
        self.weight_sum
    }
    /// Estimate the weighted mean of the sequence.
    #[inline]
    pub fn mean(&self) -> f64 {
        self.weighted_avg
    }
    /// Merge another sample into this one.
    ///
    ///
    /// ## Example
    ///
    /// ```
    /// use average::WeightedAverage;
    ///
    /// let weighted_sequence: &[(f64, f64)] = &[
    ///     (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5),
    ///     (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.9)];
    /// let (left, right) = weighted_sequence.split_at(3);
    /// let avg_total: WeightedAverage = weighted_sequence.iter().map(|&x| x).collect();
    /// let mut avg_left: WeightedAverage = left.iter().map(|&x| x).collect();
    /// let avg_right: WeightedAverage = right.iter().map(|&x| x).collect();
    /// avg_left.merge(&avg_right);
    /// assert!((avg_total.mean() - avg_left.mean()).abs() < 1e-15);
    /// ```
    #[inline]
    pub fn merge(&mut self, other: &WeightedAverage) {
        let total_weight_sum = self.weight_sum + other.weight_sum;
        self.weighted_avg = (self.weight_sum * self.weighted_avg
                             + other.weight_sum * other.weighted_avg)
                            / total_weight_sum;
        self.weight_sum = total_weight_sum;
    }
 }
 impl core::default::Default for WeightedAverage {
    fn default() -> WeightedAverage {
        WeightedAverage::new()
    }
 }
 impl core::iter::FromIterator<(f64, f64)> for WeightedAverage {
    fn from_iter<T>(iter: T) -> WeightedAverage
        where T: IntoIterator<Item=(f64, f64)>
    {
        let mut a = WeightedAverage::new();
        for (i, w) in iter {
            a.add(i, w);
        }
        a
    }
 }
 /// Estimate the weighted and unweighted arithmetic mean and the unweighted
 /// variance of a sequence of numbers ("population").
 ///
@ -19,13 +130,10 @@ use super::AverageWithError;
 /// ```
 #[derive(Debug, Clone)]
 pub struct WeightedAverageWithError {
    /// Sum of the weights.
    weight_sum: f64,
    /// Sum of the squares of the weights.
    weight_sum_sq: f64,
-    /// Weighted average value.
+    /// Estimator of the weighted average.
-    weighted_avg: f64,
+    weighted_avg: WeightedAverage,
    /// Estimator of unweighted average and its variance.
    unweighted_avg: AverageWithError,
 }
@ -34,7 +142,8 @@ impl WeightedAverageWithError {
    /// Create a new weighted and unweighted average estimator.
    pub fn new() -> WeightedAverageWithError {
        WeightedAverageWithError {
-            weight_sum: 0., weight_sum_sq: 0., weighted_avg: 0.,
+            weight_sum_sq: 0.,
            weighted_avg: WeightedAverage::new(),
            unweighted_avg: AverageWithError::new(),
        }
    }
@ -49,51 +158,55 @@ impl WeightedAverageWithError {
        // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
        // and
        // http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf.
        self.weight_sum += weight;
        self.weight_sum_sq += weight*weight;
-
+        self.weighted_avg.add(sample, weight);
        let prev_avg = self.weighted_avg;
        self.weighted_avg = prev_avg + (weight / self.weight_sum) * (sample - prev_avg);
        self.unweighted_avg.add(sample);
    }
    /// Determine whether the sample is empty.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.unweighted_avg.is_empty()
    }
    /// Return the sum of the weights.
    #[inline]
    pub fn sum_weights(&self) -> f64 {
-        self.weight_sum
+        self.weighted_avg.sum_weights()
    }
    /// Return the sum of the squared weights.
    #[inline]
    pub fn sum_weights_sq(&self) -> f64 {
        self.weight_sum_sq
    }
    /// Estimate the weighted mean of the sequence.
    #[inline]
    pub fn weighted_mean(&self) -> f64 {
-        self.weighted_avg
+        self.weighted_avg.mean()
    }
    /// Estimate the unweighted mean of the sequence.
    #[inline]
    pub fn unweighted_mean(&self) -> f64 {
        self.unweighted_avg.mean()
    }
    /// Return sample size.
    #[inline]
    pub fn len(&self) -> u64 {
        self.unweighted_avg.len()
    }
    /// Calculate the effective sample size.
    #[inline]
    pub fn effective_len(&self) -> f64 {
        if self.is_empty() {
            return 0.
        }
-        self.weight_sum * self.weight_sum / self.weight_sum_sq
+        let weight_sum = self.weighted_avg.sum_weights();
        weight_sum * weight_sum / self.weight_sum_sq
    }
    /// Calculate the *unweighted* population variance of the sample.
@ -121,10 +234,11 @@ impl WeightedAverageWithError {
        // results than the ones used by SPSS or Mentor.
        //
        // See http://www.analyticalgroup.com/download/WEIGHTED_VARIANCE.pdf.
-        if self.weight_sum == 0. {
+        let weight_sum = self.weighted_avg.sum_weights();
        if weight_sum == 0. {
            return 0.;
        }
-        let inv_effective_len = self.weight_sum_sq / (self.weight_sum * self.weight_sum);
+        let inv_effective_len = self.weight_sum_sq / (weight_sum * weight_sum);
        (self.sample_variance() * inv_effective_len).sqrt()
    }
@ -148,13 +262,8 @@ impl WeightedAverageWithError {
    /// assert!((avg_total.error() - avg_left.error()).abs() < 1e-15);
    /// ```
    pub fn merge(&mut self, other: &WeightedAverageWithError) {
        let total_weight_sum = self.weight_sum + other.weight_sum;
        self.weighted_avg = (self.weight_sum * self.weighted_avg
                             + other.weight_sum * other.weighted_avg)
                            / total_weight_sum;
        self.weight_sum = total_weight_sum;
        self.weight_sum_sq += other.weight_sum_sq;
-
+        self.weighted_avg.merge(&other.weighted_avg);
        self.unweighted_avg.merge(&other.unweighted_avg);
    }
 }
@ -176,51 +285,3 @@ impl core::iter::FromIterator<(f64, f64)> for WeightedAverageWithError {
        a
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn merge_unweighted() {
        let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
        for mid in 0..sequence.len() {
            let (left, right) = sequence.split_at(mid);
            let avg_total: WeightedAverageWithError = sequence.iter().map(|x| (*x, 1.)).collect();
            let mut avg_left: WeightedAverageWithError = left.iter().map(|x| (*x, 1.)).collect();
            let avg_right: WeightedAverageWithError = right.iter().map(|x| (*x, 1.)).collect();
            avg_left.merge(&avg_right);
            assert_eq!(avg_total.weight_sum, avg_left.weight_sum);
            assert_eq!(avg_total.weight_sum_sq, avg_left.weight_sum_sq);
            assert_eq!(avg_total.weighted_avg, avg_left.weighted_avg);
            assert_eq!(avg_total.unweighted_avg.len(), avg_left.unweighted_avg.len());
            assert_eq!(avg_total.unweighted_avg.mean(), avg_left.unweighted_avg.mean());
            assert_eq!(avg_total.unweighted_avg.sample_variance(),
                       avg_left.unweighted_avg.sample_variance());
        }
    }
    #[test]
    fn merge_weighted() {
        let sequence: &[(f64, f64)] = &[
            (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5),
            (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.)];
        for mid in 0..sequence.len() {
            let (left, right) = sequence.split_at(mid);
            let avg_total: WeightedAverageWithError = sequence.iter().map(|&(x, w)| (x, w)).collect();
            let mut avg_left: WeightedAverageWithError = left.iter().map(|&(x, w)| (x, w)).collect();
            let avg_right: WeightedAverageWithError = right.iter().map(|&(x, w)| (x, w)).collect();
            avg_left.merge(&avg_right);
            assert_eq!(avg_total.unweighted_avg.len(), avg_left.unweighted_avg.len());
            assert_almost_eq!(avg_total.weight_sum, avg_left.weight_sum, 1e-15);
            assert_eq!(avg_total.weight_sum_sq, avg_left.weight_sum_sq);
            assert_almost_eq!(avg_total.weighted_avg, avg_left.weighted_avg, 1e-15);
            assert_almost_eq!(avg_total.unweighted_avg.mean(),
                              avg_left.unweighted_avg.mean(), 1e-15);
            assert_almost_eq!(avg_total.unweighted_avg.sample_variance(),
                              avg_left.unweighted_avg.sample_variance(), 1e-14);
        }
    }
 }
--- a/tests/average.rs
+++ b/tests/average.rs
@ -44,6 +44,21 @@ fn numerically_unstable() {
    assert_eq!(a.sample_variance(), 30.);
 }
 #[test]
 fn merge() {
    let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
    for mid in 0..sequence.len() {
        let (left, right) = sequence.split_at(mid);
        let avg_total: AverageWithError = sequence.iter().map(|x| *x).collect();
        let mut avg_left: AverageWithError = left.iter().map(|x| *x).collect();
        let avg_right: AverageWithError = right.iter().map(|x| *x).collect();
        avg_left.merge(&avg_right);
        assert_eq!(avg_total.len(), avg_left.len());
        assert_eq!(avg_total.mean(), avg_left.mean());
        assert_eq!(avg_total.sample_variance(), avg_left.sample_variance());
    }
 }
 #[test]
 fn normal_distribution() {
    use rand::distributions::{Normal, IndependentSample};
--- a/tests/weighted_average.rs
+++ b/tests/weighted_average.rs
@ -64,3 +64,43 @@ fn error_corner_case() {
        .map(|(x, w)| (*x, *w)).collect();
    assert_eq!(a.error(), 0.5);
 }
 #[test]
 fn merge_unweighted() {
    let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.];
    for mid in 0..sequence.len() {
        let (left, right) = sequence.split_at(mid);
        let avg_total: WeightedAverageWithError = sequence.iter().map(|x| (*x, 1.)).collect();
        let mut avg_left: WeightedAverageWithError = left.iter().map(|x| (*x, 1.)).collect();
        let avg_right: WeightedAverageWithError = right.iter().map(|x| (*x, 1.)).collect();
        avg_left.merge(&avg_right);
        assert_eq!(avg_total.sum_weights(), avg_left.sum_weights());
        assert_eq!(avg_total.sum_weights_sq(), avg_left.sum_weights_sq());
        assert_eq!(avg_total.len(), avg_left.len());
        assert_eq!(avg_total.unweighted_mean(), avg_left.unweighted_mean());
        assert_eq!(avg_total.weighted_mean(), avg_left.weighted_mean());
        assert_eq!(avg_total.sample_variance(), avg_left.sample_variance());
    }
 }
 #[test]
 fn merge_weighted() {
    let sequence: &[(f64, f64)] = &[
        (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5),
        (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.)];
    for mid in 0..sequence.len() {
        let (left, right) = sequence.split_at(mid);
        let avg_total: WeightedAverageWithError = sequence.iter().map(|&(x, w)| (x, w)).collect();
        let mut avg_left: WeightedAverageWithError = left.iter().map(|&(x, w)| (x, w)).collect();
        let avg_right: WeightedAverageWithError = right.iter().map(|&(x, w)| (x, w)).collect();
        avg_left.merge(&avg_right);
        assert_eq!(avg_total.len(), avg_left.len());
        assert_almost_eq!(avg_total.sum_weights(), avg_left.sum_weights(), 1e-15);
        assert_eq!(avg_total.sum_weights_sq(), avg_left.sum_weights_sq());
        assert_almost_eq!(avg_total.weighted_mean(), avg_left.weighted_mean(), 1e-15);
        assert_almost_eq!(avg_total.unweighted_mean(), avg_left.unweighted_mean(), 1e-15);
        assert_almost_eq!(avg_total.sample_variance(), avg_left.sample_variance(), 1e-14);
    }
 }