From a95ab05c10c729a8a0d607cebe3860222f996158 Mon Sep 17 00:00:00 2001 From: Vinzent Steinberg Date: Wed, 24 May 2017 10:48:27 +0200 Subject: [PATCH] Factor out calculation of average Now it is possible to calculate the average without calculating the error. --- src/average.rs | 188 ++++++++++++++++++++++++++--------- src/lib.rs | 17 ++-- src/weighted_average.rs | 201 +++++++++++++++++++++++++------------- tests/average.rs | 15 +++ tests/weighted_average.rs | 40 ++++++++ 5 files changed, 335 insertions(+), 126 deletions(-) diff --git a/src/average.rs b/src/average.rs index e00dda9..f83acfe 100644 --- a/src/average.rs +++ b/src/average.rs @@ -2,6 +2,118 @@ use core; use conv::ApproxFrom; + +/// Estimate the arithmetic mean of a sequence of numbers ("population"). +/// +/// Everything is calculated iteratively using constant memory, so the sequence +/// of numbers can be an iterator. The used algorithms try to avoid numerical +/// instabilities. +/// +/// +/// ## Example +/// +/// ``` +/// use average::Average; +/// +/// let a: Average = (1..6).map(Into::into).collect(); +/// println!("The average is {}.", a.mean()); +/// ``` +#[derive(Debug, Clone)] +pub struct Average { + /// Average value. + avg: f64, + /// Sample size. + n: u64, +} + +impl Average { + /// Create a new average estimator. + pub fn new() -> Average { + Average { avg: 0., n: 0 } + } + + /// Add an element sampled from the population. + #[inline] + pub fn add(&mut self, sample: f64) { + // This algorithm introduced by Welford in 1962 trades numerical + // stability for a division inside the loop. + // + // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. + self.n += 1; + let delta = sample - self.avg; + self.avg += delta / f64::approx_from(self.n).unwrap(); + } + + /// Determine whether the samples are empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.n == 0 + } + + /// Estimate the mean of the population. + #[inline] + pub fn mean(&self) -> f64 { + self.avg + } + + /// Return the number of samples. + #[inline] + pub fn len(&self) -> u64 { + self.n + } + + /// Merge another sample into this one. + /// + /// + /// ## Example + /// + /// ``` + /// use average::Average; + /// + /// let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; + /// let (left, right) = sequence.split_at(3); + /// let avg_total: Average = sequence.iter().map(|x| *x).collect(); + /// let mut avg_left: Average = left.iter().map(|x| *x).collect(); + /// let avg_right: Average = right.iter().map(|x| *x).collect(); + /// avg_left.merge(&avg_right); + /// assert_eq!(avg_total.mean(), avg_left.mean()); + /// ``` + #[inline] + pub fn merge(&mut self, other: &Average) { + // This algorithm was proposed by Chan et al. in 1979. + // + // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. + let len_self = f64::approx_from(self.n).unwrap(); + let len_other = f64::approx_from(other.n).unwrap(); + let len_total = len_self + len_other; + self.n += other.n; + self.avg = (len_self * self.avg + len_other * other.avg) / len_total; + // Chan et al. use + // + // self.avg += delta * len_other / len_total; + // + // instead but this results in cancelation if the number of samples are similar. + } +} + +impl core::default::Default for Average { + fn default() -> Average { + Average::new() + } +} + +impl core::iter::FromIterator for Average { + fn from_iter(iter: T) -> Average + where T: IntoIterator + { + let mut a = Average::new(); + for i in iter { + a.add(i); + } + a + } +} + /// Estimate the arithmetic mean and the variance of a sequence of numbers /// ("population"). /// @@ -22,10 +134,8 @@ use conv::ApproxFrom; /// ``` #[derive(Debug, Clone)] pub struct AverageWithError { - /// Average value. - avg: f64, - /// Number of samples. - n: u64, + /// Estimator of average. + avg: Average, /// Intermediate sum of squares for calculating the variance. v: f64, } @@ -33,7 +143,7 @@ pub struct AverageWithError { impl AverageWithError { /// Create a new average estimator. pub fn new() -> AverageWithError { - AverageWithError { avg: 0., n: 0, v: 0. } + AverageWithError { avg: Average::new(), v: 0. } } /// Add an element sampled from the population. @@ -43,53 +153,60 @@ impl AverageWithError { // stability for a division inside the loop. // // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. - self.n += 1; - let delta = sample - self.avg; - self.avg += delta / f64::approx_from(self.n).unwrap(); - self.v += delta * (sample - self.avg); + let delta = sample - self.avg.mean(); + self.avg.add(sample); + self.v += delta * (sample - self.avg.mean()); } /// Determine whether the samples are empty. + #[inline] pub fn is_empty(&self) -> bool { - self.n == 0 + self.avg.is_empty() } /// Estimate the mean of the population. + #[inline] pub fn mean(&self) -> f64 { - self.avg + self.avg.mean() } /// Return the number of samples. + #[inline] pub fn len(&self) -> u64 { - self.n + self.avg.len() } /// Calculate the sample variance. /// /// This is an unbiased estimator of the variance of the population. + #[inline] pub fn sample_variance(&self) -> f64 { - if self.n < 2 { + if self.avg.len() < 2 { return 0.; } - self.v / f64::approx_from(self.n - 1).unwrap() + self.v / f64::approx_from(self.avg.len() - 1).unwrap() } /// Calculate the population variance of the sample. /// /// This is a biased estimator of the variance of the population. + #[inline] pub fn population_variance(&self) -> f64 { - if self.n < 2 { + let n = self.avg.len(); + if n < 2 { return 0.; } - self.v / f64::approx_from(self.n).unwrap() + self.v / f64::approx_from(n).unwrap() } /// Estimate the standard error of the mean of the population. + #[inline] pub fn error(&self) -> f64 { - if self.n == 0 { + let n = self.avg.len(); + if n == 0 { return 0.; } - (self.sample_variance() / f64::approx_from(self.n).unwrap()).sqrt() + (self.sample_variance() / f64::approx_from(n).unwrap()).sqrt() } /// Merge another sample into this one. @@ -109,21 +226,16 @@ impl AverageWithError { /// assert_eq!(avg_total.mean(), avg_left.mean()); /// assert_eq!(avg_total.sample_variance(), avg_left.sample_variance()); /// ``` + #[inline] pub fn merge(&mut self, other: &AverageWithError) { // This algorithm was proposed by Chan et al. in 1979. // // See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. - let delta = other.avg - self.avg; - let len_self = f64::approx_from(self.n).unwrap(); - let len_other = f64::approx_from(other.n).unwrap(); + let len_self = f64::approx_from(self.len()).unwrap(); + let len_other = f64::approx_from(other.len()).unwrap(); let len_total = len_self + len_other; - self.n += other.n; - self.avg = (len_self * self.avg + len_other * other.avg) / len_total; - // Chan et al. use - // - // self.avg += delta * len_other / len_total; - // - // instead but this results in cancelation if the number of samples are similar. + let delta = other.mean() - self.mean(); + self.avg.merge(&other.avg); self.v += other.v + delta*delta * len_self * len_other / len_total; } } @@ -145,23 +257,3 @@ impl core::iter::FromIterator for AverageWithError { a } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn merge() { - let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; - for mid in 0..sequence.len() { - let (left, right) = sequence.split_at(mid); - let avg_total: AverageWithError = sequence.iter().map(|x| *x).collect(); - let mut avg_left: AverageWithError = left.iter().map(|x| *x).collect(); - let avg_right: AverageWithError = right.iter().map(|x| *x).collect(); - avg_left.merge(&avg_right); - assert_eq!(avg_total.n, avg_left.n); - assert_eq!(avg_total.avg, avg_left.avg); - assert_eq!(avg_total.v, avg_left.v); - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 9e6a51d..b0f6fbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,8 +2,8 @@ //! sequence of numbers, and for their standard errors. The typical workflow //! looks like this: //! -//! 1. Initialize your estimator of choice ([`AverageWithError`] or -//! [`WeightedAverageWithError`]) with `new()`. +//! 1. Initialize your estimator of choice ([`Average`], [`AverageWithError`], +//! [`WeightedAverage`] or [`WeightedAverageWithError`]) with `new()`. //! 2. Add some subset (called "samples") of the sequence of numbers (called //! "population") for which you want to estimate the average, using `add()` //! or `collect()`. @@ -13,8 +13,11 @@ //! You can run several estimators in parallel and merge them into one with //! `merge()`. //! -//! [`AverageWithError`]: ./average/struct.Average.html -//! [`WeightedAverageWithError`]: ./weighted_average/struct.WeightedAverage.html +//! [`Average`]: ./average/struct.Average.html +//! [`AverageWithError`]: ./average/struct.AverageWithError.html +//! [`WeightedAverage`]: ./weighted_average/struct.WeightedAverage.html +//! [`WeightedAverageWithError`]: ./weighted_average/struct.WeightedAverageWithError.html +//! //! //! ## Example //! @@ -29,12 +32,10 @@ #![no_std] extern crate conv; -#[cfg(test)] extern crate rand; -#[cfg(test)] #[macro_use] extern crate std; #[macro_use] mod macros; mod average; mod weighted_average; -pub use average::AverageWithError; -pub use weighted_average::WeightedAverageWithError; +pub use average::{Average, AverageWithError}; +pub use weighted_average::{WeightedAverage, WeightedAverageWithError}; diff --git a/src/weighted_average.rs b/src/weighted_average.rs index f2e7bca..56a2ecb 100644 --- a/src/weighted_average.rs +++ b/src/weighted_average.rs @@ -2,6 +2,117 @@ use core; use super::AverageWithError; + +/// Estimate the weighted and unweighted arithmetic mean of a sequence of +/// numbers ("population"). +/// +/// +/// ## Example +/// +/// ``` +/// use average::WeightedAverage; +/// +/// let a: WeightedAverage = (1..6).zip(1..6) +/// .map(|(x, w)| (f64::from(x), f64::from(w))).collect(); +/// println!("The weighted average is {}.", a.mean()); +/// ``` +#[derive(Debug, Clone)] +pub struct WeightedAverage { + /// Sum of the weights. + weight_sum: f64, + /// Weighted average value. + weighted_avg: f64, +} + +impl WeightedAverage { + /// Create a new weighted and unweighted average estimator. + pub fn new() -> WeightedAverage { + WeightedAverage { + weight_sum: 0., weighted_avg: 0., + } + } + + /// Add a weighted element sampled from the population. + #[inline] + pub fn add(&mut self, sample: f64, weight: f64) { + // The algorithm for the unweighted average was suggested by Welford in 1962. + // + // See + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + // and + // http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf. + self.weight_sum += weight; + + let prev_avg = self.weighted_avg; + self.weighted_avg = prev_avg + (weight / self.weight_sum) * (sample - prev_avg); + } + + /// Determine whether the sample is empty. + /// + /// Might be a false positive if the sum of weights is zero. + #[inline] + pub fn is_empty(&self) -> bool { + self.weight_sum == 0. + } + + /// Return the sum of the weights. + #[inline] + pub fn sum_weights(&self) -> f64 { + self.weight_sum + } + + /// Estimate the weighted mean of the sequence. + #[inline] + pub fn mean(&self) -> f64 { + self.weighted_avg + } + + /// Merge another sample into this one. + /// + /// + /// ## Example + /// + /// ``` + /// use average::WeightedAverage; + /// + /// let weighted_sequence: &[(f64, f64)] = &[ + /// (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5), + /// (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.9)]; + /// let (left, right) = weighted_sequence.split_at(3); + /// let avg_total: WeightedAverage = weighted_sequence.iter().map(|&x| x).collect(); + /// let mut avg_left: WeightedAverage = left.iter().map(|&x| x).collect(); + /// let avg_right: WeightedAverage = right.iter().map(|&x| x).collect(); + /// avg_left.merge(&avg_right); + /// assert!((avg_total.mean() - avg_left.mean()).abs() < 1e-15); + /// ``` + #[inline] + pub fn merge(&mut self, other: &WeightedAverage) { + let total_weight_sum = self.weight_sum + other.weight_sum; + self.weighted_avg = (self.weight_sum * self.weighted_avg + + other.weight_sum * other.weighted_avg) + / total_weight_sum; + self.weight_sum = total_weight_sum; + } +} + +impl core::default::Default for WeightedAverage { + fn default() -> WeightedAverage { + WeightedAverage::new() + } +} + +impl core::iter::FromIterator<(f64, f64)> for WeightedAverage { + fn from_iter(iter: T) -> WeightedAverage + where T: IntoIterator + { + let mut a = WeightedAverage::new(); + for (i, w) in iter { + a.add(i, w); + } + a + } +} + /// Estimate the weighted and unweighted arithmetic mean and the unweighted /// variance of a sequence of numbers ("population"). /// @@ -19,13 +130,10 @@ use super::AverageWithError; /// ``` #[derive(Debug, Clone)] pub struct WeightedAverageWithError { - /// Sum of the weights. - weight_sum: f64, /// Sum of the squares of the weights. weight_sum_sq: f64, - /// Weighted average value. - weighted_avg: f64, - + /// Estimator of the weighted average. + weighted_avg: WeightedAverage, /// Estimator of unweighted average and its variance. unweighted_avg: AverageWithError, } @@ -34,7 +142,8 @@ impl WeightedAverageWithError { /// Create a new weighted and unweighted average estimator. pub fn new() -> WeightedAverageWithError { WeightedAverageWithError { - weight_sum: 0., weight_sum_sq: 0., weighted_avg: 0., + weight_sum_sq: 0., + weighted_avg: WeightedAverage::new(), unweighted_avg: AverageWithError::new(), } } @@ -49,51 +158,55 @@ impl WeightedAverageWithError { // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance // and // http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf. - self.weight_sum += weight; self.weight_sum_sq += weight*weight; - - let prev_avg = self.weighted_avg; - self.weighted_avg = prev_avg + (weight / self.weight_sum) * (sample - prev_avg); - + self.weighted_avg.add(sample, weight); self.unweighted_avg.add(sample); } /// Determine whether the sample is empty. + #[inline] pub fn is_empty(&self) -> bool { self.unweighted_avg.is_empty() } /// Return the sum of the weights. + #[inline] pub fn sum_weights(&self) -> f64 { - self.weight_sum + self.weighted_avg.sum_weights() } /// Return the sum of the squared weights. + #[inline] pub fn sum_weights_sq(&self) -> f64 { self.weight_sum_sq } /// Estimate the weighted mean of the sequence. + #[inline] pub fn weighted_mean(&self) -> f64 { - self.weighted_avg + self.weighted_avg.mean() } /// Estimate the unweighted mean of the sequence. + #[inline] pub fn unweighted_mean(&self) -> f64 { self.unweighted_avg.mean() } /// Return sample size. + #[inline] pub fn len(&self) -> u64 { self.unweighted_avg.len() } /// Calculate the effective sample size. + #[inline] pub fn effective_len(&self) -> f64 { if self.is_empty() { return 0. } - self.weight_sum * self.weight_sum / self.weight_sum_sq + let weight_sum = self.weighted_avg.sum_weights(); + weight_sum * weight_sum / self.weight_sum_sq } /// Calculate the *unweighted* population variance of the sample. @@ -121,10 +234,11 @@ impl WeightedAverageWithError { // results than the ones used by SPSS or Mentor. // // See http://www.analyticalgroup.com/download/WEIGHTED_VARIANCE.pdf. - if self.weight_sum == 0. { + let weight_sum = self.weighted_avg.sum_weights(); + if weight_sum == 0. { return 0.; } - let inv_effective_len = self.weight_sum_sq / (self.weight_sum * self.weight_sum); + let inv_effective_len = self.weight_sum_sq / (weight_sum * weight_sum); (self.sample_variance() * inv_effective_len).sqrt() } @@ -148,13 +262,8 @@ impl WeightedAverageWithError { /// assert!((avg_total.error() - avg_left.error()).abs() < 1e-15); /// ``` pub fn merge(&mut self, other: &WeightedAverageWithError) { - let total_weight_sum = self.weight_sum + other.weight_sum; - self.weighted_avg = (self.weight_sum * self.weighted_avg - + other.weight_sum * other.weighted_avg) - / total_weight_sum; - self.weight_sum = total_weight_sum; self.weight_sum_sq += other.weight_sum_sq; - + self.weighted_avg.merge(&other.weighted_avg); self.unweighted_avg.merge(&other.unweighted_avg); } } @@ -176,51 +285,3 @@ impl core::iter::FromIterator<(f64, f64)> for WeightedAverageWithError { a } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn merge_unweighted() { - let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; - for mid in 0..sequence.len() { - let (left, right) = sequence.split_at(mid); - let avg_total: WeightedAverageWithError = sequence.iter().map(|x| (*x, 1.)).collect(); - let mut avg_left: WeightedAverageWithError = left.iter().map(|x| (*x, 1.)).collect(); - let avg_right: WeightedAverageWithError = right.iter().map(|x| (*x, 1.)).collect(); - avg_left.merge(&avg_right); - - assert_eq!(avg_total.weight_sum, avg_left.weight_sum); - assert_eq!(avg_total.weight_sum_sq, avg_left.weight_sum_sq); - assert_eq!(avg_total.weighted_avg, avg_left.weighted_avg); - - assert_eq!(avg_total.unweighted_avg.len(), avg_left.unweighted_avg.len()); - assert_eq!(avg_total.unweighted_avg.mean(), avg_left.unweighted_avg.mean()); - assert_eq!(avg_total.unweighted_avg.sample_variance(), - avg_left.unweighted_avg.sample_variance()); - } - } - - #[test] - fn merge_weighted() { - let sequence: &[(f64, f64)] = &[ - (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5), - (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.)]; - for mid in 0..sequence.len() { - let (left, right) = sequence.split_at(mid); - let avg_total: WeightedAverageWithError = sequence.iter().map(|&(x, w)| (x, w)).collect(); - let mut avg_left: WeightedAverageWithError = left.iter().map(|&(x, w)| (x, w)).collect(); - let avg_right: WeightedAverageWithError = right.iter().map(|&(x, w)| (x, w)).collect(); - avg_left.merge(&avg_right); - assert_eq!(avg_total.unweighted_avg.len(), avg_left.unweighted_avg.len()); - assert_almost_eq!(avg_total.weight_sum, avg_left.weight_sum, 1e-15); - assert_eq!(avg_total.weight_sum_sq, avg_left.weight_sum_sq); - assert_almost_eq!(avg_total.weighted_avg, avg_left.weighted_avg, 1e-15); - assert_almost_eq!(avg_total.unweighted_avg.mean(), - avg_left.unweighted_avg.mean(), 1e-15); - assert_almost_eq!(avg_total.unweighted_avg.sample_variance(), - avg_left.unweighted_avg.sample_variance(), 1e-14); - } - } -} diff --git a/tests/average.rs b/tests/average.rs index 32be2d1..85cbfd9 100644 --- a/tests/average.rs +++ b/tests/average.rs @@ -44,6 +44,21 @@ fn numerically_unstable() { assert_eq!(a.sample_variance(), 30.); } +#[test] +fn merge() { + let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; + for mid in 0..sequence.len() { + let (left, right) = sequence.split_at(mid); + let avg_total: AverageWithError = sequence.iter().map(|x| *x).collect(); + let mut avg_left: AverageWithError = left.iter().map(|x| *x).collect(); + let avg_right: AverageWithError = right.iter().map(|x| *x).collect(); + avg_left.merge(&avg_right); + assert_eq!(avg_total.len(), avg_left.len()); + assert_eq!(avg_total.mean(), avg_left.mean()); + assert_eq!(avg_total.sample_variance(), avg_left.sample_variance()); + } +} + #[test] fn normal_distribution() { use rand::distributions::{Normal, IndependentSample}; diff --git a/tests/weighted_average.rs b/tests/weighted_average.rs index dfba85f..f9b32bf 100644 --- a/tests/weighted_average.rs +++ b/tests/weighted_average.rs @@ -64,3 +64,43 @@ fn error_corner_case() { .map(|(x, w)| (*x, *w)).collect(); assert_eq!(a.error(), 0.5); } + +#[test] +fn merge_unweighted() { + let sequence: &[f64] = &[1., 2., 3., 4., 5., 6., 7., 8., 9.]; + for mid in 0..sequence.len() { + let (left, right) = sequence.split_at(mid); + let avg_total: WeightedAverageWithError = sequence.iter().map(|x| (*x, 1.)).collect(); + let mut avg_left: WeightedAverageWithError = left.iter().map(|x| (*x, 1.)).collect(); + let avg_right: WeightedAverageWithError = right.iter().map(|x| (*x, 1.)).collect(); + avg_left.merge(&avg_right); + + assert_eq!(avg_total.sum_weights(), avg_left.sum_weights()); + assert_eq!(avg_total.sum_weights_sq(), avg_left.sum_weights_sq()); + + assert_eq!(avg_total.len(), avg_left.len()); + assert_eq!(avg_total.unweighted_mean(), avg_left.unweighted_mean()); + assert_eq!(avg_total.weighted_mean(), avg_left.weighted_mean()); + assert_eq!(avg_total.sample_variance(), avg_left.sample_variance()); + } +} + +#[test] +fn merge_weighted() { + let sequence: &[(f64, f64)] = &[ + (1., 0.1), (2., 0.2), (3., 0.3), (4., 0.4), (5., 0.5), + (6., 0.6), (7., 0.7), (8., 0.8), (9., 0.)]; + for mid in 0..sequence.len() { + let (left, right) = sequence.split_at(mid); + let avg_total: WeightedAverageWithError = sequence.iter().map(|&(x, w)| (x, w)).collect(); + let mut avg_left: WeightedAverageWithError = left.iter().map(|&(x, w)| (x, w)).collect(); + let avg_right: WeightedAverageWithError = right.iter().map(|&(x, w)| (x, w)).collect(); + avg_left.merge(&avg_right); + assert_eq!(avg_total.len(), avg_left.len()); + assert_almost_eq!(avg_total.sum_weights(), avg_left.sum_weights(), 1e-15); + assert_eq!(avg_total.sum_weights_sq(), avg_left.sum_weights_sq()); + assert_almost_eq!(avg_total.weighted_mean(), avg_left.weighted_mean(), 1e-15); + assert_almost_eq!(avg_total.unweighted_mean(), avg_left.unweighted_mean(), 1e-15); + assert_almost_eq!(avg_total.sample_variance(), avg_left.sample_variance(), 1e-14); + } +}