Importing the EstimateHistogram from origin

2015-06-30 18:22:45 +03:00 · 2015-06-30 18:22:45 +03:00 · 919785bc04
commit 919785bc04
parent baf7d101d1
1 changed files with 306 additions and 0 deletions
--- a/src/main/java/com/cloudius/urchin/utils/EstimatedHistogram.java
+++ b/src/main/java/com/cloudius/urchin/utils/EstimatedHistogram.java
@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright 2015 Cloudius Systems
+ *
+ * Modified by Cloudius Systems
+ */
+
+package com.cloudius.urchin.utils;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicLongArray;
+
+import com.google.common.base.Objects;
+
+import org.slf4j.Logger;
+
+public class EstimatedHistogram {
+    /**
+     * The series of values to which the counts in `buckets` correspond: 1, 2,
+     * 3, 4, 5, 6, 7, 8, 10, 12, 14, 17, 20, etc. Thus, a `buckets` of [0, 0, 1,
+     * 10] would mean we had seen one value of 3 and 10 values of 4.
+     *
+     * The series starts at 1 and grows by 1.2 each time (rounding and removing
+     * duplicates). It goes from 1 to around 36M by default (creating 90+1
+     * buckets), which will give us timing resolution from microseconds to 36
+     * seconds, with less precision as the numbers get larger.
+     *
+     * Each bucket represents values from (previous bucket offset, current
+     * offset].
+     */
+    private final long[] bucketOffsets;
+
+    // buckets is one element longer than bucketOffsets -- the last element is
+    // values greater than the last offset
+    final AtomicLongArray buckets;
+
+    public EstimatedHistogram() {
+        this(90);
+    }
+
+    public EstimatedHistogram(int bucketCount) {
+        bucketOffsets = newOffsets(bucketCount);
+        buckets = new AtomicLongArray(bucketOffsets.length + 1);
+    }
+
+    public EstimatedHistogram(long[] offsets, long[] bucketData) {
+        assert bucketData.length == offsets.length + 1;
+        bucketOffsets = offsets;
+        buckets = new AtomicLongArray(bucketData);
+    }
+
+    private static long[] newOffsets(int size) {
+        long[] result = new long[size];
+        long last = 1;
+        result[0] = last;
+        for (int i = 1; i < size; i++) {
+            long next = Math.round(last * 1.2);
+            if (next == last)
+                next++;
+            result[i] = next;
+            last = next;
+        }
+
+        return result;
+    }
+
+    /**
+     * @return the histogram values corresponding to each bucket index
+     */
+    public long[] getBucketOffsets() {
+        return bucketOffsets;
+    }
+
+    /**
+     * Increments the count of the bucket closest to n, rounding UP.
+     * 
+     * @param n
+     */
+    public void add(long n) {
+        int index = Arrays.binarySearch(bucketOffsets, n);
+        if (index < 0) {
+            // inexact match, take the first bucket higher than n
+            index = -index - 1;
+        }
+        // else exact match; we're good
+        buckets.incrementAndGet(index);
+    }
+
+    /**
+     * @return the count in the given bucket
+     */
+    long get(int bucket) {
+        return buckets.get(bucket);
+    }
+
+    /**
+     * @param reset
+     *            zero out buckets afterwards if true
+     * @return a long[] containing the current histogram buckets
+     */
+    public long[] getBuckets(boolean reset) {
+        final int len = buckets.length();
+        long[] rv = new long[len];
+
+        if (reset)
+            for (int i = 0; i < len; i++)
+                rv[i] = buckets.getAndSet(i, 0L);
+        else
+            for (int i = 0; i < len; i++)
+                rv[i] = buckets.get(i);
+
+        return rv;
+    }
+
+    /**
+     * @return the smallest value that could have been added to this histogram
+     */
+    public long min() {
+        for (int i = 0; i < buckets.length(); i++) {
+            if (buckets.get(i) > 0)
+                return i == 0 ? 0 : 1 + bucketOffsets[i - 1];
+        }
+        return 0;
+    }
+
+    /**
+     * @return the largest value that could have been added to this histogram.
+     *         If the histogram overflowed, returns Long.MAX_VALUE.
+     */
+    public long max() {
+        int lastBucket = buckets.length() - 1;
+        if (buckets.get(lastBucket) > 0)
+            return Long.MAX_VALUE;
+
+        for (int i = lastBucket - 1; i >= 0; i--) {
+            if (buckets.get(i) > 0)
+                return bucketOffsets[i];
+        }
+        return 0;
+    }
+
+    /**
+     * @param percentile
+     * @return estimated value at given percentile
+     */
+    public long percentile(double percentile) {
+        assert percentile >= 0 && percentile <= 1.0;
+        int lastBucket = buckets.length() - 1;
+        if (buckets.get(lastBucket) > 0)
+            throw new IllegalStateException(
+                    "Unable to compute when histogram overflowed");
+
+        long pcount = (long) Math.floor(count() * percentile);
+        if (pcount == 0)
+            return 0;
+
+        long elements = 0;
+        for (int i = 0; i < lastBucket; i++) {
+            elements += buckets.get(i);
+            if (elements >= pcount)
+                return bucketOffsets[i];
+        }
+        return 0;
+    }
+
+    /**
+     * @return the mean histogram value (average of bucket offsets, weighted by
+     *         count)
+     * @throws IllegalStateException
+     *             if any values were greater than the largest bucket threshold
+     */
+    public long mean() {
+        int lastBucket = buckets.length() - 1;
+        if (buckets.get(lastBucket) > 0)
+            throw new IllegalStateException(
+                    "Unable to compute ceiling for max when histogram overflowed");
+
+        long elements = 0;
+        long sum = 0;
+        for (int i = 0; i < lastBucket; i++) {
+            long bCount = buckets.get(i);
+            elements += bCount;
+            sum += bCount * bucketOffsets[i];
+        }
+
+        return (long) Math.ceil((double) sum / elements);
+    }
+
+    /**
+     * @return the total number of non-zero values
+     */
+    public long count() {
+        long sum = 0L;
+        for (int i = 0; i < buckets.length(); i++)
+            sum += buckets.get(i);
+        return sum;
+    }
+
+    /**
+     * @return true if this histogram has overflowed -- that is, a value larger
+     *         than our largest bucket could bound was added
+     */
+    public boolean isOverflowed() {
+        return buckets.get(buckets.length() - 1) > 0;
+    }
+
+    /**
+     * log.debug() every record in the histogram
+     *
+     * @param log
+     */
+    public void log(Logger log) {
+        // only print overflow if there is any
+        int nameCount;
+        if (buckets.get(buckets.length() - 1) == 0)
+            nameCount = buckets.length() - 1;
+        else
+            nameCount = buckets.length();
+        String[] names = new String[nameCount];
+
+        int maxNameLength = 0;
+        for (int i = 0; i < nameCount; i++) {
+            names[i] = nameOfRange(bucketOffsets, i);
+            maxNameLength = Math.max(maxNameLength, names[i].length());
+        }
+
+        // emit log records
+        String formatstr = "%" + maxNameLength + "s: %d";
+        for (int i = 0; i < nameCount; i++) {
+            long count = buckets.get(i);
+            // sort-of-hack to not print empty ranges at the start that are only
+            // used to demarcate the
+            // first populated range. for code clarity we don't omit this record
+            // from the maxNameLength
+            // calculation, and accept the unnecessary whitespace prefixes that
+            // will occasionally occur
+            if (i == 0 && count == 0)
+                continue;
+            log.debug(String.format(formatstr, names[i], count));
+        }
+    }
+
+    private static String nameOfRange(long[] bucketOffsets, int index) {
+        StringBuilder sb = new StringBuilder();
+        appendRange(sb, bucketOffsets, index);
+        return sb.toString();
+    }
+
+    private static void appendRange(StringBuilder sb, long[] bucketOffsets,
+            int index) {
+        sb.append("[");
+        if (index == 0)
+            if (bucketOffsets[0] > 0)
+                // by original definition, this histogram is for values greater
+                // than zero only;
+                // if values of 0 or less are required, an entry of lb-1 must be
+                // inserted at the start
+                sb.append("1");
+            else
+                sb.append("-Inf");
+        else
+            sb.append(bucketOffsets[index - 1] + 1);
+        sb.append("..");
+        if (index == bucketOffsets.length)
+            sb.append("Inf");
+        else
+            sb.append(bucketOffsets[index]);
+        sb.append("]");
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof EstimatedHistogram))
+            return false;
+
+        EstimatedHistogram that = (EstimatedHistogram) o;
+        return Arrays.equals(getBucketOffsets(), that.getBucketOffsets())
+                && Arrays.equals(getBuckets(false), that.getBuckets(false));
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hashCode(getBucketOffsets(), getBuckets(false));
+    }
+}