From 8fc58a4a917ef1cae1aa96457c116ed732285a82 Mon Sep 17 00:00:00 2001 From: Osvaldo Doederlein Date: Sat, 19 Jul 2014 18:16:50 -0400 Subject: [PATCH] Fixes and improvements to IntObjectHashMap. Related to [#2659] - Rewrite with linear probing, no state array, compaction at cleanup - Optimize keys() and values() to not use reflection - Optimize hashCode() and equals() for efficient iteration - Fixed equals() to not return true for equals(null) - Optimize iterator to not allocate new Entry at each next() - Added toString() - Added some new unit tests --- .../util/collection/IntObjectHashMap.java | 470 +++++++++--------- .../util/collection/IntObjectHashMapTest.java | 128 ++++- 2 files changed, 348 insertions(+), 250 deletions(-) diff --git a/common/src/main/java/io/netty/util/collection/IntObjectHashMap.java b/common/src/main/java/io/netty/util/collection/IntObjectHashMap.java index da5dc9e834..34480188d6 100644 --- a/common/src/main/java/io/netty/util/collection/IntObjectHashMap.java +++ b/common/src/main/java/io/netty/util/collection/IntObjectHashMap.java @@ -21,40 +21,36 @@ import java.util.Iterator; import java.util.NoSuchElementException; /** - * A hash map implementation of {@link IntObjectMap} that uses open addressing for keys. To minimize - * the memory footprint, this class uses open addressing rather than chaining. Collisions are - * resolved using double hashing. + * A hash map implementation of {@link IntObjectMap} that uses open addressing for keys. + * To minimize the memory footprint, this class uses open addressing rather than chaining. + * Collisions are resolved using linear probing. Deletions implement compaction, so cost of + * remove can approach O(N) for full maps, which makes a small loadFactor recommended. * * @param The value type stored in the map. */ public class IntObjectHashMap implements IntObjectMap, Iterable> { - /** State indicating that a slot is available.*/ - private static final byte AVAILABLE = 0; - - /** State indicating that a slot is occupied. */ - private static final byte OCCUPIED = 1; - - /** State indicating that a slot was removed. */ - private static final byte REMOVED = 2; - /** Default initial capacity. Used if not specified in the constructor */ private static final int DEFAULT_CAPACITY = 11; /** Default load factor. Used if not specified in the constructor */ private static final float DEFAULT_LOAD_FACTOR = 0.5f; + /** + * Placeholder for null values, so we can use the actual null to mean available. + * (Better than using a placeholder for available: less references for GC processing.) + */ + private static final Object NULL_VALUE = new Object(); + /** The maximum number of elements allowed without allocating more space. */ private int maxSize; /** The load factor for the map. Used to calculate {@link #maxSize}. */ private final float loadFactor; - private byte[] states; private int[] keys; private V[] values; private int size; - private int available; public IntObjectHashMap() { this(DEFAULT_CAPACITY, DEFAULT_LOAD_FACTOR); @@ -68,93 +64,71 @@ public class IntObjectHashMap implements IntObjectMap, Iterable= 1"); } - if (loadFactor <= 0.0f) { - throw new IllegalArgumentException("loadFactor must be > 0"); + if (loadFactor <= 0.0f || loadFactor > 1.0f) { + // Cannot exceed 1 because we can never store more than capacity elements; + // using a bigger loadFactor would trigger rehashing before the desired load is reached. + throw new IllegalArgumentException("loadFactor must be > 0 and <= 1"); } this.loadFactor = loadFactor; // Adjust the initial capacity if necessary. - initialCapacity = adjustCapacity(initialCapacity); + int capacity = adjustCapacity(initialCapacity); // Allocate the arrays. - states = new byte[initialCapacity]; - keys = new int[initialCapacity]; - @SuppressWarnings({ "unchecked", "SuspiciousArrayCast" }) - V[] temp = (V[]) new Object[initialCapacity]; + keys = new int[capacity]; + @SuppressWarnings({ "unchecked", }) + V[] temp = (V[]) new Object[capacity]; values = temp; // Initialize the maximum size value. - maxSize = calcMaxSize(initialCapacity); + maxSize = calcMaxSize(capacity); + } - // Initialize the available element count - available = initialCapacity - size; + private static T toExternal(T value) { + return value == NULL_VALUE ? null : value; + } + + @SuppressWarnings("unchecked") + private static T toInternal(T value) { + return value == null ? (T) NULL_VALUE : value; } @Override public V get(int key) { int index = indexOf(key); - return index < 0 ? null : values[index]; + return index == -1 ? null : toExternal(values[index]); } @Override public V put(int key, V value) { - int hash = hash(key); - int capacity = capacity(); - int index = hash % capacity; - int increment = 1 + hash % (capacity - 2); - final int startIndex = index; - int firstRemovedIndex = -1; - do { - switch (states[index]) { - case AVAILABLE: - // We only stop probing at a AVAILABLE node, since the value may still exist - // beyond - // a REMOVED node. - if (firstRemovedIndex != -1) { - // We encountered a REMOVED node prior. Store the entry there so that - // retrieval - // will be faster. - insertAt(firstRemovedIndex, key, value); - return null; - } + int startIndex = hashIndex(key); + int index = startIndex; - // No REMOVED node, just store the entry here. - insertAt(index, key, value); - return null; - case OCCUPIED: - if (keys[index] == key) { - V previousValue = values[index]; - insertAt(index, key, value); - return previousValue; - } - break; - case REMOVED: - // Check for first removed index. - if (firstRemovedIndex == -1) { - firstRemovedIndex = index; - } - break; - default: - throw new AssertionError("Invalid state: " + states[index]); + for (;;) { + if (values[index] == null) { + // Found empty slot, use it. + keys[index] = key; + values[index] = toInternal(value); + growSize(); + return null; + } else if (keys[index] == key) { + // Found existing entry with this key, just replace the value. + V previousValue = values[index]; + values[index] = toInternal(value); + return toExternal(previousValue); } - // REMOVED or OCCUPIED but wrong key, keep probing ... - index += increment; - if (index >= capacity) { - // Handle wrap-around by decrement rather than mod. - index -= capacity; + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + // Can only happen if the map was full at MAX_ARRAY_SIZE and couldn't grow. + throw new IllegalStateException("Unable to insert"); } - } while (index != startIndex); - - if (firstRemovedIndex == -1) { - // Should never happen. - throw new AssertionError("Unable to insert"); } + } - // Never found a AVAILABLE slot, just use the first REMOVED. - insertAt(firstRemovedIndex, key, value); - return null; + private int probeNext(int index) { + return index == values.length - 1 ? 0 : index + 1; } @Override @@ -162,9 +136,11 @@ public class IntObjectHashMap implements IntObjectMap, Iterable source = (IntObjectHashMap) sourceMap; - int i = -1; - while ((i = source.nextEntryIndex(i + 1)) >= 0) { - put(source.keys[i], source.values[i]); + for (int i = 0; i < source.values.length; ++i) { + V sourceValue = source.values[i]; + if (sourceValue != null) { + put(source.keys[i], sourceValue); + } } return; } @@ -178,13 +154,13 @@ public class IntObjectHashMap implements IntObjectMap, Iterable implements IntObjectMap, Iterable implements IntObjectMap, Iterable= 0) { - V next = values[i]; - if (value == next || value != null && value.equals(next)) { + V v = toInternal(value); + for (int i = 0; i < values.length; ++i) { + // The map supports null values; this will be matched as NULL_VALUE.equals(NULL_VALUE). + if (values[i] != null && values[i].equals(v)) { return true; } } @@ -235,7 +210,12 @@ public class IntObjectHashMap implements IntObjectMap, Iterable implements IntObjectMap, Iterable clazz) { @SuppressWarnings("unchecked") V[] outValues = (V[]) Array.newInstance(clazz, size()); - copyEntries(values, outValues); + int targetIx = 0; + for (int i = 0; i < values.length; ++i) { + if (values[i] != null) { + outValues[targetIx++] = values[i]; + } + } return outValues; } @Override public int hashCode() { - final int prime = 31; - int result = 1; - for (Entry entry : entries()) { - V value = entry.value(); - int hash = value == null ? 0 : value.hashCode(); - result = prime * result + hash; + // Hashcode is based on all non-zero, valid keys. We have to scan the whole keys + // array, which may have different lengths for two maps of same size(), so the + // capacity cannot be used as input for hashing but the size can. + int hash = size; + for (int i = 0; i < keys.length; ++i) { + // 0 can be a valid key or unused slot, but won't impact the hashcode in either case. + // This way we can use a cheap loop without conditionals, or hard-to-unroll operations, + // or the devastatingly bad memory locality of visiting value objects. + // Also, it's important to use a hash function that does not depend on the ordering + // of terms, only their values; since the map is an unordered collection and + // entries can end up in different positions in different maps that have the same + // elements, but with different history of puts/removes, due to conflicts. + hash = hash ^ keys[i]; } - return result; + return hash; } @Override public boolean equals(Object obj) { - if (this == obj || obj == null || getClass() != obj.getClass()) { + if (this == obj) { return true; - } - @SuppressWarnings("rawtypes") - IntObjectHashMap other = (IntObjectHashMap) obj; - if (size != other.size) { + } else if (!(obj instanceof IntObjectMap)) { return false; } - for (Entry entry : entries()) { - V value = entry.value(); - Object otherValue = other.get(entry.key()); - if (value == null) { - if (otherValue != null || !other.containsKey(entry.key())) { + @SuppressWarnings("rawtypes") + IntObjectMap other = (IntObjectMap) obj; + if (size != other.size()) { + return false; + } + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + int key = keys[i]; + Object otherValue = other.get(key); + if (value == NULL_VALUE) { + if (otherValue != null) { + return false; + } + } else if (!value.equals(otherValue)) { return false; } - } else if (!value.equals(otherValue)) { - return false; } } return true; } - /** - * Copies the occupied entries from the source to the target array. - */ - private void copyEntries(Object sourceArray, Object targetArray) { - int sourceIx = -1; - int targetIx = 0; - while ((sourceIx = nextEntryIndex(sourceIx + 1)) >= 0) { - Object obj = Array.get(sourceArray, sourceIx); - Array.set(targetArray, targetIx++, obj); - } - } - /** * Locates the index for the given key. This method probes using double hashing. * @@ -302,117 +287,94 @@ public class IntObjectHashMap implements IntObjectMap, Iterable= capacity) { - // Handle wrap-around by decrement rather than mod. - index -= capacity; - } - } while (index != startIndex); - - // Got back to the beginning. Not found. - return -1; - } - - /** - * Determines the current capacity (i.e. size of the arrays). - */ - private int capacity() { - return keys.length; - } - - /** - * Creates a hash value for the given key. - */ - private static int hash(int key) { - // Just make sure the integer is positive. - return key & Integer.MAX_VALUE; - } - - /** - * Performs an insert of the key/value at the given index position. If necessary, performs a - * rehash of the map. - * - * @param index the index at which to insert the key/value - * @param key the entry key - * @param value the entry value - */ - private void insertAt(int index, int key, V value) { - byte state = states[index]; - if (state != OCCUPIED) { - // Added a new mapping, increment the size. - size++; - - if (state == AVAILABLE) { - // Consumed a OCCUPIED slot, decrement the number of available slots. - available--; + // Conflict, keep probing ... + if ((index = probeNext(index)) == startIndex) { + return -1; } } + } - keys[index] = key; - values[index] = value; - states[index] = OCCUPIED; + /** + * Returns the hashed index for the given key. + */ + private int hashIndex(int key) { + return key % keys.length; + } + + /** + * Grows the map size after an insertion. If necessary, performs a rehash of the map. + */ + private void growSize() { + size++; if (size > maxSize) { - // Need to grow the arrays. - rehash(adjustCapacity(capacity() * 2)); - } else if (available == 0) { + // Need to grow the arrays. We take care to detect integer overflow, + // also limit array size to ArrayList.MAX_ARRAY_SIZE. + rehash(adjustCapacity((int) Math.min(keys.length * 2.0, Integer.MAX_VALUE - 8))); + } else if (size == keys.length) { // Open addressing requires that we have at least 1 slot available. Need to refresh // the arrays to clear any removed elements. - rehash(capacity()); + rehash(keys.length); } } /** * Adjusts the given capacity value to ensure that it's odd. Even capacities can break probing. - * TODO: would be better to ensure it's prime as well. */ private static int adjustCapacity(int capacity) { return capacity | 1; } /** - * Marks the entry at the given index position as {@link #REMOVED} and sets the value to - * {@code null}. - *

- * TODO: consider performing re-compaction. + * Removes entry at the given index position. Also performs opportunistic, incremental rehashing + * if necessary to not break conflict chains. * * @param index the index position of the element to remove. */ private void removeAt(int index) { - if (states[index] == OCCUPIED) { - size--; - } - states[index] = REMOVED; + --size; + // Clearing the key is not strictly necessary (for GC like in a regular collection), + // but recommended for security. The memory location is still fresh in the cache anyway. + keys[index] = 0; values[index] = null; + + // In the interval from index to the next available entry, the arrays may have entries + // that are displaced from their base position due to prior conflicts. Iterate these + // entries and move them back if possible, optimizing future lookups. + // Knuth Section 6.4 Algorithm R, also used by the JDK's IdentityHashMap. + + int nextFree = index; + for (int i = probeNext(index); values[i] != null; i = probeNext(i)) { + int bucket = hashIndex(keys[i]); + if ((i < bucket && (bucket <= nextFree || nextFree <= i)) + || (bucket <= nextFree && nextFree <= i)) { + // Move the displaced entry "back" to the first available position. + keys[nextFree] = keys[i]; + values[nextFree] = values[i]; + // Put the first entry after the displaced entry + keys[i] = 0; + values[i] = null; + nextFree = i; + } + } } /** * Calculates the maximum size allowed before rehashing. */ private int calcMaxSize(int capacity) { - // Clip the upper bound so that there will always be at least one - // available slot. + // Clip the upper bound so that there will always be at least one available slot. int upperBound = capacity - 1; return Math.min(upperBound, (int) (capacity * loadFactor)); } @@ -423,61 +385,62 @@ public class IntObjectHashMap implements IntObjectMap, Iterable> { - int prevIndex; - int nextIndex; + private final class IteratorImpl implements Iterator>, Entry { + private int prevIndex = -1; + private int nextIndex = -1; + private int entryIndex = -1; - IteratorImpl() { - prevIndex = -1; - nextIndex = nextEntryIndex(0); + private void scanNext() { + for (;;) { + if (++nextIndex == values.length || values[nextIndex] != null) { + break; + } + } } @Override public boolean hasNext() { - return nextIndex >= 0; + if (nextIndex == -1) { + scanNext(); + } + return nextIndex < keys.length; } @Override @@ -487,43 +450,54 @@ public class IntObjectHashMap implements IntObjectMap, Iterable { - final int index; - - EntryImpl(int index) { - this.index = index; - } + // Entry implementation. Since this implementation uses a single Entry, we coalesce that + // into the Iterator object (potentially making loop optimization much easier). @Override public int key() { - return keys[index]; + return keys[entryIndex]; } @Override public V value() { - return values[index]; + return toExternal(values[entryIndex]); } @Override public void setValue(V value) { - values[index] = value; + values[entryIndex] = toInternal(value); } } + + @Override + public String toString() { + if (size == 0) { + return "{}"; + } + StringBuilder sb = new StringBuilder(4 * size); + for (int i = 0; i < values.length; ++i) { + V value = values[i]; + if (value != null) { + sb.append(sb.length() == 0 ? "{" : ", "); + sb.append(keys[i]).append('=').append(value == this ? "(this Map)" : value); + } + } + return sb.append('}').toString(); + } } diff --git a/common/src/test/java/io/netty/util/collection/IntObjectHashMapTest.java b/common/src/test/java/io/netty/util/collection/IntObjectHashMapTest.java index 7f50134ef8..ec291df874 100644 --- a/common/src/test/java/io/netty/util/collection/IntObjectHashMapTest.java +++ b/common/src/test/java/io/netty/util/collection/IntObjectHashMapTest.java @@ -14,14 +14,21 @@ */ package io.netty.util.collection; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; + import org.junit.Before; import org.junit.Test; +import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; +import java.util.Random; import java.util.Set; -import static org.junit.Assert.*; - /** * Tests for {@link IntObjectHashMap}. */ @@ -279,4 +286,121 @@ public class IntObjectHashMapTest { } assertEquals(expected, found); } + + @Test + public void mapShouldSupportHashingConflicts() { + for (int mod = 0; mod < 10; ++mod) { + for (int sz = 1; sz <= 101; sz += 2) { + IntObjectHashMap map = new IntObjectHashMap(sz); + for (int i = 0; i < 100; ++i) { + map.put(i * mod, ""); + } + } + } + } + + @Test + public void hashcodeEqualsTest() { + IntObjectHashMap map1 = new IntObjectHashMap(); + IntObjectHashMap map2 = new IntObjectHashMap(); + Random rnd = new Random(0); + while (map1.size() < 100) { + int key = rnd.nextInt(100); + map1.put(key, key); + map2.put(key, key); + } + assertEquals(map1.hashCode(), map2.hashCode()); + assertTrue(map1.equals(map2)); + // Remove one "middle" element, maps should now be non-equals. + int[] keys = map1.keys(); + map2.remove(keys[50]); + assertFalse(map1.equals(map2)); + // Put it back; will likely be in a different position, but maps will be equal again. + map2.put(keys[50], map1.keys()[50]); + assertTrue(map1.equals(map2)); + assertEquals(map1.hashCode(), map2.hashCode()); + // Make map2 have one extra element, will be non-equal. + map2.put(1000, 1000); + assertFalse(map1.equals(map2)); + // Rebuild map2 with elements in a different order, again the maps should be equal. + // (These tests with same elements in different order also show that the hashCode + // function does not depend on the internal ordering of entries.) + map2.clear(); + Arrays.sort(keys); + for (int key : keys) { + map2.put(key, key); + } + assertEquals(map1.hashCode(), map2.hashCode()); + assertTrue(map1.equals(map2)); + } + + @Test + public void fuzzTest() { + // This test is so extremely internals-dependent that I'm not even trying to + // minimize that. Any internal changes will not fail the test (so it's not flaky per se) + // but will possibly make it less effective (not test interesting scenarios anymore). + + // The RNG algorithm is specified and stable, so this will cause the same exact dataset + // to be used in every run and every JVM implementation. + Random rnd = new Random(0); + + int baseSize = 1000; + // Empirically-determined size to expand the capacity exactly once, and before + // the step that creates the long conflict chain. We need to test rehash(), + // but also control when rehash happens because it cleans up the REMOVED entries. + // This size is also chosen so after the single rehash, the map will be densely + // populated, getting close to a second rehash but not triggering it. + int startTableSize = 1105; + IntObjectHashMap map = new IntObjectHashMap(startTableSize); + // Reference map which implementation we trust to be correct, will mirror all operations. + HashMap goodMap = new HashMap(); + + // Add initial population. + for (int i = 0; i < baseSize / 4; ++i) { + int key = rnd.nextInt(baseSize); + assertEquals(goodMap.put(key, key), map.put(key, key)); + // 50% elements are multiple of a divisor of startTableSize => more conflicts. + key = rnd.nextInt(baseSize) * 17; + assertEquals(goodMap.put(key, key), map.put(key, key)); + } + + // Now do some mixed adds and removes for further fuzzing + // Rehash will happen here, but only once, and the final size will be closer to max. + for (int i = 0; i < baseSize * 1000; ++i) { + int key = rnd.nextInt(baseSize); + if (rnd.nextDouble() >= 0.2) { + assertEquals(goodMap.put(key, key), map.put(key, key)); + } else { + assertEquals(goodMap.remove(key), map.remove(key)); + } + } + + // Final batch of fuzzing, only searches and removes. + int removeSize = map.size() / 2; + while (removeSize > 0) { + int key = rnd.nextInt(baseSize); + boolean found = goodMap.containsKey(key); + assertEquals(found, map.containsKey(key)); + assertEquals(goodMap.remove(key), map.remove(key)); + if (found) { + --removeSize; + } + } + + // Now gotta write some code to compare the final maps, as equals() won't work. + assertEquals(goodMap.size(), map.size()); + Integer[] goodKeys = goodMap.keySet().toArray(new Integer[goodMap.size()]); + Arrays.sort(goodKeys); + int [] keys = map.keys(); + Arrays.sort(keys); + for (int i = 0; i < goodKeys.length; ++i) { + assertEquals((int) goodKeys[i], keys[i]); + } + + // Finally drain the map. + for (int key : map.keys()) { + assertEquals(goodMap.remove(key), map.remove(key)); + } + assertTrue(map.isEmpty()); + } }