blob: 0a062f163749956c6fe39a3b6332bb152bd87aca [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.internal.util;
import java.util.Arrays;
import java.util.NoSuchElementException;
import org.apache.uima.util.impl.Constants;
/**
* An set of non-zero integers, ability to iterate over them (possibly in a sorted way),
* with O(1) operations for adding, removing, and testing for contains.
*
* Optimized for adding mostly increasing integers (with some space for less-than-first-one-added).
* - major uses: indexes of Feature Structures.
* Optimize for small sets
*
* Graceful degradation for completely random integer operations on large sets;
* keep O(1) operations, at the expense of extra space (< 3 x size).
*
* Uses several different representations depending on the range of ints stored and the total number of ints stored.
*
* Sizes: Tiny, medium, large
* Ranges: semi-knowable, unknowable;
* if semi-knowable, dense, small-range (< 65K), large-range
*
* For all sizes,
* if dense, use IntBitSet (with offset)
*
* else
* For large, (implies large-range, too) use IntHashSet
*
* For medium,
* if small-range < 65K, use IntHashSet with offset
* else use IntHashSet
*
* Arrange switching between representations
* to occur infrequently, especially as cost of switching (size) grows
* Arrange checking for switching to occur infrequently, taking into account how underlying data structures grow
* (which is often by doubling)
* Switch when adding new member(s) if alternative representation is sufficiently smaller
*
*/
public class PositiveIntSet_impl implements PositiveIntSet {
// number of words a inthashset has to be better than intbitset to cause switch
private static final int HYSTERESIS = 16;
private static final int INT_SET_MAX_SIZE = 16;
private static final int HASH_SET_SHORT_MAX_SIZE = (1 << 16) - 1; // 65535
// Extra space in bitset for expansion - can add an int (- offset) up to that size
// Initial space = 2 words = 1 long.
// 64 *4 is 8 words, = 256.
private static final int BIT_SET_OVERALLOCATE = 64 * 4; // 8 words minimum
// Extra space in hashset for initial capacity - when creating from bitmap set - multiplier of existing size
private static final int HASH_SET_OVERALLOCATE_DIVIDER_SHIFT = 1; // bit shift right = divide by 2 = 50% of existing capacity
private PositiveIntSet intSet; // one of 3 representations: IntBitSet, IntHashSet, IntSet (based on IntVector)
//package private for test case
boolean isBitSet = false;
boolean isIntSet = false;
boolean isHashSet = false;
boolean secondTimeShrinkable = false;
/**
* Set false once we find we have to reduce the bit offset
*/
//package private for test case
boolean useOffset = true;
public PositiveIntSet_impl() {
this(0, 0, 0);
}
/**
* Set up a Positive Bit Set
* @param initialSize - if 0, don't allocate yet, wait for first add.
* If isBitSetDense, then this number is interpreted as
* the first int to be added, typically with an offset.
* The next two params are used only if initialSize is not 0.
* @param estMin - the estimated minimum int value to be added
* @param estMax - the estimated max int value to be added
*/
public PositiveIntSet_impl(int initialSize, int estMin, int estMax) {
if (initialSize == 0) {
isBitSet = true; // try first as bit set with offset perhaps
return; // delay allocation until we know what the 1st int will be
}
if ((initialSize < 0) ||
(estMin < 0) ||
(estMax < estMin)) {
throw new IllegalArgumentException();
}
estMax = Math.max(initialSize, estMax); // because its a positive int set
// offsets are different for bit set and hash set
// for bit set, it's set to the estMin - capped 12.5% from the estMin (cap is 16 words)
// for hash set, if the size > 65K, it's ignored
// it's set to 3/4 up the "range" of est max - est min
int offsetBitSet = estimatedBitSetOffset(estMin, -1);
int offsetHashSet = getHashSetOffset(estMax, estMin);
int bitSetWordsNeeded = getBitSetSpaceFromRange(estMax - offsetBitSet, 0);
int hashSetWordsNeeded = getHashSetSpace(initialSize, estMax, estMin);
// no HYSTERESIS in test below - pick absolute smaller for initial alloc
if (bitSetWordsNeeded < hashSetWordsNeeded) {
allocateIntBitSet(estMax, estMin, offsetBitSet);
isBitSet = true;
return;
}
// choose between IntHashSet and IntSet based on size
if (initialSize <= INT_SET_MAX_SIZE) {
intSet = new IntSet(initialSize);
isIntSet = true;
return;
}
intSet = new IntHashSet(initialSize, offsetHashSet);
isHashSet = true;
}
@Override
public IntListIterator iterator() {
return getUnorderedIterator();
}
public IntListIterator getUnorderedIterator() {
if (null == intSet) {
intSet = new IntSet(0);
}
return intSet.iterator();
}
public IntListIterator getOrderedIterator() {
if (isBitSet) {
if (null == intSet) {
return new IntListIteratorOverArray(Constants.EMPTY_INT_ARRAY);
}
return intSet.iterator();
}
int[] allValues = new int[size()];
IntListIterator it = intSet.iterator();
int i = 0;
while (it.hasNext()) {
allValues[i++] = it.nextNvc();
}
Arrays.sort(allValues);
return new IntListIteratorOverArray(allValues);
}
private class IntListIteratorOverArray implements IntListIterator {
private int[] a;
private int pos;
IntListIteratorOverArray(int[] a) {
this.a = a;
pos = 0;
}
@Override
public boolean hasNext() {
return pos >= 0 && pos < a.length;
}
@Override
public int nextNvc() {
return a[pos++];
}
@Override
public boolean hasPrevious() {
return pos > 0 && pos < a.length;
}
@Override
public int previousNvc() {
return a[--pos]; // decrement first!
}
@Override
public void moveToStart() {
pos = 0;
}
@Override
public void moveToEnd() {
pos = a.length - 1;
}
}
/* (non-Javadoc)
* @see org.apache.uima.internal.util.PositiveIntSetI#clear()
*/
@Override
public void clear() {
if (null != intSet) {
if (isBitSet) {
if (secondTimeShrinkable) {
secondTimeShrinkable = false;
IntBitSet ib = (IntBitSet) intSet;
int max = ib.getLargestMenber();
int offset = ib.getOffset();
intSet = new IntBitSet((max - offset) >> 1, offset);
} else {
intSet.clear();
secondTimeShrinkable = true;
}
} else {
intSet.clear();
}
}
}
/* (non-Javadoc)
* @see org.apache.uima.internal.util.PositiveIntSetI#contains(int)
*/
@Override
public boolean contains(int key) {
return (null != intSet) ? intSet.contains(key) : false;
}
@Override
public int find(int element) {
return (null != intSet) ? intSet.find(element) : -1;
}
/* (non-Javadoc)
* @see org.apache.uima.internal.util.PositiveIntSetI#add(int)
*/
@Override
public boolean add(int key) {
maybeSwitchRepresentation(key);
return intSet.add(key);
}
/* (non-Javadoc)
* @see org.apache.uima.internal.util.PositiveIntSetI#remove(int)
*/
@Override
public boolean remove(int key) {
return (null != intSet) ? intSet.remove(key) : false;
}
/* (non-Javadoc)
* @see org.apache.uima.internal.util.PositiveIntSetI#size()
*/
@Override
public int size() {
return (null != intSet) ? intSet.size() : 0;
}
// public void add(int[] vs) {
// for (int i : vs) {
// add(i);
// }
// }
/**
* When a key is added which is lower than the offset, we need to adjust the whole int set table.
* Although an array copy could do this, we don't have access to the bitset impl.
*
* So we just set the offset to 0, and either convert the whole thing to a inthashset or copy all the bits to a new
* bit set.
*
* @param key
*/
private void adjustBitSetForLowerOffset(int key) {
final IntBitSet intBitSet = (IntBitSet) intSet;
final int size = intBitSet.size() + 1;
final int largestInt = intBitSet.getLargestMenber(); // not called if key > largest member
final int bitSetSpaceNeeded = getBitSetSpaceFromRange(largestInt, 0); // doesn't use offset
final int hashSetSpaceNeeded = getHashSetSpace(size, largestInt, key); // computes using load factor,
useOffset = false; // stop using because we have evidence it isn't appropriate
if (hashSetSpaceNeeded < (bitSetSpaceNeeded - HYSTERESIS)) {
int offset = getHashSetOffset(largestInt, key);
switchFromBitSet(size, offset);
// keep as bit set - it's smaller, but drop the offset
} else {
IntListIterator it = intSet.iterator();
allocateIntBitSet(largestInt, key, 0);
while (it.hasNext()) {
intSet.add(it.nextNvc());
}
}
}
private static int getHashSetOffset(int estMax, int estMin) {
int range = estMax - estMin;
assert (range >= 0);
return (range > HASH_SET_SHORT_MAX_SIZE) ?
Integer.MIN_VALUE : // signal to force use of 4 byte ints
// make the offset such that the 0 point is 3/4 of the way
// toward the top of the range, because there's more likelyhood
// of expansion in that direction
estMax - (range >> 2);
}
public int[] toUnorderedIntArray() {
final int[] a = new int[size()];
IntListIterator it = getUnorderedIterator();
int i = 0;
while (it.hasNext()) {
a[i++] = it.nextNvc();
}
return a;
}
public int[] toOrderedIntArray() {
int [] a = toUnorderedIntArray();
if (isBitSet) {
return a;
}
Arrays.sort(a);
return a;
}
/**
* Only called if key won't fit in existing IntHashSet, IntSet or IntBitSet allocation
*
* If IntBitSet (spaceUsed != 0) compute what its size() (capacity) would be if the
* key were added (causing doubling); include the overhead of the intbitset class.
*
* long words required = larger of double the current size, or the number of long words required
* @param adjKey - the range of bits needed
* @param spaceUsed - the number of bits currently allocated (if currently a bit set), else 0
* @return number of words needed to store the range,
*/
private static int getBitSetSpaceFromRange(int adjKey, int spaceUsed) {
int w64 = 1 + (adjKey >> 6); // 64 bits per long, add one because 0 to 63 takes one word)
spaceUsed = spaceUsed >> (6 - 1); // in # of long words, * 2 because the alloc doubles the space
int newSpace = Math.max(spaceUsed, w64) << 1; // <<1 to convert to # of words from # of longs
// e.g., 31 key => 1 word, 32 key => 2 words
return newSpace + 2; // 2 more for IntBitSet object overhead
}
/**
*
* @param size including new element to be added
* @return number of words including overhead to store that many elements
* unless is bigger than INT_SET_MAX_SIZE, then return MAX_VALUE
*/
private static int getIntSetSpace(int size) {
return (size < INT_SET_MAX_SIZE) ? size + 4 : Integer.MAX_VALUE;
}
/**
* Only called if key won't fit in existing allocation
*
* returns new hash table size ( usually double) + overhead
*
* @param numberOfEntries
* @return new hash table size ( usually double) + overhead
*/
private int getHashSetSpace() {
return ((IntHashSet)intSet).getSpaceUsedInWords() * 2 + IntHashSet.getSpaceOverheadInWords();
}
/**
* For the case where the intHashSet doesn't yet exist
* @param numberOfElements -
* @param estMax - the largest int to be stored
* @param estMin - the smallest int to be stored
* @return the size in words
*/
private static int getHashSetSpace(int numberOfElements, int estMax, int estMin) {
boolean isShort;
if (numberOfElements > HASH_SET_SHORT_MAX_SIZE) {
isShort = false;
} else {
isShort = (estMax - estMin) < HASH_SET_SHORT_MAX_SIZE;
}
int numberOfTableElements = IntHashSet.tableSpace(numberOfElements);
return (numberOfTableElements >> ((isShort) ? 1 : 0))
+ IntHashSet.getSpaceOverheadInWords();
}
/**
* When converting from bitset to hash set, the initial hash set should be
* big enough so that it takes a while before it is doubled-expanded.
*
* Reduce overallocation for small sizes because the cost to switch is low, and the % overallocate
* could be a large.
*
* @param existingSize
* @return overallocate size
*/
private static int getHashSetOverAllocateSize(int existingSize) {
return existingSize + (existingSize >> HASH_SET_OVERALLOCATE_DIVIDER_SHIFT);
}
/**
* logic for switching representation
*
* BitSet preferred - is faster, can be more compact, and permits ordered iteration
* HashSet used when BitSet is too space inefficient.
*
* Avoid switching if the new key being added won't increase the representation size
* - for hash: if number of elements +1 won't cause doubling of the hash table
* - for bitset: if key will fit in existing "capacity"
*
* Compute space needed after adding
* - bitset: array doubles
* - hashset: array doubles
*
* Preventing jitters back and forth:
* - removes don't cause switching
* - compare against other size including its non-linear space jumps.
*/
private void maybeSwitchRepresentation(int newKey) {
if (isBitSet) {
handleBitSet(newKey);
return;
}
if (isIntSet) {
handleIntSet(newKey);
return;
}
handleHashSet(newKey);
}
/********************
* Bit Set
********************/
private void handleBitSet(int newKey) {
IntBitSet intBitSet = (IntBitSet) intSet;
if (intBitSet == null) {
/*********
* Size is 0,
* use this opportunity to set the offset
* unless that has been a bad strategy for this
* instance already
********/
boolean guaranteedFits = allocateIntBitSet(newKey, newKey, -1);
intBitSet = (IntBitSet) intSet;
if (guaranteedFits) {
return;
}
}
final int offset = intBitSet.getOffset();
if (newKey < offset) {
// may change representation; always abandons this bit set
adjustBitSetForLowerOffset(newKey);
return;
}
// return if newKey fits in existing allocation
final int spaceUsed = intBitSet.getSpaceUsed_in_bits_no_overhead();
final int adjKey = newKey - offset;
if (adjKey < spaceUsed) {
return;
}
// this newKey doesn't fit in existing bit set allocation; it's too large
// figure out if we should expand this bit set or switch representations
final int bitSetSpaceNeeded = getBitSetSpaceFromRange(adjKey, spaceUsed); // in 32 bit words
final int sizeNeeded = intBitSet.size() + 1;
if (getIntSetSpace(sizeNeeded) < bitSetSpaceNeeded) {
switchToIntSet(sizeNeeded);
return;
}
// when computing hashset size, overallocate by 50% to avoid immediately doubling
final int hashSetOverAllocateSize = getHashSetOverAllocateSize(sizeNeeded);
final int hashSetOverAllocateSpace = getHashSetSpace(hashSetOverAllocateSize, newKey, offset);
// switch if hashmap space plus hysteresis would be < bitmap space
if (hashSetOverAllocateSpace < (bitSetSpaceNeeded - HYSTERESIS)) {
switchToHashSet(hashSetOverAllocateSize, getHashSetOffset(newKey, offset));
}
return;
}
/********************
* IntSet *
********************/
private void handleIntSet(int newKey) {
IntSet is = (IntSet) intSet;
final int size = is.size() + 1;
if (size <= INT_SET_MAX_SIZE) {
return;
}
// switch to bit or IntHash
// first, compute max and min values
int mostPos = Integer.MIN_VALUE;
int mostNeg = Integer.MAX_VALUE;
for (int i = 0; i < size - 1; i++) {
final int v = is.get(i);
if (v > mostPos) { mostPos = v;}
if (v < mostNeg) { mostNeg = v;}
}
// include the newKey in the computation of these
if (newKey > mostPos) { mostPos = newKey;}
if (newKey < mostNeg) { mostNeg = newKey;}
// make a bit set or hash set.
// have 16 members, so the hash set is approx 32 + 11 words = 43.
// bit set is approx 2 + range / 32
final int bitSetSpace = getBitSetSpaceFromRange(mostPos - mostNeg, 0);
final int hashSetOverAllocSize = getHashSetOverAllocateSize(size);
final int hashSetSpace = getHashSetSpace(hashSetOverAllocSize, mostPos, mostNeg);
if (bitSetSpace < hashSetSpace) {
switchToBitSet(mostPos, mostNeg, estimatedBitSetOffset(mostNeg, -1));
return;
}
switchToHashSet(hashSetOverAllocSize, getHashSetOffset(mostPos, mostNeg));
}
/********************
* Hash Set
********************/
private void handleHashSet(int newKey) {
// maybe switch to IntBitSet
// space used in words (32 bit words)
// don't bother to include the new element - it might not be "added"
// if it was already there, and only serves to decrease hysteresis a bit
final IntHashSet intHashSet = (IntHashSet) intSet;
if (intHashSet.wontExpand()) {
return;
}
final int hashSetSpaceNeeded = getHashSetSpace(); //computes the doubling of space
final int maxInt = intHashSet.getMostPositive();
// https://issues.apache.org/jira/browse/UIMA-4159
final int offset = useOffset ? Math.min(intHashSet.getMostNegative(), newKey) : 0;
final int bitSetSpaceNeeded = getBitSetSpaceFromRange(BIT_SET_OVERALLOCATE + maxInt - offset, 0);
if (bitSetSpaceNeeded < (hashSetSpaceNeeded - HYSTERESIS)) {
switchToBitSet(maxInt, intHashSet.getMostNegative(), offset);
}
}
/**
* Allocate a new IntBitSet
* @param estMax the largest int to store in the bit set - determines the initial size
* @param estMin this is a lower bound on the value that can be in the bit set
* @param offsetSpec this is the offset to use, or -1, to calculate the offset from
* the initial_size (assuming a small amount of adds will be for
* ints somewhat less than the initial_size entry (being treated
* as the first int to be added)
* @return true if allocated with offset, implying guarantee the estMax key fits.
*/
private boolean allocateIntBitSet(int estMax, int estMin, int offsetSpec) {
isBitSet = true;
isHashSet = isIntSet = false;
if (useOffset) {
// initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
// a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
final int offset = estimatedBitSetOffset(estMin, offsetSpec);
intSet = new IntBitSet(BIT_SET_OVERALLOCATE + estMax - offset, offset);
// because of offset, won't have to switch to hash for this key
return true;
} else {
intSet = new IntBitSet(BIT_SET_OVERALLOCATE + estMax);
return false;
}
}
private static int estimatedBitSetOffset(int estMin, int offsetSpec) {
return // initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
// a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
(offsetSpec == -1) ?
Math.max(0, estMin - Math.min(1023, Math.max(63, estMin >> 3))) :
offsetSpec;
}
/**
* switching from bit set to either IntSet or IntHashSet
* @param size - space needed including new item
* @param offset - for IntHashSet values kept as short, the offset subtracted from values before storing them
* offset == Integer.MIN_VALUE means use 4 byte ints
*/
private void switchFromBitSet(int size, int offset) {
if (size < INT_SET_MAX_SIZE) {
switchToIntSet(size);
return;
}
switchToHashSet(size, offset);
}
/**
* switch to IntSet
* @param size number of elements to be able to store without expanding
*/
private void switchToIntSet(int size) {
IntListIterator it = intSet.iterator();
intSet = new IntSet(size);
isIntSet = true;
isBitSet = isHashSet = false;
while (it.hasNext()) {
intSet.add(it.nextNvc());
}
return;
}
/**
* Switch from any intSet impl to Hash Set
* @param size - the capacity - this many elements could be stored before swtiching
* @param offset - used only when the size < 65K, and is a value subtracted from elements before
* storing them, in an attempt to fit the results into just "short"s instead of "int"s
* if == MIN_VALUE, then force 4 byte ints
*/
private void switchToHashSet(int size, int offset) {
IntListIterator it = intSet.iterator();
intSet = new IntHashSet(size, offset);
isIntSet = isBitSet = false;
isHashSet = true;
while (it.hasNext()) {
intSet.add(it.nextNvc());
}
}
private void switchToBitSet(int estMax, int estMin, int offset) {
final IntListIterator it = intSet.iterator();
allocateIntBitSet(estMax, estMin, offset);
while (it.hasNext()) {
intSet.add(it.nextNvc());
}
}
@Override
public int get(int position) {
if (null == intSet) {
throw new NoSuchElementException();
}
return intSet.get(position);
}
@Override
public int moveToFirst() {
return (null == intSet) ? -1 : intSet.moveToFirst();
}
@Override
public int moveToLast() {
return (null == intSet) ? -1 : intSet.moveToLast();
}
@Override
public int moveToNext(int position) {
return (null == intSet) ? -1 : intSet.moveToNext(position);
}
@Override
public int moveToPrevious(int position) {
return (null == intSet) ? -1 : intSet.moveToPrevious(position);
}
@Override
public boolean isValid(int position) {
return (null == intSet) ? false : intSet.isValid(position);
}
@Override
public void bulkAddTo(IntVector v) {
if (null != intSet) {
intSet.bulkAddTo(v);
}
}
@Override
public int[] toIntArray() {
if (null != intSet) {
return intSet.toIntArray();
}
return Constants.EMPTY_INT_ARRAY;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return String.format("PositiveIntSet_impl [%n intSet=%s%n secondTimeShrinkable=%s, useOffset=%s]",
intSet, secondTimeShrinkable, useOffset);
}
// for testing
boolean isOffsetBitSet() {
return (isBitSet && ((IntBitSet)intSet).getOffset() != 0);
}
// for testing
boolean isShortHashSet() {
return (isHashSet && ((IntHashSet)intSet).isShortHashSet());
}
}