blob: 23b4fd1609281a73c7a1ace3709e9de36a5e5608 [file] [log] [blame]
Index: lucene/CHANGES.txt
===================================================================
--- lucene/CHANGES.txt (revision 1488278)
+++ lucene/CHANGES.txt (working copy)
@@ -169,6 +169,10 @@
* LUCENE-5022: Added FacetResult.mergeHierarchies to merge multiple
FacetResult of the same dimension into a single one with the reconstructed
hierarchy. (Shai Erera)
+
+* LUCENE-5026: Added PagedGrowableWriter, a new internal packed-ints structure
+ that grows on demand, can store more than 2B values and supports random write
+ and read access. (Adrien Grand)
Build
Index: lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (revision 1488278)
+++ lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (working copy)
@@ -659,6 +659,61 @@
assertEquals(1 << 10, wrt.get(valueCount - 1));
}
+ public void testPagedGrowableWriter() {
+ int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 22));
+ // supports 0 values?
+ PagedGrowableWriter writer = new PagedGrowableWriter(0, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
+ assertEquals(0, writer.size());
+
+ // compare against AppendingLongBuffer
+ AppendingLongBuffer buf = new AppendingLongBuffer();
+ int size = random().nextInt(1000000);
+ long max = 5;
+ for (int i = 0; i < size; ++i) {
+ buf.add(_TestUtil.nextLong(random(), 0, max));
+ if (rarely()) {
+ max = PackedInts.maxValue(rarely() ? _TestUtil.nextInt(random(), 0, 63) : _TestUtil.nextInt(random(), 0, 31));
+ }
+ }
+ writer = new PagedGrowableWriter(size, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
+ assertEquals(size, writer.size());
+ for (int i = size - 1; i >= 0; --i) {
+ writer.set(i, buf.get(i));
+ }
+ for (int i = 0; i < size; ++i) {
+ assertEquals(buf.get(i), writer.get(i));
+ }
+
+ // test copy
+ PagedGrowableWriter copy = writer.resize(_TestUtil.nextLong(random(), writer.size(), 2 * writer.size()));
+ for (long i = 0; i < copy.size(); ++i) {
+ if (i < writer.size()) {
+ assertEquals(writer.get(i), copy.get(i));
+ } else {
+ assertEquals(0, copy.get(i));
+ }
+ }
+ }
+
+ // memory hole
+ @Ignore
+ public void testPagedGrowableWriterOverflow() {
+ final long size = _TestUtil.nextLong(random(), 2 * (long) Integer.MAX_VALUE, 3 * (long) Integer.MAX_VALUE);
+ final int pageSize = 1 << (_TestUtil.nextInt(random(), 16, 24));
+ final PagedGrowableWriter writer = new PagedGrowableWriter(size, pageSize, 1, random().nextFloat());
+ final long index = _TestUtil.nextLong(random(), (long) Integer.MAX_VALUE, size);
+ writer.set(index, 2);
+ assertEquals(2, writer.get(index));
+ for (int i = 0; i < 1000000; ++i) {
+ final long idx = _TestUtil.nextLong(random(), 0, size);
+ if (idx == index) {
+ assertEquals(2, writer.get(idx));
+ } else {
+ assertEquals(0, writer.get(idx));
+ }
+ }
+ }
+
public void testSave() throws IOException {
final int valueCount = _TestUtil.nextInt(random(), 1, 2048);
for (int bpv = 1; bpv <= 64; ++bpv) {
Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
===================================================================
--- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (revision 1488278)
+++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (working copy)
@@ -34,7 +34,8 @@
import org.junit.Ignore;
import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
-@Ignore("Requires tons of heap to run (10G works)")
+// nocommit
+//@Ignore("Requires tons of heap to run (420G works)")
@TimeoutSuite(millis = 100 * TimeUnits.HOUR)
public class Test2BFST extends LuceneTestCase {
@@ -50,12 +51,12 @@
for(int doPackIter=0;doPackIter<2;doPackIter++) {
boolean doPack = doPackIter == 1;
- // Build FST w/ NoOutputs and stop when nodeCount > 3B
+ // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
if (!doPack) {
System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
Outputs<Object> outputs = NoOutputs.getSingleton();
Object NO_OUTPUT = outputs.getNoOutput();
- final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, false, false, Integer.MAX_VALUE, outputs,
+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
null, doPack, PackedInts.COMPACT, true, 15);
int count = 0;
@@ -72,7 +73,7 @@
if (count % 100000 == 0) {
System.out.println(count + ": " + b.fstSizeInBytes() + " bytes; " + b.getTotStateCount() + " nodes");
}
- if (b.getTotStateCount() > LIMIT) {
+ if (b.getTotStateCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
break;
}
nextInput(r, ints2);
Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1488278)
+++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy)
@@ -21,19 +21,20 @@
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PagedGrowableWriter;
// Used to dedup states (lookup already-frozen states)
final class NodeHash<T> {
- private GrowableWriter table;
- private int count;
- private int mask;
+ private PagedGrowableWriter table;
+ private long count;
+ private long mask;
private final FST<T> fst;
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
private final FST.BytesReader in;
public NodeHash(FST<T> fst, FST.BytesReader in) {
- table = new GrowableWriter(8, 16, PackedInts.COMPACT);
+ table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
mask = 15;
this.fst = fst;
this.in = in;
@@ -69,10 +70,10 @@
// hash code for an unfrozen node. This must be identical
// to the un-frozen case (below)!!
- private int hash(Builder.UnCompiledNode<T> node) {
+ private long hash(Builder.UnCompiledNode<T> node) {
final int PRIME = 31;
//System.out.println("hash unfrozen");
- int h = 0;
+ long h = 0;
// TODO: maybe if number of arcs is high we can safely subsample?
for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
final Builder.Arc<T> arc = node.arcs[arcIdx];
@@ -87,14 +88,14 @@
}
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
- return h & Integer.MAX_VALUE;
+ return h & Long.MAX_VALUE;
}
// hash code for a frozen node
- private int hash(long node) throws IOException {
+ private long hash(long node) throws IOException {
final int PRIME = 31;
//System.out.println("hash frozen node=" + node);
- int h = 0;
+ long h = 0;
fst.readFirstRealTargetArc(node, scratchArc, in);
while(true) {
//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
@@ -111,13 +112,13 @@
fst.readNextRealArc(scratchArc, in);
}
//System.out.println(" ret " + (h&Integer.MAX_VALUE));
- return h & Integer.MAX_VALUE;
+ return h & Long.MAX_VALUE;
}
public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
- // System.out.println("hash: add count=" + count + " vs " + table.size());
- final int h = hash(nodeIn);
- int pos = h & mask;
+ //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
+ final long h = hash(nodeIn);
+ long pos = h & mask;
int c = 0;
while(true) {
final long v = table.get(pos);
@@ -128,7 +129,8 @@
assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
count++;
table.set(pos, node);
- if (table.size() < 2*count) {
+ // Rehash at 2/3 occupancy:
+ if (count > 2*table.size()/3) {
rehash();
}
return node;
@@ -144,7 +146,7 @@
// called only by rehash
private void addNew(long address) throws IOException {
- int pos = hash(address) & mask;
+ long pos = hash(address) & mask;
int c = 0;
while(true) {
if (table.get(pos) == 0) {
@@ -158,15 +160,12 @@
}
private void rehash() throws IOException {
- final GrowableWriter oldTable = table;
+ final PagedGrowableWriter oldTable = table;
- if (oldTable.size() >= Integer.MAX_VALUE/2) {
- throw new IllegalStateException("FST too large (> 2.1 GB)");
- }
-
- table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
+ table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count), PackedInts.COMPACT);
+ System.out.println("rehash to " + table.size());
mask = table.size()-1;
- for(int idx=0;idx<oldTable.size();idx++) {
+ for(long idx=0;idx<oldTable.size();idx++) {
final long address = oldTable.get(idx);
if (address != 0) {
addNew(address);
@@ -174,7 +173,10 @@
}
}
- public int count() {
+ // nocommit unused?
+ /*
+ public long count() {
return count;
}
+ */
}
Index: lucene/core/src/java/org/apache/lucene/util/packed/package.html
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/packed/package.html (revision 1488278)
+++ lucene/core/src/java/org/apache/lucene/util/packed/package.html (working copy)
@@ -47,6 +47,11 @@
<li>Same as PackedInts.Mutable but grows the number of bits per values when needed.</li>
<li>Useful to build a PackedInts.Mutable from a read-once stream of longs.</li>
</ul></li>
+ <li><b>{@link org.apache.lucene.util.packed.PagedGrowableWriter}</b><ul>
+ <li>Slices data into fixed-size blocks stored in GrowableWriters.</li>
+ <li>Supports more than 2B values.</li>
+ <li>You should use AppendingLongBuffer instead if you don't need random write access.</li>
+ </ul></li>
<li><b>{@link org.apache.lucene.util.packed.AppendingLongBuffer}</b><ul>
<li>Can store any sequence of longs.</li>
<li>Compression is good when values are close to each other.</li>
Index: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java (revision 0)
+++ lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java (working copy)
@@ -0,0 +1,139 @@
+package org.apache.lucene.util.packed;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
+ * which have independent numbers of bits per value and grow on-demand.
+ * <p>You should use this class instead of {@link AppendingLongBuffer} only when
+ * you need random write-access. Otherwise this class will likely be slower and
+ * less memory-efficient.
+ * @lucene.internal
+ */
+public final class PagedGrowableWriter {
+
+ private final long size;
+ private final int pageShift;
+ private final int pageMask;
+ private final GrowableWriter[] subWriters;
+ private final int startBitsPerValue;
+ private final float acceptableOverheadRatio;
+
+ /**
+ * Create a new {@link PagedGrowableWriter} instance.
+ *
+ * @param size the number of values to store.
+ * @param pageSize the number of values per page
+ * @param startBitsPerValue the initial number of bits per value
+ * @param acceptableOverheadRatio an acceptable overhead ratio
+ */
+ public PagedGrowableWriter(long size, int pageSize,
+ int startBitsPerValue, float acceptableOverheadRatio) {
+ this(size, pageSize, startBitsPerValue, acceptableOverheadRatio, true);
+ }
+
+ PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
+ super();
+ this.size = size;
+ this.startBitsPerValue = startBitsPerValue;
+ this.acceptableOverheadRatio = acceptableOverheadRatio;
+ if (pageSize < 64 || ((pageSize & (pageSize - 1)) != 0)) {
+ throw new IllegalArgumentException("pageSize must be >= 64 and a power of 2, got " + pageSize);
+ }
+ pageShift = 31 - Integer.numberOfLeadingZeros(pageSize);
+ assert (1 << pageShift) == pageSize;
+ pageMask = pageSize - 1;
+ final int numPages = (int) ((size + pageSize - 1) / pageSize);
+ if ((long) numPages * pageSize < size || (long) (numPages - 1) * pageSize > size) {
+ throw new IllegalArgumentException("pageSize must be chosen so that there are at most Integer.MAX_VALUE pages, got size=" + size + ", pageSize=" + pageSize);
+ }
+ subWriters = new GrowableWriter[numPages];
+ if (fillPages) {
+ for (int i = 0; i < numPages; ++i) {
+ int thisPageSize;
+ if (i == numPages-1) {
+ thisPageSize = (int) (size % pageSize);
+ if (thisPageSize == 0) {
+ thisPageSize = pageSize;
+ }
+ } else {
+ thisPageSize = pageSize;
+ }
+ subWriters[i] = new GrowableWriter(startBitsPerValue, thisPageSize, acceptableOverheadRatio);
+ }
+ }
+ }
+
+ private int pageSize() {
+ return pageMask + 1;
+ }
+
+ /** The number of values. */
+ public long size() {
+ return size;
+ }
+
+ int pageIndex(long index) {
+ return (int) (index >>> pageShift);
+ }
+
+ int indexInPage(long index) {
+ return (int) index & pageMask;
+ }
+
+ /** Get value at <code>index</code>. */
+ public long get(long index) {
+ assert index >= 0 && index < size: "index=" + index + " size=" + size;
+ final int pageIndex = pageIndex(index);
+ final int indexInPage = indexInPage(index);
+ return subWriters[pageIndex].get(indexInPage);
+ }
+
+ /** Set value at <code>index</code>. */
+ public void set(long index, long value) {
+ assert index >= 0 && index < size;
+ final int pageIndex = pageIndex(index);
+ final int indexInPage = indexInPage(index);
+ subWriters[pageIndex].set(indexInPage, value);
+ }
+
+ /** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
+ * based on the content of this buffer. This method is much more efficient
+ * than creating a new {@link PagedGrowableWriter} and copying values one by
+ * one. */
+ public PagedGrowableWriter resize(long newSize) {
+ PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
+ final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
+ final long[] copyBuffer = new long[1024];
+ for (int i = 0; i < numCommonPages; ++i) {
+ final int bpv = subWriters[i].getBitsPerValue();
+ newWriter.subWriters[i] = new GrowableWriter(bpv, pageSize(), acceptableOverheadRatio);
+ PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i], 0, pageSize(), copyBuffer);
+ }
+ for (int i = numCommonPages; i < newWriter.subWriters.length; ++i) {
+ newWriter.subWriters[i] = new GrowableWriter(startBitsPerValue, pageSize(), acceptableOverheadRatio);
+ }
+ return newWriter;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + (pageMask+1) + ")";
+ }
+
+}
Property changes on: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Index: lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
===================================================================
--- lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java (revision 1488278)
+++ lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java (working copy)
@@ -1200,31 +1200,37 @@
}
} else {
// use bulk operations
- long[] buf = new long[Math.min(capacity, len)];
- int remaining = 0;
- while (len > 0) {
- final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
- assert read > 0;
- srcPos += read;
- len -= read;
- remaining += read;
- final int written = dest.set(destPos, buf, 0, remaining);
- assert written > 0;
- destPos += written;
- if (written < remaining) {
- System.arraycopy(buf, written, buf, 0, remaining - written);
- }
- remaining -= written;
+ final long[] buf = new long[Math.min(capacity, len)];
+ copy(src, srcPos, dest, destPos, len, buf);
+ }
+ }
+
+ /** Same as {@link #copy(Reader, int, Mutable, int, int, int)} but using a pre-allocated buffer. */
+ static void copy(Reader src, int srcPos, Mutable dest, int destPos, int len, long[] buf) {
+ assert buf.length > 0;
+ int remaining = 0;
+ while (len > 0) {
+ final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
+ assert read > 0;
+ srcPos += read;
+ len -= read;
+ remaining += read;
+ final int written = dest.set(destPos, buf, 0, remaining);
+ assert written > 0;
+ destPos += written;
+ if (written < remaining) {
+ System.arraycopy(buf, written, buf, 0, remaining - written);
}
- while (remaining > 0) {
- final int written = dest.set(destPos, buf, 0, remaining);
- destPos += written;
- remaining -= written;
- System.arraycopy(buf, written, buf, 0, remaining);
- }
+ remaining -= written;
}
+ while (remaining > 0) {
+ final int written = dest.set(destPos, buf, 0, remaining);
+ destPos += written;
+ remaining -= written;
+ System.arraycopy(buf, written, buf, 0, remaining);
+ }
}
-
+
/**
* Expert: reads only the metadata from a stream. This is useful to later
* restore a stream or open a direct reader via