docs/attachments/LUCENE-5025/LUCENE-5025.patch - lucene-jira-archive - Git at Google

 Index: lucene/CHANGES.txt
 ===================================================================
 --- lucene/CHANGES.txt	(revision 1488278)
 +++ lucene/CHANGES.txt	(working copy)
 @@ -169,6 +169,10 @@
  * LUCENE-5022: Added FacetResult.mergeHierarchies to merge multiple
    FacetResult of the same dimension into a single one with the reconstructed
    hierarchy. (Shai Erera)
 +
 +* LUCENE-5026: Added PagedGrowableWriter, a new internal packed-ints structure
 +  that grows on demand, can store more than 2B values and supports random write
 +  and read access. (Adrien Grand)

  Build

 Index: lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
 ===================================================================
 --- lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java	(revision 1488278)
 +++ lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java	(working copy)
 @@ -659,6 +659,61 @@
      assertEquals(1 << 10, wrt.get(valueCount - 1));
    }

 +  public void testPagedGrowableWriter() {
 +    int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 22));
 +    // supports 0 values?
 +    PagedGrowableWriter writer = new PagedGrowableWriter(0, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
 +    assertEquals(0, writer.size());
 +
 +    // compare against AppendingLongBuffer
 +    AppendingLongBuffer buf = new AppendingLongBuffer();
 +    int size = random().nextInt(1000000);
 +    long max = 5;
 +    for (int i = 0; i < size; ++i) {
 +      buf.add(_TestUtil.nextLong(random(), 0, max));
 +      if (rarely()) {
 +        max = PackedInts.maxValue(rarely() ? _TestUtil.nextInt(random(), 0, 63) : _TestUtil.nextInt(random(), 0, 31));
 +      }
 +    }
 +    writer = new PagedGrowableWriter(size, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
 +    assertEquals(size, writer.size());
 +    for (int i = size - 1; i >= 0; --i) {
 +      writer.set(i, buf.get(i));
 +    }
 +    for (int i = 0; i < size; ++i) {
 +      assertEquals(buf.get(i), writer.get(i));
 +    }
 +
 +    // test copy
 +    PagedGrowableWriter copy = writer.resize(_TestUtil.nextLong(random(), writer.size(), 2 * writer.size()));
 +    for (long i = 0; i < copy.size(); ++i) {
 +      if (i < writer.size()) {
 +        assertEquals(writer.get(i), copy.get(i));
 +      } else {
 +        assertEquals(0, copy.get(i));
 +      }
 +    }
 +  }
 +
 +  // memory hole
 +  @Ignore
 +  public void testPagedGrowableWriterOverflow() {
 +    final long size = _TestUtil.nextLong(random(), 2 * (long) Integer.MAX_VALUE, 3 * (long) Integer.MAX_VALUE);
 +    final int pageSize = 1 << (_TestUtil.nextInt(random(), 16, 24));
 +    final PagedGrowableWriter writer = new PagedGrowableWriter(size, pageSize, 1, random().nextFloat());
 +    final long index = _TestUtil.nextLong(random(), (long) Integer.MAX_VALUE, size);
 +    writer.set(index, 2);
 +    assertEquals(2, writer.get(index));
 +    for (int i = 0; i < 1000000; ++i) {
 +      final long idx = _TestUtil.nextLong(random(), 0, size);
 +      if (idx == index) {
 +        assertEquals(2, writer.get(idx));
 +      } else {
 +        assertEquals(0, writer.get(idx));
 +      }
 +    }
 +  }
 +
    public void testSave() throws IOException {
      final int valueCount = _TestUtil.nextInt(random(), 1, 2048);
      for (int bpv = 1; bpv <= 64; ++bpv) {
 Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
 ===================================================================
 --- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java	(revision 1488278)
 +++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java	(working copy)
 @@ -34,7 +34,8 @@
  import org.junit.Ignore;
  import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;

 -@Ignore("Requires tons of heap to run (10G works)")
 +// nocommit
 +//@Ignore("Requires tons of heap to run (420G works)")
  @TimeoutSuite(millis = 100 * TimeUnits.HOUR)
  public class Test2BFST extends LuceneTestCase {

 @@ -50,12 +51,12 @@
      for(int doPackIter=0;doPackIter<2;doPackIter++) {
        boolean doPack = doPackIter == 1;

 -      // Build FST w/ NoOutputs and stop when nodeCount > 3B
 +      // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
        if (!doPack) {
          System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
          Outputs<Object> outputs = NoOutputs.getSingleton();
          Object NO_OUTPUT = outputs.getNoOutput();
 -        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, false, false, Integer.MAX_VALUE, outputs,
 +        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
                                                        null, doPack, PackedInts.COMPACT, true, 15);

          int count = 0;
 @@ -72,7 +73,7 @@
            if (count % 100000 == 0) {
              System.out.println(count + ": " + b.fstSizeInBytes() + " bytes; " + b.getTotStateCount() + " nodes");
            }
 -          if (b.getTotStateCount() > LIMIT) {
 +          if (b.getTotStateCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
              break;
            }
            nextInput(r, ints2);
 Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java	(revision 1488278)
 +++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java	(working copy)
 @@ -21,19 +21,20 @@

  import org.apache.lucene.util.packed.GrowableWriter;
  import org.apache.lucene.util.packed.PackedInts;
 +import org.apache.lucene.util.packed.PagedGrowableWriter;

  // Used to dedup states (lookup already-frozen states)
  final class NodeHash<T> {

 -  private GrowableWriter table;
 -  private int count;
 -  private int mask;
 +  private PagedGrowableWriter table;
 +  private long count;
 +  private long mask;
    private final FST<T> fst;
    private final FST.Arc<T> scratchArc = new FST.Arc<T>();
    private final FST.BytesReader in;

    public NodeHash(FST<T> fst, FST.BytesReader in) {
 -    table = new GrowableWriter(8, 16, PackedInts.COMPACT);
 +    table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
      mask = 15;
      this.fst = fst;
      this.in = in;
 @@ -69,10 +70,10 @@

    // hash code for an unfrozen node.  This must be identical
    // to the un-frozen case (below)!!
 -  private int hash(Builder.UnCompiledNode<T> node) {
 +  private long hash(Builder.UnCompiledNode<T> node) {
      final int PRIME = 31;
      //System.out.println("hash unfrozen");
 -    int h = 0;
 +    long h = 0;
      // TODO: maybe if number of arcs is high we can safely subsample?
      for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
        final Builder.Arc<T> arc = node.arcs[arcIdx];
 @@ -87,14 +88,14 @@
        }
      }
      //System.out.println("  ret " + (h&Integer.MAX_VALUE));
 -    return h & Integer.MAX_VALUE;
 +    return h & Long.MAX_VALUE;
    }

    // hash code for a frozen node
 -  private int hash(long node) throws IOException {
 +  private long hash(long node) throws IOException {
      final int PRIME = 31;
      //System.out.println("hash frozen node=" + node);
 -    int h = 0;
 +    long h = 0;
      fst.readFirstRealTargetArc(node, scratchArc, in);
      while(true) {
        //System.out.println("  label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
 @@ -111,13 +112,13 @@
        fst.readNextRealArc(scratchArc, in);
      }
      //System.out.println("  ret " + (h&Integer.MAX_VALUE));
 -    return h & Integer.MAX_VALUE;
 +    return h & Long.MAX_VALUE;
    }

    public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
 -    // System.out.println("hash: add count=" + count + " vs " + table.size());
 -    final int h = hash(nodeIn);
 -    int pos = h & mask;
 +    //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
 +    final long h = hash(nodeIn);
 +    long pos = h & mask;
      int c = 0;
      while(true) {
        final long v = table.get(pos);
 @@ -128,7 +129,8 @@
          assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
          count++;
          table.set(pos, node);
 -        if (table.size() < 2*count) {
 +        // Rehash at 2/3 occupancy:
 +        if (count > 2*table.size()/3) {
            rehash();
          }
          return node;
 @@ -144,7 +146,7 @@

    // called only by rehash
    private void addNew(long address) throws IOException {
 -    int pos = hash(address) & mask;
 +    long pos = hash(address) & mask;
      int c = 0;
      while(true) {
        if (table.get(pos) == 0) {
 @@ -158,15 +160,12 @@
    }

    private void rehash() throws IOException {
 -    final GrowableWriter oldTable = table;
 +    final PagedGrowableWriter oldTable = table;

 -    if (oldTable.size() >= Integer.MAX_VALUE/2) {
 -      throw new IllegalStateException("FST too large (> 2.1 GB)");
 -    }
 -
 -    table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
 +    table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count), PackedInts.COMPACT);
 +    System.out.println("rehash to " + table.size());
      mask = table.size()-1;
 -    for(int idx=0;idx<oldTable.size();idx++) {
 +    for(long idx=0;idx<oldTable.size();idx++) {
        final long address = oldTable.get(idx);
        if (address != 0) {
          addNew(address);
 @@ -174,7 +173,10 @@
      }
    }

 -  public int count() {
 +  // nocommit unused?
 +  /*
 +  public long count() {
      return count;
    }
 +  */
  }
 Index: lucene/core/src/java/org/apache/lucene/util/packed/package.html
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/util/packed/package.html	(revision 1488278)
 +++ lucene/core/src/java/org/apache/lucene/util/packed/package.html	(working copy)
 @@ -47,6 +47,11 @@
          <li>Same as PackedInts.Mutable but grows the number of bits per values when needed.</li>
          <li>Useful to build a PackedInts.Mutable from a read-once stream of longs.</li>
      </ul></li>
 +    <li><b>{@link org.apache.lucene.util.packed.PagedGrowableWriter}</b><ul>
 +        <li>Slices data into fixed-size blocks stored in GrowableWriters.</li>
 +        <li>Supports more than 2B values.</li>
 +        <li>You should use AppendingLongBuffer instead if you don't need random write access.</li>
 +    </ul></li>
      <li><b>{@link org.apache.lucene.util.packed.AppendingLongBuffer}</b><ul>
          <li>Can store any sequence of longs.</li>
          <li>Compression is good when values are close to each other.</li>
 Index: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java	(revision 0)
 +++ lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java	(working copy)
 @@ -0,0 +1,139 @@
 +package org.apache.lucene.util.packed;
 +
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +/**
 + * A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
 + * which have independent numbers of bits per value and grow on-demand.
 + * <p>You should use this class instead of {@link AppendingLongBuffer} only when
 + * you need random write-access. Otherwise this class will likely be slower and
 + * less memory-efficient.
 + * @lucene.internal
 + */
 +public final class PagedGrowableWriter {
 +
 +  private final long size;
 +  private final int pageShift;
 +  private final int pageMask;
 +  private final GrowableWriter[] subWriters;
 +  private final int startBitsPerValue;
 +  private final float acceptableOverheadRatio;
 +
 +  /**
 +   * Create a new {@link PagedGrowableWriter} instance.
 +   *
 +   * @param size the number of values to store.
 +   * @param pageSize the number of values per page
 +   * @param startBitsPerValue the initial number of bits per value
 +   * @param acceptableOverheadRatio an acceptable overhead ratio
 +   */
 +  public PagedGrowableWriter(long size, int pageSize,
 +      int startBitsPerValue, float acceptableOverheadRatio) {
 +    this(size, pageSize, startBitsPerValue, acceptableOverheadRatio, true);
 +  }
 +
 +  PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
 +    super();
 +    this.size = size;
 +    this.startBitsPerValue = startBitsPerValue;
 +    this.acceptableOverheadRatio = acceptableOverheadRatio;
 +    if (pageSize < 64  || ((pageSize & (pageSize - 1)) != 0)) {
 +      throw new IllegalArgumentException("pageSize must be >= 64 and a power of 2, got " + pageSize);
 +    }
 +    pageShift = 31 - Integer.numberOfLeadingZeros(pageSize);
 +    assert (1 << pageShift) == pageSize;
 +    pageMask = pageSize - 1;
 +    final int numPages = (int) ((size + pageSize - 1) / pageSize);
 +    if ((long) numPages * pageSize < size || (long) (numPages - 1) * pageSize > size) {
 +      throw new IllegalArgumentException("pageSize must be chosen so that there are at most Integer.MAX_VALUE pages, got size=" + size + ", pageSize=" + pageSize);
 +    }
 +    subWriters = new GrowableWriter[numPages];
 +    if (fillPages) {
 +      for (int i = 0; i < numPages; ++i) {
 +        int thisPageSize;
 +        if (i == numPages-1) {
 +          thisPageSize = (int) (size % pageSize);
 +          if (thisPageSize == 0) {
 +            thisPageSize = pageSize;
 +          }
 +        } else {
 +          thisPageSize = pageSize;
 +        }
 +        subWriters[i] = new GrowableWriter(startBitsPerValue, thisPageSize, acceptableOverheadRatio);
 +      }
 +    }
 +  }
 +
 +  private int pageSize() {
 +    return pageMask + 1;
 +  }
 +
 +  /** The number of values. */
 +  public long size() {
 +    return size;
 +  }
 +
 +  int pageIndex(long index) {
 +    return (int) (index >>> pageShift);
 +  }
 +
 +  int indexInPage(long index) {
 +    return (int) index & pageMask;
 +  }
 +
 +  /** Get value at <code>index</code>. */
 +  public long get(long index) {
 +    assert index >= 0 && index < size: "index=" + index + " size=" + size;
 +    final int pageIndex = pageIndex(index);
 +    final int indexInPage = indexInPage(index);
 +    return subWriters[pageIndex].get(indexInPage);
 +  }
 +
 +  /** Set value at <code>index</code>. */
 +  public void set(long index, long value) {
 +    assert index >= 0 && index < size;
 +    final int pageIndex = pageIndex(index);
 +    final int indexInPage = indexInPage(index);
 +    subWriters[pageIndex].set(indexInPage, value);
 +  }
 +
 +  /** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
 +   *  based on the content of this buffer. This method is much more efficient
 +   *  than creating a new {@link PagedGrowableWriter} and copying values one by
 +   *  one. */
 +  public PagedGrowableWriter resize(long newSize) {
 +    PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
 +    final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
 +    final long[] copyBuffer = new long[1024];
 +    for (int i = 0; i < numCommonPages; ++i) {
 +      final int bpv = subWriters[i].getBitsPerValue();
 +      newWriter.subWriters[i] = new GrowableWriter(bpv, pageSize(), acceptableOverheadRatio);
 +      PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i], 0, pageSize(), copyBuffer);
 +    }
 +    for (int i = numCommonPages; i < newWriter.subWriters.length; ++i) {
 +      newWriter.subWriters[i] = new GrowableWriter(startBitsPerValue, pageSize(), acceptableOverheadRatio);
 +    }
 +    return newWriter;
 +  }
 +
 +  @Override
 +  public String toString() {
 +    return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + (pageMask+1) + ")";
 +  }
 +
 +}

 Property changes on: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
 ___________________________________________________________________
 Added: svn:eol-style
 ## -0,0 +1 ##
 +native
 \ No newline at end of property
 Index: lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
 ===================================================================
 --- lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java	(revision 1488278)
 +++ lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java	(working copy)
 @@ -1200,31 +1200,37 @@
        }
      } else {
        // use bulk operations
 -      long[] buf = new long[Math.min(capacity, len)];
 -      int remaining = 0;
 -      while (len > 0) {
 -        final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
 -        assert read > 0;
 -        srcPos += read;
 -        len -= read;
 -        remaining += read;
 -        final int written = dest.set(destPos, buf, 0, remaining);
 -        assert written > 0;
 -        destPos += written;
 -        if (written < remaining) {
 -          System.arraycopy(buf, written, buf, 0, remaining - written);
 -        }
 -        remaining -= written;
 +      final long[] buf = new long[Math.min(capacity, len)];
 +      copy(src, srcPos, dest, destPos, len, buf);
 +    }
 +  }
 +
 +  /** Same as {@link #copy(Reader, int, Mutable, int, int, int)} but using a pre-allocated buffer. */
 +  static void copy(Reader src, int srcPos, Mutable dest, int destPos, int len, long[] buf) {
 +    assert buf.length > 0;
 +    int remaining = 0;
 +    while (len > 0) {
 +      final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
 +      assert read > 0;
 +      srcPos += read;
 +      len -= read;
 +      remaining += read;
 +      final int written = dest.set(destPos, buf, 0, remaining);
 +      assert written > 0;
 +      destPos += written;
 +      if (written < remaining) {
 +        System.arraycopy(buf, written, buf, 0, remaining - written);
        }
 -      while (remaining > 0) {
 -        final int written = dest.set(destPos, buf, 0, remaining);
 -        destPos += written;
 -        remaining -= written;
 -        System.arraycopy(buf, written, buf, 0, remaining);
 -      }
 +      remaining -= written;
      }
 +    while (remaining > 0) {
 +      final int written = dest.set(destPos, buf, 0, remaining);
 +      destPos += written;
 +      remaining -= written;
 +      System.arraycopy(buf, written, buf, 0, remaining);
 +    }
    }
 -
 +
    /**
     * Expert: reads only the metadata from a stream. This is useful to later
     * restore a stream or open a direct reader via
	Index: lucene/CHANGES.txt
	===================================================================
	--- lucene/CHANGES.txt (revision 1488278)
	+++ lucene/CHANGES.txt (working copy)
	@@ -169,6 +169,10 @@
	* LUCENE-5022: Added FacetResult.mergeHierarchies to merge multiple
	FacetResult of the same dimension into a single one with the reconstructed
	hierarchy. (Shai Erera)
	+
	+* LUCENE-5026: Added PagedGrowableWriter, a new internal packed-ints structure
	+ that grows on demand, can store more than 2B values and supports random write
	+ and read access. (Adrien Grand)

	Build

	Index: lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java
	===================================================================
	--- lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (revision 1488278)
	+++ lucene/core/src/test/org/apache/lucene/util/packed/TestPackedInts.java (working copy)
	@@ -659,6 +659,61 @@
	assertEquals(1 << 10, wrt.get(valueCount - 1));
	}

	+ public void testPagedGrowableWriter() {
	+ int pageSize = 1 << (_TestUtil.nextInt(random(), 6, 22));
	+ // supports 0 values?
	+ PagedGrowableWriter writer = new PagedGrowableWriter(0, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
	+ assertEquals(0, writer.size());
	+
	+ // compare against AppendingLongBuffer
	+ AppendingLongBuffer buf = new AppendingLongBuffer();
	+ int size = random().nextInt(1000000);
	+ long max = 5;
	+ for (int i = 0; i < size; ++i) {
	+ buf.add(_TestUtil.nextLong(random(), 0, max));
	+ if (rarely()) {
	+ max = PackedInts.maxValue(rarely() ? _TestUtil.nextInt(random(), 0, 63) : _TestUtil.nextInt(random(), 0, 31));
	+ }
	+ }
	+ writer = new PagedGrowableWriter(size, pageSize, _TestUtil.nextInt(random(), 1, 64), random().nextFloat());
	+ assertEquals(size, writer.size());
	+ for (int i = size - 1; i >= 0; --i) {
	+ writer.set(i, buf.get(i));
	+ }
	+ for (int i = 0; i < size; ++i) {
	+ assertEquals(buf.get(i), writer.get(i));
	+ }
	+
	+ // test copy
	+ PagedGrowableWriter copy = writer.resize(_TestUtil.nextLong(random(), writer.size(), 2 * writer.size()));
	+ for (long i = 0; i < copy.size(); ++i) {
	+ if (i < writer.size()) {
	+ assertEquals(writer.get(i), copy.get(i));
	+ } else {
	+ assertEquals(0, copy.get(i));
	+ }
	+ }
	+ }
	+
	+ // memory hole
	+ @Ignore
	+ public void testPagedGrowableWriterOverflow() {
	+ final long size = _TestUtil.nextLong(random(), 2 * (long) Integer.MAX_VALUE, 3 * (long) Integer.MAX_VALUE);
	+ final int pageSize = 1 << (_TestUtil.nextInt(random(), 16, 24));
	+ final PagedGrowableWriter writer = new PagedGrowableWriter(size, pageSize, 1, random().nextFloat());
	+ final long index = _TestUtil.nextLong(random(), (long) Integer.MAX_VALUE, size);
	+ writer.set(index, 2);
	+ assertEquals(2, writer.get(index));
	+ for (int i = 0; i < 1000000; ++i) {
	+ final long idx = _TestUtil.nextLong(random(), 0, size);
	+ if (idx == index) {
	+ assertEquals(2, writer.get(idx));
	+ } else {
	+ assertEquals(0, writer.get(idx));
	+ }
	+ }
	+ }
	+
	public void testSave() throws IOException {
	final int valueCount = _TestUtil.nextInt(random(), 1, 2048);
	for (int bpv = 1; bpv <= 64; ++bpv) {
	Index: lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
	===================================================================
	--- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (revision 1488278)
	+++ lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java (working copy)
	@@ -34,7 +34,8 @@
	import org.junit.Ignore;
	import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;

	-@Ignore("Requires tons of heap to run (10G works)")
	+// nocommit
	+//@Ignore("Requires tons of heap to run (420G works)")
	@TimeoutSuite(millis = 100 * TimeUnits.HOUR)
	public class Test2BFST extends LuceneTestCase {

	@@ -50,12 +51,12 @@
	for(int doPackIter=0;doPackIter<2;doPackIter++) {
	boolean doPack = doPackIter == 1;

	- // Build FST w/ NoOutputs and stop when nodeCount > 3B
	+ // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
	if (!doPack) {
	System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
	Outputs<Object> outputs = NoOutputs.getSingleton();
	Object NO_OUTPUT = outputs.getNoOutput();
	- final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, false, false, Integer.MAX_VALUE, outputs,
	+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs,
	null, doPack, PackedInts.COMPACT, true, 15);

	int count = 0;
	@@ -72,7 +73,7 @@
	if (count % 100000 == 0) {
	System.out.println(count + ": " + b.fstSizeInBytes() + " bytes; " + b.getTotStateCount() + " nodes");
	}
	- if (b.getTotStateCount() > LIMIT) {
	+ if (b.getTotStateCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
	break;
	}
	nextInput(r, ints2);
	Index: lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (revision 1488278)
	+++ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java (working copy)
	@@ -21,19 +21,20 @@

	import org.apache.lucene.util.packed.GrowableWriter;
	import org.apache.lucene.util.packed.PackedInts;
	+import org.apache.lucene.util.packed.PagedGrowableWriter;

	// Used to dedup states (lookup already-frozen states)
	final class NodeHash<T> {

	- private GrowableWriter table;
	- private int count;
	- private int mask;
	+ private PagedGrowableWriter table;
	+ private long count;
	+ private long mask;
	private final FST<T> fst;
	private final FST.Arc<T> scratchArc = new FST.Arc<T>();
	private final FST.BytesReader in;

	public NodeHash(FST<T> fst, FST.BytesReader in) {
	- table = new GrowableWriter(8, 16, PackedInts.COMPACT);
	+ table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
	mask = 15;
	this.fst = fst;
	this.in = in;
	@@ -69,10 +70,10 @@

	// hash code for an unfrozen node. This must be identical
	// to the un-frozen case (below)!!
	- private int hash(Builder.UnCompiledNode<T> node) {
	+ private long hash(Builder.UnCompiledNode<T> node) {
	final int PRIME = 31;
	//System.out.println("hash unfrozen");
	- int h = 0;
	+ long h = 0;
	// TODO: maybe if number of arcs is high we can safely subsample?
	for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
	final Builder.Arc<T> arc = node.arcs[arcIdx];
	@@ -87,14 +88,14 @@
	}
	}
	//System.out.println(" ret " + (h&Integer.MAX_VALUE));
	- return h & Integer.MAX_VALUE;
	+ return h & Long.MAX_VALUE;
	}

	// hash code for a frozen node
	- private int hash(long node) throws IOException {
	+ private long hash(long node) throws IOException {
	final int PRIME = 31;
	//System.out.println("hash frozen node=" + node);
	- int h = 0;
	+ long h = 0;
	fst.readFirstRealTargetArc(node, scratchArc, in);
	while(true) {
	//System.out.println(" label=" + scratchArc.label + " target=" + scratchArc.target + " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4) + " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
	@@ -111,13 +112,13 @@
	fst.readNextRealArc(scratchArc, in);
	}
	//System.out.println(" ret " + (h&Integer.MAX_VALUE));
	- return h & Integer.MAX_VALUE;
	+ return h & Long.MAX_VALUE;
	}

	public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
	- // System.out.println("hash: add count=" + count + " vs " + table.size());
	- final int h = hash(nodeIn);
	- int pos = h & mask;
	+ //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" + mask);
	+ final long h = hash(nodeIn);
	+ long pos = h & mask;
	int c = 0;
	while(true) {
	final long v = table.get(pos);
	@@ -128,7 +129,8 @@
	assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
	count++;
	table.set(pos, node);
	- if (table.size() < 2*count) {
	+ // Rehash at 2/3 occupancy:
	+ if (count > 2*table.size()/3) {
	rehash();
	}
	return node;
	@@ -144,7 +146,7 @@

	// called only by rehash
	private void addNew(long address) throws IOException {
	- int pos = hash(address) & mask;
	+ long pos = hash(address) & mask;
	int c = 0;
	while(true) {
	if (table.get(pos) == 0) {
	@@ -158,15 +160,12 @@
	}

	private void rehash() throws IOException {
	- final GrowableWriter oldTable = table;
	+ final PagedGrowableWriter oldTable = table;

	- if (oldTable.size() >= Integer.MAX_VALUE/2) {
	- throw new IllegalStateException("FST too large (> 2.1 GB)");
	- }
	-
	- table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
	+ table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count), PackedInts.COMPACT);
	+ System.out.println("rehash to " + table.size());
	mask = table.size()-1;
	- for(int idx=0;idx<oldTable.size();idx++) {
	+ for(long idx=0;idx<oldTable.size();idx++) {
	final long address = oldTable.get(idx);
	if (address != 0) {
	addNew(address);
	@@ -174,7 +173,10 @@
	}
	}

	- public int count() {
	+ // nocommit unused?
	+ /*
	+ public long count() {
	return count;
	}
	+ */
	}
	Index: lucene/core/src/java/org/apache/lucene/util/packed/package.html
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/util/packed/package.html (revision 1488278)
	+++ lucene/core/src/java/org/apache/lucene/util/packed/package.html (working copy)
	@@ -47,6 +47,11 @@
	<li>Same as PackedInts.Mutable but grows the number of bits per values when needed.</li>
	<li>Useful to build a PackedInts.Mutable from a read-once stream of longs.</li>
	</ul></li>
	+ <li><b>{@link org.apache.lucene.util.packed.PagedGrowableWriter}</b><ul>
	+ <li>Slices data into fixed-size blocks stored in GrowableWriters.</li>
	+ <li>Supports more than 2B values.</li>
	+ <li>You should use AppendingLongBuffer instead if you don't need random write access.</li>
	+ </ul></li>
	<li><b>{@link org.apache.lucene.util.packed.AppendingLongBuffer}</b><ul>
	<li>Can store any sequence of longs.</li>
	<li>Compression is good when values are close to each other.</li>
	Index: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java (revision 0)
	+++ lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java (working copy)
	@@ -0,0 +1,139 @@
	+package org.apache.lucene.util.packed;
	+
	+/*
	+ * Licensed to the Apache Software Foundation (ASF) under one or more
	+ * contributor license agreements. See the NOTICE file distributed with
	+ * this work for additional information regarding copyright ownership.
	+ * The ASF licenses this file to You under the Apache License, Version 2.0
	+ * (the "License"); you may not use this file except in compliance with
	+ * the License. You may obtain a copy of the License at
	+ *
	+ * http://www.apache.org/licenses/LICENSE-2.0
	+ *
	+ * Unless required by applicable law or agreed to in writing, software
	+ * distributed under the License is distributed on an "AS IS" BASIS,
	+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+ * See the License for the specific language governing permissions and
	+ * limitations under the License.
	+ */
	+
	+/**
	+ * A {@link PagedGrowableWriter}. This class slices data into fixed-size blocks
	+ * which have independent numbers of bits per value and grow on-demand.
	+ * <p>You should use this class instead of {@link AppendingLongBuffer} only when
	+ * you need random write-access. Otherwise this class will likely be slower and
	+ * less memory-efficient.
	+ * @lucene.internal
	+ */
	+public final class PagedGrowableWriter {
	+
	+ private final long size;
	+ private final int pageShift;
	+ private final int pageMask;
	+ private final GrowableWriter[] subWriters;
	+ private final int startBitsPerValue;
	+ private final float acceptableOverheadRatio;
	+
	+ /**
	+ * Create a new {@link PagedGrowableWriter} instance.
	+ *
	+ * @param size the number of values to store.
	+ * @param pageSize the number of values per page
	+ * @param startBitsPerValue the initial number of bits per value
	+ * @param acceptableOverheadRatio an acceptable overhead ratio
	+ */
	+ public PagedGrowableWriter(long size, int pageSize,
	+ int startBitsPerValue, float acceptableOverheadRatio) {
	+ this(size, pageSize, startBitsPerValue, acceptableOverheadRatio, true);
	+ }
	+
	+ PagedGrowableWriter(long size, int pageSize,int startBitsPerValue, float acceptableOverheadRatio, boolean fillPages) {
	+ super();
	+ this.size = size;
	+ this.startBitsPerValue = startBitsPerValue;
	+ this.acceptableOverheadRatio = acceptableOverheadRatio;
	+ if (pageSize < 64 \|\| ((pageSize & (pageSize - 1)) != 0)) {
	+ throw new IllegalArgumentException("pageSize must be >= 64 and a power of 2, got " + pageSize);
	+ }
	+ pageShift = 31 - Integer.numberOfLeadingZeros(pageSize);
	+ assert (1 << pageShift) == pageSize;
	+ pageMask = pageSize - 1;
	+ final int numPages = (int) ((size + pageSize - 1) / pageSize);
	+ if ((long) numPages * pageSize < size \|\| (long) (numPages - 1) * pageSize > size) {
	+ throw new IllegalArgumentException("pageSize must be chosen so that there are at most Integer.MAX_VALUE pages, got size=" + size + ", pageSize=" + pageSize);
	+ }
	+ subWriters = new GrowableWriter[numPages];
	+ if (fillPages) {
	+ for (int i = 0; i < numPages; ++i) {
	+ int thisPageSize;
	+ if (i == numPages-1) {
	+ thisPageSize = (int) (size % pageSize);
	+ if (thisPageSize == 0) {
	+ thisPageSize = pageSize;
	+ }
	+ } else {
	+ thisPageSize = pageSize;
	+ }
	+ subWriters[i] = new GrowableWriter(startBitsPerValue, thisPageSize, acceptableOverheadRatio);
	+ }
	+ }
	+ }
	+
	+ private int pageSize() {
	+ return pageMask + 1;
	+ }
	+
	+ /** The number of values. */
	+ public long size() {
	+ return size;
	+ }
	+
	+ int pageIndex(long index) {
	+ return (int) (index >>> pageShift);
	+ }
	+
	+ int indexInPage(long index) {
	+ return (int) index & pageMask;
	+ }
	+
	+ /** Get value at <code>index</code>. */
	+ public long get(long index) {
	+ assert index >= 0 && index < size: "index=" + index + " size=" + size;
	+ final int pageIndex = pageIndex(index);
	+ final int indexInPage = indexInPage(index);
	+ return subWriters[pageIndex].get(indexInPage);
	+ }
	+
	+ /** Set value at <code>index</code>. */
	+ public void set(long index, long value) {
	+ assert index >= 0 && index < size;
	+ final int pageIndex = pageIndex(index);
	+ final int indexInPage = indexInPage(index);
	+ subWriters[pageIndex].set(indexInPage, value);
	+ }
	+
	+ /** Create a new {@link PagedGrowableWriter} of size <code>newSize</code>
	+ * based on the content of this buffer. This method is much more efficient
	+ * than creating a new {@link PagedGrowableWriter} and copying values one by
	+ * one. */
	+ public PagedGrowableWriter resize(long newSize) {
	+ PagedGrowableWriter newWriter = new PagedGrowableWriter(newSize, pageSize(), startBitsPerValue, acceptableOverheadRatio, false);
	+ final int numCommonPages = Math.min(newWriter.subWriters.length, subWriters.length);
	+ final long[] copyBuffer = new long[1024];
	+ for (int i = 0; i < numCommonPages; ++i) {
	+ final int bpv = subWriters[i].getBitsPerValue();
	+ newWriter.subWriters[i] = new GrowableWriter(bpv, pageSize(), acceptableOverheadRatio);
	+ PackedInts.copy(subWriters[i], 0, newWriter.subWriters[i], 0, pageSize(), copyBuffer);
	+ }
	+ for (int i = numCommonPages; i < newWriter.subWriters.length; ++i) {
	+ newWriter.subWriters[i] = new GrowableWriter(startBitsPerValue, pageSize(), acceptableOverheadRatio);
	+ }
	+ return newWriter;
	+ }
	+
	+ @Override
	+ public String toString() {
	+ return getClass().getSimpleName() + "(size=" + size() + ",pageSize=" + (pageMask+1) + ")";
	+ }
	+
	+}

	Property changes on: lucene/core/src/java/org/apache/lucene/util/packed/PagedGrowableWriter.java
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Index: lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java
	===================================================================
	--- lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java (revision 1488278)
	+++ lucene/core/src/java/org/apache/lucene/util/packed/PackedInts.java (working copy)
	@@ -1200,31 +1200,37 @@
	}
	} else {
	// use bulk operations
	- long[] buf = new long[Math.min(capacity, len)];
	- int remaining = 0;
	- while (len > 0) {
	- final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
	- assert read > 0;
	- srcPos += read;
	- len -= read;
	- remaining += read;
	- final int written = dest.set(destPos, buf, 0, remaining);
	- assert written > 0;
	- destPos += written;
	- if (written < remaining) {
	- System.arraycopy(buf, written, buf, 0, remaining - written);
	- }
	- remaining -= written;
	+ final long[] buf = new long[Math.min(capacity, len)];
	+ copy(src, srcPos, dest, destPos, len, buf);
	+ }
	+ }
	+
	+ /** Same as {@link #copy(Reader, int, Mutable, int, int, int)} but using a pre-allocated buffer. */
	+ static void copy(Reader src, int srcPos, Mutable dest, int destPos, int len, long[] buf) {
	+ assert buf.length > 0;
	+ int remaining = 0;
	+ while (len > 0) {
	+ final int read = src.get(srcPos, buf, remaining, Math.min(len, buf.length - remaining));
	+ assert read > 0;
	+ srcPos += read;
	+ len -= read;
	+ remaining += read;
	+ final int written = dest.set(destPos, buf, 0, remaining);
	+ assert written > 0;
	+ destPos += written;
	+ if (written < remaining) {
	+ System.arraycopy(buf, written, buf, 0, remaining - written);
	}
	- while (remaining > 0) {
	- final int written = dest.set(destPos, buf, 0, remaining);
	- destPos += written;
	- remaining -= written;
	- System.arraycopy(buf, written, buf, 0, remaining);
	- }
	+ remaining -= written;
	}
	+ while (remaining > 0) {
	+ final int written = dest.set(destPos, buf, 0, remaining);
	+ destPos += written;
	+ remaining -= written;
	+ System.arraycopy(buf, written, buf, 0, remaining);
	+ }
	}
	-
	+
	/**
	* Expert: reads only the metadata from a stream. This is useful to later
	* restore a stream or open a direct reader via