blob: e493897515f083006a9d38b8b77cd284bf5aa12d [file] [log] [blame]
Index: src/test/org/apache/lucene/index/TestStressIndexing2.java
===================================================================
--- src/test/org/apache/lucene/index/TestStressIndexing2.java (revision 926738)
+++ src/test/org/apache/lucene/index/TestStressIndexing2.java (working copy)
@@ -286,14 +286,20 @@
// create mapping from id2 space to id2 based on idField
idField = StringHelper.intern(idField);
- final TermsEnum termsEnum = MultiFields.getFields(r1).terms(idField).iterator();
+ final Fields f1 = MultiFields.getFields(r1);
+ if (f1 == null) {
+ // make sure r2 is empty
+ assertNull(MultiFields.getFields(r2));
+ return;
+ }
+ final TermsEnum termsEnum = f1.terms(idField).iterator();
final Bits delDocs1 = MultiFields.getDeletedDocs(r1);
final Bits delDocs2 = MultiFields.getDeletedDocs(r2);
Fields fields = MultiFields.getFields(r2);
if (fields == null) {
- // make sure r1 is in fract empty (eg has only all
+ // make sure r1 is in fact empty (eg has only all
// deleted docs):
DocsEnum docs = null;
while(termsEnum.next() != null) {
Index: src/test/org/apache/lucene/index/FlexTestUtil.java
===================================================================
--- src/test/org/apache/lucene/index/FlexTestUtil.java (revision 926738)
+++ src/test/org/apache/lucene/index/FlexTestUtil.java (working copy)
@@ -110,6 +110,9 @@
private static void testBogusFieldTerms(Random rand, IndexReader r) throws Exception {
final Fields fields = MultiFields.getFields(r);
+ if (fields == null) {
+ return;
+ }
for(int i=0;i<10;i++) {
final String f = "bogus" + rand.nextInt() + "reallybogus";
Terms terms = fields.terms(f);
Index: src/test/org/apache/lucene/util/packed/TestPackedInts.java
===================================================================
--- src/test/org/apache/lucene/util/packed/TestPackedInts.java (revision 926738)
+++ src/test/org/apache/lucene/util/packed/TestPackedInts.java (working copy)
@@ -71,16 +71,28 @@
w.add(values[i]);
}
w.finish();
+ final long fp = out.getFilePointer();
out.close();
IndexInput in = d.openInput("out.bin");
PackedInts.Reader r = PackedInts.getReader(in);
+ assertEquals(fp, in.getFilePointer());
for(int i=0;i<valueCount;i++) {
assertEquals("index=" + i + " ceil=" + ceil + " valueCount="
+ valueCount + " nbits=" + nbits + " for "
+ r.getClass().getSimpleName(), values[i], r.get(i));
}
in.close();
+
+ in = d.openInput("out.bin");
+ PackedInts.ReaderIterator r2 = PackedInts.getReaderIterator(in);
+ for(int i=0;i<valueCount;i++) {
+ assertEquals("index=" + i + " ceil=" + ceil + " valueCount="
+ + valueCount + " nbits=" + nbits + " for "
+ + r.getClass().getSimpleName(), values[i], r2.next());
+ }
+ assertEquals(fp, in.getFilePointer());
+ in.close();
ceil *= 2;
}
}
@@ -193,4 +205,21 @@
}
}
}
+
+ public void testSingleValue() throws Exception {
+ Directory dir = new MockRAMDirectory();
+ IndexOutput out = dir.createOutput("out");
+ PackedInts.Writer w = PackedInts.getWriter(out, 1, 8);
+ w.add(17);
+ w.finish();
+ final long end = out.getFilePointer();
+ out.close();
+
+ IndexInput in = dir.openInput("out");
+ PackedInts.Reader r = PackedInts.getReader(in);
+ assertEquals(end, in.getFilePointer());
+ in.close();
+
+ dir.close();
+ }
}
Index: src/java/org/apache/lucene/index/codecs/TermsConsumer.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/TermsConsumer.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/TermsConsumer.java (working copy)
@@ -93,5 +93,7 @@
}
}
}
+
+ finish();
}
}
Index: src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.packed.PackedInts;
import java.util.HashMap;
import java.util.Iterator;
@@ -63,7 +64,7 @@
/** @lucene.experimental */
public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
- final private int totalIndexInterval;
+ private int totalIndexInterval;
private int indexDivisor;
final private int indexInterval;
@@ -72,7 +73,13 @@
private volatile boolean indexLoaded;
private final Comparator<BytesRef> termComp;
+ private final String segment;
+ private final static int PAGED_BYTES_BITS = 15;
+
+ // all fields share this single logical byte[]
+ private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
+
final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp)
@@ -80,6 +87,8 @@
this.termComp = termComp;
+ this.segment = segment;
+
IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, StandardCodec.TERMS_INDEX_EXTENSION));
boolean success = false;
@@ -118,10 +127,14 @@
System.out.println(" read field number=" + field);
}
final int numIndexTerms = in.readInt();
+ final long termsStart = in.readLong();
final long indexStart = in.readLong();
+ final long packedIndexStart = in.readLong();
+ final long packedOffsetsStart = in.readLong();
+ assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment;
if (numIndexTerms > 0) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart));
+ fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
}
}
success = true;
@@ -130,57 +143,15 @@
in.close();
this.in = null;
if (success) {
- trimByteBlock();
indexLoaded = true;
}
+ termBytes.finish();
} else {
this.in = in;
}
}
}
- /* Called when index is fully loaded. We know we will use
- * no more bytes in the final byte[], so trim it down to
- * its actual usagee. This substantially reduces memory
- * usage of SegmentReader searching a tiny segment. */
- private final void trimByteBlock() {
- if (blockOffset == 0) {
- // There were no fields in this segment:
- if (blocks != null) {
- blocks[blockUpto] = null;
- }
- } else {
- byte[] last = new byte[blockOffset];
- System.arraycopy(blocks[blockUpto], 0, last, 0, blockOffset);
- blocks[blockUpto] = last;
- }
- }
-
- // TODO: we can record precisely how many bytes are
- // required during indexing, save that into file, and be
- // precise when we allocate the blocks; we even don't need
- // to use blocks anymore (though my still want to, to
- // prevent allocation failure due to mem fragmentation on
- // 32bit)
-
- // Fixed size byte blocks, to hold all term bytes; these
- // blocks are shared across fields
- private byte[][] blocks;
- int blockUpto;
- int blockOffset;
-
- private static final int BYTE_BLOCK_SHIFT = 15;
- private static final int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
- private static final int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
-
- static {
- // Make sure DW can't ever write a term whose length
- // cannot be encoded with short (because we use short[]
- // to hold the length of each term).
- assert IndexWriter.MAX_TERM_LENGTH < Short.MAX_VALUE;
- assert BYTE_BLOCK_SIZE >= IndexWriter.MAX_TERM_LENGTH;
- }
-
private final class FieldIndexReader extends FieldReader {
final private FieldInfo fieldInfo;
@@ -190,14 +161,21 @@
private final IndexInput in;
private final long indexStart;
+ private final long termsStart;
+ private final long packedIndexStart;
+ private final long packedOffsetsStart;
private final int numIndexTerms;
- public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart) throws IOException {
+ public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
+ long packedOffsetsStart) throws IOException {
this.fieldInfo = fieldInfo;
this.in = in;
+ this.termsStart = termsStart;
this.indexStart = indexStart;
+ this.packedIndexStart = packedIndexStart;
+ this.packedOffsetsStart = packedOffsetsStart;
this.numIndexTerms = numIndexTerms;
// We still create the indexReader when indexDivisor
@@ -210,6 +188,9 @@
}
coreIndex = new CoreFieldIndex(indexStart,
+ termsStart,
+ packedIndexStart,
+ packedOffsetsStart,
numIndexTerms);
} else {
@@ -221,7 +202,7 @@
public void loadTermsIndex() throws IOException {
if (coreIndex == null) {
- coreIndex = new CoreFieldIndex(indexStart, numIndexTerms);
+ coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
}
}
@@ -263,150 +244,115 @@
private final class CoreFieldIndex {
- // TODO: used packed ints here
- // Pointer into terms dict file that we are indexing
- final long[] fileOffset;
+ final private long termBytesStart;
- // TODO: used packed ints here
- // For each term, points to start of term's bytes within
- // block.
- // TODO: wasteful that this is always long; many terms
- // dict indexes obviously don't require so much address
- // space; since we know up front during indexing how
- // much space is needed we could pack this to the
- // precise # bits
- final long[] blockPointer;
-
- // TODO: used packed ints here: we know max term
- // length; often its small
+ // offset into index termBytes
+ final PackedInts.Reader termOffsets;
- // TODO: can we inline this w/ the bytes? like
- // DW. vast majority of terms only need 1 byte, not 2
+ // index pointers into main terms dict
+ final PackedInts.Reader termsDictOffsets;
- // Length of each term
- final short[] termLength;
-
final int numIndexTerms;
- CoreFieldIndex(long indexStart, int numIndexTerms) throws IOException {
+ final long termsStart;
+ public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
+
+ this.termsStart = termsStart;
+ termBytesStart = termBytes.getPointer();
+
IndexInput clone = (IndexInput) in.clone();
clone.seek(indexStart);
- if (indexDivisor == -1) {
- // Special case: we are being loaded inside
- // IndexWriter because a SegmentReader that at
- // first was opened for merging, is now being
- // opened to perform deletes or for an NRT reader
- this.numIndexTerms = numIndexTerms;
- } else {
- this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
- }
+ // -1 is passed to mean "don't load term index", but
+ // if we are then later loaded it's overwritten with
+ // a real value
+ assert indexDivisor > 0;
+ this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
+
assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;
- if (blocks == null) {
- blocks = new byte[1][];
- blocks[0] = new byte[BYTE_BLOCK_SIZE];
- }
+ if (indexDivisor == 1) {
+ // Default (load all index terms) is fast -- slurp in the images from disk:
+
+ try {
+ final long numTermBytes = packedIndexStart - indexStart;
+ termBytes.copy(clone, numTermBytes);
- byte[] lastBlock = blocks[blockUpto];
- int lastBlockOffset = blockOffset;
+ // records offsets into main terms dict file
+ termsDictOffsets = PackedInts.getReader(clone);
+ assert termsDictOffsets.size() == numIndexTerms;
- fileOffset = new long[this.numIndexTerms];
- blockPointer = new long[this.numIndexTerms];
- termLength = new short[this.numIndexTerms];
-
- final byte[] skipBytes;
- if (indexDivisor != 1) {
- // only need skipBytes (below) if we are not
- // loading all index terms
- skipBytes = new byte[128];
+ // records offsets into byte[] term data
+ termOffsets = PackedInts.getReader(clone);
+ assert termOffsets.size() == 1+numIndexTerms;
+ } finally {
+ clone.close();
+ }
} else {
- skipBytes = null;
- }
+ // Get packed iterators
+ final IndexInput clone1 = (IndexInput) in.clone();
+ final IndexInput clone2 = (IndexInput) in.clone();
- int upto = 0;
- long pointer = 0;
-
- for(int i=0;i<numIndexTerms;i++) {
- final int start = clone.readVInt();
- final int suffix = clone.readVInt();
- final int thisTermLength = start + suffix;
+ try {
+ // Subsample the index terms
+ clone1.seek(packedIndexStart);
+ final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1);
- assert thisTermLength <= BYTE_BLOCK_SIZE;
+ clone2.seek(packedOffsetsStart);
+ final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2);
- if (i%indexDivisor == 0) {
- // Keeper
- if (blockOffset + thisTermLength > BYTE_BLOCK_SIZE) {
- // New block
- final byte[] newBlock = new byte[BYTE_BLOCK_SIZE];
- if (blocks.length == blockUpto+1) {
- final int newSize = ArrayUtil.oversize(blockUpto+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
- final byte[][] newBlocks = new byte[newSize][];
- System.arraycopy(blocks, 0, newBlocks, 0, blocks.length);
- blocks = newBlocks;
- }
- blockUpto++;
- blocks[blockUpto] = newBlock;
- blockOffset = 0;
- }
+ // TODO: often we can get by w/ fewer bits per
+ // value, below.. .but this'd be more complex:
+ // we'd have to try @ fewer bits and then grow
+ // if we overflowed it.
- final byte[] block = blocks[blockUpto];
+ PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue());
+ PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue());
- // Copy old prefix
- assert lastBlock != null || start == 0;
- assert block != null;
- System.arraycopy(lastBlock, lastBlockOffset, block, blockOffset, start);
+ termsDictOffsets = termsDictOffsetsM;
+ termOffsets = termOffsetsM;
- // Read new suffix
- clone.readBytes(block, blockOffset+start, suffix);
+ int upto = 0;
- // Advance file offset
- pointer += clone.readVLong();
+ long lastTermOffset = 0;
+ long termOffsetUpto = 0;
- assert thisTermLength < Short.MAX_VALUE;
+ while(upto < this.numIndexTerms) {
+ // main file offset copies straight over
+ termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
- termLength[upto] = (short) thisTermLength;
- fileOffset[upto] = pointer;
- blockPointer[upto] = blockUpto * BYTE_BLOCK_SIZE + blockOffset;
+ termOffsetsM.set(upto, termOffsetUpto);
+ upto++;
- /*
- BytesRef tr = new BytesRef();
- tr.bytes = blocks[blockUpto];
- tr.offset = blockOffset;
- tr.length = thisTermLength;
+ long termOffset = termOffsetsIter.next();
+ long nextTermOffset = termOffsetsIter.next();
+ final int numTermBytes = (int) (nextTermOffset - termOffset);
- //System.out.println(" read index term=" + new String(blocks[blockUpto], blockOffset, thisTermLength, "UTF-8") + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset);
- //System.out.println(" read index term=" + tr.toBytesString() + " this=" + this + " bytes=" + block + " (vs=" + blocks[blockUpto] + ") offset=" + blockOffset);
- */
+ clone.seek(indexStart + termOffset);
+ assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
+ assert indexStart + termOffset + numTermBytes < clone.length();
- lastBlock = block;
- lastBlockOffset = blockOffset;
- blockOffset += thisTermLength;
- upto++;
- } else {
- // Skip bytes
- int toSkip = suffix;
- while(true) {
- if (toSkip > skipBytes.length) {
- clone.readBytes(skipBytes, 0, skipBytes.length);
- toSkip -= skipBytes.length;
- } else {
- clone.readBytes(skipBytes, 0, toSkip);
- break;
+ termBytes.copy(clone, numTermBytes);
+ termOffsetUpto += numTermBytes;
+
+ // skip terms:
+ termsDictOffsetsIter.next();
+ for(int i=0;i<indexDivisor-2;i++) {
+ termOffsetsIter.next();
+ termsDictOffsetsIter.next();
}
}
+ termOffsetsM.set(upto, termOffsetUpto);
- // Advance file offset
- pointer += clone.readVLong();
+ } finally {
+ clone1.close();
+ clone2.close();
+ clone.close();
}
}
- clone.close();
-
- assert upto == this.numIndexTerms;
-
if (Codec.DEBUG) {
System.out.println(" done read");
}
@@ -423,30 +369,28 @@
}
private final void fillResult(int idx, TermsIndexResult result) {
- final long loc = blockPointer[idx];
- result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)];
- result.term.offset = (int) (loc & BYTE_BLOCK_MASK);
- result.term.length = termLength[idx];
+ final long offset = termOffsets.get(idx);
+ final int length = (int) (termOffsets.get(1+idx) - offset);
+ termBytes.fill(result.term, termBytesStart + offset, length);
result.position = idx * totalIndexInterval;
- result.offset = fileOffset[idx];
+ result.offset = termsStart + termsDictOffsets.get(idx);
}
public final void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
if (Codec.DEBUG) {
- System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term + " indexLen = " + blockPointer.length + " numIndexTerms=" + fileOffset.length + " numIndexedTerms=" + fileOffset.length);
+ System.out.println("getIndexOffset field=" + fieldInfo.name + " term=" + term.utf8ToString());
}
int lo = 0; // binary search
- int hi = fileOffset.length - 1;
+ int hi = numIndexTerms - 1;
while (hi >= lo) {
int mid = (lo + hi) >>> 1;
- final long loc = blockPointer[mid];
- result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)];
- result.term.offset = (int) (loc & BYTE_BLOCK_MASK);
- result.term.length = termLength[mid];
+ final long offset = termOffsets.get(mid);
+ final int length = (int) (termOffsets.get(1+mid) - offset);
+ termBytes.fill(result.term, termBytesStart + offset, length);
int delta = termComp.compare(term, result.term);
if (delta < 0) {
@@ -456,7 +400,7 @@
} else {
assert mid >= 0;
result.position = mid*totalIndexInterval;
- result.offset = fileOffset[mid];
+ result.offset = termsStart + termsDictOffsets.get(mid);
return;
}
}
@@ -465,13 +409,12 @@
hi = 0;
}
- final long loc = blockPointer[hi];
- result.term.bytes = blocks[(int) (loc >> BYTE_BLOCK_SHIFT)];
- result.term.offset = (int) (loc & BYTE_BLOCK_MASK);
- result.term.length = termLength[hi];
+ final long offset = termOffsets.get(hi);
+ final int length = (int) (termOffsets.get(1+hi) - offset);
+ termBytes.fill(result.term, termBytesStart + offset, length);
result.position = hi*totalIndexInterval;
- result.offset = fileOffset[hi];
+ result.offset = termsStart + termsDictOffsets.get(hi);
}
public final void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
@@ -488,6 +431,7 @@
if (!indexLoaded) {
this.indexDivisor = indexDivisor;
+ this.totalIndexInterval = indexInterval * indexDivisor;
// mxx
if (Codec.DEBUG) {
@@ -498,10 +442,10 @@
while(it.hasNext()) {
it.next().loadTermsIndex();
}
- trimByteBlock();
indexLoaded = true;
in.close();
+ termBytes.finish();
}
}
Index: src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/standard/StandardTermsIndexWriter.java (working copy)
@@ -29,6 +29,7 @@
public abstract class FieldWriter {
public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
+ public abstract void finish() throws IOException;
}
public abstract FieldWriter addField(FieldInfo fieldInfo);
Index: src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictReader.java (working copy)
@@ -372,7 +372,7 @@
indexReader.getIndexOffset(term, indexResult);
if (Codec.DEBUG) {
- Codec.debug(" index pos=" + indexResult.position + " termFP=" + indexResult.offset + " term=" + indexResult.term + " this=" + this);
+ Codec.debug(" index pos=" + indexResult.position + " termFP=" + indexResult.offset + " term=" + indexResult.term.utf8ToString() + " this=" + this);
}
in.seek(indexResult.offset);
@@ -507,6 +507,9 @@
}
if (state.ord >= numTerms-1) {
+ if (Codec.DEBUG) {
+ Codec.debug(" return null ord=" + state.ord + " vs numTerms-1=" + (numTerms-1));
+ }
return null;
}
@@ -514,7 +517,7 @@
state.docFreq = in.readVInt();
if (Codec.DEBUG) {
- Codec.debug(" text=" + bytesReader.term + " freq=" + state.docFreq + " tis=" + in);
+ Codec.debug(" text=" + bytesReader.term.utf8ToString() + " freq=" + state.docFreq + " tis=" + in);
}
// TODO: would be cleaner, but space-wasting, to
Index: src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (working copy)
@@ -25,6 +25,8 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.packed.PackedInts;
import java.util.List;
import java.util.ArrayList;
@@ -58,7 +60,6 @@
// Placeholder for dir offset
out.writeLong(0);
out.writeInt(termIndexInterval);
- termWriter = new DeltaBytesWriter(out);
}
@Override
@@ -66,8 +67,6 @@
this.termsOut = termsOut;
}
- final private DeltaBytesWriter termWriter;
-
@Override
public FieldWriter addField(FieldInfo field) {
SimpleFieldWriter writer = new SimpleFieldWriter(field);
@@ -78,33 +77,99 @@
private class SimpleFieldWriter extends FieldWriter {
final FieldInfo fieldInfo;
int numIndexTerms;
- private long lastTermsPointer;
final long indexStart;
+ final long termsStart;
+ long packedIndexStart;
+ long packedOffsetsStart;
private int numTerms;
+ // TODO: we could conceivably make a PackedInts wrapper
+ // that auto-grows... then we wouldn't force 6 bytes RAM
+ // per index term:
+ private short[] termLengths;
+ private int[] termsPointerDeltas;
+ private long lastTermsPointer;
+ private long totTermLength;
+
SimpleFieldWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
- termWriter.reset();
+ termsStart = lastTermsPointer = termsOut.getFilePointer();
+ termLengths = new short[0];
+ termsPointerDeltas = new int[0];
}
@Override
public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
- final long termsPointer = termsOut.getFilePointer();
+
if (Codec.DEBUG) {
- Codec.debug("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text + " termsFP=" + termsPointer + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
+ Codec.debug("sstiw.checkIndexTerm write index field=" + fieldInfo.name + " term=" + text.utf8ToString() + " numIndexTerms=" + numIndexTerms + " outFP=" + out.getFilePointer());
}
- termWriter.write(text);
- out.writeVLong(termsPointer - lastTermsPointer);
- lastTermsPointer = termsPointer;
+
+ // write full bytes
+ out.writeBytes(text.bytes, text.offset, text.length);
+
+ if (termLengths.length == numIndexTerms) {
+ termLengths = ArrayUtil.grow(termLengths);
+ }
+ if (termsPointerDeltas.length == numIndexTerms) {
+ termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
+ }
+
+ // save delta terms pointer
+ final long fp = termsOut.getFilePointer();
+ termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
+ lastTermsPointer = fp;
+
+ // save term length (in bytes)
+ assert text.length <= Short.MAX_VALUE;
+ termLengths[numIndexTerms] = (short) text.length;
+
+ totTermLength += text.length;
+
numIndexTerms++;
return true;
} else {
return false;
}
}
+
+ @Override
+ public void finish() throws IOException {
+
+ // write primary terms dict offsets
+ packedIndexStart = out.getFilePointer();
+
+ final long maxValue = termsOut.getFilePointer();
+ PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));
+
+ // relative to our indexStart
+ long upto = 0;
+ for(int i=0;i<numIndexTerms;i++) {
+ upto += termsPointerDeltas[i];
+ w.add(upto);
+ }
+ w.finish();
+
+ packedOffsetsStart = out.getFilePointer();
+
+ // write offsets into the byte[] terms
+ w = PackedInts.getWriter(out, 1+numIndexTerms, PackedInts.bitsRequired(totTermLength));
+ upto = 0;
+ for(int i=0;i<numIndexTerms;i++) {
+ w.add(upto);
+ upto += termLengths[i];
+ }
+ w.add(upto);
+ w.finish();
+
+ // our referrer holds onto us, while other fields are
+ // being written, so don't tie up this RAM:
+ termLengths = null;
+ termsPointerDeltas = null;
+ }
}
@Override
@@ -123,7 +188,10 @@
}
out.writeInt(field.fieldInfo.number);
out.writeInt(field.numIndexTerms);
+ out.writeLong(field.termsStart);
out.writeLong(field.indexStart);
+ out.writeLong(field.packedIndexStart);
+ out.writeLong(field.packedOffsetsStart);
}
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
Index: src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (revision 926738)
+++ src/java/org/apache/lucene/index/codecs/standard/StandardTermsDictWriter.java (working copy)
@@ -205,7 +205,8 @@
// Finishes all terms in this field
@Override
- public void finish() {
+ public void finish() throws IOException {
+ fieldIndexWriter.finish();
}
}
-}
\ No newline at end of file
+}
Index: src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java
===================================================================
--- src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java (revision 0)
+++ src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java (revision 0)
@@ -0,0 +1,129 @@
+package org.apache.lucene.index.codecs.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.CloseableThreadLocal;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.store.IndexInput;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.io.Closeable;
+import java.io.IOException;
+
+/** Represents a logical byte[] as a series of pages. You
+ * can write-once into the logical byte[], using copy, and
+ * then retrieve slices (BytesRef) into it using fill. */
+class PagedBytes implements Closeable {
+ private final List<byte[]> blocks = new ArrayList<byte[]>();
+ private final int blockSize;
+ private final int blockBits;
+ private final int blockMask;
+ private int upto;
+ private byte[] currentBlock;
+ private final CloseableThreadLocal<byte[]> threadBuffers = new CloseableThreadLocal();
+
+ private static final byte[] EMPTY_BYTES = new byte[0];
+
+ /** 1<<blockBits must be bigger than biggest single
+ * BytesRef slice that will be pulled */
+ public PagedBytes(int blockBits) {
+ this.blockSize = 1 << blockBits;
+ this.blockBits = blockBits;
+ blockMask = blockSize-1;
+ upto = blockSize;
+ }
+
+ /** Read this many bytes from in */
+ public void copy(IndexInput in, long byteCount) throws IOException {
+ while (byteCount > 0) {
+ int left = blockSize - upto;
+ if (left == 0) {
+ if (currentBlock != null) {
+ blocks.add(currentBlock);
+ }
+ currentBlock = new byte[blockSize];
+ upto = 0;
+ left = blockSize;
+ }
+ if (left < byteCount) {
+ in.readBytes(currentBlock, upto, left, false);
+ upto = blockSize;
+ byteCount -= left;
+ } else {
+ in.readBytes(currentBlock, upto, (int) byteCount, false);
+ upto += byteCount;
+ byteCount = 0;
+ }
+ }
+ }
+
+ /** Commits final byte[], trimming it if necessary. */
+ public void finish() {
+ if (upto < blockSize) {
+ final byte[] newBlock = new byte[upto];
+ System.arraycopy(currentBlock, 0, newBlock, 0, upto);
+ currentBlock = newBlock;
+ }
+ if (currentBlock == null) {
+ currentBlock = EMPTY_BYTES;
+ }
+ blocks.add(currentBlock);
+ currentBlock = null;
+ }
+
+ public long getPointer() {
+ if (currentBlock == null) {
+ return 0;
+ } else {
+ return (blocks.size() * ((long) blockSize)) + upto;
+ }
+ }
+
+ /** Get a slice out of the byte array. */
+ public void fill(BytesRef b, long start, int length) {
+ assert length >= 0: "length=" + length;
+ final int index = (int) (start >> blockBits);
+ final int offset = (int) (start & blockMask);
+ b.length = length;
+ if (blockSize - offset >= length) {
+ // Within block
+ b.bytes = blocks.get(index);
+ b.offset = offset;
+ } else {
+ // Split
+ byte[] buffer = threadBuffers.get();
+ if (buffer == null) {
+ buffer = new byte[length];
+ threadBuffers.set(buffer);
+ } else if (buffer.length < length) {
+ buffer = ArrayUtil.grow(buffer, length);
+ threadBuffers.set(buffer);
+ }
+ b.bytes = buffer;
+ b.offset = 0;
+ System.arraycopy(blocks.get(index), offset, buffer, 0, blockSize-offset);
+ System.arraycopy(blocks.get(1+index), 0, buffer, blockSize-offset, length-(blockSize-offset));
+ }
+ }
+
+ public void close() {
+ threadBuffers.close();
+ }
+}
Property changes on: src/java/org/apache/lucene/index/codecs/standard/PagedBytes.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/packed/Packed64.java
===================================================================
--- src/java/org/apache/lucene/util/packed/Packed64.java (revision 926738)
+++ src/java/org/apache/lucene/util/packed/Packed64.java (working copy)
@@ -148,6 +148,7 @@
super(valueCount, bitsPerValue);
int size = size(valueCount, bitsPerValue);
blocks = new long[size+1]; // +1 due to non-conditional tricks
+ // TODO: find a faster way to bulk-read longs...
for(int i=0;i<size;i++) {
blocks[i] = in.readLong();
}
Index: src/java/org/apache/lucene/util/packed/PackedWriter.java
===================================================================
--- src/java/org/apache/lucene/util/packed/PackedWriter.java (revision 926738)
+++ src/java/org/apache/lucene/util/packed/PackedWriter.java (working copy)
@@ -45,7 +45,7 @@
pendingBitPos = 64;
masks = new long[bitsPerValue - 1];
- int v = 1;
+ long v = 1;
for (int i = 0; i < bitsPerValue - 1; i++) {
v *= 2;
masks[i] = v - 1;
@@ -104,7 +104,6 @@
if (pendingBitPos != 64) {
out.writeLong(pending);
}
- out.writeLong(0L); // Dummy to compensate for not using conditionals
}
public String toString() {
Index: src/java/org/apache/lucene/util/packed/PackedReaderIterator.java
===================================================================
--- src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (revision 0)
+++ src/java/org/apache/lucene/util/packed/PackedReaderIterator.java (revision 0)
@@ -0,0 +1,84 @@
+package org.apache.lucene.util.packed;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.IndexInput;
+
+import java.io.IOException;
+
+class PackedReaderIterator implements PackedInts.ReaderIterator {
+ private long pending;
+ private int pendingBitsLeft;
+ private final IndexInput in;
+ private final int bitsPerValue;
+ private final int valueCount;
+
+ // masks[n-1] masks for bottom n bits
+ private final long[] masks;
+
+ public PackedReaderIterator(int bitsPerValue, int valueCount, IndexInput in)
+ throws IOException {
+
+ this.valueCount = valueCount;
+ this.bitsPerValue = bitsPerValue;
+
+ this.in = in;
+
+ masks = new long[bitsPerValue];
+
+ long v = 1;
+ for (int i = 0; i < bitsPerValue; i++) {
+ v *= 2;
+ masks[i] = v - 1;
+ }
+ }
+
+ public int getBitsPerValue() {
+ return bitsPerValue;
+ }
+
+ public int size() {
+ return valueCount;
+ }
+
+ public long next() throws IOException {
+ if (pendingBitsLeft == 0) {
+ pending = in.readLong();
+ pendingBitsLeft = 64;
+ }
+
+ if (pendingBitsLeft >= bitsPerValue) {
+ // not split
+ final long result = (pending >> (pendingBitsLeft - bitsPerValue)) & masks[bitsPerValue-1];
+ pendingBitsLeft -= bitsPerValue;
+ return result;
+ } else {
+ // split
+ final int bits1 = bitsPerValue - pendingBitsLeft;
+ final long result1 = (pending & masks[pendingBitsLeft-1]) << bits1;
+ pending = in.readLong();
+ final long result2 = (pending >> (64 - bits1)) & masks[bits1-1];
+ pendingBitsLeft = 64 + pendingBitsLeft - bitsPerValue;
+ return result1 | result2;
+ }
+ }
+
+ public void close() throws IOException {
+ in.close();
+ }
+}
Property changes on: src/java/org/apache/lucene/util/packed/PackedReaderIterator.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: src/java/org/apache/lucene/util/packed/PackedInts.java
===================================================================
--- src/java/org/apache/lucene/util/packed/PackedInts.java (revision 926738)
+++ src/java/org/apache/lucene/util/packed/PackedInts.java (working copy)
@@ -17,6 +17,8 @@
* limitations under the License.
*/
+import java.io.Closeable;
+
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CodecUtil;
@@ -65,6 +67,18 @@
}
/**
+ * Run-once iterator interface, to decode previously saved PackedInts.
+ */
+ public static interface ReaderIterator extends Closeable {
+ /** Returns next value */
+ long next() throws IOException;
+ /** Returns number of bits per value */
+ int getBitsPerValue();
+ /** Returns number of values */
+ int size();
+ }
+
+ /**
* A packed integer array that can be modified.
* @lucene.internal
*/
@@ -167,6 +181,22 @@
}
/**
+ * Retrieve PackedInts as a {@link ReaderIterator}
+ * @param in positioned at the beginning of a stored packed int structure.
+ * @return an iterator to access the values
+ * @throws IOException if the structure could not be retrieved.
+ * @lucene.internal
+ */
+ public static ReaderIterator getReaderIterator(IndexInput in) throws IOException {
+ CodecUtil.checkHeader(in, CODEC_NAME, VERSION_START);
+ final int bitsPerValue = in.readVInt();
+ assert bitsPerValue > 0 && bitsPerValue <= 64: "bitsPerValue=" + bitsPerValue;
+ final int valueCount = in.readVInt();
+
+ return new PackedReaderIterator(bitsPerValue, valueCount, in);
+ }
+
+ /**
* Create a packed integer array with the given amount of values initialized
* to 0. the valueCount and the bitsPerValue cannot be changed after creation.
* All Mutables known by this factory are kept fully in RAM.
@@ -228,7 +258,7 @@
} if (maxValue > 0x1FFFFFFFFFFFFFFFL) {
return 62;
}
- return (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0));
+ return Math.max(1, (int) Math.ceil(Math.log(1+maxValue)/Math.log(2.0)));
}
/**
Index: src/java/org/apache/lucene/util/packed/Packed32.java
===================================================================
--- src/java/org/apache/lucene/util/packed/Packed32.java (revision 926738)
+++ src/java/org/apache/lucene/util/packed/Packed32.java (working copy)
@@ -129,6 +129,7 @@
super(valueCount, bitsPerValue);
int size = size(bitsPerValue, valueCount);
blocks = new int[size + 1]; // +1 due to non-conditional tricks
+ // TODO: find a faster way to bulk-read ints...
for(int i = 0 ; i < size ; i++) {
blocks[i] = in.readInt();
}
Index: src/java/org/apache/lucene/util/ArrayUtil.java
===================================================================
--- src/java/org/apache/lucene/util/ArrayUtil.java (revision 926738)
+++ src/java/org/apache/lucene/util/ArrayUtil.java (working copy)
@@ -232,6 +232,29 @@
return currentSize;
}
+ public static short[] grow(short[] array, int minSize) {
+ if (array.length < minSize) {
+ short[] newArray = new short[oversize(minSize, RamUsageEstimator.NUM_BYTES_SHORT)];
+ System.arraycopy(array, 0, newArray, 0, array.length);
+ return newArray;
+ } else
+ return array;
+ }
+
+ public static short[] grow(short[] array) {
+ return grow(array, 1 + array.length);
+ }
+
+ public static short[] shrink(short[] array, int targetSize) {
+ final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_SHORT);
+ if (newSize != array.length) {
+ short[] newArray = new short[newSize];
+ System.arraycopy(array, 0, newArray, 0, newSize);
+ return newArray;
+ } else
+ return array;
+ }
+
public static int[] grow(int[] array, int minSize) {
if (array.length < minSize) {
int[] newArray = new int[oversize(minSize, RamUsageEstimator.NUM_BYTES_INT)];