blob: 570ab30626265ae6c3500f0fb0edbbd34b9e90c0 [file] [log] [blame]
Index: CHANGES.txt
===================================================================
--- CHANGES.txt (revision 609389)
+++ CHANGES.txt (working copy)
@@ -342,6 +342,10 @@
14. LUCENE-1098: Make inner class StandardAnalyzer.SavedStreams static
and final. (Nathan Beyer via Michael Busch)
+15. LUCENE-1120: Speed up merging of term vectors by bulk-copying the
+ raw bytes for each contiguous range of non-deleted documents.
+ (Mike McCandless)
+
Documentation
1. LUCENE-1051: Generate separate javadocs for core, demo and contrib
Index: src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/test/org/apache/lucene/index/index.presharedstores.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/test/org/apache/lucene/index/index.20.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.20.nocfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.21.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.21.nocfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.22.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.22.nocfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.20.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.20.cfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.21.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.21.cfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.prelockless.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/test/org/apache/lucene/index/index.22.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.22.cfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.prelockless.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 609389)
+++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -1932,9 +1932,10 @@
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
- else
+ else {
reader.document(j);
- reader.getTermFreqVectors(j);
+ reader.getTermFreqVectors(j);
+ }
}
reader.close();
@@ -1958,9 +1959,10 @@
for(int j=0;j<reader.maxDoc();j++) {
if (reader.isDeleted(j))
numDel++;
- else
+ else {
reader.document(j);
- reader.getTermFreqVectors(j);
+ reader.getTermFreqVectors(j);
+ }
}
reader.close();
assertEquals(0, numDel);
Index: src/test/org/apache/lucene/index/index.19.nocfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.19.nocfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/index.19.cfs.zip
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: src/test/org/apache/lucene/index/index.19.cfs.zip
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
===================================================================
--- src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 609389)
+++ src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy)
@@ -50,11 +50,11 @@
/*
public void testCreatePreLocklessCFS() throws IOException {
- createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true);
+ createIndex("src/test/org/apache/lucene/index/index.cfs", true);
}
public void testCreatePreLocklessNoCFS() throws IOException {
- createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false);
+ createIndex("src/test/org/apache/lucene/index/index.nocfs", false);
}
*/
@@ -106,10 +106,14 @@
rmDir(dirName);
}
- final String[] oldNames = {"prelockless.cfs",
- "prelockless.nocfs",
- "presharedstores.cfs",
- "presharedstores.nocfs"};
+ final String[] oldNames = {"19.cfs",
+ "19.nocfs",
+ "20.cfs",
+ "20.nocfs",
+ "21.cfs",
+ "21.nocfs",
+ "22.cfs",
+ "22.nocfs"};
public void testSearchOldIndex() throws IOException {
for(int i=0;i<oldNames.length;i++) {
@@ -146,6 +150,15 @@
}
}
+ private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException {
+ final int hitCount = hits.length();
+ assertEquals("wrong number of hits", expectedCount, hitCount);
+ for(int i=0;i<hitCount;i++) {
+ hits.doc(i);
+ reader.getTermFreqVectors(hits.id(i));
+ }
+ }
+
public void searchIndex(String dirName) throws IOException {
//QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
//Query query = parser.parse("handle:1");
@@ -156,12 +169,14 @@
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
- assertEquals(34, hits.length());
+
+ // First document should be #21 since it's norm was
+ // increased:
Document d = hits.doc(0);
-
- // First document should be #21 since it's norm was increased:
assertEquals("didn't get the right document first", "21", d.get("id"));
+ testHits(hits, 34, searcher.getIndexReader());
+
searcher.close();
dir.close();
}
@@ -189,9 +204,9 @@
// make sure searching sees right # hits
IndexSearcher searcher = new IndexSearcher(dir);
Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
- assertEquals("wrong number of hits", 44, hits.length());
Document d = hits.doc(0);
assertEquals("wrong first document", "21", d.get("id"));
+ testHits(hits, 44, searcher.getIndexReader());
searcher.close();
// make sure we can do delete & setNorm against this
@@ -209,6 +224,7 @@
assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
+ testHits(hits, 43, searcher.getIndexReader());
searcher.close();
// optimize
@@ -220,6 +236,7 @@
hits = searcher.search(new TermQuery(new Term("content", "aaa")));
assertEquals("wrong number of hits", 43, hits.length());
d = hits.doc(0);
+ testHits(hits, 43, searcher.getIndexReader());
assertEquals("wrong first document", "22", d.get("id"));
searcher.close();
@@ -257,6 +274,7 @@
assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
+ testHits(hits, 33, searcher.getIndexReader());
searcher.close();
// optimize
@@ -269,6 +287,7 @@
assertEquals("wrong number of hits", 33, hits.length());
d = hits.doc(0);
assertEquals("wrong first document", "22", d.get("id"));
+ testHits(hits, 33, searcher.getIndexReader());
searcher.close();
dir.close();
@@ -283,6 +302,7 @@
Directory dir = FSDirectory.getDirectory(dirName);
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.setUseCompoundFile(doCFS);
+ writer.setMaxBufferedDocs(10);
for(int i=0;i<35;i++) {
addDoc(writer, i);
@@ -393,6 +413,7 @@
Document doc = new Document();
doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
+ doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc);
}
Index: src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
===================================================================
--- src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (revision 609389)
+++ src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (working copy)
@@ -251,7 +251,7 @@
message(" merge thread: done");
- } catch (IOException exc) {
+ } catch (Throwable exc) {
if (merge != null) {
merge.setException(exc);
Index: src/java/org/apache/lucene/index/SegmentMerger.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentMerger.java (revision 609389)
+++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy)
@@ -205,6 +205,38 @@
}
}
+ private SegmentReader[] matchingSegmentReaders;
+ private int[] rawDocLengths;
+ private int[] rawDocLengths2;
+
+ private void setMatchingSegmentReaders() {
+ // If the i'th reader is a SegmentReader and has
+ // identical fieldName -> number mapping, then this
+ // array will be non-null at position i:
+ matchingSegmentReaders = new SegmentReader[readers.size()];
+
+ // If this reader is a SegmentReader, and all of its
+ // field name -> number mappings match the "merged"
+ // FieldInfos, then we can do a bulk copy of the
+ // stored fields:
+ for (int i = 0; i < readers.size(); i++) {
+ IndexReader reader = (IndexReader) readers.elementAt(i);
+ if (reader instanceof SegmentReader) {
+ SegmentReader segmentReader = (SegmentReader) reader;
+ boolean same = true;
+ FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
+ for (int j = 0; same && j < segmentFieldInfos.size(); j++)
+ same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
+ if (same)
+ matchingSegmentReaders[i] = segmentReader;
+ }
+ }
+
+ // Used for bulk-reading raw bytes for stored fields
+ rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
+ rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
+ }
+
/**
*
* @return The number of documents in all of the readers
@@ -248,34 +280,10 @@
int docCount = 0;
+ setMatchingSegmentReaders();
+
if (mergeDocStores) {
- // If the i'th reader is a SegmentReader and has
- // identical fieldName -> number mapping, then this
- // array will be non-null at position i:
- SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
-
- // If this reader is a SegmentReader, and all of its
- // field name -> number mappings match the "merged"
- // FieldInfos, then we can do a bulk copy of the
- // stored fields:
- for (int i = 0; i < readers.size(); i++) {
- IndexReader reader = (IndexReader) readers.elementAt(i);
- if (reader instanceof SegmentReader) {
- SegmentReader segmentReader = (SegmentReader) reader;
- boolean same = true;
- FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
- for (int j = 0; same && j < segmentFieldInfos.size(); j++)
- same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
- if (same) {
- matchingSegmentReaders[i] = segmentReader;
- }
- }
- }
-
- // Used for bulk-reading raw bytes for stored fields
- final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
-
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
// in merge mode, we use this FieldSelector
FieldSelector fieldSelectorMerge = new FieldSelector() {
@@ -350,15 +358,45 @@
try {
for (int r = 0; r < readers.size(); r++) {
+ final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
+ TermVectorsReader matchingVectorsReader;
+ if (matchingSegmentReader != null) {
+ matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
+
+ // If the TV* files are an older format then they
+ // cannot read raw docs:
+ if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs())
+ matchingVectorsReader = null;
+ } else
+ matchingVectorsReader = null;
IndexReader reader = (IndexReader) readers.elementAt(r);
int maxDoc = reader.maxDoc();
- for (int docNum = 0; docNum < maxDoc; docNum++) {
+ for (int docNum = 0; docNum < maxDoc;) {
// skip deleted docs
- if (reader.isDeleted(docNum))
- continue;
- termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
- if (checkAbort != null)
- checkAbort.work(300);
+ if (!reader.isDeleted(docNum)) {
+ if (matchingVectorsReader != null) {
+ // We can optimize this case (doing a bulk
+ // byte copy) since the field numbers are
+ // identical
+ int start = docNum;
+ int numDocs = 0;
+ do {
+ docNum++;
+ numDocs++;
+ } while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS);
+
+ matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
+ termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
+ if (checkAbort != null)
+ checkAbort.work(300*numDocs);
+ } else {
+ termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
+ docNum++;
+ if (checkAbort != null)
+ checkAbort.work(300);
+ }
+ } else
+ docNum++;
}
}
} finally {
Index: src/java/org/apache/lucene/index/DocumentsWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 609389)
+++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy)
@@ -605,12 +605,12 @@
// Append term vectors to the real outputs:
if (tvx != null) {
tvx.writeLong(tvd.getFilePointer());
+ tvx.writeLong(tvf.getFilePointer());
tvd.writeVInt(numVectorFields);
if (numVectorFields > 0) {
for(int i=0;i<numVectorFields;i++)
tvd.writeVInt(vectorFieldNumbers[i]);
assert 0 == vectorFieldPointers[0];
- tvd.writeVLong(tvf.getFilePointer());
long lastPos = vectorFieldPointers[0];
for(int i=1;i<numVectorFields;i++) {
long pos = vectorFieldPointers[i];
@@ -788,17 +788,19 @@
if (tvx == null) {
assert docStoreSegment != null;
tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
files = null;
// We must "catch up" for all docIDs that had no
// vectors before this one
- for(int i=0;i<docID;i++)
+ for(int i=0;i<docID;i++) {
tvx.writeLong(0);
+ tvx.writeLong(0);
+ }
}
numVectorFields = 0;
Index: src/java/org/apache/lucene/index/TermVectorsReader.java
===================================================================
--- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 609389)
+++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy)
@@ -22,13 +22,18 @@
import org.apache.lucene.store.IndexInput;
import java.io.IOException;
+import java.util.Arrays;
/**
* @version $Id$
*/
class TermVectorsReader implements Cloneable {
+ // NOTE: if you make a new format, it must be larger than
+ // the current format
static final int FORMAT_VERSION = 2;
+ static final int FORMAT_VERSION2 = 3;
+
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4;
@@ -41,13 +46,13 @@
private IndexInput tvd;
private IndexInput tvf;
private int size;
+ private int numTotalDocs;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
- private int tvdFormat;
- private int tvfFormat;
+ private final int format;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws CorruptIndexException, IOException {
@@ -56,7 +61,7 @@
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
throws CorruptIndexException, IOException {
- this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
+ this(d, segment, fieldInfos, readBufferSize, -1, 0);
}
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
@@ -66,22 +71,35 @@
try {
if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
- checkValidFormat(tvx);
+ format = checkValidFormat(tvx);
tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
- tvdFormat = checkValidFormat(tvd);
+ final int tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
- tvfFormat = checkValidFormat(tvf);
+ final int tvfFormat = checkValidFormat(tvf);
+
+ assert format == tvdFormat;
+ assert format == tvfFormat;
+
+ if (format >= FORMAT_VERSION2) {
+ assert (tvx.length()-FORMAT_SIZE) % 16 == 0;
+ numTotalDocs = (int) (tvx.length() >> 4);
+ } else {
+ assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
+ numTotalDocs = (int) (tvx.length() >> 3);
+ }
+
if (-1 == docStoreOffset) {
this.docStoreOffset = 0;
- this.size = (int) (tvx.length() >> 3);
+ this.size = numTotalDocs;
} else {
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
- assert ((int) (tvx.length() / 8)) >= size + docStoreOffset;
+ assert numTotalDocs >= size + docStoreOffset;
}
- }
+ } else
+ format = 0;
this.fieldInfos = fieldInfos;
success = true;
@@ -96,26 +114,94 @@
}
}
}
-
+
+ // Used for bulk copy when merging
+ IndexInput getTvdStream() {
+ return tvd;
+ }
+
+ // Used for bulk copy when merging
+ IndexInput getTvfStream() {
+ return tvf;
+ }
+
+ final private void seekTvx(final int docNum) throws IOException {
+ if (format < FORMAT_VERSION2)
+ tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
+ else
+ tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
+ }
+
+ boolean canReadRawDocs() {
+ return format >= FORMAT_VERSION2;
+ }
+
+ /** Retrieve the length (in bytes) of the tvd and tvf
+ * entries for the next numDocs starting with
+ * startDocID. This is used for bulk copying when
+ * merging segments, if the field numbers are
+ * congruent. Once this returns, the tvf & tvd streams
+ * are seeked to the startDocID. */
+ final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
+
+ if (tvx == null) {
+ Arrays.fill(tvdLengths, 0);
+ Arrays.fill(tvfLengths, 0);
+ return;
+ }
+
+ // SegmentMerger calls canReadRawDocs() first and should
+ // not call us if that returns false.
+ if (format < FORMAT_VERSION2)
+ throw new IllegalStateException("cannot read raw docs with older term vector formats");
+
+ seekTvx(startDocID);
+
+ long tvdPosition = tvx.readLong();
+ tvd.seek(tvdPosition);
+
+ long tvfPosition = tvx.readLong();
+ tvf.seek(tvfPosition);
+
+ long lastTvdPosition = tvdPosition;
+ long lastTvfPosition = tvfPosition;
+
+ int count = 0;
+ while (count < numDocs) {
+ final int docID = startDocID + count + 1;
+ if (docID < numTotalDocs) {
+ tvdPosition = tvx.readLong();
+ tvfPosition = tvx.readLong();
+ } else {
+ tvdPosition = tvd.length();
+ tvfPosition = tvf.length();
+ }
+ tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
+ tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
+ count++;
+ lastTvdPosition = tvdPosition;
+ lastTvfPosition = tvfPosition;
+ }
+ }
+
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
{
int format = in.readInt();
- if (format > FORMAT_VERSION)
- {
+ if (format > FORMAT_VERSION2) {
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
- + FORMAT_VERSION + " or less");
+ + FORMAT_VERSION2 + " or less");
}
return format;
}
void close() throws IOException {
- // make all effort to close up. Keep the first exception
- // and throw it as a new one.
- IOException keep = null;
- if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
- if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
- if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
- if (keep != null) throw (IOException) keep.fillInStackTrace();
+ // make all effort to close up. Keep the first exception
+ // and throw it as a new one.
+ IOException keep = null;
+ if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
+ if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
+ if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
+ if (keep != null) throw (IOException) keep.fillInStackTrace();
}
/**
@@ -133,11 +219,11 @@
//We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file
- tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
+ seekTvx(docNum);
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
- long position = tvx.readLong();
+ long tvdPosition = tvx.readLong();
- tvd.seek(position);
+ tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
//System.out.println("Num Fields: " + fieldCount);
// There are only a few fields per document. We opt for a full scan
@@ -146,7 +232,7 @@
int number = 0;
int found = -1;
for (int i = 0; i < fieldCount; i++) {
- if(tvdFormat == FORMAT_VERSION)
+ if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
@@ -159,8 +245,12 @@
// document
if (found != -1) {
// Compute position in the tvf file
- position = 0;
- for (int i = 0; i <= found; i++)
+ long position;
+ if (format >= FORMAT_VERSION2)
+ position = tvx.readLong();
+ else
+ position = tvd.readVLong();
+ for (int i = 1; i <= found; i++)
position += tvd.readVLong();
mapper.setDocumentNumber(docNum);
@@ -201,10 +291,10 @@
TermFreqVector[] result = null;
if (tvx != null) {
//We need to offset by
- tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
- long position = tvx.readLong();
+ seekTvx(docNum);
+ long tvdPosition = tvx.readLong();
- tvd.seek(position);
+ tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
@@ -213,7 +303,7 @@
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
- if(tvdFormat == FORMAT_VERSION)
+ if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
@@ -222,9 +312,16 @@
}
// Compute position in the tvf file
- position = 0;
+ long position;
+ if (format >= FORMAT_VERSION2)
+ position = tvx.readLong();
+ else
+ position = tvd.readVLong();
+
long[] tvfPointers = new long[fieldCount];
- for (int i = 0; i < fieldCount; i++) {
+ tvfPointers[0] = position;
+
+ for (int i = 1; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
@@ -241,10 +338,11 @@
// Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
- tvx.seek((docNumber * 8L) + FORMAT_SIZE);
- long position = tvx.readLong();
- tvd.seek(position);
+ seekTvx(docNumber);
+ long tvdPosition = tvx.readLong();
+
+ tvd.seek(tvdPosition);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
@@ -253,7 +351,7 @@
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
- if(tvdFormat == FORMAT_VERSION)
+ if (format >= FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
@@ -262,9 +360,14 @@
}
// Compute position in the tvf file
- position = 0;
+ long position;
+ if (format >= FORMAT_VERSION2)
+ position = tvx.readLong();
+ else
+ position = tvd.readVLong();
long[] tvfPointers = new long[fieldCount];
- for (int i = 0; i < fieldCount; i++) {
+ tvfPointers[0] = position;
+ for (int i = 1; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
@@ -293,9 +396,8 @@
private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
throws IOException {
for (int i = 0; i < fields.length; i++) {
- readTermVector(fields[i], tvfPointers[i], mapper);
+ readTermVector(fields[i], tvfPointers[i], mapper);
}
-
}
@@ -324,7 +426,7 @@
boolean storePositions;
boolean storeOffsets;
- if(tvfFormat == FORMAT_VERSION){
+ if (format >= FORMAT_VERSION){
byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
@@ -400,8 +502,6 @@
}
}
-
-
protected Object clone() {
if (tvx == null || tvd == null || tvf == null)
@@ -418,11 +518,9 @@
return clone;
}
+}
-
-}
-
/**
* Models the existing parallel array structure
*/
Index: src/java/org/apache/lucene/index/TermVectorsWriter.java
===================================================================
--- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 609389)
+++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy)
@@ -33,11 +33,11 @@
throws IOException {
// Open files for TermVector storage
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
+ tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
this.fieldInfos = fieldInfos;
}
@@ -53,6 +53,7 @@
throws IOException {
tvx.writeLong(tvd.getFilePointer());
+ tvx.writeLong(tvf.getFilePointer());
if (vectors != null) {
final int numFields = vectors.length;
@@ -145,8 +146,8 @@
}
// 2nd pass: write field pointers to tvd
- long lastFieldPointer = 0;
- for (int i=0; i<numFields; i++) {
+ long lastFieldPointer = fieldPointers[0];
+ for (int i=1; i<numFields; i++) {
final long fieldPointer = fieldPointers[i];
tvd.writeVLong(fieldPointer-lastFieldPointer);
lastFieldPointer = fieldPointer;
@@ -154,6 +155,28 @@
} else
tvd.writeVInt(0);
}
+
+ /**
+ * Do a bulk copy of numDocs documents from reader to our
+ * streams. This is used to expedite merging, if the
+ * field numbers are congruent.
+ */
+ final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
+ long tvdPosition = tvd.getFilePointer();
+ long tvfPosition = tvf.getFilePointer();
+ long tvdStart = tvdPosition;
+ long tvfStart = tvfPosition;
+ for(int i=0;i<numDocs;i++) {
+ tvx.writeLong(tvdPosition);
+ tvdPosition += tvdLengths[i];
+ tvx.writeLong(tvfPosition);
+ tvfPosition += tvfLengths[i];
+ }
+ tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
+ tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
+ assert tvd.getFilePointer() == tvdPosition;
+ assert tvf.getFilePointer() == tvfPosition;
+ }
/** Close all streams. */
final void close() throws IOException {