| Index: CHANGES.txt |
| =================================================================== |
| --- CHANGES.txt (revision 609389) |
| +++ CHANGES.txt (working copy) |
| @@ -342,6 +342,10 @@ |
| 14. LUCENE-1098: Make inner class StandardAnalyzer.SavedStreams static |
| and final. (Nathan Beyer via Michael Busch) |
| |
| +15. LUCENE-1120: Speed up merging of term vectors by bulk-copying the |
| + raw bytes for each contiguous range of non-deleted documents. |
| + (Mike McCandless) |
| + |
| Documentation |
| |
| 1. LUCENE-1051: Generate separate javadocs for core, demo and contrib |
| Index: src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| Index: src/test/org/apache/lucene/index/index.presharedstores.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| Index: src/test/org/apache/lucene/index/index.20.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.20.nocfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.21.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.21.nocfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.22.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.22.nocfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.20.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.20.cfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.21.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.21.cfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.prelockless.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| Index: src/test/org/apache/lucene/index/index.22.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.22.cfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.prelockless.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| Index: src/test/org/apache/lucene/index/TestIndexWriter.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestIndexWriter.java (revision 609389) |
| +++ src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) |
| @@ -1932,9 +1932,10 @@ |
| for(int j=0;j<reader.maxDoc();j++) { |
| if (reader.isDeleted(j)) |
| numDel++; |
| - else |
| + else { |
| reader.document(j); |
| - reader.getTermFreqVectors(j); |
| + reader.getTermFreqVectors(j); |
| + } |
| } |
| reader.close(); |
| |
| @@ -1958,9 +1959,10 @@ |
| for(int j=0;j<reader.maxDoc();j++) { |
| if (reader.isDeleted(j)) |
| numDel++; |
| - else |
| + else { |
| reader.document(j); |
| - reader.getTermFreqVectors(j); |
| + reader.getTermFreqVectors(j); |
| + } |
| } |
| reader.close(); |
| assertEquals(0, numDel); |
| Index: src/test/org/apache/lucene/index/index.19.nocfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.19.nocfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/index.19.cfs.zip |
| =================================================================== |
| Cannot display: file marked as a binary type. |
| svn:mime-type = application/octet-stream |
| |
| Property changes on: src/test/org/apache/lucene/index/index.19.cfs.zip |
| ___________________________________________________________________ |
| Name: svn:mime-type |
| + application/octet-stream |
| |
| Index: src/test/org/apache/lucene/index/TestBackwardsCompatibility.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (revision 609389) |
| +++ src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (working copy) |
| @@ -50,11 +50,11 @@ |
| |
| /* |
| public void testCreatePreLocklessCFS() throws IOException { |
| - createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true); |
| + createIndex("src/test/org/apache/lucene/index/index.cfs", true); |
| } |
| |
| public void testCreatePreLocklessNoCFS() throws IOException { |
| - createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false); |
| + createIndex("src/test/org/apache/lucene/index/index.nocfs", false); |
| } |
| */ |
| |
| @@ -106,10 +106,14 @@ |
| rmDir(dirName); |
| } |
| |
| - final String[] oldNames = {"prelockless.cfs", |
| - "prelockless.nocfs", |
| - "presharedstores.cfs", |
| - "presharedstores.nocfs"}; |
| + final String[] oldNames = {"19.cfs", |
| + "19.nocfs", |
| + "20.cfs", |
| + "20.nocfs", |
| + "21.cfs", |
| + "21.nocfs", |
| + "22.cfs", |
| + "22.nocfs"}; |
| |
| public void testSearchOldIndex() throws IOException { |
| for(int i=0;i<oldNames.length;i++) { |
| @@ -146,6 +150,15 @@ |
| } |
| } |
| |
| + private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException { |
| + final int hitCount = hits.length(); |
| + assertEquals("wrong number of hits", expectedCount, hitCount); |
| + for(int i=0;i<hitCount;i++) { |
| + hits.doc(i); |
| + reader.getTermFreqVectors(hits.id(i)); |
| + } |
| + } |
| + |
| public void searchIndex(String dirName) throws IOException { |
| //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer()); |
| //Query query = parser.parse("handle:1"); |
| @@ -156,12 +169,14 @@ |
| IndexSearcher searcher = new IndexSearcher(dir); |
| |
| Hits hits = searcher.search(new TermQuery(new Term("content", "aaa"))); |
| - assertEquals(34, hits.length()); |
| + |
| + // First document should be #21 since it's norm was |
| + // increased: |
| Document d = hits.doc(0); |
| - |
| - // First document should be #21 since it's norm was increased: |
| assertEquals("didn't get the right document first", "21", d.get("id")); |
| |
| + testHits(hits, 34, searcher.getIndexReader()); |
| + |
| searcher.close(); |
| dir.close(); |
| } |
| @@ -189,9 +204,9 @@ |
| // make sure searching sees right # hits |
| IndexSearcher searcher = new IndexSearcher(dir); |
| Hits hits = searcher.search(new TermQuery(new Term("content", "aaa"))); |
| - assertEquals("wrong number of hits", 44, hits.length()); |
| Document d = hits.doc(0); |
| assertEquals("wrong first document", "21", d.get("id")); |
| + testHits(hits, 44, searcher.getIndexReader()); |
| searcher.close(); |
| |
| // make sure we can do delete & setNorm against this |
| @@ -209,6 +224,7 @@ |
| assertEquals("wrong number of hits", 43, hits.length()); |
| d = hits.doc(0); |
| assertEquals("wrong first document", "22", d.get("id")); |
| + testHits(hits, 43, searcher.getIndexReader()); |
| searcher.close(); |
| |
| // optimize |
| @@ -220,6 +236,7 @@ |
| hits = searcher.search(new TermQuery(new Term("content", "aaa"))); |
| assertEquals("wrong number of hits", 43, hits.length()); |
| d = hits.doc(0); |
| + testHits(hits, 43, searcher.getIndexReader()); |
| assertEquals("wrong first document", "22", d.get("id")); |
| searcher.close(); |
| |
| @@ -257,6 +274,7 @@ |
| assertEquals("wrong number of hits", 33, hits.length()); |
| d = hits.doc(0); |
| assertEquals("wrong first document", "22", d.get("id")); |
| + testHits(hits, 33, searcher.getIndexReader()); |
| searcher.close(); |
| |
| // optimize |
| @@ -269,6 +287,7 @@ |
| assertEquals("wrong number of hits", 33, hits.length()); |
| d = hits.doc(0); |
| assertEquals("wrong first document", "22", d.get("id")); |
| + testHits(hits, 33, searcher.getIndexReader()); |
| searcher.close(); |
| |
| dir.close(); |
| @@ -283,6 +302,7 @@ |
| Directory dir = FSDirectory.getDirectory(dirName); |
| IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true); |
| writer.setUseCompoundFile(doCFS); |
| + writer.setMaxBufferedDocs(10); |
| |
| for(int i=0;i<35;i++) { |
| addDoc(writer, i); |
| @@ -393,6 +413,7 @@ |
| Document doc = new Document(); |
| doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED)); |
| doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED)); |
| + doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); |
| writer.addDocument(doc); |
| } |
| |
| Index: src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (revision 609389) |
| +++ src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java (working copy) |
| @@ -251,7 +251,7 @@ |
| |
| message(" merge thread: done"); |
| |
| - } catch (IOException exc) { |
| + } catch (Throwable exc) { |
| |
| if (merge != null) { |
| merge.setException(exc); |
| Index: src/java/org/apache/lucene/index/SegmentMerger.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/SegmentMerger.java (revision 609389) |
| +++ src/java/org/apache/lucene/index/SegmentMerger.java (working copy) |
| @@ -205,6 +205,38 @@ |
| } |
| } |
| |
| + private SegmentReader[] matchingSegmentReaders; |
| + private int[] rawDocLengths; |
| + private int[] rawDocLengths2; |
| + |
| + private void setMatchingSegmentReaders() { |
| + // If the i'th reader is a SegmentReader and has |
| + // identical fieldName -> number mapping, then this |
| + // array will be non-null at position i: |
| + matchingSegmentReaders = new SegmentReader[readers.size()]; |
| + |
| + // If this reader is a SegmentReader, and all of its |
| + // field name -> number mappings match the "merged" |
| + // FieldInfos, then we can do a bulk copy of the |
| + // stored fields: |
| + for (int i = 0; i < readers.size(); i++) { |
| + IndexReader reader = (IndexReader) readers.elementAt(i); |
| + if (reader instanceof SegmentReader) { |
| + SegmentReader segmentReader = (SegmentReader) reader; |
| + boolean same = true; |
| + FieldInfos segmentFieldInfos = segmentReader.getFieldInfos(); |
| + for (int j = 0; same && j < segmentFieldInfos.size(); j++) |
| + same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j)); |
| + if (same) |
| + matchingSegmentReaders[i] = segmentReader; |
| + } |
| + } |
| + |
| + // Used for bulk-reading raw bytes for stored fields |
| + rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; |
| + rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS]; |
| + } |
| + |
| /** |
| * |
| * @return The number of documents in all of the readers |
| @@ -248,34 +280,10 @@ |
| |
| int docCount = 0; |
| |
| + setMatchingSegmentReaders(); |
| + |
| if (mergeDocStores) { |
| |
| - // If the i'th reader is a SegmentReader and has |
| - // identical fieldName -> number mapping, then this |
| - // array will be non-null at position i: |
| - SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()]; |
| - |
| - // If this reader is a SegmentReader, and all of its |
| - // field name -> number mappings match the "merged" |
| - // FieldInfos, then we can do a bulk copy of the |
| - // stored fields: |
| - for (int i = 0; i < readers.size(); i++) { |
| - IndexReader reader = (IndexReader) readers.elementAt(i); |
| - if (reader instanceof SegmentReader) { |
| - SegmentReader segmentReader = (SegmentReader) reader; |
| - boolean same = true; |
| - FieldInfos segmentFieldInfos = segmentReader.getFieldInfos(); |
| - for (int j = 0; same && j < segmentFieldInfos.size(); j++) |
| - same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j)); |
| - if (same) { |
| - matchingSegmentReaders[i] = segmentReader; |
| - } |
| - } |
| - } |
| - |
| - // Used for bulk-reading raw bytes for stored fields |
| - final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; |
| - |
| // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're |
| // in merge mode, we use this FieldSelector |
| FieldSelector fieldSelectorMerge = new FieldSelector() { |
| @@ -350,15 +358,45 @@ |
| |
| try { |
| for (int r = 0; r < readers.size(); r++) { |
| + final SegmentReader matchingSegmentReader = matchingSegmentReaders[r]; |
| + TermVectorsReader matchingVectorsReader; |
| + if (matchingSegmentReader != null) { |
| + matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig; |
| + |
| + // If the TV* files are an older format then they |
| + // cannot read raw docs: |
| + if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs()) |
| + matchingVectorsReader = null; |
| + } else |
| + matchingVectorsReader = null; |
| IndexReader reader = (IndexReader) readers.elementAt(r); |
| int maxDoc = reader.maxDoc(); |
| - for (int docNum = 0; docNum < maxDoc; docNum++) { |
| + for (int docNum = 0; docNum < maxDoc;) { |
| // skip deleted docs |
| - if (reader.isDeleted(docNum)) |
| - continue; |
| - termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum)); |
| - if (checkAbort != null) |
| - checkAbort.work(300); |
| + if (!reader.isDeleted(docNum)) { |
| + if (matchingVectorsReader != null) { |
| + // We can optimize this case (doing a bulk |
| + // byte copy) since the field numbers are |
| + // identical |
| + int start = docNum; |
| + int numDocs = 0; |
| + do { |
| + docNum++; |
| + numDocs++; |
| + } while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS); |
| + |
| + matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs); |
| + termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); |
| + if (checkAbort != null) |
| + checkAbort.work(300*numDocs); |
| + } else { |
| + termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum)); |
| + docNum++; |
| + if (checkAbort != null) |
| + checkAbort.work(300); |
| + } |
| + } else |
| + docNum++; |
| } |
| } |
| } finally { |
| Index: src/java/org/apache/lucene/index/DocumentsWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DocumentsWriter.java (revision 609389) |
| +++ src/java/org/apache/lucene/index/DocumentsWriter.java (working copy) |
| @@ -605,12 +605,12 @@ |
| // Append term vectors to the real outputs: |
| if (tvx != null) { |
| tvx.writeLong(tvd.getFilePointer()); |
| + tvx.writeLong(tvf.getFilePointer()); |
| tvd.writeVInt(numVectorFields); |
| if (numVectorFields > 0) { |
| for(int i=0;i<numVectorFields;i++) |
| tvd.writeVInt(vectorFieldNumbers[i]); |
| assert 0 == vectorFieldPointers[0]; |
| - tvd.writeVLong(tvf.getFilePointer()); |
| long lastPos = vectorFieldPointers[0]; |
| for(int i=1;i<numVectorFields;i++) { |
| long pos = vectorFieldPointers[i]; |
| @@ -788,17 +788,19 @@ |
| if (tvx == null) { |
| assert docStoreSegment != null; |
| tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); |
| - tvx.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvx.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); |
| - tvd.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvd.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); |
| - tvf.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvf.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| files = null; |
| |
| // We must "catch up" for all docIDs that had no |
| // vectors before this one |
| - for(int i=0;i<docID;i++) |
| + for(int i=0;i<docID;i++) { |
| tvx.writeLong(0); |
| + tvx.writeLong(0); |
| + } |
| } |
| |
| numVectorFields = 0; |
| Index: src/java/org/apache/lucene/index/TermVectorsReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermVectorsReader.java (revision 609389) |
| +++ src/java/org/apache/lucene/index/TermVectorsReader.java (working copy) |
| @@ -22,13 +22,18 @@ |
| import org.apache.lucene.store.IndexInput; |
| |
| import java.io.IOException; |
| +import java.util.Arrays; |
| |
| /** |
| * @version $Id$ |
| */ |
| class TermVectorsReader implements Cloneable { |
| |
| + // NOTE: if you make a new format, it must be larger than |
| + // the current format |
| static final int FORMAT_VERSION = 2; |
| + static final int FORMAT_VERSION2 = 3; |
| + |
| //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file |
| static final int FORMAT_SIZE = 4; |
| |
| @@ -41,13 +46,13 @@ |
| private IndexInput tvd; |
| private IndexInput tvf; |
| private int size; |
| + private int numTotalDocs; |
| |
| // The docID offset where our docs begin in the index |
| // file. This will be 0 if we have our own private file. |
| private int docStoreOffset; |
| |
| - private int tvdFormat; |
| - private int tvfFormat; |
| + private final int format; |
| |
| TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) |
| throws CorruptIndexException, IOException { |
| @@ -56,7 +61,7 @@ |
| |
| TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) |
| throws CorruptIndexException, IOException { |
| - this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0); |
| + this(d, segment, fieldInfos, readBufferSize, -1, 0); |
| } |
| |
| TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) |
| @@ -66,22 +71,35 @@ |
| try { |
| if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) { |
| tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); |
| - checkValidFormat(tvx); |
| + format = checkValidFormat(tvx); |
| tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); |
| - tvdFormat = checkValidFormat(tvd); |
| + final int tvdFormat = checkValidFormat(tvd); |
| tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); |
| - tvfFormat = checkValidFormat(tvf); |
| + final int tvfFormat = checkValidFormat(tvf); |
| + |
| + assert format == tvdFormat; |
| + assert format == tvfFormat; |
| + |
| + if (format >= FORMAT_VERSION2) { |
| + assert (tvx.length()-FORMAT_SIZE) % 16 == 0; |
| + numTotalDocs = (int) (tvx.length() >> 4); |
| + } else { |
| + assert (tvx.length()-FORMAT_SIZE) % 8 == 0; |
| + numTotalDocs = (int) (tvx.length() >> 3); |
| + } |
| + |
| if (-1 == docStoreOffset) { |
| this.docStoreOffset = 0; |
| - this.size = (int) (tvx.length() >> 3); |
| + this.size = numTotalDocs; |
| } else { |
| this.docStoreOffset = docStoreOffset; |
| this.size = size; |
| // Verify the file is long enough to hold all of our |
| // docs |
| - assert ((int) (tvx.length() / 8)) >= size + docStoreOffset; |
| + assert numTotalDocs >= size + docStoreOffset; |
| } |
| - } |
| + } else |
| + format = 0; |
| |
| this.fieldInfos = fieldInfos; |
| success = true; |
| @@ -96,26 +114,94 @@ |
| } |
| } |
| } |
| - |
| + |
| + // Used for bulk copy when merging |
| + IndexInput getTvdStream() { |
| + return tvd; |
| + } |
| + |
| + // Used for bulk copy when merging |
| + IndexInput getTvfStream() { |
| + return tvf; |
| + } |
| + |
| + final private void seekTvx(final int docNum) throws IOException { |
| + if (format < FORMAT_VERSION2) |
| + tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); |
| + else |
| + tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); |
| + } |
| + |
| + boolean canReadRawDocs() { |
| + return format >= FORMAT_VERSION2; |
| + } |
| + |
| + /** Retrieve the length (in bytes) of the tvd and tvf |
| + * entries for the next numDocs starting with |
| + * startDocID. This is used for bulk copying when |
| + * merging segments, if the field numbers are |
| + * congruent. Once this returns, the tvf & tvd streams |
| + * are seeked to the startDocID. */ |
| + final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException { |
| + |
| + if (tvx == null) { |
| + Arrays.fill(tvdLengths, 0); |
| + Arrays.fill(tvfLengths, 0); |
| + return; |
| + } |
| + |
| + // SegmentMerger calls canReadRawDocs() first and should |
| + // not call us if that returns false. |
| + if (format < FORMAT_VERSION2) |
| + throw new IllegalStateException("cannot read raw docs with older term vector formats"); |
| + |
| + seekTvx(startDocID); |
| + |
| + long tvdPosition = tvx.readLong(); |
| + tvd.seek(tvdPosition); |
| + |
| + long tvfPosition = tvx.readLong(); |
| + tvf.seek(tvfPosition); |
| + |
| + long lastTvdPosition = tvdPosition; |
| + long lastTvfPosition = tvfPosition; |
| + |
| + int count = 0; |
| + while (count < numDocs) { |
| + final int docID = startDocID + count + 1; |
| + if (docID < numTotalDocs) { |
| + tvdPosition = tvx.readLong(); |
| + tvfPosition = tvx.readLong(); |
| + } else { |
| + tvdPosition = tvd.length(); |
| + tvfPosition = tvf.length(); |
| + } |
| + tvdLengths[count] = (int) (tvdPosition-lastTvdPosition); |
| + tvfLengths[count] = (int) (tvfPosition-lastTvfPosition); |
| + count++; |
| + lastTvdPosition = tvdPosition; |
| + lastTvfPosition = tvfPosition; |
| + } |
| + } |
| + |
| private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException |
| { |
| int format = in.readInt(); |
| - if (format > FORMAT_VERSION) |
| - { |
| + if (format > FORMAT_VERSION2) { |
| throw new CorruptIndexException("Incompatible format version: " + format + " expected " |
| - + FORMAT_VERSION + " or less"); |
| + + FORMAT_VERSION2 + " or less"); |
| } |
| return format; |
| } |
| |
| void close() throws IOException { |
| - // make all effort to close up. Keep the first exception |
| - // and throw it as a new one. |
| - IOException keep = null; |
| - if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| - if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| - if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| - if (keep != null) throw (IOException) keep.fillInStackTrace(); |
| + // make all effort to close up. Keep the first exception |
| + // and throw it as a new one. |
| + IOException keep = null; |
| + if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| + if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| + if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } |
| + if (keep != null) throw (IOException) keep.fillInStackTrace(); |
| } |
| |
| /** |
| @@ -133,11 +219,11 @@ |
| //We don't need to do this in other seeks because we already have the |
| // file pointer |
| //that was written in another file |
| - tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); |
| + seekTvx(docNum); |
| //System.out.println("TVX Pointer: " + tvx.getFilePointer()); |
| - long position = tvx.readLong(); |
| + long tvdPosition = tvx.readLong(); |
| |
| - tvd.seek(position); |
| + tvd.seek(tvdPosition); |
| int fieldCount = tvd.readVInt(); |
| //System.out.println("Num Fields: " + fieldCount); |
| // There are only a few fields per document. We opt for a full scan |
| @@ -146,7 +232,7 @@ |
| int number = 0; |
| int found = -1; |
| for (int i = 0; i < fieldCount; i++) { |
| - if(tvdFormat == FORMAT_VERSION) |
| + if (format >= FORMAT_VERSION) |
| number = tvd.readVInt(); |
| else |
| number += tvd.readVInt(); |
| @@ -159,8 +245,12 @@ |
| // document |
| if (found != -1) { |
| // Compute position in the tvf file |
| - position = 0; |
| - for (int i = 0; i <= found; i++) |
| + long position; |
| + if (format >= FORMAT_VERSION2) |
| + position = tvx.readLong(); |
| + else |
| + position = tvd.readVLong(); |
| + for (int i = 1; i <= found; i++) |
| position += tvd.readVLong(); |
| |
| mapper.setDocumentNumber(docNum); |
| @@ -201,10 +291,10 @@ |
| TermFreqVector[] result = null; |
| if (tvx != null) { |
| //We need to offset by |
| - tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE); |
| - long position = tvx.readLong(); |
| + seekTvx(docNum); |
| + long tvdPosition = tvx.readLong(); |
| |
| - tvd.seek(position); |
| + tvd.seek(tvdPosition); |
| int fieldCount = tvd.readVInt(); |
| |
| // No fields are vectorized for this document |
| @@ -213,7 +303,7 @@ |
| String[] fields = new String[fieldCount]; |
| |
| for (int i = 0; i < fieldCount; i++) { |
| - if(tvdFormat == FORMAT_VERSION) |
| + if (format >= FORMAT_VERSION) |
| number = tvd.readVInt(); |
| else |
| number += tvd.readVInt(); |
| @@ -222,9 +312,16 @@ |
| } |
| |
| // Compute position in the tvf file |
| - position = 0; |
| + long position; |
| + if (format >= FORMAT_VERSION2) |
| + position = tvx.readLong(); |
| + else |
| + position = tvd.readVLong(); |
| + |
| long[] tvfPointers = new long[fieldCount]; |
| - for (int i = 0; i < fieldCount; i++) { |
| + tvfPointers[0] = position; |
| + |
| + for (int i = 1; i < fieldCount; i++) { |
| position += tvd.readVLong(); |
| tvfPointers[i] = position; |
| } |
| @@ -241,10 +338,11 @@ |
| // Check if no term vectors are available for this segment at all |
| if (tvx != null) { |
| //We need to offset by |
| - tvx.seek((docNumber * 8L) + FORMAT_SIZE); |
| - long position = tvx.readLong(); |
| |
| - tvd.seek(position); |
| + seekTvx(docNumber); |
| + long tvdPosition = tvx.readLong(); |
| + |
| + tvd.seek(tvdPosition); |
| int fieldCount = tvd.readVInt(); |
| |
| // No fields are vectorized for this document |
| @@ -253,7 +351,7 @@ |
| String[] fields = new String[fieldCount]; |
| |
| for (int i = 0; i < fieldCount; i++) { |
| - if(tvdFormat == FORMAT_VERSION) |
| + if (format >= FORMAT_VERSION) |
| number = tvd.readVInt(); |
| else |
| number += tvd.readVInt(); |
| @@ -262,9 +360,14 @@ |
| } |
| |
| // Compute position in the tvf file |
| - position = 0; |
| + long position; |
| + if (format >= FORMAT_VERSION2) |
| + position = tvx.readLong(); |
| + else |
| + position = tvd.readVLong(); |
| long[] tvfPointers = new long[fieldCount]; |
| - for (int i = 0; i < fieldCount; i++) { |
| + tvfPointers[0] = position; |
| + for (int i = 1; i < fieldCount; i++) { |
| position += tvd.readVLong(); |
| tvfPointers[i] = position; |
| } |
| @@ -293,9 +396,8 @@ |
| private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) |
| throws IOException { |
| for (int i = 0; i < fields.length; i++) { |
| - readTermVector(fields[i], tvfPointers[i], mapper); |
| + readTermVector(fields[i], tvfPointers[i], mapper); |
| } |
| - |
| } |
| |
| |
| @@ -324,7 +426,7 @@ |
| boolean storePositions; |
| boolean storeOffsets; |
| |
| - if(tvfFormat == FORMAT_VERSION){ |
| + if (format >= FORMAT_VERSION){ |
| byte bits = tvf.readByte(); |
| storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; |
| storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; |
| @@ -400,8 +502,6 @@ |
| } |
| } |
| |
| - |
| - |
| protected Object clone() { |
| |
| if (tvx == null || tvd == null || tvf == null) |
| @@ -418,11 +518,9 @@ |
| |
| return clone; |
| } |
| +} |
| |
| |
| - |
| -} |
| - |
| /** |
| * Models the existing parallel array structure |
| */ |
| Index: src/java/org/apache/lucene/index/TermVectorsWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermVectorsWriter.java (revision 609389) |
| +++ src/java/org/apache/lucene/index/TermVectorsWriter.java (working copy) |
| @@ -33,11 +33,11 @@ |
| throws IOException { |
| // Open files for TermVector storage |
| tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); |
| - tvx.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvx.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); |
| - tvd.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvd.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); |
| - tvf.writeInt(TermVectorsReader.FORMAT_VERSION); |
| + tvf.writeInt(TermVectorsReader.FORMAT_VERSION2); |
| |
| this.fieldInfos = fieldInfos; |
| } |
| @@ -53,6 +53,7 @@ |
| throws IOException { |
| |
| tvx.writeLong(tvd.getFilePointer()); |
| + tvx.writeLong(tvf.getFilePointer()); |
| |
| if (vectors != null) { |
| final int numFields = vectors.length; |
| @@ -145,8 +146,8 @@ |
| } |
| |
| // 2nd pass: write field pointers to tvd |
| - long lastFieldPointer = 0; |
| - for (int i=0; i<numFields; i++) { |
| + long lastFieldPointer = fieldPointers[0]; |
| + for (int i=1; i<numFields; i++) { |
| final long fieldPointer = fieldPointers[i]; |
| tvd.writeVLong(fieldPointer-lastFieldPointer); |
| lastFieldPointer = fieldPointer; |
| @@ -154,6 +155,28 @@ |
| } else |
| tvd.writeVInt(0); |
| } |
| + |
| + /** |
| + * Do a bulk copy of numDocs documents from reader to our |
| + * streams. This is used to expedite merging, if the |
| + * field numbers are congruent. |
| + */ |
| + final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException { |
| + long tvdPosition = tvd.getFilePointer(); |
| + long tvfPosition = tvf.getFilePointer(); |
| + long tvdStart = tvdPosition; |
| + long tvfStart = tvfPosition; |
| + for(int i=0;i<numDocs;i++) { |
| + tvx.writeLong(tvdPosition); |
| + tvdPosition += tvdLengths[i]; |
| + tvx.writeLong(tvfPosition); |
| + tvfPosition += tvfLengths[i]; |
| + } |
| + tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart); |
| + tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart); |
| + assert tvd.getFilePointer() == tvdPosition; |
| + assert tvf.getFilePointer() == tvfPosition; |
| + } |
| |
| /** Close all streams. */ |
| final void close() throws IOException { |