docs/attachments/LUCENE-1120/LUCENE-1120.patch - lucene-jira-archive - Git at Google

 Index: CHANGES.txt
 ===================================================================
 --- CHANGES.txt	(revision 609389)
 +++ CHANGES.txt	(working copy)
 @@ -342,6 +342,10 @@
  14. LUCENE-1098: Make inner class StandardAnalyzer.SavedStreams static
      and final. (Nathan Beyer via Michael Busch)

 +15. LUCENE-1120: Speed up merging of term vectors by bulk-copying the
 +    raw bytes for each contiguous range of non-deleted documents.
 +    (Mike McCandless)
 +
  Documentation

   1. LUCENE-1051: Generate separate javadocs for core, demo and contrib
 Index: src/test/org/apache/lucene/index/index.presharedstores.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream
 Index: src/test/org/apache/lucene/index/index.presharedstores.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream
 Index: src/test/org/apache/lucene/index/index.20.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.20.nocfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.21.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.21.nocfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.22.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.22.nocfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.20.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.20.cfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.21.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.21.cfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.prelockless.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream
 Index: src/test/org/apache/lucene/index/index.22.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.22.cfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.prelockless.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream
 Index: src/test/org/apache/lucene/index/TestIndexWriter.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestIndexWriter.java	(revision 609389)
 +++ src/test/org/apache/lucene/index/TestIndexWriter.java	(working copy)
 @@ -1932,9 +1932,10 @@
        for(int j=0;j<reader.maxDoc();j++) {
          if (reader.isDeleted(j))
            numDel++;
 -        else
 +        else {
            reader.document(j);
 -        reader.getTermFreqVectors(j);
 +          reader.getTermFreqVectors(j);
 +        }
        }
        reader.close();

 @@ -1958,9 +1959,10 @@
        for(int j=0;j<reader.maxDoc();j++) {
          if (reader.isDeleted(j))
            numDel++;
 -        else
 +        else {
            reader.document(j);
 -        reader.getTermFreqVectors(j);
 +          reader.getTermFreqVectors(j);
 +        }
        }
        reader.close();
        assertEquals(0, numDel);
 Index: src/test/org/apache/lucene/index/index.19.nocfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.19.nocfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/index.19.cfs.zip
 ===================================================================
 Cannot display: file marked as a binary type.
 svn:mime-type = application/octet-stream

 Property changes on: src/test/org/apache/lucene/index/index.19.cfs.zip
 ___________________________________________________________________
 Name: svn:mime-type
    + application/octet-stream

 Index: src/test/org/apache/lucene/index/TestBackwardsCompatibility.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestBackwardsCompatibility.java	(revision 609389)
 +++ src/test/org/apache/lucene/index/TestBackwardsCompatibility.java	(working copy)
 @@ -50,11 +50,11 @@

    /*
    public void testCreatePreLocklessCFS() throws IOException {
 -    createIndex("src/test/org/apache/lucene/index/index.prelockless.cfs", true);
 +    createIndex("src/test/org/apache/lucene/index/index.cfs", true);
    }

    public void testCreatePreLocklessNoCFS() throws IOException {
 -    createIndex("src/test/org/apache/lucene/index/index.prelockless.nocfs", false);
 +    createIndex("src/test/org/apache/lucene/index/index.nocfs", false);
    }
    */

 @@ -106,10 +106,14 @@
      rmDir(dirName);
    }

 -  final String[] oldNames = {"prelockless.cfs",
 -                             "prelockless.nocfs",
 -                             "presharedstores.cfs",
 -                             "presharedstores.nocfs"};
 +  final String[] oldNames = {"19.cfs",
 +                             "19.nocfs",
 +                             "20.cfs",
 +                             "20.nocfs",
 +                             "21.cfs",
 +                             "21.nocfs",
 +                             "22.cfs",
 +                             "22.nocfs"};

    public void testSearchOldIndex() throws IOException {
      for(int i=0;i<oldNames.length;i++) {
 @@ -146,6 +150,15 @@
      }
    }

 +  private void testHits(Hits hits, int expectedCount, IndexReader reader) throws IOException {
 +    final int hitCount = hits.length();
 +    assertEquals("wrong number of hits", expectedCount, hitCount);
 +    for(int i=0;i<hitCount;i++) {
 +      hits.doc(i);
 +      reader.getTermFreqVectors(hits.id(i));
 +    }
 +  }
 +
    public void searchIndex(String dirName) throws IOException {
      //QueryParser parser = new QueryParser("contents", new WhitespaceAnalyzer());
      //Query query = parser.parse("handle:1");
 @@ -156,12 +169,14 @@
      IndexSearcher searcher = new IndexSearcher(dir);

      Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
 -    assertEquals(34, hits.length());
 +
 +    // First document should be #21 since it's norm was
 +    // increased:
      Document d = hits.doc(0);
 -
 -    // First document should be #21 since it's norm was increased:
      assertEquals("didn't get the right document first", "21", d.get("id"));

 +    testHits(hits, 34, searcher.getIndexReader());
 +
      searcher.close();
      dir.close();
    }
 @@ -189,9 +204,9 @@
      // make sure searching sees right # hits
      IndexSearcher searcher = new IndexSearcher(dir);
      Hits hits = searcher.search(new TermQuery(new Term("content", "aaa")));
 -    assertEquals("wrong number of hits", 44, hits.length());
      Document d = hits.doc(0);
      assertEquals("wrong first document", "21", d.get("id"));
 +    testHits(hits, 44, searcher.getIndexReader());
      searcher.close();

      // make sure we can do delete & setNorm against this
 @@ -209,6 +224,7 @@
      assertEquals("wrong number of hits", 43, hits.length());
      d = hits.doc(0);
      assertEquals("wrong first document", "22", d.get("id"));
 +    testHits(hits, 43, searcher.getIndexReader());
      searcher.close();

      // optimize
 @@ -220,6 +236,7 @@
      hits = searcher.search(new TermQuery(new Term("content", "aaa")));
      assertEquals("wrong number of hits", 43, hits.length());
      d = hits.doc(0);
 +    testHits(hits, 43, searcher.getIndexReader());
      assertEquals("wrong first document", "22", d.get("id"));
      searcher.close();

 @@ -257,6 +274,7 @@
      assertEquals("wrong number of hits", 33, hits.length());
      d = hits.doc(0);
      assertEquals("wrong first document", "22", d.get("id"));
 +    testHits(hits, 33, searcher.getIndexReader());
      searcher.close();

      // optimize
 @@ -269,6 +287,7 @@
      assertEquals("wrong number of hits", 33, hits.length());
      d = hits.doc(0);
      assertEquals("wrong first document", "22", d.get("id"));
 +    testHits(hits, 33, searcher.getIndexReader());
      searcher.close();

      dir.close();
 @@ -283,6 +302,7 @@
      Directory dir = FSDirectory.getDirectory(dirName);
      IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
      writer.setUseCompoundFile(doCFS);
 +    writer.setMaxBufferedDocs(10);

      for(int i=0;i<35;i++) {
        addDoc(writer, i);
 @@ -393,6 +413,7 @@
      Document doc = new Document();
      doc.add(new Field("content", "aaa", Field.Store.NO, Field.Index.TOKENIZED));
      doc.add(new Field("id", Integer.toString(id), Field.Store.YES, Field.Index.UN_TOKENIZED));
 +    doc.add(new Field("content2", "here is more content with aaa aaa aaa", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
      writer.addDocument(doc);
    }

 Index: src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java
 ===================================================================
 --- src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java	(revision 609389)
 +++ src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java	(working copy)
 @@ -251,7 +251,7 @@

          message("  merge thread: done");

 -      } catch (IOException exc) {
 +      } catch (Throwable exc) {

          if (merge != null) {
            merge.setException(exc);
 Index: src/java/org/apache/lucene/index/SegmentMerger.java
 ===================================================================
 --- src/java/org/apache/lucene/index/SegmentMerger.java	(revision 609389)
 +++ src/java/org/apache/lucene/index/SegmentMerger.java	(working copy)
 @@ -205,6 +205,38 @@
      }
    }

 +  private SegmentReader[] matchingSegmentReaders;
 +  private int[] rawDocLengths;
 +  private int[] rawDocLengths2;
 +
 +  private void setMatchingSegmentReaders() {
 +    // If the i'th reader is a SegmentReader and has
 +    // identical fieldName -> number mapping, then this
 +    // array will be non-null at position i:
 +    matchingSegmentReaders = new SegmentReader[readers.size()];
 +
 +    // If this reader is a SegmentReader, and all of its
 +    // field name -> number mappings match the "merged"
 +    // FieldInfos, then we can do a bulk copy of the
 +    // stored fields:
 +    for (int i = 0; i < readers.size(); i++) {
 +      IndexReader reader = (IndexReader) readers.elementAt(i);
 +      if (reader instanceof SegmentReader) {
 +        SegmentReader segmentReader = (SegmentReader) reader;
 +        boolean same = true;
 +        FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
 +        for (int j = 0; same && j < segmentFieldInfos.size(); j++)
 +          same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
 +        if (same)
 +          matchingSegmentReaders[i] = segmentReader;
 +      }
 +    }
 +
 +    // Used for bulk-reading raw bytes for stored fields
 +    rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
 +    rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
 +  }
 +
    /**
     *
     * @return The number of documents in all of the readers
 @@ -248,34 +280,10 @@

      int docCount = 0;

 +    setMatchingSegmentReaders();
 +
      if (mergeDocStores) {

 -      // If the i'th reader is a SegmentReader and has
 -      // identical fieldName -> number mapping, then this
 -      // array will be non-null at position i:
 -      SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.size()];
 -
 -      // If this reader is a SegmentReader, and all of its
 -      // field name -> number mappings match the "merged"
 -      // FieldInfos, then we can do a bulk copy of the
 -      // stored fields:
 -      for (int i = 0; i < readers.size(); i++) {
 -        IndexReader reader = (IndexReader) readers.elementAt(i);
 -        if (reader instanceof SegmentReader) {
 -          SegmentReader segmentReader = (SegmentReader) reader;
 -          boolean same = true;
 -          FieldInfos segmentFieldInfos = segmentReader.getFieldInfos();
 -          for (int j = 0; same && j < segmentFieldInfos.size(); j++)
 -            same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
 -          if (same) {
 -            matchingSegmentReaders[i] = segmentReader;
 -          }
 -        }
 -      }
 -
 -      // Used for bulk-reading raw bytes for stored fields
 -      final int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
 -
        // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
        // in  merge mode, we use this FieldSelector
        FieldSelector fieldSelectorMerge = new FieldSelector() {
 @@ -350,15 +358,45 @@

      try {
        for (int r = 0; r < readers.size(); r++) {
 +        final SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
 +        TermVectorsReader matchingVectorsReader;
 +        if (matchingSegmentReader != null) {
 +          matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
 +
 +          // If the TV* files are an older format then they
 +          // cannot read raw docs:
 +          if (matchingVectorsReader != null && !matchingVectorsReader.canReadRawDocs())
 +            matchingVectorsReader = null;
 +        } else
 +          matchingVectorsReader = null;
          IndexReader reader = (IndexReader) readers.elementAt(r);
          int maxDoc = reader.maxDoc();
 -        for (int docNum = 0; docNum < maxDoc; docNum++) {
 +        for (int docNum = 0; docNum < maxDoc;) {
            // skip deleted docs
 -          if (reader.isDeleted(docNum))
 -            continue;
 -          termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
 -          if (checkAbort != null)
 -            checkAbort.work(300);
 +          if (!reader.isDeleted(docNum)) {
 +            if (matchingVectorsReader != null) {
 +              // We can optimize this case (doing a bulk
 +              // byte copy) since the field numbers are
 +              // identical
 +              int start = docNum;
 +              int numDocs = 0;
 +              do {
 +                docNum++;
 +                numDocs++;
 +              } while(docNum < maxDoc && !matchingSegmentReader.isDeleted(docNum) && numDocs < MAX_RAW_MERGE_DOCS);
 +
 +              matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
 +              termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
 +              if (checkAbort != null)
 +                checkAbort.work(300*numDocs);
 +            } else {
 +              termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
 +              docNum++;
 +              if (checkAbort != null)
 +                checkAbort.work(300);
 +            }
 +          } else
 +            docNum++;
          }
        }
      } finally {
 Index: src/java/org/apache/lucene/index/DocumentsWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DocumentsWriter.java	(revision 609389)
 +++ src/java/org/apache/lucene/index/DocumentsWriter.java	(working copy)
 @@ -605,12 +605,12 @@
        // Append term vectors to the real outputs:
        if (tvx != null) {
          tvx.writeLong(tvd.getFilePointer());
 +        tvx.writeLong(tvf.getFilePointer());
          tvd.writeVInt(numVectorFields);
          if (numVectorFields > 0) {
            for(int i=0;i<numVectorFields;i++)
              tvd.writeVInt(vectorFieldNumbers[i]);
            assert 0 == vectorFieldPointers[0];
 -          tvd.writeVLong(tvf.getFilePointer());
            long lastPos = vectorFieldPointers[0];
            for(int i=1;i<numVectorFields;i++) {
              long pos = vectorFieldPointers[i];
 @@ -788,17 +788,19 @@
          if (tvx == null) {
            assert docStoreSegment != null;
            tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
 -          tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
 +          tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
            tvd = directory.createOutput(docStoreSegment +  "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
 -          tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
 +          tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
            tvf = directory.createOutput(docStoreSegment +  "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
 -          tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
 +          tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);
            files = null;

            // We must "catch up" for all docIDs that had no
            // vectors before this one
 -          for(int i=0;i<docID;i++)
 +          for(int i=0;i<docID;i++) {
              tvx.writeLong(0);
 +            tvx.writeLong(0);
 +          }
          }

          numVectorFields = 0;
 Index: src/java/org/apache/lucene/index/TermVectorsReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermVectorsReader.java	(revision 609389)
 +++ src/java/org/apache/lucene/index/TermVectorsReader.java	(working copy)
 @@ -22,13 +22,18 @@
  import org.apache.lucene.store.IndexInput;

  import java.io.IOException;
 +import java.util.Arrays;

  /**
   * @version $Id$
   */
  class TermVectorsReader implements Cloneable {

 +  // NOTE: if you make a new format, it must be larger than
 +  // the current format
    static final int FORMAT_VERSION = 2;
 +  static final int FORMAT_VERSION2 = 3;
 +
    //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
    static final int FORMAT_SIZE = 4;

 @@ -41,13 +46,13 @@
    private IndexInput tvd;
    private IndexInput tvf;
    private int size;
 +  private int numTotalDocs;

    // The docID offset where our docs begin in the index
    // file.  This will be 0 if we have our own private file.
    private int docStoreOffset;

 -  private int tvdFormat;
 -  private int tvfFormat;
 +  private final int format;

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
      throws CorruptIndexException, IOException {
 @@ -56,7 +61,7 @@

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)
      throws CorruptIndexException, IOException {
 -    this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE, -1, 0);
 +    this(d, segment, fieldInfos, readBufferSize, -1, 0);
    }

    TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
 @@ -66,22 +71,35 @@
      try {
        if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
          tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
 -        checkValidFormat(tvx);
 +        format = checkValidFormat(tvx);
          tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
 -        tvdFormat = checkValidFormat(tvd);
 +        final int tvdFormat = checkValidFormat(tvd);
          tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
 -        tvfFormat = checkValidFormat(tvf);
 +        final int tvfFormat = checkValidFormat(tvf);
 +
 +        assert format == tvdFormat;
 +        assert format == tvfFormat;
 +
 +        if (format >= FORMAT_VERSION2) {
 +          assert (tvx.length()-FORMAT_SIZE) % 16 == 0;
 +          numTotalDocs = (int) (tvx.length() >> 4);
 +        } else {
 +          assert (tvx.length()-FORMAT_SIZE) % 8 == 0;
 +          numTotalDocs = (int) (tvx.length() >> 3);
 +        }
 +
          if (-1 == docStoreOffset) {
            this.docStoreOffset = 0;
 -          this.size = (int) (tvx.length() >> 3);
 +          this.size = numTotalDocs;
          } else {
            this.docStoreOffset = docStoreOffset;
            this.size = size;
            // Verify the file is long enough to hold all of our
            // docs
 -          assert ((int) (tvx.length() / 8)) >= size + docStoreOffset;
 +          assert numTotalDocs >= size + docStoreOffset;
          }
 -      }
 +      } else
 +        format = 0;

        this.fieldInfos = fieldInfos;
        success = true;
 @@ -96,26 +114,94 @@
        }
      }
    }
 -
 +
 +  // Used for bulk copy when merging
 +  IndexInput getTvdStream() {
 +    return tvd;
 +  }
 +
 +  // Used for bulk copy when merging
 +  IndexInput getTvfStream() {
 +    return tvf;
 +  }
 +
 +  final private void seekTvx(final int docNum) throws IOException {
 +    if (format < FORMAT_VERSION2)
 +      tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
 +    else
 +      tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
 +  }
 +
 +  boolean canReadRawDocs() {
 +    return format >= FORMAT_VERSION2;
 +  }
 +
 +  /** Retrieve the length (in bytes) of the tvd and tvf
 +   *  entries for the next numDocs starting with
 +   *  startDocID.  This is used for bulk copying when
 +   *  merging segments, if the field numbers are
 +   *  congruent.  Once this returns, the tvf & tvd streams
 +   *  are seeked to the startDocID. */
 +  final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException {
 +
 +    if (tvx == null) {
 +      Arrays.fill(tvdLengths, 0);
 +      Arrays.fill(tvfLengths, 0);
 +      return;
 +    }
 +
 +    // SegmentMerger calls canReadRawDocs() first and should
 +    // not call us if that returns false.
 +    if (format < FORMAT_VERSION2)
 +      throw new IllegalStateException("cannot read raw docs with older term vector formats");
 +
 +    seekTvx(startDocID);
 +
 +    long tvdPosition = tvx.readLong();
 +    tvd.seek(tvdPosition);
 +
 +    long tvfPosition = tvx.readLong();
 +    tvf.seek(tvfPosition);
 +
 +    long lastTvdPosition = tvdPosition;
 +    long lastTvfPosition = tvfPosition;
 +
 +    int count = 0;
 +    while (count < numDocs) {
 +      final int docID = startDocID + count + 1;
 +      if (docID < numTotalDocs)  {
 +        tvdPosition = tvx.readLong();
 +        tvfPosition = tvx.readLong();
 +      } else {
 +        tvdPosition = tvd.length();
 +        tvfPosition = tvf.length();
 +      }
 +      tvdLengths[count] = (int) (tvdPosition-lastTvdPosition);
 +      tvfLengths[count] = (int) (tvfPosition-lastTvfPosition);
 +      count++;
 +      lastTvdPosition = tvdPosition;
 +      lastTvfPosition = tvfPosition;
 +    }
 +  }
 +
    private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
    {
      int format = in.readInt();
 -    if (format > FORMAT_VERSION)
 -    {
 +    if (format > FORMAT_VERSION2) {
        throw new CorruptIndexException("Incompatible format version: " + format + " expected "
 -                                      + FORMAT_VERSION + " or less");
 +                                      + FORMAT_VERSION2 + " or less");
      }
      return format;
    }

    void close() throws IOException {
 -  	// make all effort to close up. Keep the first exception
 -  	// and throw it as a new one.
 -  	IOException keep = null;
 -  	if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
 -  	if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
 -  	if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
 -  	if (keep != null) throw (IOException) keep.fillInStackTrace();
 +    // make all effort to close up. Keep the first exception
 +    // and throw it as a new one.
 +    IOException keep = null;
 +    if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
 +    if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
 +    if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
 +    if (keep != null) throw (IOException) keep.fillInStackTrace();
    }

    /**
 @@ -133,11 +219,11 @@
        //We don't need to do this in other seeks because we already have the
        // file pointer
        //that was written in another file
 -      tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
 +      seekTvx(docNum);
        //System.out.println("TVX Pointer: " + tvx.getFilePointer());
 -      long position = tvx.readLong();
 +      long tvdPosition = tvx.readLong();

 -      tvd.seek(position);
 +      tvd.seek(tvdPosition);
        int fieldCount = tvd.readVInt();
        //System.out.println("Num Fields: " + fieldCount);
        // There are only a few fields per document. We opt for a full scan
 @@ -146,7 +232,7 @@
        int number = 0;
        int found = -1;
        for (int i = 0; i < fieldCount; i++) {
 -        if(tvdFormat == FORMAT_VERSION)
 +        if (format >= FORMAT_VERSION)
            number = tvd.readVInt();
          else
            number += tvd.readVInt();
 @@ -159,8 +245,12 @@
        // document
        if (found != -1) {
          // Compute position in the tvf file
 -        position = 0;
 -        for (int i = 0; i <= found; i++)
 +        long position;
 +        if (format >= FORMAT_VERSION2)
 +          position = tvx.readLong();
 +        else
 +          position = tvd.readVLong();
 +        for (int i = 1; i <= found; i++)
            position += tvd.readVLong();

          mapper.setDocumentNumber(docNum);
 @@ -201,10 +291,10 @@
      TermFreqVector[] result = null;
      if (tvx != null) {
        //We need to offset by
 -      tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
 -      long position = tvx.readLong();
 +      seekTvx(docNum);
 +      long tvdPosition = tvx.readLong();

 -      tvd.seek(position);
 +      tvd.seek(tvdPosition);
        int fieldCount = tvd.readVInt();

        // No fields are vectorized for this document
 @@ -213,7 +303,7 @@
          String[] fields = new String[fieldCount];

          for (int i = 0; i < fieldCount; i++) {
 -          if(tvdFormat == FORMAT_VERSION)
 +          if (format >= FORMAT_VERSION)
              number = tvd.readVInt();
            else
              number += tvd.readVInt();
 @@ -222,9 +312,16 @@
          }

          // Compute position in the tvf file
 -        position = 0;
 +        long position;
 +        if (format >= FORMAT_VERSION2)
 +          position = tvx.readLong();
 +        else
 +          position = tvd.readVLong();
 +
          long[] tvfPointers = new long[fieldCount];
 -        for (int i = 0; i < fieldCount; i++) {
 +        tvfPointers[0] = position;
 +
 +        for (int i = 1; i < fieldCount; i++) {
            position += tvd.readVLong();
            tvfPointers[i] = position;
          }
 @@ -241,10 +338,11 @@
      // Check if no term vectors are available for this segment at all
      if (tvx != null) {
        //We need to offset by
 -      tvx.seek((docNumber * 8L) + FORMAT_SIZE);
 -      long position = tvx.readLong();

 -      tvd.seek(position);
 +      seekTvx(docNumber);
 +      long tvdPosition = tvx.readLong();
 +
 +      tvd.seek(tvdPosition);
        int fieldCount = tvd.readVInt();

        // No fields are vectorized for this document
 @@ -253,7 +351,7 @@
          String[] fields = new String[fieldCount];

          for (int i = 0; i < fieldCount; i++) {
 -          if(tvdFormat == FORMAT_VERSION)
 +          if (format >= FORMAT_VERSION)
              number = tvd.readVInt();
            else
              number += tvd.readVInt();
 @@ -262,9 +360,14 @@
          }

          // Compute position in the tvf file
 -        position = 0;
 +        long position;
 +        if (format >= FORMAT_VERSION2)
 +          position = tvx.readLong();
 +        else
 +          position = tvd.readVLong();
          long[] tvfPointers = new long[fieldCount];
 -        for (int i = 0; i < fieldCount; i++) {
 +        tvfPointers[0] = position;
 +        for (int i = 1; i < fieldCount; i++) {
            position += tvd.readVLong();
            tvfPointers[i] = position;
          }
 @@ -293,9 +396,8 @@
    private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
            throws IOException {
      for (int i = 0; i < fields.length; i++) {
 -       readTermVector(fields[i], tvfPointers[i], mapper);
 +      readTermVector(fields[i], tvfPointers[i], mapper);
      }
 -
    }


 @@ -324,7 +426,7 @@
      boolean storePositions;
      boolean storeOffsets;

 -    if(tvfFormat == FORMAT_VERSION){
 +    if (format >= FORMAT_VERSION){
        byte bits = tvf.readByte();
        storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
        storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
 @@ -400,8 +502,6 @@
      }
    }

 -
 -
    protected Object clone() {

      if (tvx == null || tvd == null || tvf == null)
 @@ -418,11 +518,9 @@

      return clone;
    }
 +}


 -
 -}
 -
  /**
   * Models the existing parallel array structure
   */
 Index: src/java/org/apache/lucene/index/TermVectorsWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermVectorsWriter.java	(revision 609389)
 +++ src/java/org/apache/lucene/index/TermVectorsWriter.java	(working copy)
 @@ -33,11 +33,11 @@
      throws IOException {
      // Open files for TermVector storage
      tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
 -    tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
 +    tvx.writeInt(TermVectorsReader.FORMAT_VERSION2);
      tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
 -    tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
 +    tvd.writeInt(TermVectorsReader.FORMAT_VERSION2);
      tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
 -    tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
 +    tvf.writeInt(TermVectorsReader.FORMAT_VERSION2);

      this.fieldInfos = fieldInfos;
    }
 @@ -53,6 +53,7 @@
        throws IOException {

      tvx.writeLong(tvd.getFilePointer());
 +    tvx.writeLong(tvf.getFilePointer());

      if (vectors != null) {
        final int numFields = vectors.length;
 @@ -145,8 +146,8 @@
        }

        // 2nd pass: write field pointers to tvd
 -      long lastFieldPointer = 0;
 -      for (int i=0; i<numFields; i++) {
 +      long lastFieldPointer = fieldPointers[0];
 +      for (int i=1; i<numFields; i++) {
          final long fieldPointer = fieldPointers[i];
          tvd.writeVLong(fieldPointer-lastFieldPointer);
          lastFieldPointer = fieldPointer;
 @@ -154,6 +155,28 @@
      } else
        tvd.writeVInt(0);
    }
 +
 +  /**
 +   * Do a bulk copy of numDocs documents from reader to our
 +   * streams.  This is used to expedite merging, if the
 +   * field numbers are congruent.
 +   */
 +  final void addRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) throws IOException {
 +    long tvdPosition = tvd.getFilePointer();
 +    long tvfPosition = tvf.getFilePointer();
 +    long tvdStart = tvdPosition;
 +    long tvfStart = tvfPosition;
 +    for(int i=0;i<numDocs;i++) {
 +      tvx.writeLong(tvdPosition);
 +      tvdPosition += tvdLengths[i];
 +      tvx.writeLong(tvfPosition);
 +      tvfPosition += tvfLengths[i];
 +    }
 +    tvd.copyBytes(reader.getTvdStream(), tvdPosition-tvdStart);
 +    tvf.copyBytes(reader.getTvfStream(), tvfPosition-tvfStart);
 +    assert tvd.getFilePointer() == tvdPosition;
 +    assert tvf.getFilePointer() == tvfPosition;
 +  }

    /** Close all streams. */
    final void close() throws IOException {