docs/attachments/LUCENE-755/payload.patch - lucene-jira-archive - Git at Google

 Index: src/test/org/apache/lucene/store/MockRAMOutputStream.java
 ===================================================================
 --- src/test/org/apache/lucene/store/MockRAMOutputStream.java	(révision 493447)
 +++ src/test/org/apache/lucene/store/MockRAMOutputStream.java	(copie de travail)
 @@ -48,7 +48,7 @@
      }
    }

 -  public void flushBuffer(byte[] src, int len) throws IOException {
 +  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
      long freeSpace = dir.maxSize - dir.sizeInBytes();
      long realUsage = 0;

 @@ -63,14 +63,14 @@
      if (dir.maxSize != 0 && freeSpace <= len) {
        if (freeSpace > 0 && freeSpace < len) {
          realUsage += freeSpace;
 -        super.flushBuffer(src, (int) freeSpace);
 +        super.flushBuffer(src, offset, (int) freeSpace);
        }
        if (realUsage > dir.maxUsedSize) {
          dir.maxUsedSize = realUsage;
        }
        throw new IOException("fake disk full at " + dir.sizeInBytes() + " bytes");
      } else {
 -      super.flushBuffer(src, len);
 +      super.flushBuffer(src, offset, len);
      }

      if (first) {
 Index: src/test/org/apache/lucene/index/TestPayloads.java
 ===================================================================
 --- src/test/org/apache/lucene/index/TestPayloads.java	(révision 0)
 +++ src/test/org/apache/lucene/index/TestPayloads.java	(révision 0)
 @@ -0,0 +1,416 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.File;
 +import java.io.IOException;
 +import java.io.Reader;
 +import java.util.HashMap;
 +import java.util.Map;
 +import java.util.Random;
 +
 +import junit.framework.TestCase;
 +
 +import org.apache.lucene.analysis.Analyzer;
 +import org.apache.lucene.analysis.Token;
 +import org.apache.lucene.analysis.TokenFilter;
 +import org.apache.lucene.analysis.TokenStream;
 +import org.apache.lucene.analysis.WhitespaceAnalyzer;
 +import org.apache.lucene.analysis.WhitespaceTokenizer;
 +import org.apache.lucene.document.Document;
 +import org.apache.lucene.document.Field;
 +import org.apache.lucene.document.Field.Index;
 +import org.apache.lucene.document.Field.Store;
 +import org.apache.lucene.store.Directory;
 +import org.apache.lucene.store.FSDirectory;
 +import org.apache.lucene.store.RAMDirectory;
 +
 +
 +public class TestPayloads extends TestCase {
 +
 +    // Simple tests to test the Payload class
 +    public void testPayload() throws Exception {
 +        byte[] testData = "This is a test!".getBytes();
 +        BytePayload payload = new BytePayload(testData);
 +        assertEquals("Wrong payload length.", testData.length, payload.getLength());
 +
 +        // test copyTo()
 +        byte[] target = new byte[testData.length - 1];
 +        try {
 +            payload.copyTo(target, 0);
 +            fail("Expected exception not thrown");
 +        } catch (Exception expected) {
 +            // expected exception
 +        }
 +
 +        target = new byte[testData.length + 3];
 +        payload.copyTo(target, 3);
 +
 +        for (int i = 0; i < testData.length; i++) {
 +            assertEquals(testData[i], target[i + 3]);
 +        }
 +
 +
 +        // test toByteArray()
 +        target = payload.toByteArray();
 +        assertByteArrayEquals(testData, target);
 +
 +        // test byteAt()
 +        for (int i = 0; i < testData.length; i++) {
 +            assertEquals(payload.byteAt(i), testData[i]);
 +        }
 +
 +        try {
 +            payload.byteAt(testData.length + 1);
 +            fail("Expected exception not thrown");
 +        } catch (Exception expected) {
 +            // expected exception
 +        }
 +    }
 +
 +    // Tests whether the DocumentWriter and SegmentMerger correctly enable the
 +    // payload bit in the FieldInfo
 +    public void testPayloadFieldBit() throws Exception {
 +        Directory ram = new RAMDirectory();
 +        PayloadAnalyzer analyzer = new PayloadAnalyzer();
 +        IndexWriter writer = new IndexWriter(ram, analyzer, true);
 +        Document d = new Document();
 +        // this field won't have any payloads
 +        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
 +        // this field will have payloads in all docs, however not for all term positions,
 +        // so this field is used to check if the DocumentWriter correctly enables the payloads bit
 +        // even if only some term positions have payloads
 +        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
 +        // enabled in only some documents
 +        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        // only add payload data for field f2
 +        analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
 +        writer.addDocument(d);
 +        // flush
 +        writer.close();
 +
 +        // only one segment in the index, so we can cast to SegmentReader
 +        SegmentReader reader = (SegmentReader) IndexReader.open(ram);
 +        FieldInfos fi = reader.fieldInfos();
 +        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 +        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 +        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
 +        reader.close();
 +
 +        // now we add another document which has payloads for field f3 and verify if the SegmentMerger
 +        // enabled payloads for that field
 +        writer = new IndexWriter(ram, analyzer, true);
 +        d = new Document();
 +        d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
 +        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
 +        // add payload data for field f2 and f3
 +        analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
 +        analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
 +        writer.addDocument(d);
 +        // force merge
 +        writer.optimize();
 +        // flush
 +        writer.close();
 +
 +        // only one segment in the index, so we can cast to SegmentReader
 +        reader = (SegmentReader) IndexReader.open(ram);
 +        fi = reader.fieldInfos();
 +        assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
 +        assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
 +        assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
 +        reader.close();
 +    }
 +
 +    // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
 +    public void testPayloadsEncoding() throws Exception {
 +        // first perform the test using a RAMDirectory
 +        Directory dir = new RAMDirectory();
 +        performTest(dir);
 +
 +        // now use a FSDirectory and repeat same test
 +        String dirName = "test_payloads";
 +        dir = FSDirectory.getDirectory(dirName, true);
 +        performTest(dir);
 +        rmDir(dirName);
 +    }
 +
 +    // builds an index with payloads in the given Directory and performs
 +    // different tests to verify the payload encoding
 +    private void performTest(Directory dir) throws Exception {
 +        PayloadAnalyzer analyzer = new PayloadAnalyzer();
 +        IndexWriter writer = new IndexWriter(dir, analyzer, true);
 +
 +        // should be in sync with value in TermInfosWriter
 +        final int skipInterval = 16;
 +
 +        final int numTerms = 5;
 +        final String fieldName = "f1";
 +
 +        int numDocs = skipInterval + 1;
 +        // create content for the test documents with just a few terms
 +        Term[] terms = generateTerms(fieldName, numTerms);
 +        StringBuffer sb = new StringBuffer();
 +        for (int i = 0; i < terms.length; i++) {
 +            sb.append(terms[i].text);
 +            sb.append(" ");
 +        }
 +        String content = sb.toString();
 +
 +
 +        int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
 +        byte[] payloadData = generateRandomData(payloadDataLength);
 +
 +        Document d = new Document();
 +        d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
 +        // add the same document multiple times to have the same payload lengths for all
 +        // occurrences within two consecutive skip intervals
 +        int offset = 0;
 +        for (int i = 0; i < 2 * numDocs; i++) {
 +            analyzer.setPayloadData("f1", payloadData, offset, 1);
 +            offset += numTerms;
 +            writer.addDocument(d);
 +        }
 +
 +        // now we make sure to have different payload lengths next at the next skip point
 +        for (int i = 0; i < numDocs; i++) {
 +            analyzer.setPayloadData(fieldName, payloadData, offset, i);
 +            offset += i * numTerms;
 +            writer.addDocument(d);
 +        }
 +
 +        writer.optimize();
 +        // flush
 +        writer.close();
 +
 +
 +        /*
 +         * Verify the index
 +         * first we test if all payloads are stored correctly
 +         */
 +        IndexReader reader = IndexReader.open(dir);
 +
 +        byte[] verifyPayloadData = new byte[payloadDataLength];
 +        offset = 0;
 +        TermPositions[] tps = new TermPositions[numTerms];
 +        for (int i = 0; i < numTerms; i++) {
 +            tps[i] = reader.termPositions(terms[i]);
 +        }
 +
 +        while (tps[0].next()) {
 +            for (int i = 1; i < numTerms; i++) {
 +                tps[i].next();
 +            }
 +            int freq = tps[0].freq();
 +
 +            for (int i = 0; i < freq; i++) {
 +                for (int j = 0; j < numTerms; j++) {
 +                    tps[j].nextPosition();
 +                    BytePayload payload = (BytePayload) tps[j].getPayload();
 +                    payload.copyTo(verifyPayloadData, offset);
 +                    offset += tps[j].getPayloadLength();
 +                }
 +            }
 +        }
 +
 +        for (int i = 0; i < numTerms; i++) {
 +            tps[i].close();
 +        }
 +
 +        assertByteArrayEquals(payloadData, verifyPayloadData);
 +
 +        /*
 +         *  test lazy skipping
 +         */
 +        TermPositions tp = reader.termPositions(terms[0]);
 +        tp.next();
 +        tp.nextPosition();
 +        // now we don't read this payload
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 +        BytePayload payload = (BytePayload) tp.getPayload();
 +        assertEquals(payload.byteAt(0), payloadData[numTerms]);
 +        tp.nextPosition();
 +
 +        // we don't read this payload and skip to a different document
 +        tp.skipTo(5);
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 +        payload = (BytePayload) tp.getPayload();
 +        assertEquals(payload.byteAt(0), payloadData[5 * numTerms]);
 +
 +
 +        /*
 +         * Test different lengths at skip points
 +         */
 +        tp.seek(terms[1]);
 +        tp.next();
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 +        tp.skipTo(skipInterval - 1);
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 +        tp.skipTo(2 * skipInterval - 1);
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
 +        tp.skipTo(3 * skipInterval - 1);
 +        tp.nextPosition();
 +        assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
 +
 +        /*
 +         * Test multiple call of getPayload()
 +         */
 +        tp.getPayload();
 +        try {
 +            // it is forbidden to call getPayload() more than once
 +            // without calling nextPosition()
 +            tp.getPayload();
 +            fail("Expected exception not thrown");
 +        } catch (Exception expected) {
 +            // expected exception
 +        }
 +
 +        reader.close();
 +    }
 +
 +    private byte[] generateRandomData(int n) {
 +        Random rnd = new Random();
 +        byte[] data = new byte[n];
 +        rnd.nextBytes(data);
 +        return data;
 +    }
 +
 +    private Term[] generateTerms(String fieldName, int n) {
 +        int maxDigits = (int) (Math.log(n) / Math.log(10));
 +        Term[] terms = new Term[n];
 +        StringBuffer sb = new StringBuffer();
 +        for (int i = 0; i < n; i++) {
 +            sb.setLength(0);
 +            sb.append("t");
 +            int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
 +            for (int j = 0; j < zeros; j++) {
 +                sb.append("0");
 +            }
 +            sb.append(i);
 +            terms[i] = new Term(fieldName, sb.toString());
 +        }
 +        return terms;
 +    }
 +
 +
 +    private void rmDir(String dir) {
 +        File fileDir = new File(dir);
 +        if (fileDir.exists()) {
 +          File[] files = fileDir.listFiles();
 +          if (files != null) {
 +            for (int i = 0; i < files.length; i++) {
 +              files[i].delete();
 +            }
 +          }
 +          fileDir.delete();
 +        }
 +      }
 +
 +
 +
 +    void assertByteArrayEquals(byte[] b1, byte[] b2) {
 +        if (b1.length != b2.length) {
 +          fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
 +        }
 +
 +        for (int i = 0; i < b1.length; i++) {
 +          if (b1[i] != b2[i]) {
 +            fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
 +          }
 +        }
 +      }
 +
 +
 +    /**
 +     * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
 +     */
 +    private static class PayloadAnalyzer extends Analyzer {
 +        Map fieldToData = new HashMap();
 +
 +        void setPayloadData(String field, byte[] data, int offset, int length) {
 +            fieldToData.put(field, new PayloadData(0, data, offset, length));
 +        }
 +
 +        void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
 +            fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
 +        }
 +
 +        public TokenStream tokenStream(String fieldName, Reader reader) {
 +            PayloadData payload = (PayloadData) fieldToData.get(fieldName);
 +            TokenStream ts = new WhitespaceTokenizer(reader);
 +            if (payload != null) {
 +                if (payload.numFieldInstancesToSkip == 0) {
 +                    ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
 +                } else {
 +                    payload.numFieldInstancesToSkip--;
 +                }
 +            }
 +            return ts;
 +        }
 +
 +        private static class PayloadData {
 +            byte[] data;
 +            int offset;
 +            int length;
 +            int numFieldInstancesToSkip;
 +
 +            PayloadData(int skip, byte[] data, int offset, int length) {
 +                numFieldInstancesToSkip = skip;
 +                this.data = data;
 +                this.offset = offset;
 +                this.length = length;
 +            }
 +        }
 +    }
 +
 +
 +    /**
 +     * This Filter adds payloads to the tokens.
 +     */
 +    private static class PayloadFilter extends TokenFilter {
 +        private byte[] data;
 +        private int length;
 +        private int offset;
 +
 +        public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
 +            super(in);
 +            this.data = data;
 +            this.length = length;
 +            this.offset = offset;
 +        }
 +
 +        public Token next() throws IOException {
 +            Token nextToken = input.next();
 +            if (nextToken != null && offset + length <= data.length) {
 +              nextToken.setPayload(new BytePayload(data, offset, length));
 +              offset += length;
 +            }
 +
 +            return nextToken;
 +        }
 +      }
 +
 +}
 Index: src/java/org/apache/lucene/analysis/Token.java
 ===================================================================
 --- src/java/org/apache/lucene/analysis/Token.java	(révision 493447)
 +++ src/java/org/apache/lucene/analysis/Token.java	(copie de travail)
 @@ -1,5 +1,8 @@
  package org.apache.lucene.analysis;

 +import org.apache.lucene.index.Payload;
 +import org.apache.lucene.index.TermPositions;
 +
  /**
   * Licensed to the Apache Software Foundation (ASF) under one or more
   * contributor license agreements.  See the NOTICE file distributed with
 @@ -20,23 +23,32 @@
  /** A Token is an occurence of a term from the text of a field.  It consists of
    a term's text, the start and end offset of the term in the text of the field,
    and a type string.
 -
 +  <p>
    The start and end offsets permit applications to re-associate a token with
    its source text, e.g., to display highlighted query terms in a document
    browser, or to show matching text fragments in a KWIC (KeyWord In Context)
    display, etc.
 -
 +  <p>
    The type is an interned string, assigned by a lexical analyzer
    (a.k.a. tokenizer), naming the lexical or syntactic class that the token
    belongs to.  For example an end of sentence marker token might be implemented
 -  with type "eos".  The default token type is "word".  */
 +  with type "eos".  The default token type is "word".
 +  <p>
 +  A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
 +  length byte array. Use {@link TermPositions#getPayloadLength()} and
 +  {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.

 +  @see org.apache.lucene.index.Payload
 +  */
 +
  public class Token implements Cloneable {
    String termText;				  // the text of the term
    int startOffset;				  // start in source text
    int endOffset;				  // end in source text
    String type = "word";				  // lexical type
 -
 +
 +  Payload payload;
 +
    private int positionIncrement = 1;

    /** Constructs a Token with the given term text, and start & end offsets.
 @@ -115,6 +127,16 @@
    /** Returns this Token's lexical type.  Defaults to "word". */
    public final String type() { return type; }

 +  /** Sets this Token's payload. */
 +  public void setPayload(Payload payload) {
 +    this.payload = payload;
 +  }
 +
 +  /** Returns this Token's payload. */
 +  public Payload getPayload() {
 +    return this.payload;
 +  }
 +
    public String toString() {
      StringBuffer sb = new StringBuffer();
      sb.append("(" + termText + "," + startOffset + "," + endOffset);
 Index: src/java/org/apache/lucene/index/FieldInfo.java
 ===================================================================
 --- src/java/org/apache/lucene/index/FieldInfo.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/FieldInfo.java	(copie de travail)
 @@ -28,9 +28,12 @@
    boolean storePositionWithTermVector;

    boolean omitNorms; // omit norms associated with indexed fields
 +
 +  boolean storePayloads; // whether this field stores payloads together with term positions

    FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
 -            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector, boolean omitNorms) {
 +            boolean storePositionWithTermVector,  boolean storeOffsetWithTermVector,
 +            boolean omitNorms, boolean storePayloads) {
      name = na;
      isIndexed = tk;
      number = nu;
 @@ -38,5 +41,6 @@
      this.storeOffsetWithTermVector = storeOffsetWithTermVector;
      this.storePositionWithTermVector = storePositionWithTermVector;
      this.omitNorms = omitNorms;
 +    this.storePayloads = storePayloads;
    }
  }
 Index: src/java/org/apache/lucene/index/PayloadReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/PayloadReader.java	(révision 0)
 +++ src/java/org/apache/lucene/index/PayloadReader.java	(révision 0)
 @@ -0,0 +1,11 @@
 +package org.apache.lucene.index;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.store.IndexInput;
 +
 +public interface PayloadReader {
 +
 +  public Payload read(int length, IndexInput in) throws IOException;
 +
 +}
 Index: src/java/org/apache/lucene/index/MultiReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/MultiReader.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/MultiReader.java	(copie de travail)
 @@ -450,5 +450,12 @@
    public int nextPosition() throws IOException {
      return ((TermPositions)current).nextPosition();
    }
 +
 +  public int getPayloadLength() {
 +    return ((TermPositions)current).getPayloadLength();
 +  }

 +  public Payload getPayload() throws IOException {
 +    return ((TermPositions)current).getPayload();
 +  }
  }
 Index: src/java/org/apache/lucene/index/TermPositions.java
 ===================================================================
 --- src/java/org/apache/lucene/index/TermPositions.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/TermPositions.java	(copie de travail)
 @@ -32,10 +32,36 @@
      extends TermDocs
  {
      /** Returns next position in the current document.  It is an error to call
 -	this more than {@link #freq()} times
 -	without calling {@link #next()}<p> This is
 -	invalid until {@link #next()} is called for
 -	the first time.
 +    this more than {@link #freq()} times
 +    without calling {@link #next()}<p> This is
 +    invalid until {@link #next()} is called for
 +    the first time.
      */
      int nextPosition() throws IOException;
 +
 +    /** Returns the length of the payload at the current term position.
 +     *  This is invalid until {@link #nextPosition()} is called for
 +     *  the first time.
 +     *
 +     * @return length of the current payload in number of bytes
 +     */
 +    int getPayloadLength();
 +
 +    /** Returns the payload data at the current term position.
 +     * This is invalid until {@link #nextPosition()} is called for
 +     * the first time.
 +     * This method must not be called more than once after each call
 +     * of {@link #nextPosition()}. However, payloads are loaded lazily,
 +     * so if the payload data for the current position is not needed,
 +     * this method may not be called at all for performance reasons.
 +     *
 +     * @param data the array into which the data of this payload is to be
 +     *             stored, if it is big enough; otherwise, a new byte[] array
 +     *             is allocated for this purpose.
 +     * @param offset the offset in the array into which the data of this payload
 +     *               is to be stored.
 +     * @return a byte[] array containing the data of this payload
 +     * @throws IOException
 +     */
 +    Payload getPayload() throws IOException;
  }
 Index: src/java/org/apache/lucene/index/IndexFormat.java
 ===================================================================
 --- src/java/org/apache/lucene/index/IndexFormat.java	(révision 0)
 +++ src/java/org/apache/lucene/index/IndexFormat.java	(révision 0)
 @@ -0,0 +1,36 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.document.Fieldable;
 +
 +/**
 + * Specify the format of index.
 + *
 + * The implementation of the {@link FieldsReader} and {@link FieldsWriter} returned by
 + * the function getFieldsReader and getFieldsWriter will specify how the data of fields are
 + * serialized, and also the kind of {@link Fieldable} used.
 + *
 + * $Id$
 + */
 +public interface IndexFormat {
 +
 +  PayloadReader getPayloadReader();
 +
 +  PayloadWriter getPayloadWriter();
 +}
 \ Pas de fin de ligne à la fin du fichier
 Index: src/java/org/apache/lucene/index/DefaultPayloadReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DefaultPayloadReader.java	(révision 0)
 +++ src/java/org/apache/lucene/index/DefaultPayloadReader.java	(révision 0)
 @@ -0,0 +1,36 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.store.IndexInput;
 +
 +/**
 + * This payload reader only support {@link BytePayload}.
 + *
 + * $Id$
 + */
 +public class DefaultPayloadReader implements PayloadReader {
 +
 +  public Payload read(int length, IndexInput in) throws IOException {
 +    byte[] data = new byte[length];
 +    in.readBytes(data, 0, length);
 +    return new BytePayload(data);
 +  }
 +}
 Index: src/java/org/apache/lucene/index/FieldInfos.java
 ===================================================================
 --- src/java/org/apache/lucene/index/FieldInfos.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/FieldInfos.java	(copie de travail)
 @@ -39,6 +39,7 @@
    static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
    static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
    static final byte OMIT_NORMS = 0x10;
 +  static final byte STORE_PAYLOADS = 0x20;

    private ArrayList byNumber = new ArrayList();
    private HashMap byName = new HashMap();
 @@ -156,9 +157,29 @@
     */
    public void add(String name, boolean isIndexed, boolean storeTermVector,
                    boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
 +    add(name, isIndexed, storeTermVector, storePositionWithTermVector,
 +        storeOffsetWithTermVector, omitNorms, false);
 +  }
 +
 +  /** If the field is not yet known, adds it. If it is known, checks to make
 +   *  sure that the isIndexed flag is the same as was given previously for this
 +   *  field. If not - marks it as being indexed.  Same goes for the TermVector
 +   * parameters.
 +   *
 +   * @param name The name of the field
 +   * @param isIndexed true if the field is indexed
 +   * @param storeTermVector true if the term vector should be stored
 +   * @param storePositionWithTermVector true if the term vector with positions should be stored
 +   * @param storeOffsetWithTermVector true if the term vector with offsets should be stored
 +   * @param omitNorms true if the norms for the indexed field should be omitted
 +   * @param storePayloads true if payloads should be stored for this field
 +   */
 +  public void add(String name, boolean isIndexed, boolean storeTermVector,
 +                  boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
 +                  boolean omitNorms, boolean storePayloads) {
      FieldInfo fi = fieldInfo(name);
      if (fi == null) {
 -      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
 +      addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
      } else {
        if (fi.isIndexed != isIndexed) {
          fi.isIndexed = true;                      // once indexed, always index
 @@ -175,6 +196,9 @@
        if (fi.omitNorms != omitNorms) {
          fi.omitNorms = false;                // once norms are stored, always store
        }
 +      if (fi.storePayloads != storePayloads) {
 +        fi.storePayloads = true;
 +      }

      }
    }
 @@ -182,10 +206,10 @@

    private void addInternal(String name, boolean isIndexed,
                             boolean storeTermVector, boolean storePositionWithTermVector,
 -                           boolean storeOffsetWithTermVector, boolean omitNorms) {
 +                           boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
      FieldInfo fi =
        new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
 -              storeOffsetWithTermVector, omitNorms);
 +              storeOffsetWithTermVector, omitNorms, storePayloads);
      byNumber.add(fi);
      byName.put(name, fi);
    }
 @@ -271,6 +295,7 @@
        if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
        if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
        if (fi.omitNorms) bits |= OMIT_NORMS;
 +      if (fi.storePayloads) bits |= STORE_PAYLOADS;
        output.writeString(fi.name);
        output.writeByte(bits);
      }
 @@ -286,8 +311,9 @@
        boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
        boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
        boolean omitNorms = (bits & OMIT_NORMS) != 0;
 -
 -      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
 +      boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
 +
 +      addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
      }
    }

 Index: src/java/org/apache/lucene/index/DefaultIndexFormat.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DefaultIndexFormat.java	(révision 0)
 +++ src/java/org/apache/lucene/index/DefaultIndexFormat.java	(révision 0)
 @@ -0,0 +1,39 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +
 +/**
 + * The default implementation of the index format
 + *
 + * $Id$
 + */
 +public class DefaultIndexFormat implements IndexFormat {
 +
 +  private PayloadReader reader = new DefaultPayloadReader();
 +
 +  private PayloadWriter writer = new DefaultPayloadWriter();
 +
 +  public PayloadReader getPayloadReader() {
 +    return reader;
 +  }
 +
 +  public PayloadWriter getPayloadWriter() {
 +    return writer;
 +  }
 +}
 Index: src/java/org/apache/lucene/index/PayloadWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/PayloadWriter.java	(révision 0)
 +++ src/java/org/apache/lucene/index/PayloadWriter.java	(révision 0)
 @@ -0,0 +1,11 @@
 +package org.apache.lucene.index;
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.store.IndexOutput;
 +
 +public interface PayloadWriter {
 +
 +  public void write(Payload payload, IndexOutput output) throws IOException;
 +
 +}
 Index: src/java/org/apache/lucene/index/Payload.java
 ===================================================================
 --- src/java/org/apache/lucene/index/Payload.java	(révision 0)
 +++ src/java/org/apache/lucene/index/Payload.java	(révision 0)
 @@ -0,0 +1,38 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.analysis.Token;
 +import org.apache.lucene.analysis.TokenStream;
 +
 +/**
 + *  A Payload is metadata that can be stored together with each occurrence
 + *  of a term. This metadata is stored inline in the posting list of the
 + *  specific term.
 + *  <p>
 + *  To store payloads in the index a {@link TokenStream} has to be used that
 + *  produces {@link Token}s containing payload data.
 + *  <p>
 + *  Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
 + *  to retrieve the payloads from the index.
 + */
 +public interface Payload {
 +
 +  public int getLength();
 +
 +}
 Index: src/java/org/apache/lucene/index/IndexReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/IndexReader.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/IndexReader.java	(copie de travail)
 @@ -65,6 +65,8 @@
      public static final FieldOption ALL = new FieldOption ("ALL");
      // all indexed fields
      public static final FieldOption INDEXED = new FieldOption ("INDEXED");
 +    // all fields that store payloads
 +    public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
      // all fields which are not indexed
      public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
      // all fields which are indexed with termvectors enables
 Index: src/java/org/apache/lucene/index/DefaultPayloadWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DefaultPayloadWriter.java	(révision 0)
 +++ src/java/org/apache/lucene/index/DefaultPayloadWriter.java	(révision 0)
 @@ -0,0 +1,37 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import java.io.IOException;
 +
 +import org.apache.lucene.store.IndexOutput;
 +
 +/**
 + * This payload writer only support {@link BytePayload}.
 + *
 + * $Id$
 + */
 +public class DefaultPayloadWriter implements PayloadWriter {
 +
 +  public void write(Payload payload, IndexOutput output) throws IOException {
 +    if (!(payload instanceof BytePayload)) {
 +      throw new RuntimeException("Unsupported payload of type '" + payload.getClass() + "' is not supported");
 +    }
 +    output.writeBytes(((BytePayload) payload).toByteArray(), payload.getLength());
 +  }
 +}
 Index: src/java/org/apache/lucene/index/MultipleTermPositions.java
 ===================================================================
 --- src/java/org/apache/lucene/index/MultipleTermPositions.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/MultipleTermPositions.java	(copie de travail)
 @@ -191,5 +191,23 @@
    public int read(int[] arg0, int[] arg1) throws IOException {
      throw new UnsupportedOperationException();
    }
 +
 +
 +  /**
 +   * Not implemented.
 +   * @throws UnsupportedOperationException
 +   */
 +  public int getPayloadLength() {
 +    throw new UnsupportedOperationException();
 +  }
 +
 +  /**
 +   * Not implemented.
 +   * @throws UnsupportedOperationException
 +   */
 +  public Payload getPayload() throws IOException {
 +    throw new UnsupportedOperationException();
 +  }

 +
  }
 Index: src/java/org/apache/lucene/index/FilterIndexReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/FilterIndexReader.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/FilterIndexReader.java	(copie de travail)
 @@ -62,6 +62,14 @@
      public int nextPosition() throws IOException {
        return ((TermPositions) this.in).nextPosition();
      }
 +
 +    public int getPayloadLength() {
 +      return ((TermPositions) this.in).getPayloadLength();
 +    }
 +
 +    public Payload getPayload() throws IOException {
 +      return ((TermPositions) this.in).getPayload();
 +    }
    }

    /** Base class for filtering {@link TermEnum} implementations. */
 Index: src/java/org/apache/lucene/index/SegmentTermPositions.java
 ===================================================================
 --- src/java/org/apache/lucene/index/SegmentTermPositions.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/SegmentTermPositions.java	(copie de travail)
 @@ -27,23 +27,33 @@
    private int proxCount;
    private int position;

 +  // the current payload length
 +  private int payloadLength;
 +  // indicates whether the payload of the currend position has
 +  // been read from the proxStream yet
 +  private boolean needToLoadPayload;
 +
    // these variables are being used to remember information
    // for a lazy skip
    private long lazySkipPointer = 0;
    private int lazySkipDocCount = 0;
 +  private PayloadReader payloadReader;

    SegmentTermPositions(SegmentReader p) {
      super(p);
      this.proxStream = (IndexInput)parent.proxStream.clone();
 +    payloadReader = parent.directory().getIndexFormat().getPayloadReader();
    }

 -  final void seek(TermInfo ti) throws IOException {
 -    super.seek(ti);
 +  final void seek(TermInfo ti, Term term) throws IOException {
 +    super.seek(ti, term);
      if (ti != null)
        lazySkipPointer = ti.proxPointer;

      lazySkipDocCount = 0;
      proxCount = 0;
 +    payloadLength = 0;
 +    needToLoadPayload = false;
    }

    public final void close() throws IOException {
 @@ -55,8 +65,27 @@
      // perform lazy skips if neccessary
      lazySkip();
      proxCount--;
 -    return position += proxStream.readVInt();
 +    return position += readDeltaPosition();
    }
 +
 +  private final int readDeltaPosition() throws IOException {
 +    int delta = proxStream.readVInt();
 +    if (currentFieldStoresPayloads) {
 +      // if the current field stores payloads then
 +      // the position delta is shifted one bit to the left.
 +      // if the LSB is set, then we have to read the current
 +      // payload length
 +      if ((delta & 1) != 0) {
 +        payloadLength = proxStream.readVInt();
 +      }
 +      delta >>>= 1;
 +      needToLoadPayload = true;
 +    } else {
 +      payloadLength = 0;
 +      needToLoadPayload = false;
 +    }
 +    return delta;
 +  }

    protected final void skippingDoc() throws IOException {
      // we remember to skip the remaining positions of the current
 @@ -82,17 +111,28 @@


    /** Called by super.skipTo(). */
 -  protected void skipProx(long proxPointer) throws IOException {
 +  protected void skipProx(long proxPointer, int payloadLength) throws IOException {
      // we save the pointer, we might have to skip there lazily
      lazySkipPointer = proxPointer;
      lazySkipDocCount = 0;
      proxCount = 0;
 +    this.payloadLength = payloadLength;
 +    needToLoadPayload = false;
    }

    private void skipPositions(int n) throws IOException {
 -    for (int f = n; f > 0; f--)         // skip unread positions
 -      proxStream.readVInt();
 +    for (int f = n; f > 0; f--) {        // skip unread positions
 +      readDeltaPosition();
 +      skipPayload();
 +    }
    }
 +
 +  private void skipPayload() throws IOException {
 +    if (needToLoadPayload && payloadLength > 0) {
 +      proxStream.seek(proxStream.getFilePointer() + payloadLength);
 +    }
 +    needToLoadPayload = false;
 +  }

    // It is not always neccessary to move the prox pointer
    // to a new document after the freq pointer has been moved.
 @@ -105,6 +145,10 @@
    // So we move the prox pointer lazily to the document
    // as soon as positions are requested.
    private void lazySkip() throws IOException {
 +    // we might have to skip the current payload
 +    // if it was not read yet
 +    skipPayload();
 +
      if (lazySkipPointer != 0) {
        proxStream.seek(lazySkipPointer);
        lazySkipPointer = 0;
 @@ -115,5 +159,32 @@
        lazySkipDocCount = 0;
      }
    }
 +
 +  public int getPayloadLength() {
 +    return payloadLength;
 +  }

 +  public Payload getPayload() throws IOException {
 +    if (!needToLoadPayload) {
 +      throw new IOException("Payload cannot be loaded more than once for the same term position.");
 +    }
 +    Payload payload = payloadReader.read(payloadLength, proxStream);
 +    needToLoadPayload = false;
 +    return payload;
 +    // read payloads lazily
 +//    byte[] retArray;
 +//    int retOffset;
 +//    if (data == null || data.length - offset < payloadLength) {
 +//      // the array is too small to store the payload data,
 +//      // so we allocate a new one
 +//      retArray = new byte[payloadLength];
 +//      retOffset = 0;
 +//    } else {
 +//      retArray = data;
 +//      retOffset = offset;
 +//    }
 +//    proxStream.readBytes(retArray, retOffset, payloadLength);
 +//    needToLoadPayload = false;
 +//    return retArray;
 +  }
  }
 Index: src/java/org/apache/lucene/index/SegmentTermDocs.java
 ===================================================================
 --- src/java/org/apache/lucene/index/SegmentTermDocs.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/SegmentTermDocs.java	(copie de travail)
 @@ -39,6 +39,9 @@
    private long proxPointer;
    private long skipPointer;
    private boolean haveSkipped;
 +
 +  private int payloadLengthAtLastSkip;
 +  protected boolean currentFieldStoresPayloads;

    protected SegmentTermDocs(SegmentReader parent) {
      this.parent = parent;
 @@ -49,23 +52,31 @@

    public void seek(Term term) throws IOException {
      TermInfo ti = parent.tis.get(term);
 -    seek(ti);
 +    seek(ti, term);
    }

    public void seek(TermEnum termEnum) throws IOException {
      TermInfo ti;
 +    Term term;

      // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
 -    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos)          // optimized case
 -      ti = ((SegmentTermEnum) termEnum).termInfo();
 -    else                                          // punt case
 -      ti = parent.tis.get(termEnum.term());
 -
 -    seek(ti);
 +    if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) {        // optimized case
 +      SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
 +      term = segmentTermEnum.term();
 +      ti = segmentTermEnum.termInfo();
 +    } else  {                                         // punt case
 +      term = termEnum.term();
 +      ti = parent.tis.get(term);
 +    }
 +
 +    seek(ti, term);
    }

 -  void seek(TermInfo ti) throws IOException {
 +  void seek(TermInfo ti, Term term) throws IOException {
      count = 0;
 +    payloadLengthAtLastSkip = 0;
 +    FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
 +    currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
      if (ti == null) {
        df = 0;
      } else {
 @@ -141,7 +152,7 @@
    }

    /** Overridden by SegmentTermPositions to skip in prox stream. */
 -  protected void skipProx(long proxPointer) throws IOException {}
 +  protected void skipProx(long proxPointer, int payloadLength) throws IOException {}

    /** Optimized implementation. */
    public boolean skipTo(int target) throws IOException {
 @@ -157,6 +168,7 @@

        // scan skip data
        int lastSkipDoc = skipDoc;
 +      int lastPayloadLength = 0;
        long lastFreqPointer = freqStream.getFilePointer();
        long lastProxPointer = -1;
        int numSkipped = -1 - (count % skipInterval);
 @@ -165,6 +177,7 @@
          lastSkipDoc = skipDoc;
          lastFreqPointer = freqPointer;
          lastProxPointer = proxPointer;
 +        lastPayloadLength = payloadLengthAtLastSkip;

          if (skipDoc != 0 && skipDoc >= doc)
            numSkipped += skipInterval;
 @@ -172,7 +185,21 @@
          if(skipCount >= numSkips)
            break;

 -        skipDoc += skipStream.readVInt();
 +        if (currentFieldStoresPayloads) {
 +          // the current field stores payloads.
 +          // if the doc delta is odd then we have
 +          // to read the current payload length
 +          // because it differs from the length of the
 +          // previous payload
 +          int delta = skipStream.readVInt();
 +          if ((delta & 1) != 0) {
 +            payloadLengthAtLastSkip = skipStream.readVInt();
 +          }
 +          delta >>>= 1;
 +          skipDoc += delta;
 +        } else {
 +          skipDoc += skipStream.readVInt();
 +        }
          freqPointer += skipStream.readVInt();
          proxPointer += skipStream.readVInt();

 @@ -182,7 +209,7 @@
        // if we found something to skip, then skip it
        if (lastFreqPointer > freqStream.getFilePointer()) {
          freqStream.seek(lastFreqPointer);
 -        skipProx(lastProxPointer);
 +        skipProx(lastProxPointer, lastPayloadLength);

          doc = lastSkipDoc;
          count += numSkipped;
 Index: src/java/org/apache/lucene/index/SegmentMerger.java
 ===================================================================
 --- src/java/org/apache/lucene/index/SegmentMerger.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/SegmentMerger.java	(copie de travail)
 @@ -151,11 +151,11 @@
    }

    private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
 -                         boolean storeOffsetWithTermVector) throws IOException {
 +                         boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
      Iterator i = names.iterator();
      while (i.hasNext()) {
        String field = (String)i.next();
 -      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
 +      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
      }
    }

 @@ -165,15 +165,16 @@
     * @throws IOException
     */
    private final int mergeFields() throws IOException {
 -    fieldInfos = new FieldInfos();		  // merge field names
 +    fieldInfos = new FieldInfos();        // merge field names
      int docCount = 0;
      for (int i = 0; i < readers.size(); i++) {
        IndexReader reader = (IndexReader) readers.elementAt(i);
 -      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
 -      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
 -      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
 -      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
 -      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
 +      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
        fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
      }
      fieldInfos.write(directory, segment + ".fnm");
 @@ -263,7 +264,7 @@
        SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
        base += reader.numDocs();
        if (smi.next())
 -        queue.put(smi);				  // initialize queue
 +        queue.put(smi);               // initialize queue
        else
          smi.close();
      }
 @@ -271,7 +272,7 @@
      SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];

      while (queue.size() > 0) {
 -      int matchSize = 0;			  // pop matching terms
 +      int matchSize = 0;              // pop matching terms
        match[matchSize++] = (SegmentMergeInfo) queue.pop();
        Term term = match[0].term;
        SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
 @@ -281,14 +282,14 @@
          top = (SegmentMergeInfo) queue.top();
        }

 -      mergeTermInfo(match, matchSize);		  // add new TermInfo
 +      mergeTermInfo(match, matchSize);        // add new TermInfo

        while (matchSize > 0) {
          SegmentMergeInfo smi = match[--matchSize];
          if (smi.next())
 -          queue.put(smi);			  // restore queue
 +          queue.put(smi);             // restore queue
          else
 -          smi.close();				  // done with a segment
 +          smi.close();                // done with a segment
        }
      }
    }
 @@ -307,7 +308,7 @@
      long freqPointer = freqOutput.getFilePointer();
      long proxPointer = proxOutput.getFilePointer();

 -    int df = appendPostings(smis, n);		  // append posting data
 +    int df = appendPostings(smis, n);         // append posting data

      long skipPointer = writeSkip();

 @@ -317,6 +318,8 @@
        termInfosWriter.add(smis[0].term, termInfo);
      }
    }
 +
 +  private byte[] payloadBuffer = null;

    /** Process postings from multiple segments all positioned on the
     *  same term. Writes out merged entries into freqOutput and
 @@ -328,9 +331,12 @@
     */
    private final int appendPostings(SegmentMergeInfo[] smis, int n)
            throws IOException {
 +    PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter();
      int lastDoc = 0;
 -    int df = 0;					  // number of docs w/ term
 +    int df = 0;                   // number of docs w/ term
      resetSkip();
 +    boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
 +    int lastPayloadLength = -1;   // ensures that we write the first length
      for (int i = 0; i < n; i++) {
        SegmentMergeInfo smi = smis[i];
        TermPositions postings = smi.getPositions();
 @@ -350,24 +356,43 @@
          df++;

          if ((df % skipInterval) == 0) {
 -          bufferSkip(lastDoc);
 +          bufferSkip(lastDoc, storePayloads, lastPayloadLength);
          }

 -        int docCode = (doc - lastDoc) << 1;	  // use low bit to flag freq=1
 +        int docCode = (doc - lastDoc) << 1;   // use low bit to flag freq=1
          lastDoc = doc;

          int freq = postings.freq();
          if (freq == 1) {
 -          freqOutput.writeVInt(docCode | 1);	  // write doc & freq=1
 +          freqOutput.writeVInt(docCode | 1);      // write doc & freq=1
          } else {
 -          freqOutput.writeVInt(docCode);	  // write doc
 -          freqOutput.writeVInt(freq);		  // write frequency in doc
 +          freqOutput.writeVInt(docCode);      // write doc
 +          freqOutput.writeVInt(freq);         // write frequency in doc
          }

 -        int lastPosition = 0;			  // write position deltas
 +        /** See {@link DocumentWriter#writePostings(Posting[], String) for
 +         *  documentation about the encoding of positions and payloads
 +         */
 +        int lastPosition = 0;             // write position deltas
          for (int j = 0; j < freq; j++) {
            int position = postings.nextPosition();
 -          proxOutput.writeVInt(position - lastPosition);
 +          int delta = position - lastPosition;
 +          if (storePayloads) {
 +            int payloadLength = postings.getPayloadLength();
 +            if (payloadLength == lastPayloadLength) {
 +              proxOutput.writeVInt(delta * 2);
 +            } else {
 +              proxOutput.writeVInt(delta * 2 + 1);
 +              proxOutput.writeVInt(payloadLength);
 +              lastPayloadLength = payloadLength;
 +            }
 +            if (payloadLength > 0) {
 +              Payload payload = postings.getPayload();
 +              payloadWriter.write(payload, proxOutput);
 +            }
 +          } else {
 +            proxOutput.writeVInt(delta);
 +          }
            lastPosition = position;
          }
        }
 @@ -377,21 +402,59 @@

    private RAMOutputStream skipBuffer = new RAMOutputStream();
    private int lastSkipDoc;
 +  private int lastSkipPayloadLength;
    private long lastSkipFreqPointer;
    private long lastSkipProxPointer;

    private void resetSkip() {
      skipBuffer.reset();
      lastSkipDoc = 0;
 +    lastSkipPayloadLength = -1;  // we don't have to write the first length in the skip list
      lastSkipFreqPointer = freqOutput.getFilePointer();
      lastSkipProxPointer = proxOutput.getFilePointer();
    }

 -  private void bufferSkip(int doc) throws IOException {
 +  private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
      long freqPointer = freqOutput.getFilePointer();
      long proxPointer = proxOutput.getFilePointer();

 -    skipBuffer.writeVInt(doc - lastSkipDoc);
 +    // To efficiently store payloads in the posting lists we do not store the length of
 +    // every payload. Instead we omit the length for a payload if the previous payload had
 +    // the same length.
 +    // However, in order to support skipping the payload length at every skip point must be known.
 +    // So we use the same length encoding that we use for the posting lists for the skip data as well:
 +    // Case 1: current field does not store payloads
 +    //           SkipDatum                 --> DocSkip, FreqSkip, ProxSkip
 +    //           DocSkip,FreqSkip,ProxSkip --> VInt
 +    //           DocSkip records the document number before every SkipInterval th  document in TermFreqs.
 +    //           Document numbers are represented as differences from the previous value in the sequence.
 +    // Case 2: current field stores payloads
 +    //           SkipDatum                 --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
 +    //           DocSkip,FreqSkip,ProxSkip --> VInt
 +    //           PayloadLength             --> VInt
 +    //         In this case DocSkip/2 is the difference between
 +    //         the current and the previous value. If DocSkip
 +    //         is odd, then a PayloadLength encoded as VInt follows,
 +    //         if DocSkip is even, then it is assumed that the
 +    //         current payload length equals the length at the previous
 +    //         skip point
 +    if (storePayloads) {
 +      int delta = doc - lastSkipDoc;
 +      if (payloadLength == lastSkipPayloadLength) {
 +        // the current payload length equals the length at the previous skip point,
 +        // so we don't store the length again
 +        skipBuffer.writeVInt(delta * 2);
 +      } else {
 +        // the payload length is different from the previous one. We shift the DocSkip,
 +        // set the lowest bit and store the current payload length as VInt.
 +        skipBuffer.writeVInt(delta * 2 + 1);
 +        skipBuffer.writeVInt(payloadLength);
 +        lastSkipPayloadLength = payloadLength;
 +      }
 +    } else {
 +      // current field does not store payloads
 +      skipBuffer.writeVInt(doc - lastSkipDoc);
 +    }
      skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
      skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));

 Index: src/java/org/apache/lucene/index/DocumentWriter.java
 ===================================================================
 --- src/java/org/apache/lucene/index/DocumentWriter.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/DocumentWriter.java	(copie de travail)
 @@ -31,6 +31,7 @@
  import java.io.Reader;
  import java.io.StringReader;
  import java.util.Arrays;
 +import java.util.BitSet;
  import java.util.Enumeration;
  import java.util.Hashtable;
  import java.util.Iterator;
 @@ -69,34 +70,42 @@

    final void addDocument(String segment, Document doc)
            throws IOException {
 -    // write field names
 +    // create field infos
      fieldInfos = new FieldInfos();
      fieldInfos.add(doc);
 -    fieldInfos.write(directory, segment + ".fnm");

 -    // write field values
 -    FieldsWriter fieldsWriter =
 -            new FieldsWriter(directory, segment, fieldInfos);
 -    try {
 -      fieldsWriter.addDocument(doc);
 -    } finally {
 -      fieldsWriter.close();
 -    }
 -
      // invert doc into postingTable
      postingTable.clear();			  // clear postingTable
      fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
      fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
      fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets
 +    fieldStoresPayloads = new BitSet(fieldInfos.size());

      fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
      Arrays.fill(fieldBoosts, doc.getBoost());

 +    // Before we write the FieldInfos we invert the Document. The reason is that
 +    // during invertion the TokenStreams of tokenized fields are being processed
 +    // and we might encounter tokens that have payloads associated with them. In
 +    // this case we have to update the FieldInfo of the particular field.
      invertDocument(doc);

      // sort postingTable into an array
      Posting[] postings = sortPostingTable();

 +    // write field infos
 +    fieldInfos.write(directory, segment + ".fnm");
 +
 +    // write field values
 +    FieldsWriter fieldsWriter =
 +            new FieldsWriter(directory, segment, fieldInfos);
 +    try {
 +      fieldsWriter.addDocument(doc);
 +    } finally {
 +      fieldsWriter.close();
 +    }
 +
 +
      /*
      for (int i = 0; i < postings.length; i++) {
        Posting posting = postings[i];
 @@ -125,6 +134,10 @@
    private int[] fieldPositions;
    private int[] fieldOffsets;
    private float[] fieldBoosts;
 +
 +  // If any of the tokens of a paticular field carry a payload
 +  // then we enable payloads for that field.
 +  private BitSet fieldStoresPayloads;

    // Tokenizes the fields of a document into Postings.
    private final void invertDocument(Document doc)
 @@ -144,9 +157,9 @@
          if (!field.isTokenized()) {		  // un-tokenized field
            String stringValue = field.stringValue();
            if(field.isStoreOffsetWithTermVector())
 -            addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
 +            addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
            else
 -            addPosition(fieldName, stringValue, position++, null);
 +            addPosition(fieldName, stringValue, position++, null, null);
            offset += stringValue.length();
            length++;
          } else
 @@ -167,11 +180,20 @@
              for (Token t = stream.next(); t != null; t = stream.next()) {
                position += (t.getPositionIncrement() - 1);

 -              if(field.isStoreOffsetWithTermVector())
 -                addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
 -              else
 -                addPosition(fieldName, t.termText(), position++, null);
 +              Payload payload = t.getPayload();
 +              if (payload != null) {
 +                // enable payloads for this field
 +              	fieldStoresPayloads.set(fieldNumber);
 +              }

 +              TermVectorOffsetInfo termVectorOffsetInfo;
 +              if (field.isStoreOffsetWithTermVector()) {
 +                termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
 +              } else {
 +                termVectorOffsetInfo = null;
 +              }
 +              addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
 +
                lastToken = t;
                if (++length >= maxFieldLength) {
                  if (infoStream != null)
 @@ -194,11 +216,16 @@
          fieldOffsets[fieldNumber] = offset;
        }
      }
 +
 +    // update fieldInfos for all fields that have one or more tokens with payloads
 +    for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
 +    	fieldInfos.fieldInfo(i).storePayloads = true;
 +    }
    }

    private final Term termBuffer = new Term("", ""); // avoid consing

 -  private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
 +  private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
      termBuffer.set(field, text);
      //System.out.println("Offset: " + offset);
      Posting ti = (Posting) postingTable.get(termBuffer);
 @@ -209,9 +236,25 @@
          int[] positions = ti.positions;
          System.arraycopy(positions, 0, newPositions, 0, freq);
          ti.positions = newPositions;
 +
 +        if (ti.payloads != null) {
 +          // the current field stores payloads
 +          Payload[] newPayloads = new Payload[freq * 2];  // grow payloads array
 +          Payload[] payloads = ti.payloads;
 +          System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
 +          ti.payloads = newPayloads;
 +        }
        }
        ti.positions[freq] = position;		  // add new position

 +      if (payload != null) {
 +        if (ti.payloads == null) {
 +          // lazily allocate payload array
 +          ti.payloads = new Payload[ti.positions.length];
 +        }
 +        ti.payloads[freq] = payload;
 +      }
 +
        if (offset != null) {
          if (ti.offsets.length == freq){
            TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
 @@ -224,7 +267,7 @@
        ti.freq = freq + 1;			  // update frequency
      } else {					  // word not seen before
        Term term = new Term(field, text, false);
 -      postingTable.put(term, new Posting(term, position, offset));
 +      postingTable.put(term, new Posting(term, position, payload, offset));
      }
    }

 @@ -299,6 +342,7 @@
      IndexOutput freq = null, prox = null;
      TermInfosWriter tis = null;
      TermVectorsWriter termVectorWriter = null;
 +    PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter();
      try {
        //open files for inverse index storage
        freq = directory.createOutput(segment + ".frq");
 @@ -307,10 +351,31 @@
                                  termIndexInterval);
        TermInfo ti = new TermInfo();
        String currentField = null;
 -
 +      boolean currentFieldHasPayloads = false;
 +
        for (int i = 0; i < postings.length; i++) {
          Posting posting = postings[i];

 +        // check to see if we switched to a new field
 +        String termField = posting.term.field();
 +        if (currentField != termField) {
 +          // changing field - see if there is something to save
 +          currentField = termField;
 +          FieldInfo fi = fieldInfos.fieldInfo(currentField);
 +          currentFieldHasPayloads = fi.storePayloads;
 +          if (fi.storeTermVector) {
 +            if (termVectorWriter == null) {
 +              termVectorWriter =
 +                new TermVectorsWriter(directory, segment, fieldInfos);
 +              termVectorWriter.openDocument();
 +            }
 +            termVectorWriter.openField(currentField);
 +
 +          } else if (termVectorWriter != null) {
 +            termVectorWriter.closeField();
 +          }
 +        }
 +
          // add an entry to the dictionary with pointers to prox and freq files
          ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
          tis.add(posting.term, ti);
 @@ -326,28 +391,62 @@

          int lastPosition = 0;			  // write positions
          int[] positions = posting.positions;
 +        Payload[] payloads = posting.payloads;
 +        int lastPayloadLength = -1;
 +
 +
 +        // The following encoding is being used for positions and payloads:
 +        // Case 1: current field does not store payloads
 +        //           Positions     -> <PositionDelta>^freq
 +        //           PositionDelta -> VInt
 +        //         The PositionDelta is the difference between the current
 +        //         and the previous position
 +        // Case 2: current field stores payloads
 +        //           Positions     -> <PositionDelta, Payload>^freq
 +        //           Payload       ->  <PayloadLength?, PayloadData>
 +        //           PositionDelta -> VInt
 +        //           PayloadLength -> VInt
 +        //           PayloadData   -> byte^PayloadLength
 +        //         In this case PositionDelta/2 is the difference between
 +        //         the current and the previous position. If PositionDelta
 +        //         is odd, then a PayloadLength encoded as VInt follows,
 +        //         if PositionDelta is even, then it is assumed that the
 +        //         length of the current Payload equals the length of the
 +        //         previous Payload.
          for (int j = 0; j < postingFreq; j++) {		  // use delta-encoding
            int position = positions[j];
 -          prox.writeVInt(position - lastPosition);
 -          lastPosition = position;
 -        }
 -        // check to see if we switched to a new field
 -        String termField = posting.term.field();
 -        if (currentField != termField) {
 -          // changing field - see if there is something to save
 -          currentField = termField;
 -          FieldInfo fi = fieldInfos.fieldInfo(currentField);
 -          if (fi.storeTermVector) {
 -            if (termVectorWriter == null) {
 -              termVectorWriter =
 -                new TermVectorsWriter(directory, segment, fieldInfos);
 -              termVectorWriter.openDocument();
 +          int delta = position - lastPosition;
 +          if (currentFieldHasPayloads) {
 +            int payloadLength = 0;
 +            Payload payload = null;
 +            if (payloads != null) {
 +              payload = payloads[j];
 +              if (payload != null) {
 +                payloadLength = payload.getLength();
 +              }
              }
 -            termVectorWriter.openField(currentField);
 -
 -          } else if (termVectorWriter != null) {
 -            termVectorWriter.closeField();
 +            if (payloadLength == lastPayloadLength) {
 +            	// the length of the current payload equals the length
 +            	// of the previous one. So we do not have to store the length
 +            	// again and we only shift the position delta by one bit
 +              prox.writeVInt(delta * 2);
 +            } else {
 +            	// the length of the current payload is different from the
 +            	// previous one. We shift the position delta, set the lowest
 +            	// bit and store the current payload length as VInt.
 +              prox.writeVInt(delta * 2 + 1);
 +              prox.writeVInt(payloadLength);
 +              lastPayloadLength = payloadLength;
 +            }
 +            if (payloadLength > 0) {
 +              // write current payload
 +              payloadWriter.write(payload, prox);
 +            }
 +          } else {
 +          	// field does not store payloads, just write position delta as VInt
 +            prox.writeVInt(delta);
            }
 +          lastPosition = position;
          }
          if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
              termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
 @@ -394,18 +493,27 @@
    Term term;					  // the Term
    int freq;					  // its frequency in doc
    int[] positions;				  // positions it occurs at
 +  Payload[] payloads; // the payloads of the terms
    TermVectorOffsetInfo [] offsets;

 -  Posting(Term t, int position, TermVectorOffsetInfo offset) {
 +  Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
      term = t;
      freq = 1;
      positions = new int[1];
      positions[0] = position;
 +
 +    if (payload != null) {
 +      payloads = new Payload[1];
 +      payloads[0] = payload;
 +    } else
 +      payloads = null;
 +
 +
      if(offset != null){
 -    offsets = new TermVectorOffsetInfo[1];
 -    offsets[0] = offset;
 -    }
 -    else
 +      offsets = new TermVectorOffsetInfo[1];
 +      offsets[0] = offset;
 +    } else
        offsets = null;
 +
    }
  }
 Index: src/java/org/apache/lucene/index/BytePayload.java
 ===================================================================
 --- src/java/org/apache/lucene/index/BytePayload.java	(révision 0)
 +++ src/java/org/apache/lucene/index/BytePayload.java	(révision 0)
 @@ -0,0 +1,101 @@
 +package org.apache.lucene.index;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.analysis.Token;
 +import org.apache.lucene.analysis.TokenStream;
 +
 +/**
 + *  A Payload is metadata that can be stored together with each occurrence
 + *  of a term. This metadata is stored inline in the posting list of the
 + *  specific term.
 + *  <p>
 + *  To store payloads in the index a {@link TokenStream} has to be used that
 + *  produces {@link Token}s containing payload data.
 + *  <p>
 + *  Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
 + *  to retrieve the payloads from the index.
 + */
 +public class BytePayload implements Payload {
 +  private byte[] data;
 +
 +  private int offset;
 +
 +  private int length;
 +
 +  /**
 +   * Creates a new payload with the the given array as data.
 +   *
 +   * @param data the data of this payload
 +   */
 +  public BytePayload(byte[] data) {
 +    this(data, 0, data.length);
 +  }
 +
 +  /**
 +   * Creates a new payload with the the given array as data.
 +   *
 +   * @param data the data of this payload
 +   * @param offset the offset in the data byte array
 +   * @param length the length of the data
 +   */
 +  public BytePayload(byte[] data, int offset, int length) {
 +    if (offset < 0 || offset + length > data.length) {
 +      throw new IllegalArgumentException();
 +    }
 +    this.data = data;
 +    this.offset = offset;
 +    this.length = length;
 +  }
 +
 +  public int getLength() {
 +    return this.length;
 +  }
 +
 +  /**
 +   * Returns the byte at the given index.
 +   */
 +  public byte byteAt(int index) {
 +    if (0 <= index && index < this.length) {
 +      return this.data[this.offset + index];
 +    }
 +    throw new ArrayIndexOutOfBoundsException(index);
 +  }
 +
 +  /**
 +   * Allocates a new byte array, copies the payload data into it and returns it.
 +   */
 +  public byte[] toByteArray() {
 +    byte[] retArray = new byte[this.length];
 +    System.arraycopy(this.data, this.offset, retArray, 0, this.length);
 +    return retArray;
 +  }
 +
 +  /**
 +   * Copies the payload data to a byte array.
 +   *
 +   * @param target the target byte array
 +   * @param targetOffset the offset in the target byte array
 +   */
 +  public void copyTo(byte[] target, int targetOffset) {
 +    if (this.length > target.length + targetOffset) {
 +      throw new ArrayIndexOutOfBoundsException();
 +    }
 +    System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
 +  }
 +}
 Index: src/java/org/apache/lucene/index/ParallelReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/ParallelReader.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/ParallelReader.java	(copie de travail)
 @@ -389,7 +389,15 @@
        // It is an error to call this if there is no next position, e.g. if termDocs==null
        return ((TermPositions)termDocs).nextPosition();
      }
 +
 +    public int getPayloadLength() {
 +      return ((TermPositions)termDocs).getPayloadLength();
 +    }

 +    public Payload getPayload() throws IOException {
 +      return ((TermPositions)termDocs).getPayload();
 +    }
 +
    }

  }
 Index: src/java/org/apache/lucene/index/SegmentReader.java
 ===================================================================
 --- src/java/org/apache/lucene/index/SegmentReader.java	(révision 493447)
 +++ src/java/org/apache/lucene/index/SegmentReader.java	(copie de travail)
 @@ -369,6 +369,9 @@
        else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
          fieldSet.add(fi.name);
        }
 +      else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
 +        fieldSet.add(fi.name);
 +      }
        else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
          fieldSet.add(fi.name);
        }
 @@ -545,6 +548,10 @@
      return termVectorsReader.get(docNumber);
    }

 +  FieldInfos fieldInfos() {
 +    return fieldInfos;
 +  }
 +
    /**
     * Return the name of the segment this reader is reading.
     */
 Index: src/java/org/apache/lucene/store/RAMDirectory.java
 ===================================================================
 --- src/java/org/apache/lucene/store/RAMDirectory.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/RAMDirectory.java	(copie de travail)
 @@ -17,16 +17,18 @@
   * limitations under the License.
   */

 +import java.io.File;
 +import java.io.FileNotFoundException;
  import java.io.IOException;
 -import java.io.FileNotFoundException;
 -import java.io.File;
  import java.io.Serializable;
  import java.util.Collection;
 -import java.util.Enumeration;
  import java.util.HashMap;
  import java.util.Iterator;
  import java.util.Set;

 +import org.apache.lucene.index.DefaultIndexFormat;
 +import org.apache.lucene.index.IndexFormat;
 +
  /**
   * A memory-resident {@link Directory} implementation.  Locking
   * implementation is by default the {@link SingleInstanceLockFactory}
 @@ -47,8 +49,22 @@
    // Lock acquisition sequence:  RAMDirectory, then RAMFile
    // *****

 -  /** Constructs an empty {@link Directory}. */
 +
 +  /**
 +   * Constructs an empty {@link Directory}.
 +   * The index format used the the default one
 +   */
    public RAMDirectory() {
 +    this(new DefaultIndexFormat());
 +  }
 +
 +  /**
 +   * Contructor
 +   *
 +   * @param indexFormat the format of the index
 +   */
 +  public RAMDirectory(IndexFormat indexFormat) {
 +    this.indexFormat = indexFormat;
      setLockFactory(new SingleInstanceLockFactory());
    }

 @@ -69,11 +85,11 @@
     * @exception IOException if an error occurs
     */
    public RAMDirectory(Directory dir) throws IOException {
 -    this(dir, false);
 +    this(dir, false, new DefaultIndexFormat());
    }
 -
 -  private RAMDirectory(Directory dir, boolean closeDir) throws IOException {
 -    this();
 +
 +  private RAMDirectory(Directory dir, boolean closeDir, IndexFormat indexFormat) throws IOException {
 +    this(indexFormat);
      final String[] files = dir.list();
      byte[] buf = new byte[BufferedIndexOutput.BUFFER_SIZE];
      for (int i = 0; i < files.length; i++) {
 @@ -107,7 +123,7 @@
     * @see #RAMDirectory(Directory)
     */
    public RAMDirectory(File dir) throws IOException {
 -    this(FSDirectory.getDirectory(dir, false), true);
 +    this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat());
    }

    /**
 @@ -118,7 +134,7 @@
     * @see #RAMDirectory(Directory)
     */
    public RAMDirectory(String dir) throws IOException {
 -    this(FSDirectory.getDirectory(dir, false), true);
 +    this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat());
    }

    /** Returns an array of strings, one for each file in the directory. */
 Index: src/java/org/apache/lucene/store/Directory.java
 ===================================================================
 --- src/java/org/apache/lucene/store/Directory.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/Directory.java	(copie de travail)
 @@ -19,6 +19,9 @@

  import java.io.IOException;

 +import org.apache.lucene.index.DefaultIndexFormat;
 +import org.apache.lucene.index.IndexFormat;
 +
  /** A Directory is a flat list of files.  Files may be written once, when they
   * are created.  Once a file is created it may only be opened for read, or
   * deleted.  Random access is permitted both when reading and writing.
 @@ -42,6 +45,12 @@
     * this Directory instance). */
    protected LockFactory lockFactory;

 +  protected IndexFormat indexFormat = new DefaultIndexFormat();
 +
 +  public IndexFormat getIndexFormat() {
 +    return indexFormat;
 +  }
 +
    /** Returns an array of strings, one for each file in the directory. */
    public abstract String[] list()
         throws IOException;
 Index: src/java/org/apache/lucene/store/RAMOutputStream.java
 ===================================================================
 --- src/java/org/apache/lucene/store/RAMOutputStream.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/RAMOutputStream.java	(copie de travail)
 @@ -66,7 +66,7 @@
      file.setLength(0);
    }

 -  public void flushBuffer(byte[] src, int len) throws IOException {
 +  public void flushBuffer(byte[] src, int offset, int len) throws IOException {
      byte[] buffer;
      int bufferPos = 0;
      while (bufferPos != len) {
 @@ -81,7 +81,7 @@
        else
          buffer = (byte[]) file.buffers.get(bufferNumber);

 -      System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
 +      System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
        bufferPos += bytesToCopy;
        pointer += bytesToCopy;
      }
 Index: src/java/org/apache/lucene/store/IndexOutput.java
 ===================================================================
 --- src/java/org/apache/lucene/store/IndexOutput.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/IndexOutput.java	(copie de travail)
 @@ -36,8 +36,18 @@
     * @param length the number of bytes to write
     * @see IndexInput#readBytes(byte[],int,int)
     */
 -  public abstract void writeBytes(byte[] b, int length) throws IOException;
 +  public void writeBytes(byte[] b, int length) throws IOException {
 +    writeBytes(b, 0, length);
 +  }

 +  /** Writes an array of bytes.
 +   * @param b the bytes to write
 +   * @param offset the offset in the byte array
 +   * @param length the number of bytes to write
 +   * @see IndexInput#readBytes(byte[],int,int)
 +   */
 +  public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
 +
    /** Writes an int as four bytes.
     * @see IndexInput#readInt()
     */
 Index: src/java/org/apache/lucene/store/BufferedIndexOutput.java
 ===================================================================
 --- src/java/org/apache/lucene/store/BufferedIndexOutput.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/BufferedIndexOutput.java	(copie de travail)
 @@ -24,8 +24,8 @@
    static final int BUFFER_SIZE = 1024;

    private final byte[] buffer = new byte[BUFFER_SIZE];
 -  private long bufferStart = 0;			  // position in file of buffer
 -  private int bufferPosition = 0;		  // position in buffer
 +  private long bufferStart = 0;           // position in file of buffer
 +  private int bufferPosition = 0;         // position in buffer

    /** Writes a single byte.
     * @see IndexInput#readByte()
 @@ -41,12 +41,12 @@
     * @param length the number of bytes to write
     * @see IndexInput#readBytes(byte[],int,int)
     */
 -  public void writeBytes(byte[] b, int length) throws IOException {
 +  public void writeBytes(byte[] b, int offset, int length) throws IOException {
      int bytesLeft = BUFFER_SIZE - bufferPosition;
      // is there enough space in the buffer?
      if (bytesLeft >= length) {
        // we add the data to the end of the buffer
 -      System.arraycopy(b, 0, buffer, bufferPosition, length);
 +      System.arraycopy(b, offset, buffer, bufferPosition, length);
        bufferPosition += length;
        // if the buffer is full, flush it
        if (BUFFER_SIZE - bufferPosition == 0)
 @@ -66,7 +66,7 @@
          int pieceLength;
          while (pos < length) {
            pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
 -          System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
 +          System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
            pos += pieceLength;
            bufferPosition += pieceLength;
            // if the buffer is full, flush it
 @@ -92,8 +92,18 @@
     * @param b the bytes to write
     * @param len the number of bytes to write
     */
 -  protected abstract void flushBuffer(byte[] b, int len) throws IOException;
 +  private void flushBuffer(byte[] b, int len) throws IOException {
 +    flushBuffer(b, 0, len);
 +  }

 +  /** Expert: implements buffer write.  Writes bytes at the current position in
 +   * the output.
 +   * @param b the bytes to write
 +   * @param offset the offset in the byte array
 +   * @param len the number of bytes to write
 +   */
 +  protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
 +
    /** Closes this stream to further operations. */
    public void close() throws IOException {
      flush();
 Index: src/java/org/apache/lucene/store/FSDirectory.java
 ===================================================================
 --- src/java/org/apache/lucene/store/FSDirectory.java	(révision 493447)
 +++ src/java/org/apache/lucene/store/FSDirectory.java	(copie de travail)
 @@ -26,7 +26,9 @@
  import java.security.NoSuchAlgorithmException;
  import java.util.Hashtable;

 +import org.apache.lucene.index.DefaultIndexFormat;
  import org.apache.lucene.index.IndexFileNameFilter;
 +import org.apache.lucene.index.IndexFormat;

  /**
   * Straightforward implementation of {@link Directory} as a directory of files.
 @@ -180,9 +182,28 @@
     * @param create if true, create, or erase any existing contents.
     * @param lockFactory instance of  {@link LockFactory} providing the
     *        locking implementation.
 -   * @return the FSDirectory for the named file.  */
 +   * @return the FSDirectory for the named file.
 +   * @throws IOException */
    public static FSDirectory getDirectory(File file, boolean create,
                                           LockFactory lockFactory, boolean doRemoveOldFiles)
 +  throws IOException {
 +    return getDirectory(file, create, lockFactory, doRemoveOldFiles, new DefaultIndexFormat());
 +  }
 +
 +  /** Returns the directory instance for the named location, using the
 +   * provided LockFactory implementation.
 +   *
 +   * <p>Directories are cached, so that, for a given canonical path, the same
 +   * FSDirectory instance will always be returned.  This permits
 +   * synchronization on directories.
 +   *
 +   * @param file the path to the directory.
 +   * @param create if true, create, or erase any existing contents.
 +   * @param lockFactory instance of  {@link LockFactory} providing the
 +   *        locking implementation.
 +   * @return the FSDirectory for the named file.  */
 +  public static FSDirectory getDirectory(File file, boolean create, LockFactory lockFactory,
 +                                         boolean doRemoveOldFiles, IndexFormat indexFormat)
      throws IOException {
      file = new File(file.getCanonicalPath());
      FSDirectory dir;
 @@ -194,7 +215,7 @@
          } catch (Exception e) {
            throw new RuntimeException("cannot load FSDirectory class: " + e.toString(), e);
          }
 -        dir.init(file, create, lockFactory, doRemoveOldFiles);
 +        dir.init(file, create, lockFactory, doRemoveOldFiles, indexFormat);
          DIRECTORIES.put(file, dir);
        } else {

 @@ -243,8 +264,11 @@
        throw new IOException(path + " not a directory");
    }

 -  private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles) throws IOException {
 +  private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles,
 +      IndexFormat indexFormat) throws IOException {

 +    this.indexFormat = indexFormat;
 +
      // Set up lockFactory with cascaded defaults: if an instance was passed in,
      // use that; else if locks are disabled, use NoLockFactory; else if the
      // system property org.apache.lucene.store.FSDirectoryLockFactoryClass is set,
 @@ -592,8 +616,8 @@
    }

    /** output methods: */
 -  public void flushBuffer(byte[] b, int size) throws IOException {
 -    file.write(b, 0, size);
 +  public void flushBuffer(byte[] b, int offset, int size) throws IOException {
 +    file.write(b, offset, size);
    }
    public void close() throws IOException {
      // only close the file if it has not been closed yet
 Index: src/site/src/documentation/content/xdocs/fileformats.xml
 ===================================================================
 --- src/site/src/documentation/content/xdocs/fileformats.xml	(révision 493447)
 +++ src/site/src/documentation/content/xdocs/fileformats.xml	(copie de travail)
 @@ -798,16 +798,20 @@

                  <p>
                      <b>Pre-2.1:</b>
 +                    <code>
                      Segments --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize&gt;
                      <sup>SegCount</sup>
 +                    </code>
                  </p>
                  <p>
                      <b>2.1 and above:</b>
 +                    <code>
                      Segments --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize, DelGen, NumField, NormGen
                      <sup>NumField</sup>
                      &gt;
                      <sup>SegCount</sup>
                      , IsCompoundFile
 +                    </code>
                  </p>

                  <p>
 @@ -1002,6 +1006,7 @@
                          <li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
                          <li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
                          <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
 +                        <li>If the sixth lowest-order bit is set (0x20), payloads are being stored for the indexed field.</li>
                      </ul>
                  </p>

 @@ -1287,9 +1292,9 @@
                      <sup>DocFreq/SkipInterval</sup>
                  </p>
                  <p>SkipDatum --&gt;
 -                    DocSkip,FreqSkip,ProxSkip
 +                    DocSkip,PayloadLength?,FreqSkip,ProxSkip
                  </p>
 -                <p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --&gt;
 +                <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt;
                      VInt
                  </p>
                  <p>TermFreqs
 @@ -1317,9 +1322,17 @@
                      SkipInterval
                      <sup>th</sup>
                      document in TermFreqs.
 -                    Document numbers are represented as differences
 -                    from the previous value in the sequence. FreqSkip
 -                    and ProxSkip record the position of every
 +                    If payloads are disabled for the term's field,
 +                    then DocSkip represents the difference from the
 +                    previous value in the sequence.
 +                    If payloads are enabled for the term's field,
 +                    then DocSkip/2 represents the difference from the
 +                    previous value in the sequence. If payloads are enabled
 +                    and DocSkip is odd,
 +                    then PayloadLength is stored indicating the length
 +                    of the last payload before the SkipInterval<sup>th</sup>
 +                    document in TermPositions.
 +					FreqSkip and ProxSkip record the position of every
                      SkipInterval
                      <sup>th</sup>
                      entry in FreqFile and
 @@ -1368,12 +1381,22 @@
                      <sup>DocFreq</sup>
                  </p>
                  <p>Positions --&gt;
 -                    &lt;PositionDelta&gt;
 +                    &lt;PositionDelta,Payload?&gt;
                      <sup>Freq</sup>
                  </p>
 +                <p>Payload --&gt;
 +                    &lt;PayloadLength?,PayloadData&gt;
 +                </p>
                  <p>PositionDelta --&gt;
                      VInt
                  </p>
 +                <p>PayloadLength --&gt;
 +                    VInt
 +                </p>
 +                <p>PayloadData --&gt;
 +                    byte<sup>PayloadLength</sup>
 +                </p>
 +
                  <p>TermPositions
                      are ordered by term (the term is implicit, from the .tis file).
                  </p>
 @@ -1382,19 +1405,30 @@
                      number is implicit from the .frq file).
                  </p>
                  <p>PositionDelta
 -                    is the difference between the position of the current occurrence in
 +                    is, if payloads are disabled for the term's field, the difference
 +                    between the position of the current occurrence in
                      the document and the previous occurrence (or zero, if this is the
                      first occurrence in this document).
 +                    If payloads are enabled for the term's field, then PositionDelta/2
 +                    is the difference between the current and the previous position. If
 +                    payloads are enabled and PositionDelta is odd, then PayloadLength is
 +                    stored, indicating the length of the payload at the current term position.
                  </p>
                  <p>
                      For example, the TermPositions for a
                      term which occurs as the fourth term in one document, and as the
                      fifth and ninth term in a subsequent document, would be the following
 -                    sequence of VInts:
 +                    sequence of VInts (payloads disabled):
                  </p>
                  <p>4,
                      5, 4
                  </p>
 +                <p>PayloadData
 +                    is metadata associated with the current term position. If PayloadLength
 +                    is stored at the current position, then it indicates the length of this
 +                    Payload. If PayloadLength is not stored, then this Payload has the same
 +                    length as the Payload at the previous position.
 +                </p>
              </section>
              <section id="Normalization Factors"><title>Normalization Factors</title>
                  <p>There's a norm file for each indexed field with a byte for