| Index: src/test/org/apache/lucene/store/MockRAMOutputStream.java |
| =================================================================== |
| --- src/test/org/apache/lucene/store/MockRAMOutputStream.java (révision 493447) |
| +++ src/test/org/apache/lucene/store/MockRAMOutputStream.java (copie de travail) |
| @@ -48,7 +48,7 @@ |
| } |
| } |
| |
| - public void flushBuffer(byte[] src, int len) throws IOException { |
| + public void flushBuffer(byte[] src, int offset, int len) throws IOException { |
| long freeSpace = dir.maxSize - dir.sizeInBytes(); |
| long realUsage = 0; |
| |
| @@ -63,14 +63,14 @@ |
| if (dir.maxSize != 0 && freeSpace <= len) { |
| if (freeSpace > 0 && freeSpace < len) { |
| realUsage += freeSpace; |
| - super.flushBuffer(src, (int) freeSpace); |
| + super.flushBuffer(src, offset, (int) freeSpace); |
| } |
| if (realUsage > dir.maxUsedSize) { |
| dir.maxUsedSize = realUsage; |
| } |
| throw new IOException("fake disk full at " + dir.sizeInBytes() + " bytes"); |
| } else { |
| - super.flushBuffer(src, len); |
| + super.flushBuffer(src, offset, len); |
| } |
| |
| if (first) { |
| Index: src/test/org/apache/lucene/index/TestPayloads.java |
| =================================================================== |
| --- src/test/org/apache/lucene/index/TestPayloads.java (révision 0) |
| +++ src/test/org/apache/lucene/index/TestPayloads.java (révision 0) |
| @@ -0,0 +1,416 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.File; |
| +import java.io.IOException; |
| +import java.io.Reader; |
| +import java.util.HashMap; |
| +import java.util.Map; |
| +import java.util.Random; |
| + |
| +import junit.framework.TestCase; |
| + |
| +import org.apache.lucene.analysis.Analyzer; |
| +import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenFilter; |
| +import org.apache.lucene.analysis.TokenStream; |
| +import org.apache.lucene.analysis.WhitespaceAnalyzer; |
| +import org.apache.lucene.analysis.WhitespaceTokenizer; |
| +import org.apache.lucene.document.Document; |
| +import org.apache.lucene.document.Field; |
| +import org.apache.lucene.document.Field.Index; |
| +import org.apache.lucene.document.Field.Store; |
| +import org.apache.lucene.store.Directory; |
| +import org.apache.lucene.store.FSDirectory; |
| +import org.apache.lucene.store.RAMDirectory; |
| + |
| + |
| +public class TestPayloads extends TestCase { |
| + |
| + // Simple tests to test the Payload class |
| + public void testPayload() throws Exception { |
| + byte[] testData = "This is a test!".getBytes(); |
| + BytePayload payload = new BytePayload(testData); |
| + assertEquals("Wrong payload length.", testData.length, payload.getLength()); |
| + |
| + // test copyTo() |
| + byte[] target = new byte[testData.length - 1]; |
| + try { |
| + payload.copyTo(target, 0); |
| + fail("Expected exception not thrown"); |
| + } catch (Exception expected) { |
| + // expected exception |
| + } |
| + |
| + target = new byte[testData.length + 3]; |
| + payload.copyTo(target, 3); |
| + |
| + for (int i = 0; i < testData.length; i++) { |
| + assertEquals(testData[i], target[i + 3]); |
| + } |
| + |
| + |
| + // test toByteArray() |
| + target = payload.toByteArray(); |
| + assertByteArrayEquals(testData, target); |
| + |
| + // test byteAt() |
| + for (int i = 0; i < testData.length; i++) { |
| + assertEquals(payload.byteAt(i), testData[i]); |
| + } |
| + |
| + try { |
| + payload.byteAt(testData.length + 1); |
| + fail("Expected exception not thrown"); |
| + } catch (Exception expected) { |
| + // expected exception |
| + } |
| + } |
| + |
| + // Tests whether the DocumentWriter and SegmentMerger correctly enable the |
| + // payload bit in the FieldInfo |
| + public void testPayloadFieldBit() throws Exception { |
| + Directory ram = new RAMDirectory(); |
| + PayloadAnalyzer analyzer = new PayloadAnalyzer(); |
| + IndexWriter writer = new IndexWriter(ram, analyzer, true); |
| + Document d = new Document(); |
| + // this field won't have any payloads |
| + d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); |
| + // this field will have payloads in all docs, however not for all term positions, |
| + // so this field is used to check if the DocumentWriter correctly enables the payloads bit |
| + // even if only some term positions have payloads |
| + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads |
| + // enabled in only some documents |
| + d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + // only add payload data for field f2 |
| + analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1); |
| + writer.addDocument(d); |
| + // flush |
| + writer.close(); |
| + |
| + // only one segment in the index, so we can cast to SegmentReader |
| + SegmentReader reader = (SegmentReader) IndexReader.open(ram); |
| + FieldInfos fi = reader.fieldInfos(); |
| + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); |
| + assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); |
| + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads); |
| + reader.close(); |
| + |
| + // now we add another document which has payloads for field f3 and verify if the SegmentMerger |
| + // enabled payloads for that field |
| + writer = new IndexWriter(ram, analyzer, true); |
| + d = new Document(); |
| + d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED)); |
| + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED)); |
| + // add payload data for field f2 and f3 |
| + analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1); |
| + analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3); |
| + writer.addDocument(d); |
| + // force merge |
| + writer.optimize(); |
| + // flush |
| + writer.close(); |
| + |
| + // only one segment in the index, so we can cast to SegmentReader |
| + reader = (SegmentReader) IndexReader.open(ram); |
| + fi = reader.fieldInfos(); |
| + assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads); |
| + assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads); |
| + assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads); |
| + reader.close(); |
| + } |
| + |
| + // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory |
| + public void testPayloadsEncoding() throws Exception { |
| + // first perform the test using a RAMDirectory |
| + Directory dir = new RAMDirectory(); |
| + performTest(dir); |
| + |
| + // now use a FSDirectory and repeat same test |
| + String dirName = "test_payloads"; |
| + dir = FSDirectory.getDirectory(dirName, true); |
| + performTest(dir); |
| + rmDir(dirName); |
| + } |
| + |
| + // builds an index with payloads in the given Directory and performs |
| + // different tests to verify the payload encoding |
| + private void performTest(Directory dir) throws Exception { |
| + PayloadAnalyzer analyzer = new PayloadAnalyzer(); |
| + IndexWriter writer = new IndexWriter(dir, analyzer, true); |
| + |
| + // should be in sync with value in TermInfosWriter |
| + final int skipInterval = 16; |
| + |
| + final int numTerms = 5; |
| + final String fieldName = "f1"; |
| + |
| + int numDocs = skipInterval + 1; |
| + // create content for the test documents with just a few terms |
| + Term[] terms = generateTerms(fieldName, numTerms); |
| + StringBuffer sb = new StringBuffer(); |
| + for (int i = 0; i < terms.length; i++) { |
| + sb.append(terms[i].text); |
| + sb.append(" "); |
| + } |
| + String content = sb.toString(); |
| + |
| + |
| + int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2; |
| + byte[] payloadData = generateRandomData(payloadDataLength); |
| + |
| + Document d = new Document(); |
| + d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED)); |
| + // add the same document multiple times to have the same payload lengths for all |
| + // occurrences within two consecutive skip intervals |
| + int offset = 0; |
| + for (int i = 0; i < 2 * numDocs; i++) { |
| + analyzer.setPayloadData("f1", payloadData, offset, 1); |
| + offset += numTerms; |
| + writer.addDocument(d); |
| + } |
| + |
| + // now we make sure to have different payload lengths next at the next skip point |
| + for (int i = 0; i < numDocs; i++) { |
| + analyzer.setPayloadData(fieldName, payloadData, offset, i); |
| + offset += i * numTerms; |
| + writer.addDocument(d); |
| + } |
| + |
| + writer.optimize(); |
| + // flush |
| + writer.close(); |
| + |
| + |
| + /* |
| + * Verify the index |
| + * first we test if all payloads are stored correctly |
| + */ |
| + IndexReader reader = IndexReader.open(dir); |
| + |
| + byte[] verifyPayloadData = new byte[payloadDataLength]; |
| + offset = 0; |
| + TermPositions[] tps = new TermPositions[numTerms]; |
| + for (int i = 0; i < numTerms; i++) { |
| + tps[i] = reader.termPositions(terms[i]); |
| + } |
| + |
| + while (tps[0].next()) { |
| + for (int i = 1; i < numTerms; i++) { |
| + tps[i].next(); |
| + } |
| + int freq = tps[0].freq(); |
| + |
| + for (int i = 0; i < freq; i++) { |
| + for (int j = 0; j < numTerms; j++) { |
| + tps[j].nextPosition(); |
| + BytePayload payload = (BytePayload) tps[j].getPayload(); |
| + payload.copyTo(verifyPayloadData, offset); |
| + offset += tps[j].getPayloadLength(); |
| + } |
| + } |
| + } |
| + |
| + for (int i = 0; i < numTerms; i++) { |
| + tps[i].close(); |
| + } |
| + |
| + assertByteArrayEquals(payloadData, verifyPayloadData); |
| + |
| + /* |
| + * test lazy skipping |
| + */ |
| + TermPositions tp = reader.termPositions(terms[0]); |
| + tp.next(); |
| + tp.nextPosition(); |
| + // now we don't read this payload |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); |
| + BytePayload payload = (BytePayload) tp.getPayload(); |
| + assertEquals(payload.byteAt(0), payloadData[numTerms]); |
| + tp.nextPosition(); |
| + |
| + // we don't read this payload and skip to a different document |
| + tp.skipTo(5); |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); |
| + payload = (BytePayload) tp.getPayload(); |
| + assertEquals(payload.byteAt(0), payloadData[5 * numTerms]); |
| + |
| + |
| + /* |
| + * Test different lengths at skip points |
| + */ |
| + tp.seek(terms[1]); |
| + tp.next(); |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); |
| + tp.skipTo(skipInterval - 1); |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); |
| + tp.skipTo(2 * skipInterval - 1); |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 1, tp.getPayloadLength()); |
| + tp.skipTo(3 * skipInterval - 1); |
| + tp.nextPosition(); |
| + assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength()); |
| + |
| + /* |
| + * Test multiple call of getPayload() |
| + */ |
| + tp.getPayload(); |
| + try { |
| + // it is forbidden to call getPayload() more than once |
| + // without calling nextPosition() |
| + tp.getPayload(); |
| + fail("Expected exception not thrown"); |
| + } catch (Exception expected) { |
| + // expected exception |
| + } |
| + |
| + reader.close(); |
| + } |
| + |
| + private byte[] generateRandomData(int n) { |
| + Random rnd = new Random(); |
| + byte[] data = new byte[n]; |
| + rnd.nextBytes(data); |
| + return data; |
| + } |
| + |
| + private Term[] generateTerms(String fieldName, int n) { |
| + int maxDigits = (int) (Math.log(n) / Math.log(10)); |
| + Term[] terms = new Term[n]; |
| + StringBuffer sb = new StringBuffer(); |
| + for (int i = 0; i < n; i++) { |
| + sb.setLength(0); |
| + sb.append("t"); |
| + int zeros = maxDigits - (int) (Math.log(i) / Math.log(10)); |
| + for (int j = 0; j < zeros; j++) { |
| + sb.append("0"); |
| + } |
| + sb.append(i); |
| + terms[i] = new Term(fieldName, sb.toString()); |
| + } |
| + return terms; |
| + } |
| + |
| + |
| + private void rmDir(String dir) { |
| + File fileDir = new File(dir); |
| + if (fileDir.exists()) { |
| + File[] files = fileDir.listFiles(); |
| + if (files != null) { |
| + for (int i = 0; i < files.length; i++) { |
| + files[i].delete(); |
| + } |
| + } |
| + fileDir.delete(); |
| + } |
| + } |
| + |
| + |
| + |
| + void assertByteArrayEquals(byte[] b1, byte[] b2) { |
| + if (b1.length != b2.length) { |
| + fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length); |
| + } |
| + |
| + for (int i = 0; i < b1.length; i++) { |
| + if (b1[i] != b2[i]) { |
| + fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]); |
| + } |
| + } |
| + } |
| + |
| + |
| + /** |
| + * This Analyzer uses an WhitespaceTokenizer and PayloadFilter. |
| + */ |
| + private static class PayloadAnalyzer extends Analyzer { |
| + Map fieldToData = new HashMap(); |
| + |
| + void setPayloadData(String field, byte[] data, int offset, int length) { |
| + fieldToData.put(field, new PayloadData(0, data, offset, length)); |
| + } |
| + |
| + void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) { |
| + fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length)); |
| + } |
| + |
| + public TokenStream tokenStream(String fieldName, Reader reader) { |
| + PayloadData payload = (PayloadData) fieldToData.get(fieldName); |
| + TokenStream ts = new WhitespaceTokenizer(reader); |
| + if (payload != null) { |
| + if (payload.numFieldInstancesToSkip == 0) { |
| + ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length); |
| + } else { |
| + payload.numFieldInstancesToSkip--; |
| + } |
| + } |
| + return ts; |
| + } |
| + |
| + private static class PayloadData { |
| + byte[] data; |
| + int offset; |
| + int length; |
| + int numFieldInstancesToSkip; |
| + |
| + PayloadData(int skip, byte[] data, int offset, int length) { |
| + numFieldInstancesToSkip = skip; |
| + this.data = data; |
| + this.offset = offset; |
| + this.length = length; |
| + } |
| + } |
| + } |
| + |
| + |
| + /** |
| + * This Filter adds payloads to the tokens. |
| + */ |
| + private static class PayloadFilter extends TokenFilter { |
| + private byte[] data; |
| + private int length; |
| + private int offset; |
| + |
| + public PayloadFilter(TokenStream in, byte[] data, int offset, int length) { |
| + super(in); |
| + this.data = data; |
| + this.length = length; |
| + this.offset = offset; |
| + } |
| + |
| + public Token next() throws IOException { |
| + Token nextToken = input.next(); |
| + if (nextToken != null && offset + length <= data.length) { |
| + nextToken.setPayload(new BytePayload(data, offset, length)); |
| + offset += length; |
| + } |
| + |
| + return nextToken; |
| + } |
| + } |
| + |
| +} |
| Index: src/java/org/apache/lucene/analysis/Token.java |
| =================================================================== |
| --- src/java/org/apache/lucene/analysis/Token.java (révision 493447) |
| +++ src/java/org/apache/lucene/analysis/Token.java (copie de travail) |
| @@ -1,5 +1,8 @@ |
| package org.apache.lucene.analysis; |
| |
| +import org.apache.lucene.index.Payload; |
| +import org.apache.lucene.index.TermPositions; |
| + |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| @@ -20,23 +23,32 @@ |
| /** A Token is an occurence of a term from the text of a field. It consists of |
| a term's text, the start and end offset of the term in the text of the field, |
| and a type string. |
| - |
| + <p> |
| The start and end offsets permit applications to re-associate a token with |
| its source text, e.g., to display highlighted query terms in a document |
| browser, or to show matching text fragments in a KWIC (KeyWord In Context) |
| display, etc. |
| - |
| + <p> |
| The type is an interned string, assigned by a lexical analyzer |
| (a.k.a. tokenizer), naming the lexical or syntactic class that the token |
| belongs to. For example an end of sentence marker token might be implemented |
| - with type "eos". The default token type is "word". */ |
| + with type "eos". The default token type is "word". |
| + <p> |
| + A Token can optionally have metadata (a.k.a. Payload) in the form of a variable |
| + length byte array. Use {@link TermPositions#getPayloadLength()} and |
| + {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index. |
| |
| + @see org.apache.lucene.index.Payload |
| + */ |
| + |
| public class Token implements Cloneable { |
| String termText; // the text of the term |
| int startOffset; // start in source text |
| int endOffset; // end in source text |
| String type = "word"; // lexical type |
| - |
| + |
| + Payload payload; |
| + |
| private int positionIncrement = 1; |
| |
| /** Constructs a Token with the given term text, and start & end offsets. |
| @@ -115,6 +127,16 @@ |
| /** Returns this Token's lexical type. Defaults to "word". */ |
| public final String type() { return type; } |
| |
| + /** Sets this Token's payload. */ |
| + public void setPayload(Payload payload) { |
| + this.payload = payload; |
| + } |
| + |
| + /** Returns this Token's payload. */ |
| + public Payload getPayload() { |
| + return this.payload; |
| + } |
| + |
| public String toString() { |
| StringBuffer sb = new StringBuffer(); |
| sb.append("(" + termText + "," + startOffset + "," + endOffset); |
| Index: src/java/org/apache/lucene/index/FieldInfo.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/FieldInfo.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/FieldInfo.java (copie de travail) |
| @@ -28,9 +28,12 @@ |
| boolean storePositionWithTermVector; |
| |
| boolean omitNorms; // omit norms associated with indexed fields |
| + |
| + boolean storePayloads; // whether this field stores payloads together with term positions |
| |
| FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, |
| - boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { |
| + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, |
| + boolean omitNorms, boolean storePayloads) { |
| name = na; |
| isIndexed = tk; |
| number = nu; |
| @@ -38,5 +41,6 @@ |
| this.storeOffsetWithTermVector = storeOffsetWithTermVector; |
| this.storePositionWithTermVector = storePositionWithTermVector; |
| this.omitNorms = omitNorms; |
| + this.storePayloads = storePayloads; |
| } |
| } |
| Index: src/java/org/apache/lucene/index/PayloadReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/PayloadReader.java (révision 0) |
| +++ src/java/org/apache/lucene/index/PayloadReader.java (révision 0) |
| @@ -0,0 +1,11 @@ |
| +package org.apache.lucene.index; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.store.IndexInput; |
| + |
| +public interface PayloadReader { |
| + |
| + public Payload read(int length, IndexInput in) throws IOException; |
| + |
| +} |
| Index: src/java/org/apache/lucene/index/MultiReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/MultiReader.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/MultiReader.java (copie de travail) |
| @@ -450,5 +450,12 @@ |
| public int nextPosition() throws IOException { |
| return ((TermPositions)current).nextPosition(); |
| } |
| + |
| + public int getPayloadLength() { |
| + return ((TermPositions)current).getPayloadLength(); |
| + } |
| |
| + public Payload getPayload() throws IOException { |
| + return ((TermPositions)current).getPayload(); |
| + } |
| } |
| Index: src/java/org/apache/lucene/index/TermPositions.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/TermPositions.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/TermPositions.java (copie de travail) |
| @@ -32,10 +32,36 @@ |
| extends TermDocs |
| { |
| /** Returns next position in the current document. It is an error to call |
| - this more than {@link #freq()} times |
| - without calling {@link #next()}<p> This is |
| - invalid until {@link #next()} is called for |
| - the first time. |
| + this more than {@link #freq()} times |
| + without calling {@link #next()}<p> This is |
| + invalid until {@link #next()} is called for |
| + the first time. |
| */ |
| int nextPosition() throws IOException; |
| + |
| + /** Returns the length of the payload at the current term position. |
| + * This is invalid until {@link #nextPosition()} is called for |
| + * the first time. |
| + * |
| + * @return length of the current payload in number of bytes |
| + */ |
| + int getPayloadLength(); |
| + |
| + /** Returns the payload data at the current term position. |
| + * This is invalid until {@link #nextPosition()} is called for |
| + * the first time. |
| + * This method must not be called more than once after each call |
| + * of {@link #nextPosition()}. However, payloads are loaded lazily, |
| + * so if the payload data for the current position is not needed, |
| + * this method may not be called at all for performance reasons. |
| + * |
| + * @param data the array into which the data of this payload is to be |
| + * stored, if it is big enough; otherwise, a new byte[] array |
| + * is allocated for this purpose. |
| + * @param offset the offset in the array into which the data of this payload |
| + * is to be stored. |
| + * @return a byte[] array containing the data of this payload |
| + * @throws IOException |
| + */ |
| + Payload getPayload() throws IOException; |
| } |
| Index: src/java/org/apache/lucene/index/IndexFormat.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/IndexFormat.java (révision 0) |
| +++ src/java/org/apache/lucene/index/IndexFormat.java (révision 0) |
| @@ -0,0 +1,36 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.document.Fieldable; |
| + |
| +/** |
| + * Specify the format of index. |
| + * |
| + * The implementation of the {@link FieldsReader} and {@link FieldsWriter} returned by |
| + * the function getFieldsReader and getFieldsWriter will specify how the data of fields are |
| + * serialized, and also the kind of {@link Fieldable} used. |
| + * |
| + * $Id$ |
| + */ |
| +public interface IndexFormat { |
| + |
| + PayloadReader getPayloadReader(); |
| + |
| + PayloadWriter getPayloadWriter(); |
| +} |
| \ Pas de fin de ligne à la fin du fichier |
| Index: src/java/org/apache/lucene/index/DefaultPayloadReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DefaultPayloadReader.java (révision 0) |
| +++ src/java/org/apache/lucene/index/DefaultPayloadReader.java (révision 0) |
| @@ -0,0 +1,36 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.store.IndexInput; |
| + |
| +/** |
| + * This payload reader only support {@link BytePayload}. |
| + * |
| + * $Id$ |
| + */ |
| +public class DefaultPayloadReader implements PayloadReader { |
| + |
| + public Payload read(int length, IndexInput in) throws IOException { |
| + byte[] data = new byte[length]; |
| + in.readBytes(data, 0, length); |
| + return new BytePayload(data); |
| + } |
| +} |
| Index: src/java/org/apache/lucene/index/FieldInfos.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/FieldInfos.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/FieldInfos.java (copie de travail) |
| @@ -39,6 +39,7 @@ |
| static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4; |
| static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8; |
| static final byte OMIT_NORMS = 0x10; |
| + static final byte STORE_PAYLOADS = 0x20; |
| |
| private ArrayList byNumber = new ArrayList(); |
| private HashMap byName = new HashMap(); |
| @@ -156,9 +157,29 @@ |
| */ |
| public void add(String name, boolean isIndexed, boolean storeTermVector, |
| boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { |
| + add(name, isIndexed, storeTermVector, storePositionWithTermVector, |
| + storeOffsetWithTermVector, omitNorms, false); |
| + } |
| + |
| + /** If the field is not yet known, adds it. If it is known, checks to make |
| + * sure that the isIndexed flag is the same as was given previously for this |
| + * field. If not - marks it as being indexed. Same goes for the TermVector |
| + * parameters. |
| + * |
| + * @param name The name of the field |
| + * @param isIndexed true if the field is indexed |
| + * @param storeTermVector true if the term vector should be stored |
| + * @param storePositionWithTermVector true if the term vector with positions should be stored |
| + * @param storeOffsetWithTermVector true if the term vector with offsets should be stored |
| + * @param omitNorms true if the norms for the indexed field should be omitted |
| + * @param storePayloads true if payloads should be stored for this field |
| + */ |
| + public void add(String name, boolean isIndexed, boolean storeTermVector, |
| + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, |
| + boolean omitNorms, boolean storePayloads) { |
| FieldInfo fi = fieldInfo(name); |
| if (fi == null) { |
| - addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms); |
| + addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads); |
| } else { |
| if (fi.isIndexed != isIndexed) { |
| fi.isIndexed = true; // once indexed, always index |
| @@ -175,6 +196,9 @@ |
| if (fi.omitNorms != omitNorms) { |
| fi.omitNorms = false; // once norms are stored, always store |
| } |
| + if (fi.storePayloads != storePayloads) { |
| + fi.storePayloads = true; |
| + } |
| |
| } |
| } |
| @@ -182,10 +206,10 @@ |
| |
| private void addInternal(String name, boolean isIndexed, |
| boolean storeTermVector, boolean storePositionWithTermVector, |
| - boolean storeOffsetWithTermVector, boolean omitNorms) { |
| + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) { |
| FieldInfo fi = |
| new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, |
| - storeOffsetWithTermVector, omitNorms); |
| + storeOffsetWithTermVector, omitNorms, storePayloads); |
| byNumber.add(fi); |
| byName.put(name, fi); |
| } |
| @@ -271,6 +295,7 @@ |
| if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; |
| if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; |
| if (fi.omitNorms) bits |= OMIT_NORMS; |
| + if (fi.storePayloads) bits |= STORE_PAYLOADS; |
| output.writeString(fi.name); |
| output.writeByte(bits); |
| } |
| @@ -286,8 +311,9 @@ |
| boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; |
| boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; |
| boolean omitNorms = (bits & OMIT_NORMS) != 0; |
| - |
| - addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms); |
| + boolean storePayloads = (bits & STORE_PAYLOADS) != 0; |
| + |
| + addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads); |
| } |
| } |
| |
| Index: src/java/org/apache/lucene/index/DefaultIndexFormat.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DefaultIndexFormat.java (révision 0) |
| +++ src/java/org/apache/lucene/index/DefaultIndexFormat.java (révision 0) |
| @@ -0,0 +1,39 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| + |
| +/** |
| + * The default implementation of the index format |
| + * |
| + * $Id$ |
| + */ |
| +public class DefaultIndexFormat implements IndexFormat { |
| + |
| + private PayloadReader reader = new DefaultPayloadReader(); |
| + |
| + private PayloadWriter writer = new DefaultPayloadWriter(); |
| + |
| + public PayloadReader getPayloadReader() { |
| + return reader; |
| + } |
| + |
| + public PayloadWriter getPayloadWriter() { |
| + return writer; |
| + } |
| +} |
| Index: src/java/org/apache/lucene/index/PayloadWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/PayloadWriter.java (révision 0) |
| +++ src/java/org/apache/lucene/index/PayloadWriter.java (révision 0) |
| @@ -0,0 +1,11 @@ |
| +package org.apache.lucene.index; |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.store.IndexOutput; |
| + |
| +public interface PayloadWriter { |
| + |
| + public void write(Payload payload, IndexOutput output) throws IOException; |
| + |
| +} |
| Index: src/java/org/apache/lucene/index/Payload.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/Payload.java (révision 0) |
| +++ src/java/org/apache/lucene/index/Payload.java (révision 0) |
| @@ -0,0 +1,38 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| + |
| +/** |
| + * A Payload is metadata that can be stored together with each occurrence |
| + * of a term. This metadata is stored inline in the posting list of the |
| + * specific term. |
| + * <p> |
| + * To store payloads in the index a {@link TokenStream} has to be used that |
| + * produces {@link Token}s containing payload data. |
| + * <p> |
| + * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} |
| + * to retrieve the payloads from the index. |
| + */ |
| +public interface Payload { |
| + |
| + public int getLength(); |
| + |
| +} |
| Index: src/java/org/apache/lucene/index/IndexReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/IndexReader.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/IndexReader.java (copie de travail) |
| @@ -65,6 +65,8 @@ |
| public static final FieldOption ALL = new FieldOption ("ALL"); |
| // all indexed fields |
| public static final FieldOption INDEXED = new FieldOption ("INDEXED"); |
| + // all fields that store payloads |
| + public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS"); |
| // all fields which are not indexed |
| public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED"); |
| // all fields which are indexed with termvectors enables |
| Index: src/java/org/apache/lucene/index/DefaultPayloadWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DefaultPayloadWriter.java (révision 0) |
| +++ src/java/org/apache/lucene/index/DefaultPayloadWriter.java (révision 0) |
| @@ -0,0 +1,37 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import java.io.IOException; |
| + |
| +import org.apache.lucene.store.IndexOutput; |
| + |
| +/** |
| + * This payload writer only support {@link BytePayload}. |
| + * |
| + * $Id$ |
| + */ |
| +public class DefaultPayloadWriter implements PayloadWriter { |
| + |
| + public void write(Payload payload, IndexOutput output) throws IOException { |
| + if (!(payload instanceof BytePayload)) { |
| + throw new RuntimeException("Unsupported payload of type '" + payload.getClass() + "' is not supported"); |
| + } |
| + output.writeBytes(((BytePayload) payload).toByteArray(), payload.getLength()); |
| + } |
| +} |
| Index: src/java/org/apache/lucene/index/MultipleTermPositions.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/MultipleTermPositions.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/MultipleTermPositions.java (copie de travail) |
| @@ -191,5 +191,23 @@ |
| public int read(int[] arg0, int[] arg1) throws IOException { |
| throw new UnsupportedOperationException(); |
| } |
| + |
| + |
| + /** |
| + * Not implemented. |
| + * @throws UnsupportedOperationException |
| + */ |
| + public int getPayloadLength() { |
| + throw new UnsupportedOperationException(); |
| + } |
| + |
| + /** |
| + * Not implemented. |
| + * @throws UnsupportedOperationException |
| + */ |
| + public Payload getPayload() throws IOException { |
| + throw new UnsupportedOperationException(); |
| + } |
| |
| + |
| } |
| Index: src/java/org/apache/lucene/index/FilterIndexReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/FilterIndexReader.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/FilterIndexReader.java (copie de travail) |
| @@ -62,6 +62,14 @@ |
| public int nextPosition() throws IOException { |
| return ((TermPositions) this.in).nextPosition(); |
| } |
| + |
| + public int getPayloadLength() { |
| + return ((TermPositions) this.in).getPayloadLength(); |
| + } |
| + |
| + public Payload getPayload() throws IOException { |
| + return ((TermPositions) this.in).getPayload(); |
| + } |
| } |
| |
| /** Base class for filtering {@link TermEnum} implementations. */ |
| Index: src/java/org/apache/lucene/index/SegmentTermPositions.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/SegmentTermPositions.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/SegmentTermPositions.java (copie de travail) |
| @@ -27,23 +27,33 @@ |
| private int proxCount; |
| private int position; |
| |
| + // the current payload length |
| + private int payloadLength; |
| + // indicates whether the payload of the currend position has |
| + // been read from the proxStream yet |
| + private boolean needToLoadPayload; |
| + |
| // these variables are being used to remember information |
| // for a lazy skip |
| private long lazySkipPointer = 0; |
| private int lazySkipDocCount = 0; |
| + private PayloadReader payloadReader; |
| |
| SegmentTermPositions(SegmentReader p) { |
| super(p); |
| this.proxStream = (IndexInput)parent.proxStream.clone(); |
| + payloadReader = parent.directory().getIndexFormat().getPayloadReader(); |
| } |
| |
| - final void seek(TermInfo ti) throws IOException { |
| - super.seek(ti); |
| + final void seek(TermInfo ti, Term term) throws IOException { |
| + super.seek(ti, term); |
| if (ti != null) |
| lazySkipPointer = ti.proxPointer; |
| |
| lazySkipDocCount = 0; |
| proxCount = 0; |
| + payloadLength = 0; |
| + needToLoadPayload = false; |
| } |
| |
| public final void close() throws IOException { |
| @@ -55,8 +65,27 @@ |
| // perform lazy skips if neccessary |
| lazySkip(); |
| proxCount--; |
| - return position += proxStream.readVInt(); |
| + return position += readDeltaPosition(); |
| } |
| + |
| + private final int readDeltaPosition() throws IOException { |
| + int delta = proxStream.readVInt(); |
| + if (currentFieldStoresPayloads) { |
| + // if the current field stores payloads then |
| + // the position delta is shifted one bit to the left. |
| + // if the LSB is set, then we have to read the current |
| + // payload length |
| + if ((delta & 1) != 0) { |
| + payloadLength = proxStream.readVInt(); |
| + } |
| + delta >>>= 1; |
| + needToLoadPayload = true; |
| + } else { |
| + payloadLength = 0; |
| + needToLoadPayload = false; |
| + } |
| + return delta; |
| + } |
| |
| protected final void skippingDoc() throws IOException { |
| // we remember to skip the remaining positions of the current |
| @@ -82,17 +111,28 @@ |
| |
| |
| /** Called by super.skipTo(). */ |
| - protected void skipProx(long proxPointer) throws IOException { |
| + protected void skipProx(long proxPointer, int payloadLength) throws IOException { |
| // we save the pointer, we might have to skip there lazily |
| lazySkipPointer = proxPointer; |
| lazySkipDocCount = 0; |
| proxCount = 0; |
| + this.payloadLength = payloadLength; |
| + needToLoadPayload = false; |
| } |
| |
| private void skipPositions(int n) throws IOException { |
| - for (int f = n; f > 0; f--) // skip unread positions |
| - proxStream.readVInt(); |
| + for (int f = n; f > 0; f--) { // skip unread positions |
| + readDeltaPosition(); |
| + skipPayload(); |
| + } |
| } |
| + |
| + private void skipPayload() throws IOException { |
| + if (needToLoadPayload && payloadLength > 0) { |
| + proxStream.seek(proxStream.getFilePointer() + payloadLength); |
| + } |
| + needToLoadPayload = false; |
| + } |
| |
| // It is not always neccessary to move the prox pointer |
| // to a new document after the freq pointer has been moved. |
| @@ -105,6 +145,10 @@ |
| // So we move the prox pointer lazily to the document |
| // as soon as positions are requested. |
| private void lazySkip() throws IOException { |
| + // we might have to skip the current payload |
| + // if it was not read yet |
| + skipPayload(); |
| + |
| if (lazySkipPointer != 0) { |
| proxStream.seek(lazySkipPointer); |
| lazySkipPointer = 0; |
| @@ -115,5 +159,32 @@ |
| lazySkipDocCount = 0; |
| } |
| } |
| + |
| + public int getPayloadLength() { |
| + return payloadLength; |
| + } |
| |
| + public Payload getPayload() throws IOException { |
| + if (!needToLoadPayload) { |
| + throw new IOException("Payload cannot be loaded more than once for the same term position."); |
| + } |
| + Payload payload = payloadReader.read(payloadLength, proxStream); |
| + needToLoadPayload = false; |
| + return payload; |
| + // read payloads lazily |
| +// byte[] retArray; |
| +// int retOffset; |
| +// if (data == null || data.length - offset < payloadLength) { |
| +// // the array is too small to store the payload data, |
| +// // so we allocate a new one |
| +// retArray = new byte[payloadLength]; |
| +// retOffset = 0; |
| +// } else { |
| +// retArray = data; |
| +// retOffset = offset; |
| +// } |
| +// proxStream.readBytes(retArray, retOffset, payloadLength); |
| +// needToLoadPayload = false; |
| +// return retArray; |
| + } |
| } |
| Index: src/java/org/apache/lucene/index/SegmentTermDocs.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/SegmentTermDocs.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/SegmentTermDocs.java (copie de travail) |
| @@ -39,6 +39,9 @@ |
| private long proxPointer; |
| private long skipPointer; |
| private boolean haveSkipped; |
| + |
| + private int payloadLengthAtLastSkip; |
| + protected boolean currentFieldStoresPayloads; |
| |
| protected SegmentTermDocs(SegmentReader parent) { |
| this.parent = parent; |
| @@ -49,23 +52,31 @@ |
| |
| public void seek(Term term) throws IOException { |
| TermInfo ti = parent.tis.get(term); |
| - seek(ti); |
| + seek(ti, term); |
| } |
| |
| public void seek(TermEnum termEnum) throws IOException { |
| TermInfo ti; |
| + Term term; |
| |
| // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs |
| - if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) // optimized case |
| - ti = ((SegmentTermEnum) termEnum).termInfo(); |
| - else // punt case |
| - ti = parent.tis.get(termEnum.term()); |
| - |
| - seek(ti); |
| + if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) { // optimized case |
| + SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum); |
| + term = segmentTermEnum.term(); |
| + ti = segmentTermEnum.termInfo(); |
| + } else { // punt case |
| + term = termEnum.term(); |
| + ti = parent.tis.get(term); |
| + } |
| + |
| + seek(ti, term); |
| } |
| |
| - void seek(TermInfo ti) throws IOException { |
| + void seek(TermInfo ti, Term term) throws IOException { |
| count = 0; |
| + payloadLengthAtLastSkip = 0; |
| + FieldInfo fi = parent.fieldInfos.fieldInfo(term.field); |
| + currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; |
| if (ti == null) { |
| df = 0; |
| } else { |
| @@ -141,7 +152,7 @@ |
| } |
| |
| /** Overridden by SegmentTermPositions to skip in prox stream. */ |
| - protected void skipProx(long proxPointer) throws IOException {} |
| + protected void skipProx(long proxPointer, int payloadLength) throws IOException {} |
| |
| /** Optimized implementation. */ |
| public boolean skipTo(int target) throws IOException { |
| @@ -157,6 +168,7 @@ |
| |
| // scan skip data |
| int lastSkipDoc = skipDoc; |
| + int lastPayloadLength = 0; |
| long lastFreqPointer = freqStream.getFilePointer(); |
| long lastProxPointer = -1; |
| int numSkipped = -1 - (count % skipInterval); |
| @@ -165,6 +177,7 @@ |
| lastSkipDoc = skipDoc; |
| lastFreqPointer = freqPointer; |
| lastProxPointer = proxPointer; |
| + lastPayloadLength = payloadLengthAtLastSkip; |
| |
| if (skipDoc != 0 && skipDoc >= doc) |
| numSkipped += skipInterval; |
| @@ -172,7 +185,21 @@ |
| if(skipCount >= numSkips) |
| break; |
| |
| - skipDoc += skipStream.readVInt(); |
| + if (currentFieldStoresPayloads) { |
| + // the current field stores payloads. |
| + // if the doc delta is odd then we have |
| + // to read the current payload length |
| + // because it differs from the length of the |
| + // previous payload |
| + int delta = skipStream.readVInt(); |
| + if ((delta & 1) != 0) { |
| + payloadLengthAtLastSkip = skipStream.readVInt(); |
| + } |
| + delta >>>= 1; |
| + skipDoc += delta; |
| + } else { |
| + skipDoc += skipStream.readVInt(); |
| + } |
| freqPointer += skipStream.readVInt(); |
| proxPointer += skipStream.readVInt(); |
| |
| @@ -182,7 +209,7 @@ |
| // if we found something to skip, then skip it |
| if (lastFreqPointer > freqStream.getFilePointer()) { |
| freqStream.seek(lastFreqPointer); |
| - skipProx(lastProxPointer); |
| + skipProx(lastProxPointer, lastPayloadLength); |
| |
| doc = lastSkipDoc; |
| count += numSkipped; |
| Index: src/java/org/apache/lucene/index/SegmentMerger.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/SegmentMerger.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/SegmentMerger.java (copie de travail) |
| @@ -151,11 +151,11 @@ |
| } |
| |
| private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, |
| - boolean storeOffsetWithTermVector) throws IOException { |
| + boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException { |
| Iterator i = names.iterator(); |
| while (i.hasNext()) { |
| String field = (String)i.next(); |
| - fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field)); |
| + fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads); |
| } |
| } |
| |
| @@ -165,15 +165,16 @@ |
| * @throws IOException |
| */ |
| private final int mergeFields() throws IOException { |
| - fieldInfos = new FieldInfos(); // merge field names |
| + fieldInfos = new FieldInfos(); // merge field names |
| int docCount = 0; |
| for (int i = 0; i < readers.size(); i++) { |
| IndexReader reader = (IndexReader) readers.elementAt(i); |
| - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true); |
| - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false); |
| - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true); |
| - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false); |
| - addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true); |
| + addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false); |
| fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false); |
| } |
| fieldInfos.write(directory, segment + ".fnm"); |
| @@ -263,7 +264,7 @@ |
| SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader); |
| base += reader.numDocs(); |
| if (smi.next()) |
| - queue.put(smi); // initialize queue |
| + queue.put(smi); // initialize queue |
| else |
| smi.close(); |
| } |
| @@ -271,7 +272,7 @@ |
| SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; |
| |
| while (queue.size() > 0) { |
| - int matchSize = 0; // pop matching terms |
| + int matchSize = 0; // pop matching terms |
| match[matchSize++] = (SegmentMergeInfo) queue.pop(); |
| Term term = match[0].term; |
| SegmentMergeInfo top = (SegmentMergeInfo) queue.top(); |
| @@ -281,14 +282,14 @@ |
| top = (SegmentMergeInfo) queue.top(); |
| } |
| |
| - mergeTermInfo(match, matchSize); // add new TermInfo |
| + mergeTermInfo(match, matchSize); // add new TermInfo |
| |
| while (matchSize > 0) { |
| SegmentMergeInfo smi = match[--matchSize]; |
| if (smi.next()) |
| - queue.put(smi); // restore queue |
| + queue.put(smi); // restore queue |
| else |
| - smi.close(); // done with a segment |
| + smi.close(); // done with a segment |
| } |
| } |
| } |
| @@ -307,7 +308,7 @@ |
| long freqPointer = freqOutput.getFilePointer(); |
| long proxPointer = proxOutput.getFilePointer(); |
| |
| - int df = appendPostings(smis, n); // append posting data |
| + int df = appendPostings(smis, n); // append posting data |
| |
| long skipPointer = writeSkip(); |
| |
| @@ -317,6 +318,8 @@ |
| termInfosWriter.add(smis[0].term, termInfo); |
| } |
| } |
| + |
| + private byte[] payloadBuffer = null; |
| |
| /** Process postings from multiple segments all positioned on the |
| * same term. Writes out merged entries into freqOutput and |
| @@ -328,9 +331,12 @@ |
| */ |
| private final int appendPostings(SegmentMergeInfo[] smis, int n) |
| throws IOException { |
| + PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter(); |
| int lastDoc = 0; |
| - int df = 0; // number of docs w/ term |
| + int df = 0; // number of docs w/ term |
| resetSkip(); |
| + boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads; |
| + int lastPayloadLength = -1; // ensures that we write the first length |
| for (int i = 0; i < n; i++) { |
| SegmentMergeInfo smi = smis[i]; |
| TermPositions postings = smi.getPositions(); |
| @@ -350,24 +356,43 @@ |
| df++; |
| |
| if ((df % skipInterval) == 0) { |
| - bufferSkip(lastDoc); |
| + bufferSkip(lastDoc, storePayloads, lastPayloadLength); |
| } |
| |
| - int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 |
| + int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1 |
| lastDoc = doc; |
| |
| int freq = postings.freq(); |
| if (freq == 1) { |
| - freqOutput.writeVInt(docCode | 1); // write doc & freq=1 |
| + freqOutput.writeVInt(docCode | 1); // write doc & freq=1 |
| } else { |
| - freqOutput.writeVInt(docCode); // write doc |
| - freqOutput.writeVInt(freq); // write frequency in doc |
| + freqOutput.writeVInt(docCode); // write doc |
| + freqOutput.writeVInt(freq); // write frequency in doc |
| } |
| |
| - int lastPosition = 0; // write position deltas |
| + /** See {@link DocumentWriter#writePostings(Posting[], String) for |
| + * documentation about the encoding of positions and payloads |
| + */ |
| + int lastPosition = 0; // write position deltas |
| for (int j = 0; j < freq; j++) { |
| int position = postings.nextPosition(); |
| - proxOutput.writeVInt(position - lastPosition); |
| + int delta = position - lastPosition; |
| + if (storePayloads) { |
| + int payloadLength = postings.getPayloadLength(); |
| + if (payloadLength == lastPayloadLength) { |
| + proxOutput.writeVInt(delta * 2); |
| + } else { |
| + proxOutput.writeVInt(delta * 2 + 1); |
| + proxOutput.writeVInt(payloadLength); |
| + lastPayloadLength = payloadLength; |
| + } |
| + if (payloadLength > 0) { |
| + Payload payload = postings.getPayload(); |
| + payloadWriter.write(payload, proxOutput); |
| + } |
| + } else { |
| + proxOutput.writeVInt(delta); |
| + } |
| lastPosition = position; |
| } |
| } |
| @@ -377,21 +402,59 @@ |
| |
| private RAMOutputStream skipBuffer = new RAMOutputStream(); |
| private int lastSkipDoc; |
| + private int lastSkipPayloadLength; |
| private long lastSkipFreqPointer; |
| private long lastSkipProxPointer; |
| |
| private void resetSkip() { |
| skipBuffer.reset(); |
| lastSkipDoc = 0; |
| + lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list |
| lastSkipFreqPointer = freqOutput.getFilePointer(); |
| lastSkipProxPointer = proxOutput.getFilePointer(); |
| } |
| |
| - private void bufferSkip(int doc) throws IOException { |
| + private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException { |
| long freqPointer = freqOutput.getFilePointer(); |
| long proxPointer = proxOutput.getFilePointer(); |
| |
| - skipBuffer.writeVInt(doc - lastSkipDoc); |
| + // To efficiently store payloads in the posting lists we do not store the length of |
| + // every payload. Instead we omit the length for a payload if the previous payload had |
| + // the same length. |
| + // However, in order to support skipping the payload length at every skip point must be known. |
| + // So we use the same length encoding that we use for the posting lists for the skip data as well: |
| + // Case 1: current field does not store payloads |
| + // SkipDatum --> DocSkip, FreqSkip, ProxSkip |
| + // DocSkip,FreqSkip,ProxSkip --> VInt |
| + // DocSkip records the document number before every SkipInterval th document in TermFreqs. |
| + // Document numbers are represented as differences from the previous value in the sequence. |
| + // Case 2: current field stores payloads |
| + // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip |
| + // DocSkip,FreqSkip,ProxSkip --> VInt |
| + // PayloadLength --> VInt |
| + // In this case DocSkip/2 is the difference between |
| + // the current and the previous value. If DocSkip |
| + // is odd, then a PayloadLength encoded as VInt follows, |
| + // if DocSkip is even, then it is assumed that the |
| + // current payload length equals the length at the previous |
| + // skip point |
| + if (storePayloads) { |
| + int delta = doc - lastSkipDoc; |
| + if (payloadLength == lastSkipPayloadLength) { |
| + // the current payload length equals the length at the previous skip point, |
| + // so we don't store the length again |
| + skipBuffer.writeVInt(delta * 2); |
| + } else { |
| + // the payload length is different from the previous one. We shift the DocSkip, |
| + // set the lowest bit and store the current payload length as VInt. |
| + skipBuffer.writeVInt(delta * 2 + 1); |
| + skipBuffer.writeVInt(payloadLength); |
| + lastSkipPayloadLength = payloadLength; |
| + } |
| + } else { |
| + // current field does not store payloads |
| + skipBuffer.writeVInt(doc - lastSkipDoc); |
| + } |
| skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer)); |
| skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer)); |
| |
| Index: src/java/org/apache/lucene/index/DocumentWriter.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/DocumentWriter.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/DocumentWriter.java (copie de travail) |
| @@ -31,6 +31,7 @@ |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Arrays; |
| +import java.util.BitSet; |
| import java.util.Enumeration; |
| import java.util.Hashtable; |
| import java.util.Iterator; |
| @@ -69,34 +70,42 @@ |
| |
| final void addDocument(String segment, Document doc) |
| throws IOException { |
| - // write field names |
| + // create field infos |
| fieldInfos = new FieldInfos(); |
| fieldInfos.add(doc); |
| - fieldInfos.write(directory, segment + ".fnm"); |
| |
| - // write field values |
| - FieldsWriter fieldsWriter = |
| - new FieldsWriter(directory, segment, fieldInfos); |
| - try { |
| - fieldsWriter.addDocument(doc); |
| - } finally { |
| - fieldsWriter.close(); |
| - } |
| - |
| // invert doc into postingTable |
| postingTable.clear(); // clear postingTable |
| fieldLengths = new int[fieldInfos.size()]; // init fieldLengths |
| fieldPositions = new int[fieldInfos.size()]; // init fieldPositions |
| fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets |
| + fieldStoresPayloads = new BitSet(fieldInfos.size()); |
| |
| fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts |
| Arrays.fill(fieldBoosts, doc.getBoost()); |
| |
| + // Before we write the FieldInfos we invert the Document. The reason is that |
| + // during invertion the TokenStreams of tokenized fields are being processed |
| + // and we might encounter tokens that have payloads associated with them. In |
| + // this case we have to update the FieldInfo of the particular field. |
| invertDocument(doc); |
| |
| // sort postingTable into an array |
| Posting[] postings = sortPostingTable(); |
| |
| + // write field infos |
| + fieldInfos.write(directory, segment + ".fnm"); |
| + |
| + // write field values |
| + FieldsWriter fieldsWriter = |
| + new FieldsWriter(directory, segment, fieldInfos); |
| + try { |
| + fieldsWriter.addDocument(doc); |
| + } finally { |
| + fieldsWriter.close(); |
| + } |
| + |
| + |
| /* |
| for (int i = 0; i < postings.length; i++) { |
| Posting posting = postings[i]; |
| @@ -125,6 +134,10 @@ |
| private int[] fieldPositions; |
| private int[] fieldOffsets; |
| private float[] fieldBoosts; |
| + |
| + // If any of the tokens of a paticular field carry a payload |
| + // then we enable payloads for that field. |
| + private BitSet fieldStoresPayloads; |
| |
| // Tokenizes the fields of a document into Postings. |
| private final void invertDocument(Document doc) |
| @@ -144,9 +157,9 @@ |
| if (!field.isTokenized()) { // un-tokenized field |
| String stringValue = field.stringValue(); |
| if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); |
| + addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length())); |
| else |
| - addPosition(fieldName, stringValue, position++, null); |
| + addPosition(fieldName, stringValue, position++, null, null); |
| offset += stringValue.length(); |
| length++; |
| } else |
| @@ -167,11 +180,20 @@ |
| for (Token t = stream.next(); t != null; t = stream.next()) { |
| position += (t.getPositionIncrement() - 1); |
| |
| - if(field.isStoreOffsetWithTermVector()) |
| - addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); |
| - else |
| - addPosition(fieldName, t.termText(), position++, null); |
| + Payload payload = t.getPayload(); |
| + if (payload != null) { |
| + // enable payloads for this field |
| + fieldStoresPayloads.set(fieldNumber); |
| + } |
| |
| + TermVectorOffsetInfo termVectorOffsetInfo; |
| + if (field.isStoreOffsetWithTermVector()) { |
| + termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()); |
| + } else { |
| + termVectorOffsetInfo = null; |
| + } |
| + addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo); |
| + |
| lastToken = t; |
| if (++length >= maxFieldLength) { |
| if (infoStream != null) |
| @@ -194,11 +216,16 @@ |
| fieldOffsets[fieldNumber] = offset; |
| } |
| } |
| + |
| + // update fieldInfos for all fields that have one or more tokens with payloads |
| + for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { |
| + fieldInfos.fieldInfo(i).storePayloads = true; |
| + } |
| } |
| |
| private final Term termBuffer = new Term("", ""); // avoid consing |
| |
| - private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { |
| + private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) { |
| termBuffer.set(field, text); |
| //System.out.println("Offset: " + offset); |
| Posting ti = (Posting) postingTable.get(termBuffer); |
| @@ -209,9 +236,25 @@ |
| int[] positions = ti.positions; |
| System.arraycopy(positions, 0, newPositions, 0, freq); |
| ti.positions = newPositions; |
| + |
| + if (ti.payloads != null) { |
| + // the current field stores payloads |
| + Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array |
| + Payload[] payloads = ti.payloads; |
| + System.arraycopy(payloads, 0, newPayloads, 0, payloads.length); |
| + ti.payloads = newPayloads; |
| + } |
| } |
| ti.positions[freq] = position; // add new position |
| |
| + if (payload != null) { |
| + if (ti.payloads == null) { |
| + // lazily allocate payload array |
| + ti.payloads = new Payload[ti.positions.length]; |
| + } |
| + ti.payloads[freq] = payload; |
| + } |
| + |
| if (offset != null) { |
| if (ti.offsets.length == freq){ |
| TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; |
| @@ -224,7 +267,7 @@ |
| ti.freq = freq + 1; // update frequency |
| } else { // word not seen before |
| Term term = new Term(field, text, false); |
| - postingTable.put(term, new Posting(term, position, offset)); |
| + postingTable.put(term, new Posting(term, position, payload, offset)); |
| } |
| } |
| |
| @@ -299,6 +342,7 @@ |
| IndexOutput freq = null, prox = null; |
| TermInfosWriter tis = null; |
| TermVectorsWriter termVectorWriter = null; |
| + PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter(); |
| try { |
| //open files for inverse index storage |
| freq = directory.createOutput(segment + ".frq"); |
| @@ -307,10 +351,31 @@ |
| termIndexInterval); |
| TermInfo ti = new TermInfo(); |
| String currentField = null; |
| - |
| + boolean currentFieldHasPayloads = false; |
| + |
| for (int i = 0; i < postings.length; i++) { |
| Posting posting = postings[i]; |
| |
| + // check to see if we switched to a new field |
| + String termField = posting.term.field(); |
| + if (currentField != termField) { |
| + // changing field - see if there is something to save |
| + currentField = termField; |
| + FieldInfo fi = fieldInfos.fieldInfo(currentField); |
| + currentFieldHasPayloads = fi.storePayloads; |
| + if (fi.storeTermVector) { |
| + if (termVectorWriter == null) { |
| + termVectorWriter = |
| + new TermVectorsWriter(directory, segment, fieldInfos); |
| + termVectorWriter.openDocument(); |
| + } |
| + termVectorWriter.openField(currentField); |
| + |
| + } else if (termVectorWriter != null) { |
| + termVectorWriter.closeField(); |
| + } |
| + } |
| + |
| // add an entry to the dictionary with pointers to prox and freq files |
| ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); |
| tis.add(posting.term, ti); |
| @@ -326,28 +391,62 @@ |
| |
| int lastPosition = 0; // write positions |
| int[] positions = posting.positions; |
| + Payload[] payloads = posting.payloads; |
| + int lastPayloadLength = -1; |
| + |
| + |
| + // The following encoding is being used for positions and payloads: |
| + // Case 1: current field does not store payloads |
| + // Positions -> <PositionDelta>^freq |
| + // PositionDelta -> VInt |
| + // The PositionDelta is the difference between the current |
| + // and the previous position |
| + // Case 2: current field stores payloads |
| + // Positions -> <PositionDelta, Payload>^freq |
| + // Payload -> <PayloadLength?, PayloadData> |
| + // PositionDelta -> VInt |
| + // PayloadLength -> VInt |
| + // PayloadData -> byte^PayloadLength |
| + // In this case PositionDelta/2 is the difference between |
| + // the current and the previous position. If PositionDelta |
| + // is odd, then a PayloadLength encoded as VInt follows, |
| + // if PositionDelta is even, then it is assumed that the |
| + // length of the current Payload equals the length of the |
| + // previous Payload. |
| for (int j = 0; j < postingFreq; j++) { // use delta-encoding |
| int position = positions[j]; |
| - prox.writeVInt(position - lastPosition); |
| - lastPosition = position; |
| - } |
| - // check to see if we switched to a new field |
| - String termField = posting.term.field(); |
| - if (currentField != termField) { |
| - // changing field - see if there is something to save |
| - currentField = termField; |
| - FieldInfo fi = fieldInfos.fieldInfo(currentField); |
| - if (fi.storeTermVector) { |
| - if (termVectorWriter == null) { |
| - termVectorWriter = |
| - new TermVectorsWriter(directory, segment, fieldInfos); |
| - termVectorWriter.openDocument(); |
| + int delta = position - lastPosition; |
| + if (currentFieldHasPayloads) { |
| + int payloadLength = 0; |
| + Payload payload = null; |
| + if (payloads != null) { |
| + payload = payloads[j]; |
| + if (payload != null) { |
| + payloadLength = payload.getLength(); |
| + } |
| } |
| - termVectorWriter.openField(currentField); |
| - |
| - } else if (termVectorWriter != null) { |
| - termVectorWriter.closeField(); |
| + if (payloadLength == lastPayloadLength) { |
| + // the length of the current payload equals the length |
| + // of the previous one. So we do not have to store the length |
| + // again and we only shift the position delta by one bit |
| + prox.writeVInt(delta * 2); |
| + } else { |
| + // the length of the current payload is different from the |
| + // previous one. We shift the position delta, set the lowest |
| + // bit and store the current payload length as VInt. |
| + prox.writeVInt(delta * 2 + 1); |
| + prox.writeVInt(payloadLength); |
| + lastPayloadLength = payloadLength; |
| + } |
| + if (payloadLength > 0) { |
| + // write current payload |
| + payloadWriter.write(payload, prox); |
| + } |
| + } else { |
| + // field does not store payloads, just write position delta as VInt |
| + prox.writeVInt(delta); |
| } |
| + lastPosition = position; |
| } |
| if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { |
| termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); |
| @@ -394,18 +493,27 @@ |
| Term term; // the Term |
| int freq; // its frequency in doc |
| int[] positions; // positions it occurs at |
| + Payload[] payloads; // the payloads of the terms |
| TermVectorOffsetInfo [] offsets; |
| |
| - Posting(Term t, int position, TermVectorOffsetInfo offset) { |
| + Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) { |
| term = t; |
| freq = 1; |
| positions = new int[1]; |
| positions[0] = position; |
| + |
| + if (payload != null) { |
| + payloads = new Payload[1]; |
| + payloads[0] = payload; |
| + } else |
| + payloads = null; |
| + |
| + |
| if(offset != null){ |
| - offsets = new TermVectorOffsetInfo[1]; |
| - offsets[0] = offset; |
| - } |
| - else |
| + offsets = new TermVectorOffsetInfo[1]; |
| + offsets[0] = offset; |
| + } else |
| offsets = null; |
| + |
| } |
| } |
| Index: src/java/org/apache/lucene/index/BytePayload.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/BytePayload.java (révision 0) |
| +++ src/java/org/apache/lucene/index/BytePayload.java (révision 0) |
| @@ -0,0 +1,101 @@ |
| +package org.apache.lucene.index; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.analysis.Token; |
| +import org.apache.lucene.analysis.TokenStream; |
| + |
| +/** |
| + * A Payload is metadata that can be stored together with each occurrence |
| + * of a term. This metadata is stored inline in the posting list of the |
| + * specific term. |
| + * <p> |
| + * To store payloads in the index a {@link TokenStream} has to be used that |
| + * produces {@link Token}s containing payload data. |
| + * <p> |
| + * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} |
| + * to retrieve the payloads from the index. |
| + */ |
| +public class BytePayload implements Payload { |
| + private byte[] data; |
| + |
| + private int offset; |
| + |
| + private int length; |
| + |
| + /** |
| + * Creates a new payload with the the given array as data. |
| + * |
| + * @param data the data of this payload |
| + */ |
| + public BytePayload(byte[] data) { |
| + this(data, 0, data.length); |
| + } |
| + |
| + /** |
| + * Creates a new payload with the the given array as data. |
| + * |
| + * @param data the data of this payload |
| + * @param offset the offset in the data byte array |
| + * @param length the length of the data |
| + */ |
| + public BytePayload(byte[] data, int offset, int length) { |
| + if (offset < 0 || offset + length > data.length) { |
| + throw new IllegalArgumentException(); |
| + } |
| + this.data = data; |
| + this.offset = offset; |
| + this.length = length; |
| + } |
| + |
| + public int getLength() { |
| + return this.length; |
| + } |
| + |
| + /** |
| + * Returns the byte at the given index. |
| + */ |
| + public byte byteAt(int index) { |
| + if (0 <= index && index < this.length) { |
| + return this.data[this.offset + index]; |
| + } |
| + throw new ArrayIndexOutOfBoundsException(index); |
| + } |
| + |
| + /** |
| + * Allocates a new byte array, copies the payload data into it and returns it. |
| + */ |
| + public byte[] toByteArray() { |
| + byte[] retArray = new byte[this.length]; |
| + System.arraycopy(this.data, this.offset, retArray, 0, this.length); |
| + return retArray; |
| + } |
| + |
| + /** |
| + * Copies the payload data to a byte array. |
| + * |
| + * @param target the target byte array |
| + * @param targetOffset the offset in the target byte array |
| + */ |
| + public void copyTo(byte[] target, int targetOffset) { |
| + if (this.length > target.length + targetOffset) { |
| + throw new ArrayIndexOutOfBoundsException(); |
| + } |
| + System.arraycopy(this.data, this.offset, target, targetOffset, this.length); |
| + } |
| +} |
| Index: src/java/org/apache/lucene/index/ParallelReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/ParallelReader.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/ParallelReader.java (copie de travail) |
| @@ -389,7 +389,15 @@ |
| // It is an error to call this if there is no next position, e.g. if termDocs==null |
| return ((TermPositions)termDocs).nextPosition(); |
| } |
| + |
| + public int getPayloadLength() { |
| + return ((TermPositions)termDocs).getPayloadLength(); |
| + } |
| |
| + public Payload getPayload() throws IOException { |
| + return ((TermPositions)termDocs).getPayload(); |
| + } |
| + |
| } |
| |
| } |
| Index: src/java/org/apache/lucene/index/SegmentReader.java |
| =================================================================== |
| --- src/java/org/apache/lucene/index/SegmentReader.java (révision 493447) |
| +++ src/java/org/apache/lucene/index/SegmentReader.java (copie de travail) |
| @@ -369,6 +369,9 @@ |
| else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) { |
| fieldSet.add(fi.name); |
| } |
| + else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) { |
| + fieldSet.add(fi.name); |
| + } |
| else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) { |
| fieldSet.add(fi.name); |
| } |
| @@ -545,6 +548,10 @@ |
| return termVectorsReader.get(docNumber); |
| } |
| |
| + FieldInfos fieldInfos() { |
| + return fieldInfos; |
| + } |
| + |
| /** |
| * Return the name of the segment this reader is reading. |
| */ |
| Index: src/java/org/apache/lucene/store/RAMDirectory.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/RAMDirectory.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/RAMDirectory.java (copie de travail) |
| @@ -17,16 +17,18 @@ |
| * limitations under the License. |
| */ |
| |
| +import java.io.File; |
| +import java.io.FileNotFoundException; |
| import java.io.IOException; |
| -import java.io.FileNotFoundException; |
| -import java.io.File; |
| import java.io.Serializable; |
| import java.util.Collection; |
| -import java.util.Enumeration; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.Set; |
| |
| +import org.apache.lucene.index.DefaultIndexFormat; |
| +import org.apache.lucene.index.IndexFormat; |
| + |
| /** |
| * A memory-resident {@link Directory} implementation. Locking |
| * implementation is by default the {@link SingleInstanceLockFactory} |
| @@ -47,8 +49,22 @@ |
| // Lock acquisition sequence: RAMDirectory, then RAMFile |
| // ***** |
| |
| - /** Constructs an empty {@link Directory}. */ |
| + |
| + /** |
| + * Constructs an empty {@link Directory}. |
| + * The index format used the the default one |
| + */ |
| public RAMDirectory() { |
| + this(new DefaultIndexFormat()); |
| + } |
| + |
| + /** |
| + * Contructor |
| + * |
| + * @param indexFormat the format of the index |
| + */ |
| + public RAMDirectory(IndexFormat indexFormat) { |
| + this.indexFormat = indexFormat; |
| setLockFactory(new SingleInstanceLockFactory()); |
| } |
| |
| @@ -69,11 +85,11 @@ |
| * @exception IOException if an error occurs |
| */ |
| public RAMDirectory(Directory dir) throws IOException { |
| - this(dir, false); |
| + this(dir, false, new DefaultIndexFormat()); |
| } |
| - |
| - private RAMDirectory(Directory dir, boolean closeDir) throws IOException { |
| - this(); |
| + |
| + private RAMDirectory(Directory dir, boolean closeDir, IndexFormat indexFormat) throws IOException { |
| + this(indexFormat); |
| final String[] files = dir.list(); |
| byte[] buf = new byte[BufferedIndexOutput.BUFFER_SIZE]; |
| for (int i = 0; i < files.length; i++) { |
| @@ -107,7 +123,7 @@ |
| * @see #RAMDirectory(Directory) |
| */ |
| public RAMDirectory(File dir) throws IOException { |
| - this(FSDirectory.getDirectory(dir, false), true); |
| + this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat()); |
| } |
| |
| /** |
| @@ -118,7 +134,7 @@ |
| * @see #RAMDirectory(Directory) |
| */ |
| public RAMDirectory(String dir) throws IOException { |
| - this(FSDirectory.getDirectory(dir, false), true); |
| + this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat()); |
| } |
| |
| /** Returns an array of strings, one for each file in the directory. */ |
| Index: src/java/org/apache/lucene/store/Directory.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/Directory.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/Directory.java (copie de travail) |
| @@ -19,6 +19,9 @@ |
| |
| import java.io.IOException; |
| |
| +import org.apache.lucene.index.DefaultIndexFormat; |
| +import org.apache.lucene.index.IndexFormat; |
| + |
| /** A Directory is a flat list of files. Files may be written once, when they |
| * are created. Once a file is created it may only be opened for read, or |
| * deleted. Random access is permitted both when reading and writing. |
| @@ -42,6 +45,12 @@ |
| * this Directory instance). */ |
| protected LockFactory lockFactory; |
| |
| + protected IndexFormat indexFormat = new DefaultIndexFormat(); |
| + |
| + public IndexFormat getIndexFormat() { |
| + return indexFormat; |
| + } |
| + |
| /** Returns an array of strings, one for each file in the directory. */ |
| public abstract String[] list() |
| throws IOException; |
| Index: src/java/org/apache/lucene/store/RAMOutputStream.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/RAMOutputStream.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/RAMOutputStream.java (copie de travail) |
| @@ -66,7 +66,7 @@ |
| file.setLength(0); |
| } |
| |
| - public void flushBuffer(byte[] src, int len) throws IOException { |
| + public void flushBuffer(byte[] src, int offset, int len) throws IOException { |
| byte[] buffer; |
| int bufferPos = 0; |
| while (bufferPos != len) { |
| @@ -81,7 +81,7 @@ |
| else |
| buffer = (byte[]) file.buffers.get(bufferNumber); |
| |
| - System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy); |
| + System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy); |
| bufferPos += bytesToCopy; |
| pointer += bytesToCopy; |
| } |
| Index: src/java/org/apache/lucene/store/IndexOutput.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/IndexOutput.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/IndexOutput.java (copie de travail) |
| @@ -36,8 +36,18 @@ |
| * @param length the number of bytes to write |
| * @see IndexInput#readBytes(byte[],int,int) |
| */ |
| - public abstract void writeBytes(byte[] b, int length) throws IOException; |
| + public void writeBytes(byte[] b, int length) throws IOException { |
| + writeBytes(b, 0, length); |
| + } |
| |
| + /** Writes an array of bytes. |
| + * @param b the bytes to write |
| + * @param offset the offset in the byte array |
| + * @param length the number of bytes to write |
| + * @see IndexInput#readBytes(byte[],int,int) |
| + */ |
| + public abstract void writeBytes(byte[] b, int offset, int length) throws IOException; |
| + |
| /** Writes an int as four bytes. |
| * @see IndexInput#readInt() |
| */ |
| Index: src/java/org/apache/lucene/store/BufferedIndexOutput.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/BufferedIndexOutput.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/BufferedIndexOutput.java (copie de travail) |
| @@ -24,8 +24,8 @@ |
| static final int BUFFER_SIZE = 1024; |
| |
| private final byte[] buffer = new byte[BUFFER_SIZE]; |
| - private long bufferStart = 0; // position in file of buffer |
| - private int bufferPosition = 0; // position in buffer |
| + private long bufferStart = 0; // position in file of buffer |
| + private int bufferPosition = 0; // position in buffer |
| |
| /** Writes a single byte. |
| * @see IndexInput#readByte() |
| @@ -41,12 +41,12 @@ |
| * @param length the number of bytes to write |
| * @see IndexInput#readBytes(byte[],int,int) |
| */ |
| - public void writeBytes(byte[] b, int length) throws IOException { |
| + public void writeBytes(byte[] b, int offset, int length) throws IOException { |
| int bytesLeft = BUFFER_SIZE - bufferPosition; |
| // is there enough space in the buffer? |
| if (bytesLeft >= length) { |
| // we add the data to the end of the buffer |
| - System.arraycopy(b, 0, buffer, bufferPosition, length); |
| + System.arraycopy(b, offset, buffer, bufferPosition, length); |
| bufferPosition += length; |
| // if the buffer is full, flush it |
| if (BUFFER_SIZE - bufferPosition == 0) |
| @@ -66,7 +66,7 @@ |
| int pieceLength; |
| while (pos < length) { |
| pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft; |
| - System.arraycopy(b, pos, buffer, bufferPosition, pieceLength); |
| + System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength); |
| pos += pieceLength; |
| bufferPosition += pieceLength; |
| // if the buffer is full, flush it |
| @@ -92,8 +92,18 @@ |
| * @param b the bytes to write |
| * @param len the number of bytes to write |
| */ |
| - protected abstract void flushBuffer(byte[] b, int len) throws IOException; |
| + private void flushBuffer(byte[] b, int len) throws IOException { |
| + flushBuffer(b, 0, len); |
| + } |
| |
| + /** Expert: implements buffer write. Writes bytes at the current position in |
| + * the output. |
| + * @param b the bytes to write |
| + * @param offset the offset in the byte array |
| + * @param len the number of bytes to write |
| + */ |
| + protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException; |
| + |
| /** Closes this stream to further operations. */ |
| public void close() throws IOException { |
| flush(); |
| Index: src/java/org/apache/lucene/store/FSDirectory.java |
| =================================================================== |
| --- src/java/org/apache/lucene/store/FSDirectory.java (révision 493447) |
| +++ src/java/org/apache/lucene/store/FSDirectory.java (copie de travail) |
| @@ -26,7 +26,9 @@ |
| import java.security.NoSuchAlgorithmException; |
| import java.util.Hashtable; |
| |
| +import org.apache.lucene.index.DefaultIndexFormat; |
| import org.apache.lucene.index.IndexFileNameFilter; |
| +import org.apache.lucene.index.IndexFormat; |
| |
| /** |
| * Straightforward implementation of {@link Directory} as a directory of files. |
| @@ -180,9 +182,28 @@ |
| * @param create if true, create, or erase any existing contents. |
| * @param lockFactory instance of {@link LockFactory} providing the |
| * locking implementation. |
| - * @return the FSDirectory for the named file. */ |
| + * @return the FSDirectory for the named file. |
| + * @throws IOException */ |
| public static FSDirectory getDirectory(File file, boolean create, |
| LockFactory lockFactory, boolean doRemoveOldFiles) |
| + throws IOException { |
| + return getDirectory(file, create, lockFactory, doRemoveOldFiles, new DefaultIndexFormat()); |
| + } |
| + |
| + /** Returns the directory instance for the named location, using the |
| + * provided LockFactory implementation. |
| + * |
| + * <p>Directories are cached, so that, for a given canonical path, the same |
| + * FSDirectory instance will always be returned. This permits |
| + * synchronization on directories. |
| + * |
| + * @param file the path to the directory. |
| + * @param create if true, create, or erase any existing contents. |
| + * @param lockFactory instance of {@link LockFactory} providing the |
| + * locking implementation. |
| + * @return the FSDirectory for the named file. */ |
| + public static FSDirectory getDirectory(File file, boolean create, LockFactory lockFactory, |
| + boolean doRemoveOldFiles, IndexFormat indexFormat) |
| throws IOException { |
| file = new File(file.getCanonicalPath()); |
| FSDirectory dir; |
| @@ -194,7 +215,7 @@ |
| } catch (Exception e) { |
| throw new RuntimeException("cannot load FSDirectory class: " + e.toString(), e); |
| } |
| - dir.init(file, create, lockFactory, doRemoveOldFiles); |
| + dir.init(file, create, lockFactory, doRemoveOldFiles, indexFormat); |
| DIRECTORIES.put(file, dir); |
| } else { |
| |
| @@ -243,8 +264,11 @@ |
| throw new IOException(path + " not a directory"); |
| } |
| |
| - private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles) throws IOException { |
| + private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles, |
| + IndexFormat indexFormat) throws IOException { |
| |
| + this.indexFormat = indexFormat; |
| + |
| // Set up lockFactory with cascaded defaults: if an instance was passed in, |
| // use that; else if locks are disabled, use NoLockFactory; else if the |
| // system property org.apache.lucene.store.FSDirectoryLockFactoryClass is set, |
| @@ -592,8 +616,8 @@ |
| } |
| |
| /** output methods: */ |
| - public void flushBuffer(byte[] b, int size) throws IOException { |
| - file.write(b, 0, size); |
| + public void flushBuffer(byte[] b, int offset, int size) throws IOException { |
| + file.write(b, offset, size); |
| } |
| public void close() throws IOException { |
| // only close the file if it has not been closed yet |
| Index: src/site/src/documentation/content/xdocs/fileformats.xml |
| =================================================================== |
| --- src/site/src/documentation/content/xdocs/fileformats.xml (révision 493447) |
| +++ src/site/src/documentation/content/xdocs/fileformats.xml (copie de travail) |
| @@ -798,16 +798,20 @@ |
| |
| <p> |
| <b>Pre-2.1:</b> |
| + <code> |
| Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize> |
| <sup>SegCount</sup> |
| + </code> |
| </p> |
| <p> |
| <b>2.1 and above:</b> |
| + <code> |
| Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, NumField, NormGen |
| <sup>NumField</sup> |
| > |
| <sup>SegCount</sup> |
| , IsCompoundFile |
| + </code> |
| </p> |
| |
| <p> |
| @@ -1002,6 +1006,7 @@ |
| <li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li> |
| <li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li> |
| <li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li> |
| + <li>If the sixth lowest-order bit is set (0x20), payloads are being stored for the indexed field.</li> |
| </ul> |
| </p> |
| |
| @@ -1287,9 +1292,9 @@ |
| <sup>DocFreq/SkipInterval</sup> |
| </p> |
| <p>SkipDatum --> |
| - DocSkip,FreqSkip,ProxSkip |
| + DocSkip,PayloadLength?,FreqSkip,ProxSkip |
| </p> |
| - <p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --> |
| + <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --> |
| VInt |
| </p> |
| <p>TermFreqs |
| @@ -1317,9 +1322,17 @@ |
| SkipInterval |
| <sup>th</sup> |
| document in TermFreqs. |
| - Document numbers are represented as differences |
| - from the previous value in the sequence. FreqSkip |
| - and ProxSkip record the position of every |
| + If payloads are disabled for the term's field, |
| + then DocSkip represents the difference from the |
| + previous value in the sequence. |
| + If payloads are enabled for the term's field, |
| + then DocSkip/2 represents the difference from the |
| + previous value in the sequence. If payloads are enabled |
| + and DocSkip is odd, |
| + then PayloadLength is stored indicating the length |
| + of the last payload before the SkipInterval<sup>th</sup> |
| + document in TermPositions. |
| + FreqSkip and ProxSkip record the position of every |
| SkipInterval |
| <sup>th</sup> |
| entry in FreqFile and |
| @@ -1368,12 +1381,22 @@ |
| <sup>DocFreq</sup> |
| </p> |
| <p>Positions --> |
| - <PositionDelta> |
| + <PositionDelta,Payload?> |
| <sup>Freq</sup> |
| </p> |
| + <p>Payload --> |
| + <PayloadLength?,PayloadData> |
| + </p> |
| <p>PositionDelta --> |
| VInt |
| </p> |
| + <p>PayloadLength --> |
| + VInt |
| + </p> |
| + <p>PayloadData --> |
| + byte<sup>PayloadLength</sup> |
| + </p> |
| + |
| <p>TermPositions |
| are ordered by term (the term is implicit, from the .tis file). |
| </p> |
| @@ -1382,19 +1405,30 @@ |
| number is implicit from the .frq file). |
| </p> |
| <p>PositionDelta |
| - is the difference between the position of the current occurrence in |
| + is, if payloads are disabled for the term's field, the difference |
| + between the position of the current occurrence in |
| the document and the previous occurrence (or zero, if this is the |
| first occurrence in this document). |
| + If payloads are enabled for the term's field, then PositionDelta/2 |
| + is the difference between the current and the previous position. If |
| + payloads are enabled and PositionDelta is odd, then PayloadLength is |
| + stored, indicating the length of the payload at the current term position. |
| </p> |
| <p> |
| For example, the TermPositions for a |
| term which occurs as the fourth term in one document, and as the |
| fifth and ninth term in a subsequent document, would be the following |
| - sequence of VInts: |
| + sequence of VInts (payloads disabled): |
| </p> |
| <p>4, |
| 5, 4 |
| </p> |
| + <p>PayloadData |
| + is metadata associated with the current term position. If PayloadLength |
| + is stored at the current position, then it indicates the length of this |
| + Payload. If PayloadLength is not stored, then this Payload has the same |
| + length as the Payload at the previous position. |
| + </p> |
| </section> |
| <section id="Normalization Factors"><title>Normalization Factors</title> |
| <p>There's a norm file for each indexed field with a byte for |