blob: f44309f45e7b41770f0f98d55c6ca4d584ee9c59 [file] [log] [blame]
Index: src/test/org/apache/lucene/store/MockRAMOutputStream.java
===================================================================
--- src/test/org/apache/lucene/store/MockRAMOutputStream.java (révision 493447)
+++ src/test/org/apache/lucene/store/MockRAMOutputStream.java (copie de travail)
@@ -48,7 +48,7 @@
}
}
- public void flushBuffer(byte[] src, int len) throws IOException {
+ public void flushBuffer(byte[] src, int offset, int len) throws IOException {
long freeSpace = dir.maxSize - dir.sizeInBytes();
long realUsage = 0;
@@ -63,14 +63,14 @@
if (dir.maxSize != 0 && freeSpace <= len) {
if (freeSpace > 0 && freeSpace < len) {
realUsage += freeSpace;
- super.flushBuffer(src, (int) freeSpace);
+ super.flushBuffer(src, offset, (int) freeSpace);
}
if (realUsage > dir.maxUsedSize) {
dir.maxUsedSize = realUsage;
}
throw new IOException("fake disk full at " + dir.sizeInBytes() + " bytes");
} else {
- super.flushBuffer(src, len);
+ super.flushBuffer(src, offset, len);
}
if (first) {
Index: src/test/org/apache/lucene/index/TestPayloads.java
===================================================================
--- src/test/org/apache/lucene/index/TestPayloads.java (révision 0)
+++ src/test/org/apache/lucene/index/TestPayloads.java (révision 0)
@@ -0,0 +1,416 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.RAMDirectory;
+
+
+public class TestPayloads extends TestCase {
+
+ // Simple tests to test the Payload class
+ public void testPayload() throws Exception {
+ byte[] testData = "This is a test!".getBytes();
+ BytePayload payload = new BytePayload(testData);
+ assertEquals("Wrong payload length.", testData.length, payload.getLength());
+
+ // test copyTo()
+ byte[] target = new byte[testData.length - 1];
+ try {
+ payload.copyTo(target, 0);
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+
+ target = new byte[testData.length + 3];
+ payload.copyTo(target, 3);
+
+ for (int i = 0; i < testData.length; i++) {
+ assertEquals(testData[i], target[i + 3]);
+ }
+
+
+ // test toByteArray()
+ target = payload.toByteArray();
+ assertByteArrayEquals(testData, target);
+
+ // test byteAt()
+ for (int i = 0; i < testData.length; i++) {
+ assertEquals(payload.byteAt(i), testData[i]);
+ }
+
+ try {
+ payload.byteAt(testData.length + 1);
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+ }
+
+ // Tests whether the DocumentWriter and SegmentMerger correctly enable the
+ // payload bit in the FieldInfo
+ public void testPayloadFieldBit() throws Exception {
+ Directory ram = new RAMDirectory();
+ PayloadAnalyzer analyzer = new PayloadAnalyzer();
+ IndexWriter writer = new IndexWriter(ram, analyzer, true);
+ Document d = new Document();
+ // this field won't have any payloads
+ d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
+ // this field will have payloads in all docs, however not for all term positions,
+ // so this field is used to check if the DocumentWriter correctly enables the payloads bit
+ // even if only some term positions have payloads
+ d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+ d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+ // this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
+ // enabled in only some documents
+ d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
+ // only add payload data for field f2
+ analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
+ writer.addDocument(d);
+ // flush
+ writer.close();
+
+ // only one segment in the index, so we can cast to SegmentReader
+ SegmentReader reader = (SegmentReader) IndexReader.open(ram);
+ FieldInfos fi = reader.fieldInfos();
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
+ reader.close();
+
+ // now we add another document which has payloads for field f3 and verify if the SegmentMerger
+ // enabled payloads for that field
+ writer = new IndexWriter(ram, analyzer, true);
+ d = new Document();
+ d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.TOKENIZED));
+ d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+ d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.TOKENIZED));
+ d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.TOKENIZED));
+ // add payload data for field f2 and f3
+ analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
+ analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
+ writer.addDocument(d);
+ // force merge
+ writer.optimize();
+ // flush
+ writer.close();
+
+ // only one segment in the index, so we can cast to SegmentReader
+ reader = (SegmentReader) IndexReader.open(ram);
+ fi = reader.fieldInfos();
+ assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
+ assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
+ reader.close();
+ }
+
+ // Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
+ public void testPayloadsEncoding() throws Exception {
+ // first perform the test using a RAMDirectory
+ Directory dir = new RAMDirectory();
+ performTest(dir);
+
+ // now use a FSDirectory and repeat same test
+ String dirName = "test_payloads";
+ dir = FSDirectory.getDirectory(dirName, true);
+ performTest(dir);
+ rmDir(dirName);
+ }
+
+ // builds an index with payloads in the given Directory and performs
+ // different tests to verify the payload encoding
+ private void performTest(Directory dir) throws Exception {
+ PayloadAnalyzer analyzer = new PayloadAnalyzer();
+ IndexWriter writer = new IndexWriter(dir, analyzer, true);
+
+ // should be in sync with value in TermInfosWriter
+ final int skipInterval = 16;
+
+ final int numTerms = 5;
+ final String fieldName = "f1";
+
+ int numDocs = skipInterval + 1;
+ // create content for the test documents with just a few terms
+ Term[] terms = generateTerms(fieldName, numTerms);
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < terms.length; i++) {
+ sb.append(terms[i].text);
+ sb.append(" ");
+ }
+ String content = sb.toString();
+
+
+ int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
+ byte[] payloadData = generateRandomData(payloadDataLength);
+
+ Document d = new Document();
+ d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.TOKENIZED));
+ // add the same document multiple times to have the same payload lengths for all
+ // occurrences within two consecutive skip intervals
+ int offset = 0;
+ for (int i = 0; i < 2 * numDocs; i++) {
+ analyzer.setPayloadData("f1", payloadData, offset, 1);
+ offset += numTerms;
+ writer.addDocument(d);
+ }
+
+ // now we make sure to have different payload lengths next at the next skip point
+ for (int i = 0; i < numDocs; i++) {
+ analyzer.setPayloadData(fieldName, payloadData, offset, i);
+ offset += i * numTerms;
+ writer.addDocument(d);
+ }
+
+ writer.optimize();
+ // flush
+ writer.close();
+
+
+ /*
+ * Verify the index
+ * first we test if all payloads are stored correctly
+ */
+ IndexReader reader = IndexReader.open(dir);
+
+ byte[] verifyPayloadData = new byte[payloadDataLength];
+ offset = 0;
+ TermPositions[] tps = new TermPositions[numTerms];
+ for (int i = 0; i < numTerms; i++) {
+ tps[i] = reader.termPositions(terms[i]);
+ }
+
+ while (tps[0].next()) {
+ for (int i = 1; i < numTerms; i++) {
+ tps[i].next();
+ }
+ int freq = tps[0].freq();
+
+ for (int i = 0; i < freq; i++) {
+ for (int j = 0; j < numTerms; j++) {
+ tps[j].nextPosition();
+ BytePayload payload = (BytePayload) tps[j].getPayload();
+ payload.copyTo(verifyPayloadData, offset);
+ offset += tps[j].getPayloadLength();
+ }
+ }
+ }
+
+ for (int i = 0; i < numTerms; i++) {
+ tps[i].close();
+ }
+
+ assertByteArrayEquals(payloadData, verifyPayloadData);
+
+ /*
+ * test lazy skipping
+ */
+ TermPositions tp = reader.termPositions(terms[0]);
+ tp.next();
+ tp.nextPosition();
+ // now we don't read this payload
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ BytePayload payload = (BytePayload) tp.getPayload();
+ assertEquals(payload.byteAt(0), payloadData[numTerms]);
+ tp.nextPosition();
+
+ // we don't read this payload and skip to a different document
+ tp.skipTo(5);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ payload = (BytePayload) tp.getPayload();
+ assertEquals(payload.byteAt(0), payloadData[5 * numTerms]);
+
+
+ /*
+ * Test different lengths at skip points
+ */
+ tp.seek(terms[1]);
+ tp.next();
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(2 * skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
+ tp.skipTo(3 * skipInterval - 1);
+ tp.nextPosition();
+ assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
+
+ /*
+ * Test multiple call of getPayload()
+ */
+ tp.getPayload();
+ try {
+ // it is forbidden to call getPayload() more than once
+ // without calling nextPosition()
+ tp.getPayload();
+ fail("Expected exception not thrown");
+ } catch (Exception expected) {
+ // expected exception
+ }
+
+ reader.close();
+ }
+
+ private byte[] generateRandomData(int n) {
+ Random rnd = new Random();
+ byte[] data = new byte[n];
+ rnd.nextBytes(data);
+ return data;
+ }
+
+ private Term[] generateTerms(String fieldName, int n) {
+ int maxDigits = (int) (Math.log(n) / Math.log(10));
+ Term[] terms = new Term[n];
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < n; i++) {
+ sb.setLength(0);
+ sb.append("t");
+ int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
+ for (int j = 0; j < zeros; j++) {
+ sb.append("0");
+ }
+ sb.append(i);
+ terms[i] = new Term(fieldName, sb.toString());
+ }
+ return terms;
+ }
+
+
+ private void rmDir(String dir) {
+ File fileDir = new File(dir);
+ if (fileDir.exists()) {
+ File[] files = fileDir.listFiles();
+ if (files != null) {
+ for (int i = 0; i < files.length; i++) {
+ files[i].delete();
+ }
+ }
+ fileDir.delete();
+ }
+ }
+
+
+
+ void assertByteArrayEquals(byte[] b1, byte[] b2) {
+ if (b1.length != b2.length) {
+ fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
+ }
+
+ for (int i = 0; i < b1.length; i++) {
+ if (b1[i] != b2[i]) {
+ fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
+ }
+ }
+ }
+
+
+ /**
+ * This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
+ */
+ private static class PayloadAnalyzer extends Analyzer {
+ Map fieldToData = new HashMap();
+
+ void setPayloadData(String field, byte[] data, int offset, int length) {
+ fieldToData.put(field, new PayloadData(0, data, offset, length));
+ }
+
+ void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
+ fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
+ }
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ PayloadData payload = (PayloadData) fieldToData.get(fieldName);
+ TokenStream ts = new WhitespaceTokenizer(reader);
+ if (payload != null) {
+ if (payload.numFieldInstancesToSkip == 0) {
+ ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
+ } else {
+ payload.numFieldInstancesToSkip--;
+ }
+ }
+ return ts;
+ }
+
+ private static class PayloadData {
+ byte[] data;
+ int offset;
+ int length;
+ int numFieldInstancesToSkip;
+
+ PayloadData(int skip, byte[] data, int offset, int length) {
+ numFieldInstancesToSkip = skip;
+ this.data = data;
+ this.offset = offset;
+ this.length = length;
+ }
+ }
+ }
+
+
+ /**
+ * This Filter adds payloads to the tokens.
+ */
+ private static class PayloadFilter extends TokenFilter {
+ private byte[] data;
+ private int length;
+ private int offset;
+
+ public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
+ super(in);
+ this.data = data;
+ this.length = length;
+ this.offset = offset;
+ }
+
+ public Token next() throws IOException {
+ Token nextToken = input.next();
+ if (nextToken != null && offset + length <= data.length) {
+ nextToken.setPayload(new BytePayload(data, offset, length));
+ offset += length;
+ }
+
+ return nextToken;
+ }
+ }
+
+}
Index: src/java/org/apache/lucene/analysis/Token.java
===================================================================
--- src/java/org/apache/lucene/analysis/Token.java (révision 493447)
+++ src/java/org/apache/lucene/analysis/Token.java (copie de travail)
@@ -1,5 +1,8 @@
package org.apache.lucene.analysis;
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.TermPositions;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -20,23 +23,32 @@
/** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
-
+ <p>
The start and end offsets permit applications to re-associate a token with
its source text, e.g., to display highlighted query terms in a document
browser, or to show matching text fragments in a KWIC (KeyWord In Context)
display, etc.
-
+ <p>
The type is an interned string, assigned by a lexical analyzer
(a.k.a. tokenizer), naming the lexical or syntactic class that the token
belongs to. For example an end of sentence marker token might be implemented
- with type "eos". The default token type is "word". */
+ with type "eos". The default token type is "word".
+ <p>
+ A Token can optionally have metadata (a.k.a. Payload) in the form of a variable
+ length byte array. Use {@link TermPositions#getPayloadLength()} and
+ {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.
+ @see org.apache.lucene.index.Payload
+ */
+
public class Token implements Cloneable {
String termText; // the text of the term
int startOffset; // start in source text
int endOffset; // end in source text
String type = "word"; // lexical type
-
+
+ Payload payload;
+
private int positionIncrement = 1;
/** Constructs a Token with the given term text, and start & end offsets.
@@ -115,6 +127,16 @@
/** Returns this Token's lexical type. Defaults to "word". */
public final String type() { return type; }
+ /** Sets this Token's payload. */
+ public void setPayload(Payload payload) {
+ this.payload = payload;
+ }
+
+ /** Returns this Token's payload. */
+ public Payload getPayload() {
+ return this.payload;
+ }
+
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("(" + termText + "," + startOffset + "," + endOffset);
Index: src/java/org/apache/lucene/index/FieldInfo.java
===================================================================
--- src/java/org/apache/lucene/index/FieldInfo.java (révision 493447)
+++ src/java/org/apache/lucene/index/FieldInfo.java (copie de travail)
@@ -28,9 +28,12 @@
boolean storePositionWithTermVector;
boolean omitNorms; // omit norms associated with indexed fields
+
+ boolean storePayloads; // whether this field stores payloads together with term positions
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
- boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
+ boolean omitNorms, boolean storePayloads) {
name = na;
isIndexed = tk;
number = nu;
@@ -38,5 +41,6 @@
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
this.storePositionWithTermVector = storePositionWithTermVector;
this.omitNorms = omitNorms;
+ this.storePayloads = storePayloads;
}
}
Index: src/java/org/apache/lucene/index/PayloadReader.java
===================================================================
--- src/java/org/apache/lucene/index/PayloadReader.java (révision 0)
+++ src/java/org/apache/lucene/index/PayloadReader.java (révision 0)
@@ -0,0 +1,11 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+
+public interface PayloadReader {
+
+ public Payload read(int length, IndexInput in) throws IOException;
+
+}
Index: src/java/org/apache/lucene/index/MultiReader.java
===================================================================
--- src/java/org/apache/lucene/index/MultiReader.java (révision 493447)
+++ src/java/org/apache/lucene/index/MultiReader.java (copie de travail)
@@ -450,5 +450,12 @@
public int nextPosition() throws IOException {
return ((TermPositions)current).nextPosition();
}
+
+ public int getPayloadLength() {
+ return ((TermPositions)current).getPayloadLength();
+ }
+ public Payload getPayload() throws IOException {
+ return ((TermPositions)current).getPayload();
+ }
}
Index: src/java/org/apache/lucene/index/TermPositions.java
===================================================================
--- src/java/org/apache/lucene/index/TermPositions.java (révision 493447)
+++ src/java/org/apache/lucene/index/TermPositions.java (copie de travail)
@@ -32,10 +32,36 @@
extends TermDocs
{
/** Returns next position in the current document. It is an error to call
- this more than {@link #freq()} times
- without calling {@link #next()}<p> This is
- invalid until {@link #next()} is called for
- the first time.
+ this more than {@link #freq()} times
+ without calling {@link #next()}<p> This is
+ invalid until {@link #next()} is called for
+ the first time.
*/
int nextPosition() throws IOException;
+
+ /** Returns the length of the payload at the current term position.
+ * This is invalid until {@link #nextPosition()} is called for
+ * the first time.
+ *
+ * @return length of the current payload in number of bytes
+ */
+ int getPayloadLength();
+
+ /** Returns the payload data at the current term position.
+ * This is invalid until {@link #nextPosition()} is called for
+ * the first time.
+ * This method must not be called more than once after each call
+ * of {@link #nextPosition()}. However, payloads are loaded lazily,
+ * so if the payload data for the current position is not needed,
+ * this method may not be called at all for performance reasons.
+ *
+ * @param data the array into which the data of this payload is to be
+ * stored, if it is big enough; otherwise, a new byte[] array
+ * is allocated for this purpose.
+ * @param offset the offset in the array into which the data of this payload
+ * is to be stored.
+ * @return a byte[] array containing the data of this payload
+ * @throws IOException
+ */
+ Payload getPayload() throws IOException;
}
Index: src/java/org/apache/lucene/index/IndexFormat.java
===================================================================
--- src/java/org/apache/lucene/index/IndexFormat.java (révision 0)
+++ src/java/org/apache/lucene/index/IndexFormat.java (révision 0)
@@ -0,0 +1,36 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.document.Fieldable;
+
+/**
+ * Specify the format of index.
+ *
+ * The implementation of the {@link FieldsReader} and {@link FieldsWriter} returned by
+ * the function getFieldsReader and getFieldsWriter will specify how the data of fields are
+ * serialized, and also the kind of {@link Fieldable} used.
+ *
+ * $Id$
+ */
+public interface IndexFormat {
+
+ PayloadReader getPayloadReader();
+
+ PayloadWriter getPayloadWriter();
+}
\ Pas de fin de ligne à la fin du fichier
Index: src/java/org/apache/lucene/index/DefaultPayloadReader.java
===================================================================
--- src/java/org/apache/lucene/index/DefaultPayloadReader.java (révision 0)
+++ src/java/org/apache/lucene/index/DefaultPayloadReader.java (révision 0)
@@ -0,0 +1,36 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+
+/**
+ * This payload reader only support {@link BytePayload}.
+ *
+ * $Id$
+ */
+public class DefaultPayloadReader implements PayloadReader {
+
+ public Payload read(int length, IndexInput in) throws IOException {
+ byte[] data = new byte[length];
+ in.readBytes(data, 0, length);
+ return new BytePayload(data);
+ }
+}
Index: src/java/org/apache/lucene/index/FieldInfos.java
===================================================================
--- src/java/org/apache/lucene/index/FieldInfos.java (révision 493447)
+++ src/java/org/apache/lucene/index/FieldInfos.java (copie de travail)
@@ -39,6 +39,7 @@
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
static final byte OMIT_NORMS = 0x10;
+ static final byte STORE_PAYLOADS = 0x20;
private ArrayList byNumber = new ArrayList();
private HashMap byName = new HashMap();
@@ -156,9 +157,29 @@
*/
public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) {
+ add(name, isIndexed, storeTermVector, storePositionWithTermVector,
+ storeOffsetWithTermVector, omitNorms, false);
+ }
+
+ /** If the field is not yet known, adds it. If it is known, checks to make
+ * sure that the isIndexed flag is the same as was given previously for this
+ * field. If not - marks it as being indexed. Same goes for the TermVector
+ * parameters.
+ *
+ * @param name The name of the field
+ * @param isIndexed true if the field is indexed
+ * @param storeTermVector true if the term vector should be stored
+ * @param storePositionWithTermVector true if the term vector with positions should be stored
+ * @param storeOffsetWithTermVector true if the term vector with offsets should be stored
+ * @param omitNorms true if the norms for the indexed field should be omitted
+ * @param storePayloads true if payloads should be stored for this field
+ */
+ public void add(String name, boolean isIndexed, boolean storeTermVector,
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
+ boolean omitNorms, boolean storePayloads) {
FieldInfo fi = fieldInfo(name);
if (fi == null) {
- addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms);
+ addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
} else {
if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index
@@ -175,6 +196,9 @@
if (fi.omitNorms != omitNorms) {
fi.omitNorms = false; // once norms are stored, always store
}
+ if (fi.storePayloads != storePayloads) {
+ fi.storePayloads = true;
+ }
}
}
@@ -182,10 +206,10 @@
private void addInternal(String name, boolean isIndexed,
boolean storeTermVector, boolean storePositionWithTermVector,
- boolean storeOffsetWithTermVector, boolean omitNorms) {
+ boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads) {
FieldInfo fi =
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
- storeOffsetWithTermVector, omitNorms);
+ storeOffsetWithTermVector, omitNorms, storePayloads);
byNumber.add(fi);
byName.put(name, fi);
}
@@ -271,6 +295,7 @@
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
if (fi.omitNorms) bits |= OMIT_NORMS;
+ if (fi.storePayloads) bits |= STORE_PAYLOADS;
output.writeString(fi.name);
output.writeByte(bits);
}
@@ -286,8 +311,9 @@
boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
boolean omitNorms = (bits & OMIT_NORMS) != 0;
-
- addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms);
+ boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
+
+ addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads);
}
}
Index: src/java/org/apache/lucene/index/DefaultIndexFormat.java
===================================================================
--- src/java/org/apache/lucene/index/DefaultIndexFormat.java (révision 0)
+++ src/java/org/apache/lucene/index/DefaultIndexFormat.java (révision 0)
@@ -0,0 +1,39 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * The default implementation of the index format
+ *
+ * $Id$
+ */
+public class DefaultIndexFormat implements IndexFormat {
+
+ private PayloadReader reader = new DefaultPayloadReader();
+
+ private PayloadWriter writer = new DefaultPayloadWriter();
+
+ public PayloadReader getPayloadReader() {
+ return reader;
+ }
+
+ public PayloadWriter getPayloadWriter() {
+ return writer;
+ }
+}
Index: src/java/org/apache/lucene/index/PayloadWriter.java
===================================================================
--- src/java/org/apache/lucene/index/PayloadWriter.java (révision 0)
+++ src/java/org/apache/lucene/index/PayloadWriter.java (révision 0)
@@ -0,0 +1,11 @@
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexOutput;
+
+public interface PayloadWriter {
+
+ public void write(Payload payload, IndexOutput output) throws IOException;
+
+}
Index: src/java/org/apache/lucene/index/Payload.java
===================================================================
--- src/java/org/apache/lucene/index/Payload.java (révision 0)
+++ src/java/org/apache/lucene/index/Payload.java (révision 0)
@@ -0,0 +1,38 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A Payload is metadata that can be stored together with each occurrence
+ * of a term. This metadata is stored inline in the posting list of the
+ * specific term.
+ * <p>
+ * To store payloads in the index a {@link TokenStream} has to be used that
+ * produces {@link Token}s containing payload data.
+ * <p>
+ * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
+ * to retrieve the payloads from the index.
+ */
+public interface Payload {
+
+ public int getLength();
+
+}
Index: src/java/org/apache/lucene/index/IndexReader.java
===================================================================
--- src/java/org/apache/lucene/index/IndexReader.java (révision 493447)
+++ src/java/org/apache/lucene/index/IndexReader.java (copie de travail)
@@ -65,6 +65,8 @@
public static final FieldOption ALL = new FieldOption ("ALL");
// all indexed fields
public static final FieldOption INDEXED = new FieldOption ("INDEXED");
+ // all fields that store payloads
+ public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
// all fields which are not indexed
public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
// all fields which are indexed with termvectors enables
Index: src/java/org/apache/lucene/index/DefaultPayloadWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DefaultPayloadWriter.java (révision 0)
+++ src/java/org/apache/lucene/index/DefaultPayloadWriter.java (révision 0)
@@ -0,0 +1,37 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexOutput;
+
+/**
+ * This payload writer only support {@link BytePayload}.
+ *
+ * $Id$
+ */
+public class DefaultPayloadWriter implements PayloadWriter {
+
+ public void write(Payload payload, IndexOutput output) throws IOException {
+ if (!(payload instanceof BytePayload)) {
+ throw new RuntimeException("Unsupported payload of type '" + payload.getClass() + "' is not supported");
+ }
+ output.writeBytes(((BytePayload) payload).toByteArray(), payload.getLength());
+ }
+}
Index: src/java/org/apache/lucene/index/MultipleTermPositions.java
===================================================================
--- src/java/org/apache/lucene/index/MultipleTermPositions.java (révision 493447)
+++ src/java/org/apache/lucene/index/MultipleTermPositions.java (copie de travail)
@@ -191,5 +191,23 @@
public int read(int[] arg0, int[] arg1) throws IOException {
throw new UnsupportedOperationException();
}
+
+
+ /**
+ * Not implemented.
+ * @throws UnsupportedOperationException
+ */
+ public int getPayloadLength() {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Not implemented.
+ * @throws UnsupportedOperationException
+ */
+ public Payload getPayload() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
}
Index: src/java/org/apache/lucene/index/FilterIndexReader.java
===================================================================
--- src/java/org/apache/lucene/index/FilterIndexReader.java (révision 493447)
+++ src/java/org/apache/lucene/index/FilterIndexReader.java (copie de travail)
@@ -62,6 +62,14 @@
public int nextPosition() throws IOException {
return ((TermPositions) this.in).nextPosition();
}
+
+ public int getPayloadLength() {
+ return ((TermPositions) this.in).getPayloadLength();
+ }
+
+ public Payload getPayload() throws IOException {
+ return ((TermPositions) this.in).getPayload();
+ }
}
/** Base class for filtering {@link TermEnum} implementations. */
Index: src/java/org/apache/lucene/index/SegmentTermPositions.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentTermPositions.java (révision 493447)
+++ src/java/org/apache/lucene/index/SegmentTermPositions.java (copie de travail)
@@ -27,23 +27,33 @@
private int proxCount;
private int position;
+ // the current payload length
+ private int payloadLength;
+ // indicates whether the payload of the currend position has
+ // been read from the proxStream yet
+ private boolean needToLoadPayload;
+
// these variables are being used to remember information
// for a lazy skip
private long lazySkipPointer = 0;
private int lazySkipDocCount = 0;
+ private PayloadReader payloadReader;
SegmentTermPositions(SegmentReader p) {
super(p);
this.proxStream = (IndexInput)parent.proxStream.clone();
+ payloadReader = parent.directory().getIndexFormat().getPayloadReader();
}
- final void seek(TermInfo ti) throws IOException {
- super.seek(ti);
+ final void seek(TermInfo ti, Term term) throws IOException {
+ super.seek(ti, term);
if (ti != null)
lazySkipPointer = ti.proxPointer;
lazySkipDocCount = 0;
proxCount = 0;
+ payloadLength = 0;
+ needToLoadPayload = false;
}
public final void close() throws IOException {
@@ -55,8 +65,27 @@
// perform lazy skips if neccessary
lazySkip();
proxCount--;
- return position += proxStream.readVInt();
+ return position += readDeltaPosition();
}
+
+ private final int readDeltaPosition() throws IOException {
+ int delta = proxStream.readVInt();
+ if (currentFieldStoresPayloads) {
+ // if the current field stores payloads then
+ // the position delta is shifted one bit to the left.
+ // if the LSB is set, then we have to read the current
+ // payload length
+ if ((delta & 1) != 0) {
+ payloadLength = proxStream.readVInt();
+ }
+ delta >>>= 1;
+ needToLoadPayload = true;
+ } else {
+ payloadLength = 0;
+ needToLoadPayload = false;
+ }
+ return delta;
+ }
protected final void skippingDoc() throws IOException {
// we remember to skip the remaining positions of the current
@@ -82,17 +111,28 @@
/** Called by super.skipTo(). */
- protected void skipProx(long proxPointer) throws IOException {
+ protected void skipProx(long proxPointer, int payloadLength) throws IOException {
// we save the pointer, we might have to skip there lazily
lazySkipPointer = proxPointer;
lazySkipDocCount = 0;
proxCount = 0;
+ this.payloadLength = payloadLength;
+ needToLoadPayload = false;
}
private void skipPositions(int n) throws IOException {
- for (int f = n; f > 0; f--) // skip unread positions
- proxStream.readVInt();
+ for (int f = n; f > 0; f--) { // skip unread positions
+ readDeltaPosition();
+ skipPayload();
+ }
}
+
+ private void skipPayload() throws IOException {
+ if (needToLoadPayload && payloadLength > 0) {
+ proxStream.seek(proxStream.getFilePointer() + payloadLength);
+ }
+ needToLoadPayload = false;
+ }
// It is not always neccessary to move the prox pointer
// to a new document after the freq pointer has been moved.
@@ -105,6 +145,10 @@
// So we move the prox pointer lazily to the document
// as soon as positions are requested.
private void lazySkip() throws IOException {
+ // we might have to skip the current payload
+ // if it was not read yet
+ skipPayload();
+
if (lazySkipPointer != 0) {
proxStream.seek(lazySkipPointer);
lazySkipPointer = 0;
@@ -115,5 +159,32 @@
lazySkipDocCount = 0;
}
}
+
+ public int getPayloadLength() {
+ return payloadLength;
+ }
+ public Payload getPayload() throws IOException {
+ if (!needToLoadPayload) {
+ throw new IOException("Payload cannot be loaded more than once for the same term position.");
+ }
+ Payload payload = payloadReader.read(payloadLength, proxStream);
+ needToLoadPayload = false;
+ return payload;
+ // read payloads lazily
+// byte[] retArray;
+// int retOffset;
+// if (data == null || data.length - offset < payloadLength) {
+// // the array is too small to store the payload data,
+// // so we allocate a new one
+// retArray = new byte[payloadLength];
+// retOffset = 0;
+// } else {
+// retArray = data;
+// retOffset = offset;
+// }
+// proxStream.readBytes(retArray, retOffset, payloadLength);
+// needToLoadPayload = false;
+// return retArray;
+ }
}
Index: src/java/org/apache/lucene/index/SegmentTermDocs.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentTermDocs.java (révision 493447)
+++ src/java/org/apache/lucene/index/SegmentTermDocs.java (copie de travail)
@@ -39,6 +39,9 @@
private long proxPointer;
private long skipPointer;
private boolean haveSkipped;
+
+ private int payloadLengthAtLastSkip;
+ protected boolean currentFieldStoresPayloads;
protected SegmentTermDocs(SegmentReader parent) {
this.parent = parent;
@@ -49,23 +52,31 @@
public void seek(Term term) throws IOException {
TermInfo ti = parent.tis.get(term);
- seek(ti);
+ seek(ti, term);
}
public void seek(TermEnum termEnum) throws IOException {
TermInfo ti;
+ Term term;
// use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
- if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) // optimized case
- ti = ((SegmentTermEnum) termEnum).termInfo();
- else // punt case
- ti = parent.tis.get(termEnum.term());
-
- seek(ti);
+ if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.fieldInfos) { // optimized case
+ SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum);
+ term = segmentTermEnum.term();
+ ti = segmentTermEnum.termInfo();
+ } else { // punt case
+ term = termEnum.term();
+ ti = parent.tis.get(term);
+ }
+
+ seek(ti, term);
}
- void seek(TermInfo ti) throws IOException {
+ void seek(TermInfo ti, Term term) throws IOException {
count = 0;
+ payloadLengthAtLastSkip = 0;
+ FieldInfo fi = parent.fieldInfos.fieldInfo(term.field);
+ currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false;
if (ti == null) {
df = 0;
} else {
@@ -141,7 +152,7 @@
}
/** Overridden by SegmentTermPositions to skip in prox stream. */
- protected void skipProx(long proxPointer) throws IOException {}
+ protected void skipProx(long proxPointer, int payloadLength) throws IOException {}
/** Optimized implementation. */
public boolean skipTo(int target) throws IOException {
@@ -157,6 +168,7 @@
// scan skip data
int lastSkipDoc = skipDoc;
+ int lastPayloadLength = 0;
long lastFreqPointer = freqStream.getFilePointer();
long lastProxPointer = -1;
int numSkipped = -1 - (count % skipInterval);
@@ -165,6 +177,7 @@
lastSkipDoc = skipDoc;
lastFreqPointer = freqPointer;
lastProxPointer = proxPointer;
+ lastPayloadLength = payloadLengthAtLastSkip;
if (skipDoc != 0 && skipDoc >= doc)
numSkipped += skipInterval;
@@ -172,7 +185,21 @@
if(skipCount >= numSkips)
break;
- skipDoc += skipStream.readVInt();
+ if (currentFieldStoresPayloads) {
+ // the current field stores payloads.
+ // if the doc delta is odd then we have
+ // to read the current payload length
+ // because it differs from the length of the
+ // previous payload
+ int delta = skipStream.readVInt();
+ if ((delta & 1) != 0) {
+ payloadLengthAtLastSkip = skipStream.readVInt();
+ }
+ delta >>>= 1;
+ skipDoc += delta;
+ } else {
+ skipDoc += skipStream.readVInt();
+ }
freqPointer += skipStream.readVInt();
proxPointer += skipStream.readVInt();
@@ -182,7 +209,7 @@
// if we found something to skip, then skip it
if (lastFreqPointer > freqStream.getFilePointer()) {
freqStream.seek(lastFreqPointer);
- skipProx(lastProxPointer);
+ skipProx(lastProxPointer, lastPayloadLength);
doc = lastSkipDoc;
count += numSkipped;
Index: src/java/org/apache/lucene/index/SegmentMerger.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentMerger.java (révision 493447)
+++ src/java/org/apache/lucene/index/SegmentMerger.java (copie de travail)
@@ -151,11 +151,11 @@
}
private void addIndexed(IndexReader reader, FieldInfos fieldInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
- boolean storeOffsetWithTermVector) throws IOException {
+ boolean storeOffsetWithTermVector, boolean storePayloads) throws IOException {
Iterator i = names.iterator();
while (i.hasNext()) {
String field = (String)i.next();
- fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field));
+ fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
}
}
@@ -165,15 +165,16 @@
* @throws IOException
*/
private final int mergeFields() throws IOException {
- fieldInfos = new FieldInfos(); // merge field names
+ fieldInfos = new FieldInfos(); // merge field names
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
- addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true);
- addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false);
- addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true);
- addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false);
- addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
+ addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
}
fieldInfos.write(directory, segment + ".fnm");
@@ -263,7 +264,7 @@
SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
base += reader.numDocs();
if (smi.next())
- queue.put(smi); // initialize queue
+ queue.put(smi); // initialize queue
else
smi.close();
}
@@ -271,7 +272,7 @@
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
while (queue.size() > 0) {
- int matchSize = 0; // pop matching terms
+ int matchSize = 0; // pop matching terms
match[matchSize++] = (SegmentMergeInfo) queue.pop();
Term term = match[0].term;
SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
@@ -281,14 +282,14 @@
top = (SegmentMergeInfo) queue.top();
}
- mergeTermInfo(match, matchSize); // add new TermInfo
+ mergeTermInfo(match, matchSize); // add new TermInfo
while (matchSize > 0) {
SegmentMergeInfo smi = match[--matchSize];
if (smi.next())
- queue.put(smi); // restore queue
+ queue.put(smi); // restore queue
else
- smi.close(); // done with a segment
+ smi.close(); // done with a segment
}
}
}
@@ -307,7 +308,7 @@
long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer();
- int df = appendPostings(smis, n); // append posting data
+ int df = appendPostings(smis, n); // append posting data
long skipPointer = writeSkip();
@@ -317,6 +318,8 @@
termInfosWriter.add(smis[0].term, termInfo);
}
}
+
+ private byte[] payloadBuffer = null;
/** Process postings from multiple segments all positioned on the
* same term. Writes out merged entries into freqOutput and
@@ -328,9 +331,12 @@
*/
private final int appendPostings(SegmentMergeInfo[] smis, int n)
throws IOException {
+ PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter();
int lastDoc = 0;
- int df = 0; // number of docs w/ term
+ int df = 0; // number of docs w/ term
resetSkip();
+ boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
+ int lastPayloadLength = -1; // ensures that we write the first length
for (int i = 0; i < n; i++) {
SegmentMergeInfo smi = smis[i];
TermPositions postings = smi.getPositions();
@@ -350,24 +356,43 @@
df++;
if ((df % skipInterval) == 0) {
- bufferSkip(lastDoc);
+ bufferSkip(lastDoc, storePayloads, lastPayloadLength);
}
- int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
+ int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
lastDoc = doc;
int freq = postings.freq();
if (freq == 1) {
- freqOutput.writeVInt(docCode | 1); // write doc & freq=1
+ freqOutput.writeVInt(docCode | 1); // write doc & freq=1
} else {
- freqOutput.writeVInt(docCode); // write doc
- freqOutput.writeVInt(freq); // write frequency in doc
+ freqOutput.writeVInt(docCode); // write doc
+ freqOutput.writeVInt(freq); // write frequency in doc
}
- int lastPosition = 0; // write position deltas
+ /** See {@link DocumentWriter#writePostings(Posting[], String) for
+ * documentation about the encoding of positions and payloads
+ */
+ int lastPosition = 0; // write position deltas
for (int j = 0; j < freq; j++) {
int position = postings.nextPosition();
- proxOutput.writeVInt(position - lastPosition);
+ int delta = position - lastPosition;
+ if (storePayloads) {
+ int payloadLength = postings.getPayloadLength();
+ if (payloadLength == lastPayloadLength) {
+ proxOutput.writeVInt(delta * 2);
+ } else {
+ proxOutput.writeVInt(delta * 2 + 1);
+ proxOutput.writeVInt(payloadLength);
+ lastPayloadLength = payloadLength;
+ }
+ if (payloadLength > 0) {
+ Payload payload = postings.getPayload();
+ payloadWriter.write(payload, proxOutput);
+ }
+ } else {
+ proxOutput.writeVInt(delta);
+ }
lastPosition = position;
}
}
@@ -377,21 +402,59 @@
private RAMOutputStream skipBuffer = new RAMOutputStream();
private int lastSkipDoc;
+ private int lastSkipPayloadLength;
private long lastSkipFreqPointer;
private long lastSkipProxPointer;
private void resetSkip() {
skipBuffer.reset();
lastSkipDoc = 0;
+ lastSkipPayloadLength = -1; // we don't have to write the first length in the skip list
lastSkipFreqPointer = freqOutput.getFilePointer();
lastSkipProxPointer = proxOutput.getFilePointer();
}
- private void bufferSkip(int doc) throws IOException {
+ private void bufferSkip(int doc, boolean storePayloads, int payloadLength) throws IOException {
long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer();
- skipBuffer.writeVInt(doc - lastSkipDoc);
+ // To efficiently store payloads in the posting lists we do not store the length of
+ // every payload. Instead we omit the length for a payload if the previous payload had
+ // the same length.
+ // However, in order to support skipping the payload length at every skip point must be known.
+ // So we use the same length encoding that we use for the posting lists for the skip data as well:
+ // Case 1: current field does not store payloads
+ // SkipDatum --> DocSkip, FreqSkip, ProxSkip
+ // DocSkip,FreqSkip,ProxSkip --> VInt
+ // DocSkip records the document number before every SkipInterval th document in TermFreqs.
+ // Document numbers are represented as differences from the previous value in the sequence.
+ // Case 2: current field stores payloads
+ // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip
+ // DocSkip,FreqSkip,ProxSkip --> VInt
+ // PayloadLength --> VInt
+ // In this case DocSkip/2 is the difference between
+ // the current and the previous value. If DocSkip
+ // is odd, then a PayloadLength encoded as VInt follows,
+ // if DocSkip is even, then it is assumed that the
+ // current payload length equals the length at the previous
+ // skip point
+ if (storePayloads) {
+ int delta = doc - lastSkipDoc;
+ if (payloadLength == lastSkipPayloadLength) {
+ // the current payload length equals the length at the previous skip point,
+ // so we don't store the length again
+ skipBuffer.writeVInt(delta * 2);
+ } else {
+ // the payload length is different from the previous one. We shift the DocSkip,
+ // set the lowest bit and store the current payload length as VInt.
+ skipBuffer.writeVInt(delta * 2 + 1);
+ skipBuffer.writeVInt(payloadLength);
+ lastSkipPayloadLength = payloadLength;
+ }
+ } else {
+ // current field does not store payloads
+ skipBuffer.writeVInt(doc - lastSkipDoc);
+ }
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java (révision 493447)
+++ src/java/org/apache/lucene/index/DocumentWriter.java (copie de travail)
@@ -31,6 +31,7 @@
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
+import java.util.BitSet;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
@@ -69,34 +70,42 @@
final void addDocument(String segment, Document doc)
throws IOException {
- // write field names
+ // create field infos
fieldInfos = new FieldInfos();
fieldInfos.add(doc);
- fieldInfos.write(directory, segment + ".fnm");
- // write field values
- FieldsWriter fieldsWriter =
- new FieldsWriter(directory, segment, fieldInfos);
- try {
- fieldsWriter.addDocument(doc);
- } finally {
- fieldsWriter.close();
- }
-
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
+ fieldStoresPayloads = new BitSet(fieldInfos.size());
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost());
+ // Before we write the FieldInfos we invert the Document. The reason is that
+ // during invertion the TokenStreams of tokenized fields are being processed
+ // and we might encounter tokens that have payloads associated with them. In
+ // this case we have to update the FieldInfo of the particular field.
invertDocument(doc);
// sort postingTable into an array
Posting[] postings = sortPostingTable();
+ // write field infos
+ fieldInfos.write(directory, segment + ".fnm");
+
+ // write field values
+ FieldsWriter fieldsWriter =
+ new FieldsWriter(directory, segment, fieldInfos);
+ try {
+ fieldsWriter.addDocument(doc);
+ } finally {
+ fieldsWriter.close();
+ }
+
+
/*
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
@@ -125,6 +134,10 @@
private int[] fieldPositions;
private int[] fieldOffsets;
private float[] fieldBoosts;
+
+ // If any of the tokens of a paticular field carry a payload
+ // then we enable payloads for that field.
+ private BitSet fieldStoresPayloads;
// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
@@ -144,9 +157,9 @@
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+ addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
else
- addPosition(fieldName, stringValue, position++, null);
+ addPosition(fieldName, stringValue, position++, null, null);
offset += stringValue.length();
length++;
} else
@@ -167,11 +180,20 @@
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
- if(field.isStoreOffsetWithTermVector())
- addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
- else
- addPosition(fieldName, t.termText(), position++, null);
+ Payload payload = t.getPayload();
+ if (payload != null) {
+ // enable payloads for this field
+ fieldStoresPayloads.set(fieldNumber);
+ }
+ TermVectorOffsetInfo termVectorOffsetInfo;
+ if (field.isStoreOffsetWithTermVector()) {
+ termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
+ } else {
+ termVectorOffsetInfo = null;
+ }
+ addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
+
lastToken = t;
if (++length >= maxFieldLength) {
if (infoStream != null)
@@ -194,11 +216,16 @@
fieldOffsets[fieldNumber] = offset;
}
}
+
+ // update fieldInfos for all fields that have one or more tokens with payloads
+ for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
+ fieldInfos.fieldInfo(i).storePayloads = true;
+ }
}
private final Term termBuffer = new Term("", ""); // avoid consing
- private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
+ private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {
termBuffer.set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
@@ -209,9 +236,25 @@
int[] positions = ti.positions;
System.arraycopy(positions, 0, newPositions, 0, freq);
ti.positions = newPositions;
+
+ if (ti.payloads != null) {
+ // the current field stores payloads
+ Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
+ Payload[] payloads = ti.payloads;
+ System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
+ ti.payloads = newPayloads;
+ }
}
ti.positions[freq] = position; // add new position
+ if (payload != null) {
+ if (ti.payloads == null) {
+ // lazily allocate payload array
+ ti.payloads = new Payload[ti.positions.length];
+ }
+ ti.payloads[freq] = payload;
+ }
+
if (offset != null) {
if (ti.offsets.length == freq){
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
@@ -224,7 +267,7 @@
ti.freq = freq + 1; // update frequency
} else { // word not seen before
Term term = new Term(field, text, false);
- postingTable.put(term, new Posting(term, position, offset));
+ postingTable.put(term, new Posting(term, position, payload, offset));
}
}
@@ -299,6 +342,7 @@
IndexOutput freq = null, prox = null;
TermInfosWriter tis = null;
TermVectorsWriter termVectorWriter = null;
+ PayloadWriter payloadWriter = directory.getIndexFormat().getPayloadWriter();
try {
//open files for inverse index storage
freq = directory.createOutput(segment + ".frq");
@@ -307,10 +351,31 @@
termIndexInterval);
TermInfo ti = new TermInfo();
String currentField = null;
-
+ boolean currentFieldHasPayloads = false;
+
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
+ // check to see if we switched to a new field
+ String termField = posting.term.field();
+ if (currentField != termField) {
+ // changing field - see if there is something to save
+ currentField = termField;
+ FieldInfo fi = fieldInfos.fieldInfo(currentField);
+ currentFieldHasPayloads = fi.storePayloads;
+ if (fi.storeTermVector) {
+ if (termVectorWriter == null) {
+ termVectorWriter =
+ new TermVectorsWriter(directory, segment, fieldInfos);
+ termVectorWriter.openDocument();
+ }
+ termVectorWriter.openField(currentField);
+
+ } else if (termVectorWriter != null) {
+ termVectorWriter.closeField();
+ }
+ }
+
// add an entry to the dictionary with pointers to prox and freq files
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
tis.add(posting.term, ti);
@@ -326,28 +391,62 @@
int lastPosition = 0; // write positions
int[] positions = posting.positions;
+ Payload[] payloads = posting.payloads;
+ int lastPayloadLength = -1;
+
+
+ // The following encoding is being used for positions and payloads:
+ // Case 1: current field does not store payloads
+ // Positions -> <PositionDelta>^freq
+ // PositionDelta -> VInt
+ // The PositionDelta is the difference between the current
+ // and the previous position
+ // Case 2: current field stores payloads
+ // Positions -> <PositionDelta, Payload>^freq
+ // Payload -> <PayloadLength?, PayloadData>
+ // PositionDelta -> VInt
+ // PayloadLength -> VInt
+ // PayloadData -> byte^PayloadLength
+ // In this case PositionDelta/2 is the difference between
+ // the current and the previous position. If PositionDelta
+ // is odd, then a PayloadLength encoded as VInt follows,
+ // if PositionDelta is even, then it is assumed that the
+ // length of the current Payload equals the length of the
+ // previous Payload.
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
int position = positions[j];
- prox.writeVInt(position - lastPosition);
- lastPosition = position;
- }
- // check to see if we switched to a new field
- String termField = posting.term.field();
- if (currentField != termField) {
- // changing field - see if there is something to save
- currentField = termField;
- FieldInfo fi = fieldInfos.fieldInfo(currentField);
- if (fi.storeTermVector) {
- if (termVectorWriter == null) {
- termVectorWriter =
- new TermVectorsWriter(directory, segment, fieldInfos);
- termVectorWriter.openDocument();
+ int delta = position - lastPosition;
+ if (currentFieldHasPayloads) {
+ int payloadLength = 0;
+ Payload payload = null;
+ if (payloads != null) {
+ payload = payloads[j];
+ if (payload != null) {
+ payloadLength = payload.getLength();
+ }
}
- termVectorWriter.openField(currentField);
-
- } else if (termVectorWriter != null) {
- termVectorWriter.closeField();
+ if (payloadLength == lastPayloadLength) {
+ // the length of the current payload equals the length
+ // of the previous one. So we do not have to store the length
+ // again and we only shift the position delta by one bit
+ prox.writeVInt(delta * 2);
+ } else {
+ // the length of the current payload is different from the
+ // previous one. We shift the position delta, set the lowest
+ // bit and store the current payload length as VInt.
+ prox.writeVInt(delta * 2 + 1);
+ prox.writeVInt(payloadLength);
+ lastPayloadLength = payloadLength;
+ }
+ if (payloadLength > 0) {
+ // write current payload
+ payloadWriter.write(payload, prox);
+ }
+ } else {
+ // field does not store payloads, just write position delta as VInt
+ prox.writeVInt(delta);
}
+ lastPosition = position;
}
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
@@ -394,18 +493,27 @@
Term term; // the Term
int freq; // its frequency in doc
int[] positions; // positions it occurs at
+ Payload[] payloads; // the payloads of the terms
TermVectorOffsetInfo [] offsets;
- Posting(Term t, int position, TermVectorOffsetInfo offset) {
+ Posting(Term t, int position, Payload payload, TermVectorOffsetInfo offset) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
+
+ if (payload != null) {
+ payloads = new Payload[1];
+ payloads[0] = payload;
+ } else
+ payloads = null;
+
+
if(offset != null){
- offsets = new TermVectorOffsetInfo[1];
- offsets[0] = offset;
- }
- else
+ offsets = new TermVectorOffsetInfo[1];
+ offsets[0] = offset;
+ } else
offsets = null;
+
}
}
Index: src/java/org/apache/lucene/index/BytePayload.java
===================================================================
--- src/java/org/apache/lucene/index/BytePayload.java (révision 0)
+++ src/java/org/apache/lucene/index/BytePayload.java (révision 0)
@@ -0,0 +1,101 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A Payload is metadata that can be stored together with each occurrence
+ * of a term. This metadata is stored inline in the posting list of the
+ * specific term.
+ * <p>
+ * To store payloads in the index a {@link TokenStream} has to be used that
+ * produces {@link Token}s containing payload data.
+ * <p>
+ * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
+ * to retrieve the payloads from the index.
+ */
+public class BytePayload implements Payload {
+ private byte[] data;
+
+ private int offset;
+
+ private int length;
+
+ /**
+ * Creates a new payload with the the given array as data.
+ *
+ * @param data the data of this payload
+ */
+ public BytePayload(byte[] data) {
+ this(data, 0, data.length);
+ }
+
+ /**
+ * Creates a new payload with the the given array as data.
+ *
+ * @param data the data of this payload
+ * @param offset the offset in the data byte array
+ * @param length the length of the data
+ */
+ public BytePayload(byte[] data, int offset, int length) {
+ if (offset < 0 || offset + length > data.length) {
+ throw new IllegalArgumentException();
+ }
+ this.data = data;
+ this.offset = offset;
+ this.length = length;
+ }
+
+ public int getLength() {
+ return this.length;
+ }
+
+ /**
+ * Returns the byte at the given index.
+ */
+ public byte byteAt(int index) {
+ if (0 <= index && index < this.length) {
+ return this.data[this.offset + index];
+ }
+ throw new ArrayIndexOutOfBoundsException(index);
+ }
+
+ /**
+ * Allocates a new byte array, copies the payload data into it and returns it.
+ */
+ public byte[] toByteArray() {
+ byte[] retArray = new byte[this.length];
+ System.arraycopy(this.data, this.offset, retArray, 0, this.length);
+ return retArray;
+ }
+
+ /**
+ * Copies the payload data to a byte array.
+ *
+ * @param target the target byte array
+ * @param targetOffset the offset in the target byte array
+ */
+ public void copyTo(byte[] target, int targetOffset) {
+ if (this.length > target.length + targetOffset) {
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ System.arraycopy(this.data, this.offset, target, targetOffset, this.length);
+ }
+}
Index: src/java/org/apache/lucene/index/ParallelReader.java
===================================================================
--- src/java/org/apache/lucene/index/ParallelReader.java (révision 493447)
+++ src/java/org/apache/lucene/index/ParallelReader.java (copie de travail)
@@ -389,7 +389,15 @@
// It is an error to call this if there is no next position, e.g. if termDocs==null
return ((TermPositions)termDocs).nextPosition();
}
+
+ public int getPayloadLength() {
+ return ((TermPositions)termDocs).getPayloadLength();
+ }
+ public Payload getPayload() throws IOException {
+ return ((TermPositions)termDocs).getPayload();
+ }
+
}
}
Index: src/java/org/apache/lucene/index/SegmentReader.java
===================================================================
--- src/java/org/apache/lucene/index/SegmentReader.java (révision 493447)
+++ src/java/org/apache/lucene/index/SegmentReader.java (copie de travail)
@@ -369,6 +369,9 @@
else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) {
fieldSet.add(fi.name);
}
+ else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) {
+ fieldSet.add(fi.name);
+ }
else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) {
fieldSet.add(fi.name);
}
@@ -545,6 +548,10 @@
return termVectorsReader.get(docNumber);
}
+ FieldInfos fieldInfos() {
+ return fieldInfos;
+ }
+
/**
* Return the name of the segment this reader is reading.
*/
Index: src/java/org/apache/lucene/store/RAMDirectory.java
===================================================================
--- src/java/org/apache/lucene/store/RAMDirectory.java (révision 493447)
+++ src/java/org/apache/lucene/store/RAMDirectory.java (copie de travail)
@@ -17,16 +17,18 @@
* limitations under the License.
*/
+import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
-import java.io.FileNotFoundException;
-import java.io.File;
import java.io.Serializable;
import java.util.Collection;
-import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
+import org.apache.lucene.index.DefaultIndexFormat;
+import org.apache.lucene.index.IndexFormat;
+
/**
* A memory-resident {@link Directory} implementation. Locking
* implementation is by default the {@link SingleInstanceLockFactory}
@@ -47,8 +49,22 @@
// Lock acquisition sequence: RAMDirectory, then RAMFile
// *****
- /** Constructs an empty {@link Directory}. */
+
+ /**
+ * Constructs an empty {@link Directory}.
+ * The index format used the the default one
+ */
public RAMDirectory() {
+ this(new DefaultIndexFormat());
+ }
+
+ /**
+ * Contructor
+ *
+ * @param indexFormat the format of the index
+ */
+ public RAMDirectory(IndexFormat indexFormat) {
+ this.indexFormat = indexFormat;
setLockFactory(new SingleInstanceLockFactory());
}
@@ -69,11 +85,11 @@
* @exception IOException if an error occurs
*/
public RAMDirectory(Directory dir) throws IOException {
- this(dir, false);
+ this(dir, false, new DefaultIndexFormat());
}
-
- private RAMDirectory(Directory dir, boolean closeDir) throws IOException {
- this();
+
+ private RAMDirectory(Directory dir, boolean closeDir, IndexFormat indexFormat) throws IOException {
+ this(indexFormat);
final String[] files = dir.list();
byte[] buf = new byte[BufferedIndexOutput.BUFFER_SIZE];
for (int i = 0; i < files.length; i++) {
@@ -107,7 +123,7 @@
* @see #RAMDirectory(Directory)
*/
public RAMDirectory(File dir) throws IOException {
- this(FSDirectory.getDirectory(dir, false), true);
+ this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat());
}
/**
@@ -118,7 +134,7 @@
* @see #RAMDirectory(Directory)
*/
public RAMDirectory(String dir) throws IOException {
- this(FSDirectory.getDirectory(dir, false), true);
+ this(FSDirectory.getDirectory(dir, false), true, new DefaultIndexFormat());
}
/** Returns an array of strings, one for each file in the directory. */
Index: src/java/org/apache/lucene/store/Directory.java
===================================================================
--- src/java/org/apache/lucene/store/Directory.java (révision 493447)
+++ src/java/org/apache/lucene/store/Directory.java (copie de travail)
@@ -19,6 +19,9 @@
import java.io.IOException;
+import org.apache.lucene.index.DefaultIndexFormat;
+import org.apache.lucene.index.IndexFormat;
+
/** A Directory is a flat list of files. Files may be written once, when they
* are created. Once a file is created it may only be opened for read, or
* deleted. Random access is permitted both when reading and writing.
@@ -42,6 +45,12 @@
* this Directory instance). */
protected LockFactory lockFactory;
+ protected IndexFormat indexFormat = new DefaultIndexFormat();
+
+ public IndexFormat getIndexFormat() {
+ return indexFormat;
+ }
+
/** Returns an array of strings, one for each file in the directory. */
public abstract String[] list()
throws IOException;
Index: src/java/org/apache/lucene/store/RAMOutputStream.java
===================================================================
--- src/java/org/apache/lucene/store/RAMOutputStream.java (révision 493447)
+++ src/java/org/apache/lucene/store/RAMOutputStream.java (copie de travail)
@@ -66,7 +66,7 @@
file.setLength(0);
}
- public void flushBuffer(byte[] src, int len) throws IOException {
+ public void flushBuffer(byte[] src, int offset, int len) throws IOException {
byte[] buffer;
int bufferPos = 0;
while (bufferPos != len) {
@@ -81,7 +81,7 @@
else
buffer = (byte[]) file.buffers.get(bufferNumber);
- System.arraycopy(src, bufferPos, buffer, bufferOffset, bytesToCopy);
+ System.arraycopy(src, offset + bufferPos, buffer, bufferOffset, bytesToCopy);
bufferPos += bytesToCopy;
pointer += bytesToCopy;
}
Index: src/java/org/apache/lucene/store/IndexOutput.java
===================================================================
--- src/java/org/apache/lucene/store/IndexOutput.java (révision 493447)
+++ src/java/org/apache/lucene/store/IndexOutput.java (copie de travail)
@@ -36,8 +36,18 @@
* @param length the number of bytes to write
* @see IndexInput#readBytes(byte[],int,int)
*/
- public abstract void writeBytes(byte[] b, int length) throws IOException;
+ public void writeBytes(byte[] b, int length) throws IOException {
+ writeBytes(b, 0, length);
+ }
+ /** Writes an array of bytes.
+ * @param b the bytes to write
+ * @param offset the offset in the byte array
+ * @param length the number of bytes to write
+ * @see IndexInput#readBytes(byte[],int,int)
+ */
+ public abstract void writeBytes(byte[] b, int offset, int length) throws IOException;
+
/** Writes an int as four bytes.
* @see IndexInput#readInt()
*/
Index: src/java/org/apache/lucene/store/BufferedIndexOutput.java
===================================================================
--- src/java/org/apache/lucene/store/BufferedIndexOutput.java (révision 493447)
+++ src/java/org/apache/lucene/store/BufferedIndexOutput.java (copie de travail)
@@ -24,8 +24,8 @@
static final int BUFFER_SIZE = 1024;
private final byte[] buffer = new byte[BUFFER_SIZE];
- private long bufferStart = 0; // position in file of buffer
- private int bufferPosition = 0; // position in buffer
+ private long bufferStart = 0; // position in file of buffer
+ private int bufferPosition = 0; // position in buffer
/** Writes a single byte.
* @see IndexInput#readByte()
@@ -41,12 +41,12 @@
* @param length the number of bytes to write
* @see IndexInput#readBytes(byte[],int,int)
*/
- public void writeBytes(byte[] b, int length) throws IOException {
+ public void writeBytes(byte[] b, int offset, int length) throws IOException {
int bytesLeft = BUFFER_SIZE - bufferPosition;
// is there enough space in the buffer?
if (bytesLeft >= length) {
// we add the data to the end of the buffer
- System.arraycopy(b, 0, buffer, bufferPosition, length);
+ System.arraycopy(b, offset, buffer, bufferPosition, length);
bufferPosition += length;
// if the buffer is full, flush it
if (BUFFER_SIZE - bufferPosition == 0)
@@ -66,7 +66,7 @@
int pieceLength;
while (pos < length) {
pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft;
- System.arraycopy(b, pos, buffer, bufferPosition, pieceLength);
+ System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength);
pos += pieceLength;
bufferPosition += pieceLength;
// if the buffer is full, flush it
@@ -92,8 +92,18 @@
* @param b the bytes to write
* @param len the number of bytes to write
*/
- protected abstract void flushBuffer(byte[] b, int len) throws IOException;
+ private void flushBuffer(byte[] b, int len) throws IOException {
+ flushBuffer(b, 0, len);
+ }
+ /** Expert: implements buffer write. Writes bytes at the current position in
+ * the output.
+ * @param b the bytes to write
+ * @param offset the offset in the byte array
+ * @param len the number of bytes to write
+ */
+ protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException;
+
/** Closes this stream to further operations. */
public void close() throws IOException {
flush();
Index: src/java/org/apache/lucene/store/FSDirectory.java
===================================================================
--- src/java/org/apache/lucene/store/FSDirectory.java (révision 493447)
+++ src/java/org/apache/lucene/store/FSDirectory.java (copie de travail)
@@ -26,7 +26,9 @@
import java.security.NoSuchAlgorithmException;
import java.util.Hashtable;
+import org.apache.lucene.index.DefaultIndexFormat;
import org.apache.lucene.index.IndexFileNameFilter;
+import org.apache.lucene.index.IndexFormat;
/**
* Straightforward implementation of {@link Directory} as a directory of files.
@@ -180,9 +182,28 @@
* @param create if true, create, or erase any existing contents.
* @param lockFactory instance of {@link LockFactory} providing the
* locking implementation.
- * @return the FSDirectory for the named file. */
+ * @return the FSDirectory for the named file.
+ * @throws IOException */
public static FSDirectory getDirectory(File file, boolean create,
LockFactory lockFactory, boolean doRemoveOldFiles)
+ throws IOException {
+ return getDirectory(file, create, lockFactory, doRemoveOldFiles, new DefaultIndexFormat());
+ }
+
+ /** Returns the directory instance for the named location, using the
+ * provided LockFactory implementation.
+ *
+ * <p>Directories are cached, so that, for a given canonical path, the same
+ * FSDirectory instance will always be returned. This permits
+ * synchronization on directories.
+ *
+ * @param file the path to the directory.
+ * @param create if true, create, or erase any existing contents.
+ * @param lockFactory instance of {@link LockFactory} providing the
+ * locking implementation.
+ * @return the FSDirectory for the named file. */
+ public static FSDirectory getDirectory(File file, boolean create, LockFactory lockFactory,
+ boolean doRemoveOldFiles, IndexFormat indexFormat)
throws IOException {
file = new File(file.getCanonicalPath());
FSDirectory dir;
@@ -194,7 +215,7 @@
} catch (Exception e) {
throw new RuntimeException("cannot load FSDirectory class: " + e.toString(), e);
}
- dir.init(file, create, lockFactory, doRemoveOldFiles);
+ dir.init(file, create, lockFactory, doRemoveOldFiles, indexFormat);
DIRECTORIES.put(file, dir);
} else {
@@ -243,8 +264,11 @@
throw new IOException(path + " not a directory");
}
- private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles) throws IOException {
+ private void init(File path, boolean create, LockFactory lockFactory, boolean doRemoveOldFiles,
+ IndexFormat indexFormat) throws IOException {
+ this.indexFormat = indexFormat;
+
// Set up lockFactory with cascaded defaults: if an instance was passed in,
// use that; else if locks are disabled, use NoLockFactory; else if the
// system property org.apache.lucene.store.FSDirectoryLockFactoryClass is set,
@@ -592,8 +616,8 @@
}
/** output methods: */
- public void flushBuffer(byte[] b, int size) throws IOException {
- file.write(b, 0, size);
+ public void flushBuffer(byte[] b, int offset, int size) throws IOException {
+ file.write(b, offset, size);
}
public void close() throws IOException {
// only close the file if it has not been closed yet
Index: src/site/src/documentation/content/xdocs/fileformats.xml
===================================================================
--- src/site/src/documentation/content/xdocs/fileformats.xml (révision 493447)
+++ src/site/src/documentation/content/xdocs/fileformats.xml (copie de travail)
@@ -798,16 +798,20 @@
<p>
<b>Pre-2.1:</b>
+ <code>
Segments --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize&gt;
<sup>SegCount</sup>
+ </code>
</p>
<p>
<b>2.1 and above:</b>
+ <code>
Segments --&gt; Format, Version, NameCounter, SegCount, &lt;SegName, SegSize, DelGen, NumField, NormGen
<sup>NumField</sup>
&gt;
<sup>SegCount</sup>
, IsCompoundFile
+ </code>
</p>
<p>
@@ -1002,6 +1006,7 @@
<li>If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.</li>
<li>If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.</li>
<li>If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.</li>
+ <li>If the sixth lowest-order bit is set (0x20), payloads are being stored for the indexed field.</li>
</ul>
</p>
@@ -1287,9 +1292,9 @@
<sup>DocFreq/SkipInterval</sup>
</p>
<p>SkipDatum --&gt;
- DocSkip,FreqSkip,ProxSkip
+ DocSkip,PayloadLength?,FreqSkip,ProxSkip
</p>
- <p>DocDelta,Freq,DocSkip,FreqSkip,ProxSkip --&gt;
+ <p>DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --&gt;
VInt
</p>
<p>TermFreqs
@@ -1317,9 +1322,17 @@
SkipInterval
<sup>th</sup>
document in TermFreqs.
- Document numbers are represented as differences
- from the previous value in the sequence. FreqSkip
- and ProxSkip record the position of every
+ If payloads are disabled for the term's field,
+ then DocSkip represents the difference from the
+ previous value in the sequence.
+ If payloads are enabled for the term's field,
+ then DocSkip/2 represents the difference from the
+ previous value in the sequence. If payloads are enabled
+ and DocSkip is odd,
+ then PayloadLength is stored indicating the length
+ of the last payload before the SkipInterval<sup>th</sup>
+ document in TermPositions.
+ FreqSkip and ProxSkip record the position of every
SkipInterval
<sup>th</sup>
entry in FreqFile and
@@ -1368,12 +1381,22 @@
<sup>DocFreq</sup>
</p>
<p>Positions --&gt;
- &lt;PositionDelta&gt;
+ &lt;PositionDelta,Payload?&gt;
<sup>Freq</sup>
</p>
+ <p>Payload --&gt;
+ &lt;PayloadLength?,PayloadData&gt;
+ </p>
<p>PositionDelta --&gt;
VInt
</p>
+ <p>PayloadLength --&gt;
+ VInt
+ </p>
+ <p>PayloadData --&gt;
+ byte<sup>PayloadLength</sup>
+ </p>
+
<p>TermPositions
are ordered by term (the term is implicit, from the .tis file).
</p>
@@ -1382,19 +1405,30 @@
number is implicit from the .frq file).
</p>
<p>PositionDelta
- is the difference between the position of the current occurrence in
+ is, if payloads are disabled for the term's field, the difference
+ between the position of the current occurrence in
the document and the previous occurrence (or zero, if this is the
first occurrence in this document).
+ If payloads are enabled for the term's field, then PositionDelta/2
+ is the difference between the current and the previous position. If
+ payloads are enabled and PositionDelta is odd, then PayloadLength is
+ stored, indicating the length of the payload at the current term position.
</p>
<p>
For example, the TermPositions for a
term which occurs as the fourth term in one document, and as the
fifth and ninth term in a subsequent document, would be the following
- sequence of VInts:
+ sequence of VInts (payloads disabled):
</p>
<p>4,
5, 4
</p>
+ <p>PayloadData
+ is metadata associated with the current term position. If PayloadLength
+ is stored at the current position, then it indicates the length of this
+ Payload. If PayloadLength is not stored, then this Payload has the same
+ length as the Payload at the previous position.
+ </p>
</section>
<section id="Normalization Factors"><title>Normalization Factors</title>
<p>There's a norm file for each indexed field with a byte for