blob: 9a844b02e37965c720d3615e71ab709b0d2cbb1a [file] [log] [blame]
package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestPayloads extends LuceneTestCase {
// Simple tests to test the Payload class
public void testPayload() throws Exception {
rnd = newRandom();
byte[] testData = "This is a test!".getBytes();
Payload payload = new Payload(testData);
assertEquals("Wrong payload length.", testData.length, payload.length());
// test copyTo()
byte[] target = new byte[testData.length - 1];
try {
payload.copyTo(target, 0);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
target = new byte[testData.length + 3];
payload.copyTo(target, 3);
for (int i = 0; i < testData.length; i++) {
assertEquals(testData[i], target[i + 3]);
}
// test toByteArray()
target = payload.toByteArray();
assertByteArrayEquals(testData, target);
// test byteAt()
for (int i = 0; i < testData.length; i++) {
assertEquals(payload.byteAt(i), testData[i]);
}
try {
payload.byteAt(testData.length + 1);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
Payload clone = (Payload) payload.clone();
assertEquals(payload.length(), clone.length());
for (int i = 0; i < payload.length(); i++) {
assertEquals(payload.byteAt(i), clone.byteAt(i));
}
}
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
// payload bit in the FieldInfo
public void testPayloadFieldBit() throws Exception {
rnd = newRandom();
Directory ram = new RAMDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
Document d = new Document();
// this field won't have any payloads
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
// this field will have payloads in all docs, however not for all term positions,
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
// even if only some term positions have payloads
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
// enabled in only some documents
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
// only add payload data for field f2
analyzer.setPayloadData("f2", 1, "somedata".getBytes(), 0, 1);
writer.addDocument(d);
// flush
writer.close();
SegmentReader reader = SegmentReader.getOnlySegmentReader(ram);
FieldInfos fi = reader.fieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").storePayloads);
reader.close();
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
// enabled payloads for that field
writer = new IndexWriter(ram, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
d = new Document();
d.add(new Field("f1", "This field has no payloads", Field.Store.NO, Field.Index.ANALYZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
d.add(new Field("f2", "This field has payloads in all docs", Field.Store.NO, Field.Index.ANALYZED));
d.add(new Field("f3", "This field has payloads in some docs", Field.Store.NO, Field.Index.ANALYZED));
// add payload data for field f2 and f3
analyzer.setPayloadData("f2", "somedata".getBytes(), 0, 1);
analyzer.setPayloadData("f3", "somedata".getBytes(), 0, 3);
writer.addDocument(d);
// force merge
writer.optimize();
// flush
writer.close();
reader = SegmentReader.getOnlySegmentReader(ram);
fi = reader.fieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").storePayloads);
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").storePayloads);
reader.close();
}
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
public void testPayloadsEncoding() throws Exception {
rnd = newRandom();
// first perform the test using a RAMDirectory
Directory dir = new RAMDirectory();
performTest(dir);
// now use a FSDirectory and repeat same test
File dirName = _TestUtil.getTempDir("test_payloads");
dir = FSDirectory.open(dirName);
performTest(dir);
_TestUtil.rmDir(dirName);
}
// builds an index with payloads in the given Directory and performs
// different tests to verify the payload encoding
private void performTest(Directory dir) throws Exception {
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
// should be in sync with value in TermInfosWriter
final int skipInterval = 16;
final int numTerms = 5;
final String fieldName = "f1";
int numDocs = skipInterval + 1;
// create content for the test documents with just a few terms
Term[] terms = generateTerms(fieldName, numTerms);
StringBuffer sb = new StringBuffer();
for (int i = 0; i < terms.length; i++) {
sb.append(terms[i].text);
sb.append(" ");
}
String content = sb.toString();
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
byte[] payloadData = generateRandomData(payloadDataLength);
Document d = new Document();
d.add(new Field(fieldName, content, Field.Store.NO, Field.Index.ANALYZED));
// add the same document multiple times to have the same payload lengths for all
// occurrences within two consecutive skip intervals
int offset = 0;
for (int i = 0; i < 2 * numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
offset += numTerms;
writer.addDocument(d);
}
// make sure we create more than one segment to test merging
writer.flush();
// now we make sure to have different payload lengths next at the next skip point
for (int i = 0; i < numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, i);
offset += i * numTerms;
writer.addDocument(d);
}
writer.optimize();
// flush
writer.close();
/*
* Verify the index
* first we test if all payloads are stored correctly
*/
IndexReader reader = IndexReader.open(dir);
byte[] verifyPayloadData = new byte[payloadDataLength];
offset = 0;
TermPositions[] tps = new TermPositions[numTerms];
for (int i = 0; i < numTerms; i++) {
tps[i] = reader.termPositions(terms[i]);
}
while (tps[0].next()) {
for (int i = 1; i < numTerms; i++) {
tps[i].next();
}
int freq = tps[0].freq();
for (int i = 0; i < freq; i++) {
for (int j = 0; j < numTerms; j++) {
tps[j].nextPosition();
tps[j].getPayload(verifyPayloadData, offset);
offset += tps[j].getPayloadLength();
}
}
}
for (int i = 0; i < numTerms; i++) {
tps[i].close();
}
assertByteArrayEquals(payloadData, verifyPayloadData);
/*
* test lazy skipping
*/
TermPositions tp = reader.termPositions(terms[0]);
tp.next();
tp.nextPosition();
// now we don't read this payload
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
byte[] payload = tp.getPayload(null, 0);
assertEquals(payload[0], payloadData[numTerms]);
tp.nextPosition();
// we don't read this payload and skip to a different document
tp.skipTo(5);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
payload = tp.getPayload(null, 0);
assertEquals(payload[0], payloadData[5 * numTerms]);
/*
* Test different lengths at skip points
*/
tp.seek(terms[1]);
tp.next();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(2 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayloadLength());
tp.skipTo(3 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayloadLength());
/*
* Test multiple call of getPayload()
*/
tp.getPayload(null, 0);
try {
// it is forbidden to call getPayload() more than once
// without calling nextPosition()
tp.getPayload(null, 0);
fail("Expected exception not thrown");
} catch (Exception expected) {
// expected exception
}
reader.close();
// test long payload
analyzer = new PayloadAnalyzer();
writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
String singleTerm = "lucene";
d = new Document();
d.add(new Field(fieldName, singleTerm, Field.Store.NO, Field.Index.ANALYZED));
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
payloadData = generateRandomData(2000);
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
writer.addDocument(d);
writer.optimize();
// flush
writer.close();
reader = IndexReader.open(dir);
tp = reader.termPositions(new Term(fieldName, singleTerm));
tp.next();
tp.nextPosition();
verifyPayloadData = new byte[tp.getPayloadLength()];
tp.getPayload(verifyPayloadData, 0);
byte[] portion = new byte[1500];
System.arraycopy(payloadData, 100, portion, 0, 1500);
assertByteArrayEquals(portion, verifyPayloadData);
reader.close();
}
private Random rnd;
private void generateRandomData(byte[] data) {
rnd.nextBytes(data);
}
private byte[] generateRandomData(int n) {
byte[] data = new byte[n];
generateRandomData(data);
return data;
}
private Term[] generateTerms(String fieldName, int n) {
int maxDigits = (int) (Math.log(n) / Math.log(10));
Term[] terms = new Term[n];
StringBuffer sb = new StringBuffer();
for (int i = 0; i < n; i++) {
sb.setLength(0);
sb.append("t");
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
for (int j = 0; j < zeros; j++) {
sb.append("0");
}
sb.append(i);
terms[i] = new Term(fieldName, sb.toString());
}
return terms;
}
void assertByteArrayEquals(byte[] b1, byte[] b2) {
if (b1.length != b2.length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
}
}
}
/**
* This Analyzer uses an WhitespaceTokenizer and PayloadFilter.
*/
private static class PayloadAnalyzer extends Analyzer {
Map fieldToData = new HashMap();
void setPayloadData(String field, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(0, data, offset, length));
}
void setPayloadData(String field, int numFieldInstancesToSkip, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(numFieldInstancesToSkip, data, offset, length));
}
public TokenStream tokenStream(String fieldName, Reader reader) {
PayloadData payload = (PayloadData) fieldToData.get(fieldName);
TokenStream ts = new WhitespaceTokenizer(reader);
if (payload != null) {
if (payload.numFieldInstancesToSkip == 0) {
ts = new PayloadFilter(ts, payload.data, payload.offset, payload.length);
} else {
payload.numFieldInstancesToSkip--;
}
}
return ts;
}
private static class PayloadData {
byte[] data;
int offset;
int length;
int numFieldInstancesToSkip;
PayloadData(int skip, byte[] data, int offset, int length) {
numFieldInstancesToSkip = skip;
this.data = data;
this.offset = offset;
this.length = length;
}
}
}
/**
* This Filter adds payloads to the tokens.
*/
private static class PayloadFilter extends TokenFilter {
private byte[] data;
private int length;
private int offset;
Payload payload = new Payload();
PayloadAttribute payloadAtt;
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
super(in);
this.data = data;
this.length = length;
this.offset = offset;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
}
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (hasNext) {
if (offset + length <= data.length) {
Payload p = null;
if (p == null) {
p = new Payload();
payloadAtt.setPayload(p);
}
p.setData(data, offset, length);
offset += length;
} else {
payloadAtt.setPayload(null);
}
}
return hasNext;
}
}
public void testThreadSafety() throws Exception {
rnd = newRandom();
final int numThreads = 5;
final int numDocs = 50;
final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
Directory dir = new RAMDirectory();
final IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
final String field = "test";
Thread[] ingesters = new Thread[numThreads];
for (int i = 0; i < numThreads; i++) {
ingesters[i] = new Thread() {
public void run() {
try {
for (int j = 0; j < numDocs; j++) {
Document d = new Document();
d.add(new Field(field, new PoolingPayloadTokenStream(pool)));
writer.addDocument(d);
}
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
}
};
ingesters[i].start();
}
for (int i = 0; i < numThreads; i++) {
ingesters[i].join();
}
writer.close();
IndexReader reader = IndexReader.open(dir);
TermEnum terms = reader.terms();
while (terms.next()) {
TermPositions tp = reader.termPositions(terms.term());
while(tp.next()) {
int freq = tp.freq();
for (int i = 0; i < freq; i++) {
tp.nextPosition();
assertEquals(pool.bytesToString(tp.getPayload(new byte[5], 0)), terms.term().text);
}
}
tp.close();
}
terms.close();
reader.close();
assertEquals(pool.size(), numThreads);
}
private class PoolingPayloadTokenStream extends TokenStream {
private byte[] payload;
private boolean first;
private ByteArrayPool pool;
private String term;
TermAttribute termAtt;
PayloadAttribute payloadAtt;
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
generateRandomData(payload);
term = pool.bytesToString(payload);
first = true;
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public boolean incrementToken() throws IOException {
if (!first) return false;
first = false;
termAtt.setTermBuffer(term);
payloadAtt.setPayload(new Payload(payload));
return true;
}
public void close() throws IOException {
pool.release(payload);
}
}
private static class ByteArrayPool {
private List pool;
ByteArrayPool(int capacity, int size) {
pool = new ArrayList();
for (int i = 0; i < capacity; i++) {
pool.add(new byte[size]);
}
}
private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result();
synchronized String bytesToString(byte[] bytes) {
String s = new String(bytes);
UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result);
try {
return new String(utf8Result.result, 0, utf8Result.length, "UTF-8");
} catch (UnsupportedEncodingException uee) {
return null;
}
}
synchronized byte[] get() {
return (byte[]) pool.remove(0);
}
synchronized void release(byte[] b) {
pool.add(b);
}
synchronized int size() {
return pool.size();
}
}
}