blob: ffb9a420667b2ce5905410a3f77721aaa6a015a9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestPayloads extends LuceneTestCase {
// Simple tests to test the Payload class
public void testPayload() throws Exception {
BytesRef payload = new BytesRef("This is a test!");
assertEquals("Wrong payload length.", "This is a test!".length(), payload.length);
BytesRef clone = payload.clone();
assertEquals(payload.length, clone.length);
for (int i = 0; i < payload.length; i++) {
assertEquals(payload.bytes[i + payload.offset], clone.bytes[i + clone.offset]);
}
}
// Tests whether the DocumentWriter and SegmentMerger correctly enable the
// payload bit in the FieldInfo
public void testPayloadFieldBit() throws Exception {
Directory ram = newDirectory();
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(ram, newIndexWriterConfig(analyzer));
Document d = new Document();
// this field won't have any payloads
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
// this field will have payloads in all docs, however not for all term positions,
// so this field is used to check if the DocumentWriter correctly enables the payloads bit
// even if only some term positions have payloads
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs NO PAYLOAD", Field.Store.NO));
// this field is used to verify if the SegmentMerger enables payloads for a field if it has payloads
// enabled in only some documents
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// only add payload data for field f2
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
writer.addDocument(d);
// flush
writer.close();
LeafReader reader = getOnlyLeafReader(DirectoryReader.open(ram));
FieldInfos fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
// now we add another document which has payloads for field f3 and verify if the SegmentMerger
// enabled payloads for that field
analyzer = new PayloadAnalyzer(); // Clear payload state for each field
writer = new IndexWriter(ram, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
d = new Document();
d.add(newTextField("f1", "This field has no payloads", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f2", "This field has payloads in all docs", Field.Store.NO));
d.add(newTextField("f3", "This field has payloads in some docs", Field.Store.NO));
// add payload data for field f2 and f3
analyzer.setPayloadData("f2", "somedata".getBytes(StandardCharsets.UTF_8), 0, 1);
analyzer.setPayloadData("f3", "somedata".getBytes(StandardCharsets.UTF_8), 0, 3);
writer.addDocument(d);
// force merge
writer.forceMerge(1);
// flush
writer.close();
reader = getOnlyLeafReader(DirectoryReader.open(ram));
fi = reader.getFieldInfos();
assertFalse("Payload field bit should not be set.", fi.fieldInfo("f1").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f2").hasPayloads());
assertTrue("Payload field bit should be set.", fi.fieldInfo("f3").hasPayloads());
reader.close();
ram.close();
}
// Tests if payloads are correctly stored and loaded using both RamDirectory and FSDirectory
public void testPayloadsEncoding() throws Exception {
Directory dir = newDirectory();
performTest(dir);
dir.close();
}
// builds an index with payloads in the given Directory and performs
// different tests to verify the payload encoding
private void performTest(Directory dir) throws Exception {
PayloadAnalyzer analyzer = new PayloadAnalyzer();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE)
.setMergePolicy(newLogMergePolicy()));
// should be in sync with value in TermInfosWriter
final int skipInterval = 16;
final int numTerms = 5;
final String fieldName = "f1";
int numDocs = skipInterval + 1;
// create content for the test documents with just a few terms
Term[] terms = generateTerms(fieldName, numTerms);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < terms.length; i++) {
sb.append(terms[i].text());
sb.append(" ");
}
String content = sb.toString();
int payloadDataLength = numTerms * numDocs * 2 + numTerms * numDocs * (numDocs - 1) / 2;
byte[] payloadData = generateRandomData(payloadDataLength);
Document d = new Document();
d.add(newTextField(fieldName, content, Field.Store.NO));
// add the same document multiple times to have the same payload lengths for all
// occurrences within two consecutive skip intervals
int offset = 0;
for (int i = 0; i < 2 * numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, 1);
offset += numTerms;
writer.addDocument(d);
}
// make sure we create more than one segment to test merging
writer.commit();
// now we make sure to have different payload lengths next at the next skip point
for (int i = 0; i < numDocs; i++) {
analyzer.setPayloadData(fieldName, payloadData, offset, i);
offset += i * numTerms;
writer.addDocument(d);
}
writer.forceMerge(1);
// flush
writer.close();
/*
* Verify the index
* first we test if all payloads are stored correctly
*/
IndexReader reader = DirectoryReader.open(dir);
byte[] verifyPayloadData = new byte[payloadDataLength];
offset = 0;
PostingsEnum[] tps = new PostingsEnum[numTerms];
for (int i = 0; i < numTerms; i++) {
tps[i] = MultiTerms.getTermPostingsEnum(reader,
terms[i].field(),
new BytesRef(terms[i].text()));
}
while (tps[0].nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 1; i < numTerms; i++) {
tps[i].nextDoc();
}
int freq = tps[0].freq();
for (int i = 0; i < freq; i++) {
for (int j = 0; j < numTerms; j++) {
tps[j].nextPosition();
BytesRef br = tps[j].getPayload();
if (br != null) {
System.arraycopy(br.bytes, br.offset, verifyPayloadData, offset, br.length);
offset += br.length;
}
}
}
}
assertByteArrayEquals(payloadData, verifyPayloadData);
/*
* test lazy skipping
*/
PostingsEnum tp = MultiTerms.getTermPostingsEnum(reader,
terms[0].field(),
new BytesRef(terms[0].text()));
tp.nextDoc();
tp.nextPosition();
// NOTE: prior rev of this test was failing to first
// call next here:
tp.nextDoc();
// now we don't read this payload
tp.nextPosition();
BytesRef payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[numTerms]);
tp.nextDoc();
tp.nextPosition();
// we don't read this payload and skip to a different document
tp.advance(5);
tp.nextPosition();
payload = tp.getPayload();
assertEquals("Wrong payload length.", 1, payload.length);
assertEquals(payload.bytes[payload.offset], payloadData[5 * numTerms]);
/*
* Test different lengths at skip points
*/
tp = MultiTerms.getTermPostingsEnum(reader,
terms[1].field(),
new BytesRef(terms[1].text()));
tp.nextDoc();
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(2 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 1, tp.getPayload().length);
tp.advance(3 * skipInterval - 1);
tp.nextPosition();
assertEquals("Wrong payload length.", 3 * skipInterval - 2 * numDocs - 1, tp.getPayload().length);
reader.close();
// test long payload
analyzer = new PayloadAnalyzer();
writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)
.setOpenMode(OpenMode.CREATE));
String singleTerm = "lucene";
d = new Document();
d.add(newTextField(fieldName, singleTerm, Field.Store.NO));
// add a payload whose length is greater than the buffer size of BufferedIndexOutput
payloadData = generateRandomData(2000);
analyzer.setPayloadData(fieldName, payloadData, 100, 1500);
writer.addDocument(d);
writer.forceMerge(1);
// flush
writer.close();
reader = DirectoryReader.open(dir);
tp = MultiTerms.getTermPostingsEnum(reader,
fieldName,
new BytesRef(singleTerm));
tp.nextDoc();
tp.nextPosition();
BytesRef br = tp.getPayload();
verifyPayloadData = new byte[br.length];
byte[] portion = new byte[1500];
System.arraycopy(payloadData, 100, portion, 0, 1500);
assertByteArrayEquals(portion, br.bytes, br.offset, br.length);
reader.close();
}
static final Charset utf8 = StandardCharsets.UTF_8;
private void generateRandomData(byte[] data) {
// this test needs the random data to be valid unicode
String s = TestUtil.randomFixedByteLengthUnicodeString(random(), data.length);
byte b[] = s.getBytes(utf8);
assert b.length == data.length;
System.arraycopy(b, 0, data, 0, b.length);
}
private byte[] generateRandomData(int n) {
byte[] data = new byte[n];
generateRandomData(data);
return data;
}
private Term[] generateTerms(String fieldName, int n) {
int maxDigits = (int) (Math.log(n) / Math.log(10));
Term[] terms = new Term[n];
StringBuilder sb = new StringBuilder();
for (int i = 0; i < n; i++) {
sb.setLength(0);
sb.append("t");
int zeros = maxDigits - (int) (Math.log(i) / Math.log(10));
for (int j = 0; j < zeros; j++) {
sb.append("0");
}
sb.append(i);
terms[i] = new Term(fieldName, sb.toString());
}
return terms;
}
void assertByteArrayEquals(byte[] b1, byte[] b2) {
if (b1.length != b2.length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2.length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[i]);
}
}
}
void assertByteArrayEquals(byte[] b1, byte[] b2, int b2offset, int b2length) {
if (b1.length != b2length) {
fail("Byte arrays have different lengths: " + b1.length + ", " + b2length);
}
for (int i = 0; i < b1.length; i++) {
if (b1[i] != b2[b2offset+i]) {
fail("Byte arrays different at index " + i + ": " + b1[i] + ", " + b2[b2offset+i]);
}
}
}
static class PayloadData {
byte[] data;
int offset;
int length;
PayloadData(byte[] data, int offset, int length) {
this.data = data;
this.offset = offset;
this.length = length;
}
}
/**
* This Analyzer uses an MockTokenizer and PayloadFilter.
*/
private static class PayloadAnalyzer extends Analyzer {
Map<String,PayloadData> fieldToData = new HashMap<>();
public PayloadAnalyzer() {
super(PER_FIELD_REUSE_STRATEGY);
}
public PayloadAnalyzer(String field, byte[] data, int offset, int length) {
super(PER_FIELD_REUSE_STRATEGY);
setPayloadData(field, data, offset, length);
}
void setPayloadData(String field, byte[] data, int offset, int length) {
fieldToData.put(field, new PayloadData(data, offset, length));
}
@Override
public TokenStreamComponents createComponents(String fieldName) {
PayloadData payload = fieldToData.get(fieldName);
Tokenizer ts = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream tokenStream = (payload != null) ?
new PayloadFilter(ts, fieldName, fieldToData) : ts;
return new TokenStreamComponents(ts, tokenStream);
}
}
/**
* This Filter adds payloads to the tokens.
*/
private static class PayloadFilter extends TokenFilter {
PayloadAttribute payloadAtt;
CharTermAttribute termAttribute;
private Map<String,PayloadData> fieldToData;
private String fieldName;
private PayloadData payloadData;
private int offset;
public PayloadFilter(TokenStream in, String fieldName, Map<String,PayloadData> fieldToData) {
super(in);
this.fieldToData = fieldToData;
this.fieldName = fieldName;
payloadAtt = addAttribute(PayloadAttribute.class);
termAttribute = addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
boolean hasNext = input.incrementToken();
if (!hasNext) {
return false;
}
// Some values of the same field are to have payloads and others not
if (offset + payloadData.length <= payloadData.data.length && !termAttribute.toString().endsWith("NO PAYLOAD")) {
BytesRef p = new BytesRef(payloadData.data, offset, payloadData.length);
payloadAtt.setPayload(p);
offset += payloadData.length;
} else {
payloadAtt.setPayload(null);
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.payloadData = fieldToData.get(fieldName);
this.offset = payloadData.offset;
}
}
public void testThreadSafety() throws Exception {
final int numThreads = 5;
final int numDocs = atLeast(50);
final ByteArrayPool pool = new ByteArrayPool(numThreads, 5);
Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
final String field = "test";
Thread[] ingesters = new Thread[numThreads];
for (int i = 0; i < numThreads; i++) {
ingesters[i] = new Thread() {
@Override
public void run() {
try {
for (int j = 0; j < numDocs; j++) {
Document d = new Document();
d.add(new TextField(field, new PoolingPayloadTokenStream(pool)));
writer.addDocument(d);
}
} catch (Exception e) {
e.printStackTrace();
fail(e.toString());
}
}
};
ingesters[i].start();
}
for (int i = 0; i < numThreads; i++) {
ingesters[i].join();
}
writer.close();
IndexReader reader = DirectoryReader.open(dir);
TermsEnum terms = MultiTerms.getTerms(reader, field).iterator();
PostingsEnum tp = null;
while (terms.next() != null) {
String termText = terms.term().utf8ToString();
tp = terms.postings(tp, PostingsEnum.PAYLOADS);
while(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = tp.freq();
for (int i = 0; i < freq; i++) {
tp.nextPosition();
final BytesRef payload = tp.getPayload();
assertEquals(termText, payload.utf8ToString());
}
}
}
reader.close();
dir.close();
assertEquals(pool.size(), numThreads);
}
private class PoolingPayloadTokenStream extends TokenStream {
private byte[] payload;
private boolean first;
private ByteArrayPool pool;
private String term;
CharTermAttribute termAtt;
PayloadAttribute payloadAtt;
PoolingPayloadTokenStream(ByteArrayPool pool) {
this.pool = pool;
payload = pool.get();
generateRandomData(payload);
term = new String(payload, 0, payload.length, utf8);
first = true;
payloadAtt = addAttribute(PayloadAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!first) return false;
first = false;
clearAttributes();
termAtt.append(term);
payloadAtt.setPayload(new BytesRef(payload));
return true;
}
@Override
public void close() throws IOException {
pool.release(payload);
}
}
private static class ByteArrayPool {
private List<byte[]> pool;
ByteArrayPool(int capacity, int size) {
pool = new ArrayList<>();
for (int i = 0; i < capacity; i++) {
pool.add(new byte[size]);
}
}
synchronized byte[] get() {
return pool.remove(0);
}
synchronized void release(byte[] b) {
pool.add(b);
}
synchronized int size() {
return pool.size();
}
}
public void testAcrossFields() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir,
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
Document doc = new Document();
doc.add(new TextField("hasMaybepayload", "here we go", Field.Store.YES));
writer.addDocument(doc);
writer.close();
writer = new RandomIndexWriter(random(), dir,
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
doc = new Document();
doc.add(new TextField("hasMaybepayload2", "here we go", Field.Store.YES));
writer.addDocument(doc);
writer.addDocument(doc);
writer.forceMerge(1);
writer.close();
dir.close();
}
/** some docs have payload att, some not */
public void testMixupDocs() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(null);
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, iwc);
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
writer.addDocument(doc);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
field.setTokenStream(ts);
writer.addDocument(doc);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("another"));
field.setTokenStream(ts);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
TermsEnum te = MultiTerms.getTerms(reader, "field").iterator();
assertTrue(te.seekExact(new BytesRef("withPayload")));
PostingsEnum de = te.postings(null, PostingsEnum.PAYLOADS);
de.nextDoc();
de.nextPosition();
assertEquals(new BytesRef("test"), de.getPayload());
writer.close();
reader.close();
dir.close();
}
/** some field instances have payload att, some not */
public void testMixupMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
Field field = new TextField("field", "", Field.Store.NO);
TokenStream ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("here we go"));
field.setTokenStream(ts);
doc.add(field);
Field field2 = new TextField("field", "", Field.Store.NO);
Token withPayload = new Token("withPayload", 0, 11);
withPayload.setPayload(new BytesRef("test"));
ts = new CannedTokenStream(withPayload);
assertTrue(ts.hasAttribute(PayloadAttribute.class));
field2.setTokenStream(ts);
doc.add(field2);
Field field3 = new TextField("field", "", Field.Store.NO);
ts = new MockTokenizer(MockTokenizer.WHITESPACE, true);
((Tokenizer)ts).setReader(new StringReader("nopayload"));
field3.setTokenStream(ts);
doc.add(field3);
writer.addDocument(doc);
DirectoryReader reader = writer.getReader();
LeafReader sr = getOnlyLeafReader(reader);
PostingsEnum de = sr.postings(new Term("field", "withPayload"), PostingsEnum.PAYLOADS);
de.nextDoc();
de.nextPosition();
assertEquals(new BytesRef("test"), de.getPayload());
writer.close();
reader.close();
dir.close();
}
}