blob: 97f78f34c5319d62fb218c5d1874a28e591e9cab [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.SortedSet;
import java.util.TreeSet;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.ShortSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.OrderedIndexTestUtils;
import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
import edu.uci.ics.hyracks.storage.am.common.CheckTuple;
import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoader;
import edu.uci.ics.hyracks.storage.am.common.api.IIndexCursor;
import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.datagen.DocumentStringFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.IFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.PersonNameFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.SortedIntegerFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.common.tuples.PermutingTupleReference;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common.LSMInvertedIndexTestHarness;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizerFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.LSMInvertedIndexTestContext.InvertedIndexType;
@SuppressWarnings("rawtypes")
public class LSMInvertedIndexTestUtils {
public static final int TEST_GRAM_LENGTH = 3;
public static TupleGenerator createStringDocumentTupleGen(Random rnd) throws IOException {
IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
return tupleGen;
}
public static TupleGenerator createPersonNamesTupleGen(Random rnd) throws IOException {
IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
fieldGens[0] = new PersonNameFieldValueGenerator(rnd, 0.5f);
fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
return tupleGen;
}
private static ISerializerDeserializer[] getNonHashedIndexFieldSerdes(InvertedIndexType invIndexType)
throws IndexException {
ISerializerDeserializer[] fieldSerdes = null;
switch (invIndexType) {
case INMEMORY:
case ONDISK:
case LSM: {
fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
IntegerSerializerDeserializer.INSTANCE };
break;
}
case PARTITIONED_INMEMORY:
case PARTITIONED_ONDISK:
case PARTITIONED_LSM: {
// Such indexes also include the set-size for partitioning.
fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
ShortSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
break;
}
default: {
throw new IndexException("Unhandled inverted index type '" + invIndexType + "'.");
}
}
return fieldSerdes;
}
private static ISerializerDeserializer[] getHashedIndexFieldSerdes(InvertedIndexType invIndexType)
throws IndexException {
ISerializerDeserializer[] fieldSerdes = null;
switch (invIndexType) {
case INMEMORY:
case ONDISK:
case LSM: {
fieldSerdes = new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE,
IntegerSerializerDeserializer.INSTANCE };
break;
}
case PARTITIONED_INMEMORY:
case PARTITIONED_ONDISK:
case PARTITIONED_LSM: {
// Such indexes also include the set-size for partitioning.
fieldSerdes = new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE,
ShortSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
break;
}
default: {
throw new IndexException("Unhandled inverted index type '" + invIndexType + "'.");
}
}
return fieldSerdes;
}
public static LSMInvertedIndexTestContext createWordInvIndexTestContext(LSMInvertedIndexTestHarness harness,
InvertedIndexType invIndexType) throws IOException, IndexException {
ISerializerDeserializer[] fieldSerdes = getNonHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new UTF8WordTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
fieldSerdes.length - 1, tokenizerFactory, invIndexType);
return testCtx;
}
public static LSMInvertedIndexTestContext createHashedWordInvIndexTestContext(LSMInvertedIndexTestHarness harness,
InvertedIndexType invIndexType) throws IOException, IndexException {
ISerializerDeserializer[] fieldSerdes = getHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
fieldSerdes.length - 1, tokenizerFactory, invIndexType);
return testCtx;
}
public static LSMInvertedIndexTestContext createNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness,
InvertedIndexType invIndexType) throws IOException, IndexException {
ISerializerDeserializer[] fieldSerdes = getNonHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new UTF8NGramTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true,
true, false, tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
fieldSerdes.length - 1, tokenizerFactory, invIndexType);
return testCtx;
}
public static LSMInvertedIndexTestContext createHashedNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness,
InvertedIndexType invIndexType) throws IOException, IndexException {
ISerializerDeserializer[] fieldSerdes = getHashedIndexFieldSerdes(invIndexType);
ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true,
true, false, tokenFactory);
LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
fieldSerdes.length - 1, tokenizerFactory, invIndexType);
return testCtx;
}
public static void bulkLoadInvIndex(LSMInvertedIndexTestContext testCtx, TupleGenerator tupleGen, int numDocs)
throws IndexException, IOException {
SortedSet<CheckTuple> tmpMemIndex = new TreeSet<CheckTuple>();
// First generate the expected index by inserting the documents one-by-one.
for (int i = 0; i < numDocs; i++) {
ITupleReference tuple = tupleGen.next();
testCtx.insertCheckTuples(tuple, tmpMemIndex);
}
ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
// Use the expected index to bulk-load the actual index.
IIndexBulkLoader bulkLoader = testCtx.getIndex().createBulkLoader(1.0f, false, numDocs);
ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(testCtx.getFieldSerdes().length);
ArrayTupleReference tuple = new ArrayTupleReference();
Iterator<CheckTuple> checkTupleIter = tmpMemIndex.iterator();
while (checkTupleIter.hasNext()) {
CheckTuple checkTuple = checkTupleIter.next();
OrderedIndexTestUtils.createTupleFromCheckTuple(checkTuple, tupleBuilder, tuple, fieldSerdes);
bulkLoader.add(tuple);
}
bulkLoader.end();
// Add all check tuples from the temp index to the text context.
testCtx.getCheckTuples().addAll(tmpMemIndex);
}
public static void insertIntoInvIndex(LSMInvertedIndexTestContext testCtx, TupleGenerator tupleGen, int numDocs)
throws IOException, IndexException {
// InMemoryInvertedIndex only supports insert.
for (int i = 0; i < numDocs; i++) {
ITupleReference tuple = tupleGen.next();
testCtx.getIndexAccessor().insert(tuple);
testCtx.insertCheckTuples(tuple, testCtx.getCheckTuples());
}
}
public static void deleteFromInvIndex(LSMInvertedIndexTestContext testCtx, Random rnd, int numDocsToDelete)
throws HyracksDataException, IndexException {
List<ITupleReference> documentCorpus = testCtx.getDocumentCorpus();
for (int i = 0; i < numDocsToDelete && !documentCorpus.isEmpty(); i++) {
int size = documentCorpus.size();
int tupleIndex = Math.abs(rnd.nextInt()) % size;
ITupleReference deleteTuple = documentCorpus.get(tupleIndex);
testCtx.getIndexAccessor().delete(deleteTuple);
testCtx.deleteCheckTuples(deleteTuple, testCtx.getCheckTuples());
// Swap tupleIndex with last element.
documentCorpus.set(tupleIndex, documentCorpus.get(size - 1));
documentCorpus.remove(size - 1);
}
}
/**
* Compares actual and expected indexes using the rangeSearch() method of the inverted-index accessor.
*/
public static void compareActualAndExpectedIndexesRangeSearch(LSMInvertedIndexTestContext testCtx)
throws HyracksDataException, IndexException {
IInvertedIndex invIndex = (IInvertedIndex) testCtx.getIndex();
int tokenFieldCount = invIndex.getTokenTypeTraits().length;
int invListFieldCount = invIndex.getInvListTypeTraits().length;
IInvertedIndexAccessor invIndexAccessor = (IInvertedIndexAccessor) invIndex.createAccessor(
NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
IIndexCursor invIndexCursor = invIndexAccessor.createRangeSearchCursor();
MultiComparator tokenCmp = MultiComparator.create(invIndex.getTokenCmpFactories());
IBinaryComparatorFactory[] tupleCmpFactories = new IBinaryComparatorFactory[tokenFieldCount + invListFieldCount];
for (int i = 0; i < tokenFieldCount; i++) {
tupleCmpFactories[i] = invIndex.getTokenCmpFactories()[i];
}
for (int i = 0; i < invListFieldCount; i++) {
tupleCmpFactories[tokenFieldCount + i] = invIndex.getInvListCmpFactories()[i];
}
MultiComparator tupleCmp = MultiComparator.create(tupleCmpFactories);
RangePredicate nullPred = new RangePredicate(null, null, true, true, tokenCmp, tokenCmp);
invIndexAccessor.rangeSearch(invIndexCursor, nullPred);
// Helpers for generating a serialized inverted-list element from a CheckTuple from the expected index.
ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
ArrayTupleBuilder expectedBuilder = new ArrayTupleBuilder(fieldSerdes.length);
ArrayTupleReference expectedTuple = new ArrayTupleReference();
Iterator<CheckTuple> expectedIter = testCtx.getCheckTuples().iterator();
// Compare index elements.
try {
while (invIndexCursor.hasNext() && expectedIter.hasNext()) {
invIndexCursor.next();
ITupleReference actualTuple = invIndexCursor.getTuple();
CheckTuple expected = expectedIter.next();
OrderedIndexTestUtils.createTupleFromCheckTuple(expected, expectedBuilder, expectedTuple, fieldSerdes);
if (tupleCmp.compare(actualTuple, expectedTuple) != 0) {
fail("Index entries differ for token '" + expected.getField(0) + "'.");
}
}
if (expectedIter.hasNext()) {
fail("Indexes do not match. Actual index is missing entries.");
}
if (invIndexCursor.hasNext()) {
fail("Indexes do not match. Actual index contains too many entries.");
}
} finally {
invIndexCursor.close();
}
}
/**
* Compares actual and expected indexes by comparing their inverted-lists one by one. Exercises the openInvertedListCursor() method of the inverted-index accessor.
*/
@SuppressWarnings("unchecked")
public static void compareActualAndExpectedIndexes(LSMInvertedIndexTestContext testCtx)
throws HyracksDataException, IndexException {
IInvertedIndex invIndex = (IInvertedIndex) testCtx.getIndex();
ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
MultiComparator invListCmp = MultiComparator.create(invIndex.getInvListCmpFactories());
IInvertedIndexAccessor invIndexAccessor = (IInvertedIndexAccessor) testCtx.getIndexAccessor();
int tokenFieldCount = invIndex.getTokenTypeTraits().length;
int invListFieldCount = invIndex.getInvListTypeTraits().length;
// All tokens that were inserted into the indexes.
Iterator<Comparable> tokensIter = testCtx.getAllTokens().iterator();
// Search key for finding an inverted-list in the actual index.
ArrayTupleBuilder searchKeyBuilder = new ArrayTupleBuilder(tokenFieldCount);
ArrayTupleReference searchKey = new ArrayTupleReference();
// Cursor over inverted list from actual index.
IInvertedListCursor actualInvListCursor = invIndexAccessor.createInvertedListCursor();
// Helpers for generating a serialized inverted-list element from a CheckTuple from the expected index.
ArrayTupleBuilder expectedBuilder = new ArrayTupleBuilder(fieldSerdes.length);
// Includes the token fields.
ArrayTupleReference completeExpectedTuple = new ArrayTupleReference();
// Field permutation and permuting tuple reference to strip away token fields from completeExpectedTuple.
int[] fieldPermutation = new int[invListFieldCount];
for (int i = 0; i < fieldPermutation.length; i++) {
fieldPermutation[i] = tokenFieldCount + i;
}
PermutingTupleReference expectedTuple = new PermutingTupleReference(fieldPermutation);
// Iterate over all tokens. Find the inverted-lists in actual and expected indexes. Compare the inverted lists,
while (tokensIter.hasNext()) {
Comparable token = tokensIter.next();
// Position inverted-list iterator on expected index.
CheckTuple checkLowKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
checkLowKey.appendField(token);
CheckTuple checkHighKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
checkHighKey.appendField(token);
SortedSet<CheckTuple> expectedInvList = OrderedIndexTestUtils.getPrefixExpectedSubset(
testCtx.getCheckTuples(), checkLowKey, checkHighKey);
Iterator<CheckTuple> expectedInvListIter = expectedInvList.iterator();
// Position inverted-list cursor in actual index.
OrderedIndexTestUtils.createTupleFromCheckTuple(checkLowKey, searchKeyBuilder, searchKey, fieldSerdes);
invIndexAccessor.openInvertedListCursor(actualInvListCursor, searchKey);
if (actualInvListCursor.size() != expectedInvList.size()) {
fail("Actual and expected inverted lists for token '" + token.toString()
+ "' have different sizes. Actual size: " + actualInvListCursor.size() + ". Expected size: "
+ expectedInvList.size() + ".");
}
// Compare inverted-list elements.
int count = 0;
actualInvListCursor.pinPages();
try {
while (actualInvListCursor.hasNext() && expectedInvListIter.hasNext()) {
actualInvListCursor.next();
ITupleReference actual = actualInvListCursor.getTuple();
CheckTuple expected = expectedInvListIter.next();
OrderedIndexTestUtils.createTupleFromCheckTuple(expected, expectedBuilder, completeExpectedTuple,
fieldSerdes);
expectedTuple.reset(completeExpectedTuple);
if (invListCmp.compare(actual, expectedTuple) != 0) {
fail("Inverted lists of token '" + token + "' differ at position " + count + ".");
}
count++;
}
} finally {
actualInvListCursor.unpinPages();
}
}
}
/**
* Determine the expected results with the simple ScanCount algorithm.
*/
public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples,
ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde,
IInvertedIndexSearchModifier searchModifier, List<Integer> expectedResults, InvertedIndexType invIndexType)
throws IOException {
boolean isPartitioned = false;
switch (invIndexType) {
case INMEMORY:
case ONDISK:
case LSM: {
isPartitioned = false;
break;
}
case PARTITIONED_INMEMORY:
case PARTITIONED_ONDISK:
case PARTITIONED_LSM: {
isPartitioned = true;
break;
}
}
getExpectedResults(scanCountArray, checkTuples, searchDocument, tokenizer, tokenSerde, searchModifier,
expectedResults, isPartitioned);
}
@SuppressWarnings("unchecked")
public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples,
ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde,
IInvertedIndexSearchModifier searchModifier, List<Integer> expectedResults, boolean isPartitioned)
throws IOException {
// Reset scan count array.
Arrays.fill(scanCountArray, 0);
expectedResults.clear();
GrowableArray tokenData = new GrowableArray();
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0),
searchDocument.getFieldLength(0));
// Run though tokenizer to get number of tokens.
int numQueryTokens = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
numQueryTokens++;
}
short numTokensLowerBound = -1;
short numTokensUpperBound = -1;
int invListElementField = 1;
if (isPartitioned) {
numTokensLowerBound = searchModifier.getNumTokensLowerBound((short) numQueryTokens);
numTokensUpperBound = searchModifier.getNumTokensUpperBound((short) numQueryTokens);
invListElementField = 2;
}
int occurrenceThreshold = searchModifier.getOccurrenceThreshold(numQueryTokens);
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0),
searchDocument.getFieldLength(0));
while (tokenizer.hasNext()) {
tokenizer.next();
IToken token = tokenizer.getToken();
tokenData.reset();
token.serializeToken(tokenData);
ByteArrayInputStream inStream = new ByteArrayInputStream(tokenData.getByteArray(), 0, tokenData.getLength());
DataInput dataIn = new DataInputStream(inStream);
Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
CheckTuple lowKey;
if (numTokensLowerBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
lowKey = new CheckTuple(1, 1);
lowKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
lowKey = new CheckTuple(2, 2);
lowKey.appendField(tokenObj);
lowKey.appendField(Short.valueOf(numTokensLowerBound));
}
CheckTuple highKey;
if (numTokensUpperBound < 0) {
// Index is not partitioned, or no length filtering is possible for this search modifier.
highKey = new CheckTuple(1, 1);
highKey.appendField(tokenObj);
} else {
// Index is length partitioned, and search modifier supports length filtering.
highKey = new CheckTuple(2, 2);
highKey.appendField(tokenObj);
highKey.appendField(Short.valueOf(numTokensUpperBound));
}
// Get view over check tuples containing inverted-list corresponding to token.
SortedSet<CheckTuple> invList = OrderedIndexTestUtils.getPrefixExpectedSubset(checkTuples, lowKey, highKey);
Iterator<CheckTuple> invListIter = invList.iterator();
// Iterate over inverted list and update scan count array.
while (invListIter.hasNext()) {
CheckTuple checkTuple = invListIter.next();
Integer element = (Integer) checkTuple.getField(invListElementField);
scanCountArray[element]++;
}
}
// Run through scan count array, and see whether elements satisfy the given occurrence threshold.
expectedResults.clear();
for (int i = 0; i < scanCountArray.length; i++) {
if (scanCountArray[i] >= occurrenceThreshold) {
expectedResults.add(i);
}
}
}
public static void testIndexSearch(LSMInvertedIndexTestContext testCtx, TupleGenerator tupleGen, Random rnd,
int numDocQueries, int numRandomQueries, IInvertedIndexSearchModifier searchModifier, int[] scanCountArray)
throws IOException, IndexException {
IInvertedIndex invIndex = testCtx.invIndex;
IInvertedIndexAccessor accessor = (IInvertedIndexAccessor) invIndex.createAccessor(
NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
IBinaryTokenizer tokenizer = testCtx.getTokenizerFactory().createTokenizer();
InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
List<ITupleReference> documentCorpus = testCtx.getDocumentCorpus();
// Project away the primary-key field.
int[] fieldPermutation = new int[] { 0 };
PermutingTupleReference searchDocument = new PermutingTupleReference(fieldPermutation);
IIndexCursor resultCursor = accessor.createSearchCursor();
int numQueries = numDocQueries + numRandomQueries;
for (int i = 0; i < numQueries; i++) {
// If number of documents in the corpus is less than numDocQueries, then replace the remaining ones with random queries.
if (i >= numDocQueries || i >= documentCorpus.size()) {
// Generate a random query.
ITupleReference randomQuery = tupleGen.next();
searchDocument.reset(randomQuery);
} else {
// Pick a random document from the corpus to use as the search query.
int queryIndex = Math.abs(rnd.nextInt() % documentCorpus.size());
searchDocument.reset(documentCorpus.get(queryIndex));
}
// Set query tuple in search predicate.
searchPred.setQueryTuple(searchDocument);
searchPred.setQueryFieldIndex(0);
resultCursor.reset();
boolean panic = false;
try {
accessor.search(resultCursor, searchPred);
} catch (OccurrenceThresholdPanicException e) {
// ignore panic queries.
panic = true;
}
try {
if (!panic) {
// Consume cursor and deserialize results so we can sort them. Some search cursors may not deliver the result sorted (e.g., LSM search cursor).
ArrayList<Integer> actualResults = new ArrayList<Integer>();
try {
while (resultCursor.hasNext()) {
resultCursor.next();
ITupleReference resultTuple = resultCursor.getTuple();
int actual = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
resultTuple.getFieldStart(0));
actualResults.add(Integer.valueOf(actual));
}
} catch (OccurrenceThresholdPanicException e) {
// Ignore panic queries.
continue;
}
Collections.sort(actualResults);
// Get expected results.
List<Integer> expectedResults = new ArrayList<Integer>();
LSMInvertedIndexTestUtils.getExpectedResults(scanCountArray, testCtx.getCheckTuples(),
searchDocument, tokenizer, testCtx.getFieldSerdes()[0], searchModifier, expectedResults,
testCtx.getInvertedIndexType());
Iterator<Integer> expectedIter = expectedResults.iterator();
Iterator<Integer> actualIter = actualResults.iterator();
while (expectedIter.hasNext() && actualIter.hasNext()) {
int expected = expectedIter.next();
int actual = actualIter.next();
if (actual != expected) {
fail("Query results do not match. Encountered: " + actual + ". Expected: " + expected + "");
}
}
if (expectedIter.hasNext()) {
fail("Query results do not match. Actual results missing.");
}
if (actualIter.hasNext()) {
fail("Query results do not match. Actual contains too many results.");
}
}
} finally {
resultCursor.close();
}
}
}
}