blob: 1299fcb8e819e4892ad3c474e126594ee204c89d [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.invertedindex;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
import edu.uci.ics.hyracks.storage.am.common.api.PageAllocationException;
import edu.uci.ics.hyracks.storage.am.common.api.TreeIndexException;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.OccurrenceThresholdPanicException;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SearchResultCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.ConjunctiveSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.JaccardSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8WordTokenFactory;
/**
* The purpose of this test is to evaluate the performance of searches against
* an inverted index. First, we generate random <token, id> pairs sorted on
* token, which are bulk loaded into an inverted index. Next, we build random
* queries from a list of predefined tokens in the index, and measure the
* performance of executing them with different search modifiers. We test the
* ConjunctiveSearchModifier and the JaccardSearchModifier.
*
*/
public class SearchPerfTest extends AbstractInvIndexSearchTest {
protected List<String> tokens = new ArrayList<String>();
@Before
public void start() throws Exception {
super.start();
tokenFactory = new UTF8WordTokenFactory();
tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false,
tokenFactory);
searcher = new TOccurrenceSearcher(taskCtx, invIndex, tokenizer);
resultCursor = new SearchResultCursor(
searcher.createResultFrameTupleAccessor(),
searcher.createResultTupleReference());
loadData();
}
public void loadData() throws HyracksDataException, TreeIndexException, PageAllocationException {
tokens.add("compilers");
tokens.add("computer");
tokens.add("databases");
tokens.add("fast");
tokens.add("hyracks");
tokens.add("major");
tokens.add("science");
tokens.add("systems");
tokens.add("university");
for (int i = 0; i < tokens.size(); i++) {
checkInvLists.add(new ArrayList<Integer>());
}
// for generating length-skewed inverted lists
int addProb = 0;
int addProbStep = 10;
IInvertedListBuilder invListBuilder = new FixedSizeElementInvertedListBuilder(
invListTypeTraits);
InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(
invListBuilder, HYRACKS_FRAME_SIZE, BTree.DEFAULT_FILL_FACTOR);
int totalElements = 0;
for (int i = 0; i < tokens.size(); i++) {
addProb += addProbStep * (i + 1);
for (int j = 0; j < maxId; j++) {
if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
totalElements++;
tb.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(
tokens.get(i), dos);
tb.addFieldEndOffset();
IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
tb.addFieldEndOffset();
checkInvLists.get(i).add(j);
appender.reset(frame, true);
appender.append(tb.getFieldEndOffsets(), tb.getByteArray(),
0, tb.getSize());
tuple.reset(accessor, 0);
try {
invIndex.bulkLoadAddTuple(ctx, tuple);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
invIndex.endBulkLoad(ctx);
}
/**
* Determine the expected results with the ScanCount algorithm. The
* ScanCount algorithm is very simple, so we can be confident the results
* are correct.
*
*/
protected void fillExpectedResults(int[] queryTokenIndexes,
int numQueryTokens, int occurrenceThreshold) {
// reset scan count array
for (int i = 0; i < maxId; i++) {
scanCountArray[i] = 0;
}
// count occurrences
for (int i = 0; i < numQueryTokens; i++) {
ArrayList<Integer> list = checkInvLists.get(queryTokenIndexes[i]);
for (int j = 0; j < list.size(); j++) {
scanCountArray[list.get(j)]++;
}
}
// check threshold
expectedResults.clear();
for (int i = 0; i < maxId; i++) {
if (scanCountArray[i] >= occurrenceThreshold) {
expectedResults.add(i);
}
}
}
/**
* Generates a specified number of queries. Each query consists of a set of
* randomly chosen tokens that are picked from the pre-defined set of
* tokens. We run each query, measure it's time, and verify it's results
* against the results produced by ScanCount, implemented in
* fillExpectedResults().
*
*/
private void runQueries(IInvertedIndexSearchModifier searchModifier,
int numQueries) throws Exception {
rnd.setSeed(50);
// generate random queries
int[] queryTokenIndexes = new int[tokens.size()];
for (int i = 0; i < numQueries; i++) {
int numQueryTokens = Math.abs(rnd.nextInt() % tokens.size()) + 1;
for (int j = 0; j < numQueryTokens; j++) {
queryTokenIndexes[j] = Math.abs(rnd.nextInt() % tokens.size());
}
StringBuilder strBuilder = new StringBuilder();
for (int j = 0; j < numQueryTokens; j++) {
strBuilder.append(tokens.get(queryTokenIndexes[j]));
if (j + 1 != numQueryTokens)
strBuilder.append(" ");
}
String queryString = strBuilder.toString();
queryTb.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(queryString,
queryDos);
queryTb.addFieldEndOffset();
queryAppender.reset(frame, true);
queryAppender.append(queryTb.getFieldEndOffsets(),
queryTb.getByteArray(), 0, queryTb.getSize());
queryTuple.reset(queryAccessor, 0);
boolean panic = false;
int repeats = 1;
double totalTime = 0;
for (int j = 0; j < repeats; j++) {
long timeStart = System.currentTimeMillis();
try {
searcher.reset();
searcher.search(resultCursor, queryTuple, 0, searchModifier);
} catch (OccurrenceThresholdPanicException e) {
panic = true;
}
long timeEnd = System.currentTimeMillis();
totalTime += timeEnd - timeStart;
}
double avgTime = totalTime / (double) repeats;
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info(i + ": " + "\"" + queryString + "\": " + avgTime
+ "ms");
}
if (!panic) {
fillExpectedResults(queryTokenIndexes, numQueryTokens,
searcher.getOccurrenceThreshold());
// verify results
int checkIndex = 0;
while (resultCursor.hasNext()) {
resultCursor.next();
ITupleReference resultTuple = resultCursor.getTuple();
int id = IntegerSerializerDeserializer.getInt(
resultTuple.getFieldData(0),
resultTuple.getFieldStart(0));
Assert.assertEquals(expectedResults.get(checkIndex)
.intValue(), id);
checkIndex++;
}
if (expectedResults.size() != checkIndex) {
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("CHECKING");
}
StringBuilder expectedStrBuilder = new StringBuilder();
for (Integer x : expectedResults) {
expectedStrBuilder.append(x + " ");
}
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info(expectedStrBuilder.toString());
}
}
Assert.assertEquals(expectedResults.size(), checkIndex);
}
}
}
/**
* Runs 50 random conjunctive search queries to test the
* ConjunctiveSearchModifier.
*
*/
@Test
public void conjunctiveKeywordQueryTest() throws Exception {
IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
runQueries(searchModifier, 50);
}
/**
* Runs 50 random jaccard-based search queries with thresholds 1.0, 0.9,
* 0.8, 0.7, 0.6, 0.5. Tests the JaccardSearchModifier.
*
*/
@Test
public void jaccardKeywordQueryTest() throws Exception {
JaccardSearchModifier searchModifier = new JaccardSearchModifier(1.0f);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 1.0f);
}
searchModifier.setJaccThresh(1.0f);
runQueries(searchModifier, 50);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.9f);
}
searchModifier.setJaccThresh(0.9f);
runQueries(searchModifier, 50);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.8f);
}
searchModifier.setJaccThresh(0.8f);
runQueries(searchModifier, 50);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.7f);
}
searchModifier.setJaccThresh(0.7f);
runQueries(searchModifier, 50);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.6f);
}
searchModifier.setJaccThresh(0.6f);
runQueries(searchModifier, 50);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.5f);
}
searchModifier.setJaccThresh(0.5f);
runQueries(searchModifier, 50);
}
}