blob: eb2b39c41c43ba86cd2bd8753b2041bef089d098 [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.invertedindex;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.logging.Level;
import org.junit.Before;
import org.junit.Test;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
import edu.uci.ics.hyracks.storage.am.common.api.PageAllocationException;
import edu.uci.ics.hyracks.storage.am.common.api.TreeIndexException;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.OccurrenceThresholdPanicException;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SearchResultCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.ConjunctiveSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.EditDistanceSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.JaccardSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;
public class SearchTest extends AbstractInvIndexSearchTest {
protected List<String> dataStrings = new ArrayList<String>();
protected List<String> firstNames = new ArrayList<String>();
protected List<String> lastNames = new ArrayList<String>();
@Before
public void start() throws Exception {
super.start();
tokenFactory = new UTF8NGramTokenFactory();
tokenizer = new NGramUTF8StringBinaryTokenizer(3, false, true, false,
tokenFactory);
searcher = new TOccurrenceSearcher(taskCtx, invIndex, tokenizer);
resultCursor = new SearchResultCursor(
searcher.createResultFrameTupleAccessor(),
searcher.createResultTupleReference());
generateDataStrings();
loadData();
}
public void generateDataStrings() {
firstNames.add("Kathrin");
firstNames.add("Cathrin");
firstNames.add("Kathryn");
firstNames.add("Cathryn");
firstNames.add("Kathrine");
firstNames.add("Cathrine");
firstNames.add("Kathryne");
firstNames.add("Cathryne");
firstNames.add("Katherin");
firstNames.add("Catherin");
firstNames.add("Katheryn");
firstNames.add("Catheryn");
firstNames.add("Katherine");
firstNames.add("Catherine");
firstNames.add("Katheryne");
firstNames.add("Catheryne");
firstNames.add("John");
firstNames.add("Jack");
firstNames.add("Jonathan");
firstNames.add("Nathan");
lastNames.add("Miller");
lastNames.add("Myller");
lastNames.add("Keller");
lastNames.add("Ketler");
lastNames.add("Muller");
lastNames.add("Fuller");
lastNames.add("Smith");
lastNames.add("Smyth");
lastNames.add("Smithe");
lastNames.add("Smythe");
// Generate all 'firstName lastName' combinations as data strings
for (String f : firstNames) {
for (String l : lastNames) {
dataStrings.add(f + " " + l);
}
}
}
private class TokenIdPair implements Comparable<TokenIdPair> {
public ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
public DataOutputStream dos = new DataOutputStream(baaos);
public int id;
TokenIdPair(IToken token, int id) throws IOException {
token.serializeToken(dos);
this.id = id;
}
@Override
public int compareTo(TokenIdPair o) {
int cmp = btreeBinCmps[0].compare(baaos.getByteArray(), 0,
baaos.getByteArray().length, o.baaos.getByteArray(), 0,
o.baaos.getByteArray().length);
if (cmp == 0) {
return id - o.id;
} else {
return cmp;
}
}
}
public void loadData() throws IOException, TreeIndexException, PageAllocationException {
List<TokenIdPair> pairs = new ArrayList<TokenIdPair>();
// generate pairs for subsequent sorting and bulk-loading
int id = 0;
for (String s : dataStrings) {
ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
DataOutputStream dos = new DataOutputStream(baaos);
UTF8StringSerializerDeserializer.INSTANCE.serialize(s, dos);
tokenizer.reset(baaos.getByteArray(), 0, baaos.size());
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
IToken token = tokenizer.getToken();
pairs.add(new TokenIdPair(token, id));
++tokenCount;
}
++id;
}
Collections.sort(pairs);
// bulk load index
IInvertedListBuilder invListBuilder = new FixedSizeElementInvertedListBuilder(
invListTypeTraits);
InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(
invListBuilder, HYRACKS_FRAME_SIZE, BTree.DEFAULT_FILL_FACTOR);
for (TokenIdPair t : pairs) {
tb.reset();
tb.addField(t.baaos.getByteArray(), 0,
t.baaos.getByteArray().length);
IntegerSerializerDeserializer.INSTANCE.serialize(t.id, dos);
tb.addFieldEndOffset();
appender.reset(frame, true);
appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0,
tb.getSize());
tuple.reset(accessor, 0);
try {
invIndex.bulkLoadAddTuple(ctx, tuple);
} catch (Exception e) {
e.printStackTrace();
}
}
invIndex.endBulkLoad(ctx);
}
/**
* Runs a specified number of randomly picked strings from dataStrings as
* queries. We run each query, measure it's time, and print it's results.
*
*/
private void runQueries(IInvertedIndexSearchModifier searchModifier,
int numQueries) throws Exception {
rnd.setSeed(50);
for (int i = 0; i < numQueries; i++) {
int queryIndex = Math.abs(rnd.nextInt() % dataStrings.size());
String queryString = dataStrings.get(queryIndex);
queryTb.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(queryString,
queryDos);
queryTb.addFieldEndOffset();
queryAppender.reset(frame, true);
queryAppender.append(queryTb.getFieldEndOffsets(),
queryTb.getByteArray(), 0, queryTb.getSize());
queryTuple.reset(queryAccessor, 0);
int repeats = 1;
double totalTime = 0;
for (int j = 0; j < repeats; j++) {
long timeStart = System.currentTimeMillis();
try {
searcher.reset();
searcher.search(resultCursor, queryTuple, 0, searchModifier);
} catch (OccurrenceThresholdPanicException e) {
// ignore panic queries
}
long timeEnd = System.currentTimeMillis();
totalTime += timeEnd - timeStart;
}
double avgTime = totalTime / (double) repeats;
StringBuilder strBuilder = new StringBuilder();
strBuilder.append(i + ": " + "\"" + queryString + "\": " + avgTime
+ "ms" + "\n");
strBuilder.append("CANDIDATE RESULTS:\n");
while (resultCursor.hasNext()) {
resultCursor.next();
ITupleReference resultTuple = resultCursor.getTuple();
int id = IntegerSerializerDeserializer.getInt(
resultTuple.getFieldData(0),
resultTuple.getFieldStart(0));
strBuilder.append(id + " " + dataStrings.get(id));
strBuilder.append('\n');
}
// remove trailing newline
strBuilder.deleteCharAt(strBuilder.length() - 1);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info(strBuilder.toString());
}
}
}
/**
* Runs 5 random conjunctive search queries to test the
* ConjunctiveSearchModifier.
*
*/
@Test
public void conjunctiveQueryTest() throws Exception {
IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
runQueries(searchModifier, 5);
}
/**
* Runs 5 random jaccard-based search queries with thresholds 0.9, 0.8, 0.7.
* Tests the JaccardSearchModifier.
*
*/
@Test
public void jaccardQueryTest() throws Exception {
JaccardSearchModifier searchModifier = new JaccardSearchModifier(1.0f);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.9f);
}
searchModifier.setJaccThresh(0.9f);
runQueries(searchModifier, 5);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.8f);
}
searchModifier.setJaccThresh(0.8f);
runQueries(searchModifier, 5);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("JACCARD: " + 0.7f);
}
searchModifier.setJaccThresh(0.7f);
runQueries(searchModifier, 5);
}
/**
* Runs 5 random edit-distance based search queries with thresholds 1, 2, 3.
* Tests the EditDistanceSearchModifier.
*
*/
@Test
public void editDistanceQueryTest() throws Exception {
EditDistanceSearchModifier searchModifier = new EditDistanceSearchModifier(
3, 0);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("EDIT DISTANCE: " + 1);
}
searchModifier.setEdThresh(1);
runQueries(searchModifier, 5);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("EDIT DISTANCE: " + 2);
}
searchModifier.setEdThresh(2);
runQueries(searchModifier, 5);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.info("EDIT DISTANCE: " + 3);
}
searchModifier.setEdThresh(3);
runQueries(searchModifier, 5);
}
}