blob: bdd8c4a49b9a5857ad0052ef1b1991b0aa0e0f0a [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.invertedindex;
import java.io.DataOutput;
import java.io.File;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import junit.framework.Assert;
import org.junit.AfterClass;
import org.junit.Test;
import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
import edu.uci.ics.hyracks.api.io.FileReference;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
import edu.uci.ics.hyracks.storage.am.btree.frames.BTreeNSMInteriorFrameFactory;
import edu.uci.ics.hyracks.storage.am.btree.frames.BTreeNSMLeafFrameFactory;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeRangeSearchCursor;
import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrame;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrame;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrameFactory;
import edu.uci.ics.hyracks.storage.am.common.frames.LIFOMetaDataFrameFactory;
import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManager;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.IndexOp;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListBuilder;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
import edu.uci.ics.hyracks.test.support.TestUtils;
public class BulkLoadTest extends AbstractInvIndexTest {
private static final int PAGE_SIZE = 32768;
private static final int NUM_PAGES = 100;
private static final int MAX_OPEN_FILES = 10;
private static final int HYRACKS_FRAME_SIZE = 32768;
private IHyracksTaskContext stageletCtx = TestUtils.create(HYRACKS_FRAME_SIZE);
/**
* This test generates a list of <word-token, id> pairs which are pre-sorted
* on the token. Those pairs for the input to an inverted-index bulk load.
* The contents of the inverted lists are verified against the generated
* data.
*/
@Test
public void singleFieldPayloadTest() throws Exception {
TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES, MAX_OPEN_FILES);
IBufferCache bufferCache = TestStorageManagerComponentHolder.getBufferCache(stageletCtx);
IFileMapProvider fmp = TestStorageManagerComponentHolder.getFileMapProvider(stageletCtx);
// create file refs
FileReference btreeFile = new FileReference(new File(btreeFileName));
bufferCache.createFile(btreeFile);
int btreeFileId = fmp.lookupFileId(btreeFile);
bufferCache.openFile(btreeFileId);
FileReference invListsFile = new FileReference(new File(invListsFileName));
bufferCache.createFile(invListsFile);
int invListsFileId = fmp.lookupFileId(invListsFile);
bufferCache.openFile(invListsFileId);
// declare btree fields
int fieldCount = 5;
ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
// token (key)
typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
// startPageId
typeTraits[1] = new TypeTrait(4);
// endPageId
typeTraits[2] = new TypeTrait(4);
// startOff
typeTraits[3] = new TypeTrait(4);
// numElements
typeTraits[4] = new TypeTrait(4);
// declare btree keys
int keyFieldCount = 1;
IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
MultiComparator btreeCmp = new MultiComparator(typeTraits, cmps);
TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
ITreeIndexFrameFactory leafFrameFactory = new BTreeNSMLeafFrameFactory(tupleWriterFactory);
ITreeIndexFrameFactory interiorFrameFactory = new BTreeNSMInteriorFrameFactory(tupleWriterFactory);
ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
ITreeIndexFrame leafFrame = leafFrameFactory.createFrame();
ITreeIndexFrame interiorFrame = interiorFrameFactory.createFrame();
ITreeIndexMetaDataFrame metaFrame = metaFrameFactory.createFrame();
IFreePageManager freePageManager = new LinkedListFreePageManager(bufferCache, btreeFileId, 0, metaFrameFactory);
BTree btree = new BTree(bufferCache, freePageManager, interiorFrameFactory, leafFrameFactory, btreeCmp);
btree.create(btreeFileId, leafFrame, metaFrame);
btree.open(btreeFileId);
int invListFields = 1;
ITypeTrait[] invListTypeTraits = new ITypeTrait[invListFields];
invListTypeTraits[0] = new TypeTrait(4);
int invListKeys = 1;
IBinaryComparator[] invListBinCmps = new IBinaryComparator[invListKeys];
invListBinCmps[0] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
MultiComparator invListCmp = new MultiComparator(invListTypeTraits, invListBinCmps);
InvertedIndex invIndex = new InvertedIndex(bufferCache, btree, invListCmp);
invIndex.open(invListsFileId);
Random rnd = new Random();
rnd.setSeed(50);
ByteBuffer frame = stageletCtx.allocateFrame();
FrameTupleAppender appender = new FrameTupleAppender(stageletCtx.getFrameSize());
ArrayTupleBuilder tb = new ArrayTupleBuilder(2);
DataOutput dos = tb.getDataOutput();
ISerializerDeserializer[] insertSerde = { UTF8StringSerializerDeserializer.INSTANCE,
IntegerSerializerDeserializer.INSTANCE };
RecordDescriptor insertRecDesc = new RecordDescriptor(insertSerde);
IFrameTupleAccessor accessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), insertRecDesc);
accessor.reset(frame);
FrameTupleReference tuple = new FrameTupleReference();
List<String> tokens = new ArrayList<String>();
tokens.add("compilers");
tokens.add("computer");
tokens.add("databases");
tokens.add("fast");
tokens.add("hyracks");
tokens.add("major");
tokens.add("science");
tokens.add("systems");
tokens.add("university");
ArrayList<ArrayList<Integer>> checkListElements = new ArrayList<ArrayList<Integer>>();
for (int i = 0; i < tokens.size(); i++) {
checkListElements.add(new ArrayList<Integer>());
}
int maxId = 1000000;
int addProb = 0;
int addProbStep = 10;
IInvertedListBuilder invListBuilder = new FixedSizeElementInvertedListBuilder(invListTypeTraits);
InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(invListBuilder, HYRACKS_FRAME_SIZE,
BTree.DEFAULT_FILL_FACTOR);
int totalElements = 0;
for (int i = 0; i < tokens.size(); i++) {
addProb += addProbStep * (i + 1);
for (int j = 0; j < maxId; j++) {
if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
totalElements++;
tb.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
tb.addFieldEndOffset();
IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
tb.addFieldEndOffset();
checkListElements.get(i).add(j);
appender.reset(frame, true);
appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
tuple.reset(accessor, 0);
try {
invIndex.bulkLoadAddTuple(ctx, tuple);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
invIndex.endBulkLoad(ctx);
// ------- START VERIFICATION -----------
ITreeIndexCursor btreeCursor = new BTreeRangeSearchCursor((IBTreeLeafFrame) leafFrame);
FrameTupleReference searchKey = new FrameTupleReference();
RangePredicate btreePred = new RangePredicate(true, searchKey, searchKey, true, true, btreeCmp, btreeCmp);
IInvertedListCursor invListCursor = new FixedSizeElementInvertedListCursor(bufferCache, invListsFileId,
invListTypeTraits);
ISerializerDeserializer[] tokenSerde = { UTF8StringSerializerDeserializer.INSTANCE };
RecordDescriptor tokenRecDesc = new RecordDescriptor(tokenSerde);
FrameTupleAppender tokenAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
ArrayTupleBuilder tokenTupleBuilder = new ArrayTupleBuilder(1);
DataOutput tokenDos = tokenTupleBuilder.getDataOutput();
IFrameTupleAccessor tokenAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), tokenRecDesc);
tokenAccessor.reset(frame);
BTreeOpContext btreeOpCtx = invIndex.getBTree().createOpContext(IndexOp.SEARCH, leafFrame, interiorFrame, null);
// verify created inverted lists one-by-one
for (int i = 0; i < tokens.size(); i++) {
tokenTupleBuilder.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), tokenDos);
tokenTupleBuilder.addFieldEndOffset();
tokenAppender.reset(frame, true);
tokenAppender.append(tokenTupleBuilder.getFieldEndOffsets(), tokenTupleBuilder.getByteArray(), 0,
tokenTupleBuilder.getSize());
searchKey.reset(tokenAccessor, 0);
invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursor);
invListCursor.pinPagesSync();
int checkIndex = 0;
while (invListCursor.hasNext()) {
invListCursor.next();
ITupleReference invListTuple = invListCursor.getTuple();
int invListElement = IntegerSerializerDeserializer.getInt(invListTuple.getFieldData(0),
invListTuple.getFieldStart(0));
int checkInvListElement = checkListElements.get(i).get(checkIndex).intValue();
Assert.assertEquals(invListElement, checkInvListElement);
checkIndex++;
}
invListCursor.unpinPages();
Assert.assertEquals(checkIndex, checkListElements.get(i).size());
}
// check that non-existing tokens have an empty inverted list
List<String> nonExistingTokens = new ArrayList<String>();
nonExistingTokens.add("watermelon");
nonExistingTokens.add("avocado");
nonExistingTokens.add("lemon");
for (int i = 0; i < nonExistingTokens.size(); i++) {
tokenTupleBuilder.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(nonExistingTokens.get(i), tokenDos);
tokenTupleBuilder.addFieldEndOffset();
tokenAppender.reset(frame, true);
tokenAppender.append(tokenTupleBuilder.getFieldEndOffsets(), tokenTupleBuilder.getByteArray(), 0,
tokenTupleBuilder.getSize());
searchKey.reset(tokenAccessor, 0);
invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursor);
invListCursor.pinPagesSync();
Assert.assertEquals(invListCursor.hasNext(), false);
invListCursor.unpinPages();
}
btree.close();
bufferCache.closeFile(btreeFileId);
bufferCache.closeFile(invListsFileId);
bufferCache.close();
}
@AfterClass
public static void deinit() {
AbstractInvIndexTest.tearDown();
}
}