blob: 47c75cfae6a0676455fd91344706f19d020d0275 [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.util.ArrayList;
import java.util.Random;
import org.junit.Assert;
import org.junit.Test;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
public class TokenizerTest {
// testing DelimitedUTF8StringBinaryTokenizer
@Test
public void test01() throws Exception {
Random rnd = new Random(50);
int numDocs = 100;
int maxWords = 1000;
int maxWordLength = 50;
char delimiter = ' ';
DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
// create a bunch of documents
for (int i = 0; i < numDocs; i++) {
// create a single document with a bunch of words
int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
StringBuilder strBuilder = new StringBuilder();
for (int j = 0; j < words; j++) {
int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
String s = randomString(len, rnd);
strBuilder.append(s);
if (j < words - 1)
strBuilder.append(delimiter);
}
String doc = strBuilder.toString();
// serialize document into baaos
ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
DataOutputStream dos = new DataOutputStream(baaos);
UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
byte[] data = baaos.toByteArray();
// use binary tokenizer and compare with Java tokenizer
String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
int cmpCounter = 0;
tok.reset(data, 0, data.length);
while (tok.hasNext()) {
tok.next();
// write token to outputstream
ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
tok.writeToken(dosWrite);
// deserialize token to get string object
ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
DataInput dataIn = new DataInputStream(inStream);
String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
Assert.assertEquals(s, cmpTokens[cmpCounter++]);
}
}
}
// testing HashedQGramUTF8StringBinaryTokenizer
@Test
public void test02() throws Exception {
Random rnd = new Random(50);
int numStrings = 1000;
int maxStrLen = 100;
int minQ = 2;
int maxQ = 10;
// we test the correctness of HashedQGramUTF8StringBinaryTokenizer as
// follows:
// 1.1. tokenize the string into q-gram strings
// 1.2. serialize q-gram strings into bytes
// 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
// 2.1. serialize string into bytes
// 2.2. tokenize serialized string into hashed q-grams
// 2.3. test whether hashed grams from 1.3. and 2.3. are equal
for (int i = 0; i < numStrings; i++) {
int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q;
String str = randomString(strLen, rnd);
// randomly choose pre and postfixing
boolean prePost = false;
if (Math.abs(rnd.nextInt()) % 2 == 0)
prePost = true;
HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
String extendedString = str;
if (prePost) {
// pre and postfix string
StringBuilder strBuilder = new StringBuilder();
for (int j = 0; j < q - 1; j++)
strBuilder.append(qgramTok.getPreChar());
strBuilder.append(str);
for (int j = 0; j < q - 1; j++)
strBuilder.append(qgramTok.getPostChar());
extendedString = strBuilder.toString();
}
// generate q-grams in deserialized form
ArrayList<String> javaGrams = new ArrayList<String>();
for (int j = 0; j < extendedString.length() - q + 1; j++) {
javaGrams.add(extendedString.substring(j, j + q));
}
// serialize string for use in binary gram tokenizer
ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
DataOutputStream dos = new DataOutputStream(baaos);
UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
byte[] data = baaos.toByteArray();
qgramTok.reset(data, 0, data.length);
int counter = 0;
while (qgramTok.hasNext()) {
qgramTok.next();
// write token to outputstream
ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
qgramTok.writeToken(dosWrite);
// deserialize token to get hashed gram
ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
DataInput dataIn = new DataInputStream(inStream);
Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
// create hashed gram to test against
ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
DataOutputStream dosCmp = new DataOutputStream(baaosCmp);
UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);
IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
byte[] cmpData = baaosCmp.toByteArray();
int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
Assert.assertEquals(binHashedGram.intValue(), cmpHash);
counter++;
}
}
}
public static String randomString(int length, Random random) {
int maxAttempts = 1000;
int count = 0;
while (count < maxAttempts) {
String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
StringBuilder strBuilder = new StringBuilder();
for (int i = 0; i < s.length() && i < length; i++) {
strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
}
if (strBuilder.length() > 0)
return strBuilder.toString();
count++;
}
return "abc";
}
}