blob: 33ea4f59206f02eaf5841938b00b027046f42afe [file] [log] [blame]
/*
* Copyright 2009-2010 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public class NGramTokenizerTest {
private char PRECHAR = '#';
private char POSTCHAR = '$';
private String str = "Jürgen S. Generic's Car";
private byte[] inputBuffer;
private int gramLength = 3;
private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
String tmp = s.toLowerCase();
if (prePost) {
StringBuilder preBuilder = new StringBuilder();
for (int i = 0; i < gramLength - 1; i++) {
preBuilder.append(PRECHAR);
}
String pre = preBuilder.toString();
StringBuilder postBuilder = new StringBuilder();
for (int i = 0; i < gramLength - 1; i++) {
postBuilder.append(POSTCHAR);
}
String post = postBuilder.toString();
tmp = pre + s.toLowerCase() + post;
}
for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
String gram = tmp.substring(i, i + gramLength);
grams.add(gram);
}
}
@Before
public void init() throws Exception {
// serialize string into bytes
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DataOutput dos = new DataOutputStream(baos);
dos.writeUTF(str);
inputBuffer = baos.toByteArray();
}
void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
false, tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
for (String s : expectedGrams) {
Integer count = gramCounts.get(s);
if (count == null) {
count = 1;
gramCounts.put(s, count);
} else {
count++;
}
int hash = tokenHash(s, count);
expectedHashedGrams.add(hash);
}
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedGram = in.readInt();
// System.out.println(hashedGram);
Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
tokenCount++;
}
// System.out.println("---------");
}
void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
for (String s : expectedGrams) {
int hash = tokenHash(s, 1);
expectedHashedGrams.add(hash);
}
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedGram = in.readInt();
// System.out.println(hashedGram);
Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
tokenCount++;
}
// System.out.println("---------");
}
void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
ArrayList<String> expectedGrams = new ArrayList<String>();
getExpectedGrams(str, gramLength, expectedGrams, prePost);
int tokenCount = 0;
while (tokenizer.hasNext()) {
tokenizer.next();
// serialize hashed token
GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
token.serializeToken(tokenData);
// deserialize token
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
String strGram = in.readUTF();
// System.out.println("\"" + strGram + "\"");
Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
tokenCount++;
}
// System.out.println("---------");
}
@Test
public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
}
@Test
public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
runTestNGramTokenizerWithHashedUTF8Tokens(false);
runTestNGramTokenizerWithHashedUTF8Tokens(true);
}
@Test
public void testNGramTokenizerWithUTF8Tokens() throws IOException {
runTestNGramTokenizerWithUTF8Tokens(false);
runTestNGramTokenizerWithUTF8Tokens(true);
}
public int tokenHash(String token, int tokenCount) {
int h = AbstractUTF8Token.GOLDEN_RATIO_32;
for (int i = 0; i < token.length(); i++) {
h ^= token.charAt(i);
h *= AbstractUTF8Token.GOLDEN_RATIO_32;
}
return h + tokenCount;
}
}