| /* |
| * Copyright 2009-2010 by The Regents of the University of California |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * you may obtain a copy of the License from |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.DataInput; |
| import java.io.DataInputStream; |
| import java.io.DataOutput; |
| import java.io.DataOutputStream; |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| |
| import junit.framework.Assert; |
| |
| import org.junit.Before; |
| import org.junit.Test; |
| |
| import edu.uci.ics.hyracks.data.std.util.GrowableArray; |
| |
| public class WordTokenizerTest { |
| |
| private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen."; |
| private byte[] inputBuffer; |
| |
| private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>(); |
| private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>(); |
| private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>(); |
| |
| private boolean isSeparator(char c) { |
| return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER); |
| } |
| |
| private void tokenize(String text, ArrayList<String> tokens) { |
| String lowerCaseText = text.toLowerCase(); |
| int startIx = 0; |
| |
| // Skip separators at beginning of string. |
| while (isSeparator(lowerCaseText.charAt(startIx))) { |
| startIx++; |
| } |
| while (startIx < lowerCaseText.length()) { |
| while (startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) { |
| startIx++; |
| } |
| int tokenStart = startIx; |
| |
| while (startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) { |
| startIx++; |
| } |
| int tokenEnd = startIx; |
| |
| // Emit token. |
| String token = lowerCaseText.substring(tokenStart, tokenEnd); |
| |
| tokens.add(token); |
| } |
| } |
| |
| @Before |
| public void init() throws IOException { |
| // serialize text into bytes |
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
| DataOutput dos = new DataOutputStream(baos); |
| dos.writeUTF(text); |
| inputBuffer = baos.toByteArray(); |
| |
| // init expected string tokens |
| tokenize(text, expectedUTF8Tokens); |
| |
| // hashed tokens ignoring token count |
| for (int i = 0; i < expectedUTF8Tokens.size(); i++) { |
| int hash = tokenHash(expectedUTF8Tokens.get(i), 1); |
| expectedHashedUTF8Tokens.add(hash); |
| } |
| |
| // hashed tokens using token count |
| HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>(); |
| for (int i = 0; i < expectedUTF8Tokens.size(); i++) { |
| Integer count = tokenCounts.get(expectedUTF8Tokens.get(i)); |
| if (count == null) { |
| count = 1; |
| tokenCounts.put(expectedUTF8Tokens.get(i), count); |
| } else { |
| count++; |
| } |
| |
| int hash = tokenHash(expectedUTF8Tokens.get(i), count); |
| expectedCountedHashedUTF8Tokens.add(hash); |
| } |
| } |
| |
| @Test |
| public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException { |
| |
| HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); |
| DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false, |
| tokenFactory); |
| |
| tokenizer.reset(inputBuffer, 0, inputBuffer.length); |
| |
| int tokenCount = 0; |
| |
| while (tokenizer.hasNext()) { |
| tokenizer.next(); |
| |
| // serialize hashed token |
| GrowableArray tokenData = new GrowableArray(); |
| |
| IToken token = tokenizer.getToken(); |
| token.serializeToken(tokenData); |
| |
| // deserialize token |
| ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray()); |
| DataInput in = new DataInputStream(bais); |
| |
| Integer hashedToken = in.readInt(); |
| |
| Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount)); |
| |
| tokenCount++; |
| } |
| } |
| |
| @Test |
| public void testWordTokenizerWithHashedUTF8Tokens() throws IOException { |
| |
| HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory(); |
| DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory); |
| |
| tokenizer.reset(inputBuffer, 0, inputBuffer.length); |
| |
| int tokenCount = 0; |
| |
| while (tokenizer.hasNext()) { |
| tokenizer.next(); |
| |
| // serialize hashed token |
| GrowableArray tokenData = new GrowableArray(); |
| |
| IToken token = tokenizer.getToken(); |
| token.serializeToken(tokenData); |
| |
| // deserialize token |
| ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray()); |
| DataInput in = new DataInputStream(bais); |
| |
| Integer hashedToken = in.readInt(); |
| |
| Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken); |
| |
| tokenCount++; |
| } |
| } |
| |
| @Test |
| public void testWordTokenizerWithUTF8Tokens() throws IOException { |
| |
| UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory(); |
| DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory); |
| |
| tokenizer.reset(inputBuffer, 0, inputBuffer.length); |
| |
| int tokenCount = 0; |
| |
| while (tokenizer.hasNext()) { |
| tokenizer.next(); |
| |
| // serialize hashed token |
| GrowableArray tokenData = new GrowableArray(); |
| |
| IToken token = tokenizer.getToken(); |
| token.serializeToken(tokenData); |
| |
| // deserialize token |
| ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray()); |
| DataInput in = new DataInputStream(bais); |
| |
| String strToken = in.readUTF(); |
| |
| Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken); |
| |
| tokenCount++; |
| } |
| } |
| |
| // JAQL Hash |
| public int tokenHash(String token, int tokenCount) { |
| int h = AbstractUTF8Token.GOLDEN_RATIO_32; |
| for (int i = 0; i < token.length(); i++) { |
| h ^= token.charAt(i); |
| h *= AbstractUTF8Token.GOLDEN_RATIO_32; |
| } |
| return h + tokenCount; |
| } |
| } |