hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java - asterixdb - Git at Google

 /*
  * Copyright 2009-2010 by The Regents of the University of California
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;

 import junit.framework.Assert;

 import org.junit.Before;
 import org.junit.Test;

 import edu.uci.ics.hyracks.data.std.util.GrowableArray;

 public class WordTokenizerTest {

     private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
     private byte[] inputBuffer;

     private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
     private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
     private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();

     private boolean isSeparator(char c) {
         return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
     }

     private void tokenize(String text, ArrayList<String> tokens) {
         String lowerCaseText = text.toLowerCase();
         int startIx = 0;

         // Skip separators at beginning of string.
         while (isSeparator(lowerCaseText.charAt(startIx))) {
             startIx++;
         }
         while (startIx < lowerCaseText.length()) {
             while (startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
                 startIx++;
             }
             int tokenStart = startIx;

             while (startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
                 startIx++;
             }
             int tokenEnd = startIx;

             // Emit token.
             String token = lowerCaseText.substring(tokenStart, tokenEnd);

             tokens.add(token);
         }
     }

     @Before
     public void init() throws IOException {
         // serialize text into bytes
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutput dos = new DataOutputStream(baos);
         dos.writeUTF(text);
         inputBuffer = baos.toByteArray();

         // init expected string tokens
         tokenize(text, expectedUTF8Tokens);

         // hashed tokens ignoring token count
         for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
             int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
             expectedHashedUTF8Tokens.add(hash);
         }

         // hashed tokens using token count
         HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
         for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
             Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
             if (count == null) {
                 count = 1;
                 tokenCounts.put(expectedUTF8Tokens.get(i), count);
             } else {
                 count++;
             }

             int hash = tokenHash(expectedUTF8Tokens.get(i), count);
             expectedCountedHashedUTF8Tokens.add(hash);
         }
     }

     @Test
     public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {

         HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
         DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
                 tokenFactory);

         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenData = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenData);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
             DataInput in = new DataInputStream(bais);

             Integer hashedToken = in.readInt();

             Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));

             tokenCount++;
         }
     }

     @Test
     public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {

         HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
         DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);

         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenData = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenData);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
             DataInput in = new DataInputStream(bais);

             Integer hashedToken = in.readInt();

             Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);

             tokenCount++;
         }
     }

     @Test
     public void testWordTokenizerWithUTF8Tokens() throws IOException {

         UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
         DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);

         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenData = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenData);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
             DataInput in = new DataInputStream(bais);

             String strToken = in.readUTF();

             Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);

             tokenCount++;
         }
     }

     // JAQL Hash
     public int tokenHash(String token, int tokenCount) {
         int h = AbstractUTF8Token.GOLDEN_RATIO_32;
         for (int i = 0; i < token.length(); i++) {
             h ^= token.charAt(i);
             h *= AbstractUTF8Token.GOLDEN_RATIO_32;
         }
         return h + tokenCount;
     }
 }
	/*
	* Copyright 2009-2010 by The Regents of the University of California
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* you may obtain a copy of the License from
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.DataInput;
	import java.io.DataInputStream;
	import java.io.DataOutput;
	import java.io.DataOutputStream;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;

	import junit.framework.Assert;

	import org.junit.Before;
	import org.junit.Test;

	import edu.uci.ics.hyracks.data.std.util.GrowableArray;

	public class WordTokenizerTest {

	private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
	private byte[] inputBuffer;

	private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
	private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
	private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();

	private boolean isSeparator(char c) {
	return !(Character.isLetterOrDigit(c) \|\| Character.getType(c) == Character.OTHER_LETTER \|\| Character.getType(c) == Character.OTHER_NUMBER);
	}

	private void tokenize(String text, ArrayList<String> tokens) {
	String lowerCaseText = text.toLowerCase();
	int startIx = 0;

	// Skip separators at beginning of string.
	while (isSeparator(lowerCaseText.charAt(startIx))) {
	startIx++;
	}
	while (startIx < lowerCaseText.length()) {
	while (startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
	startIx++;
	}
	int tokenStart = startIx;

	while (startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
	startIx++;
	}
	int tokenEnd = startIx;

	// Emit token.
	String token = lowerCaseText.substring(tokenStart, tokenEnd);

	tokens.add(token);
	}
	}

	@Before
	public void init() throws IOException {
	// serialize text into bytes
	ByteArrayOutputStream baos = new ByteArrayOutputStream();
	DataOutput dos = new DataOutputStream(baos);
	dos.writeUTF(text);
	inputBuffer = baos.toByteArray();

	// init expected string tokens
	tokenize(text, expectedUTF8Tokens);

	// hashed tokens ignoring token count
	for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
	int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
	expectedHashedUTF8Tokens.add(hash);
	}

	// hashed tokens using token count
	HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
	for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
	Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
	if (count == null) {
	count = 1;
	tokenCounts.put(expectedUTF8Tokens.get(i), count);
	} else {
	count++;
	}

	int hash = tokenHash(expectedUTF8Tokens.get(i), count);
	expectedCountedHashedUTF8Tokens.add(hash);
	}
	}

	@Test
	public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {

	HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
	DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
	tokenFactory);

	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenData = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenData);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
	DataInput in = new DataInputStream(bais);

	Integer hashedToken = in.readInt();

	Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));

	tokenCount++;
	}
	}

	@Test
	public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {

	HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
	DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);

	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenData = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenData);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
	DataInput in = new DataInputStream(bais);

	Integer hashedToken = in.readInt();

	Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);

	tokenCount++;
	}
	}

	@Test
	public void testWordTokenizerWithUTF8Tokens() throws IOException {

	UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
	DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);

	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenData = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenData);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
	DataInput in = new DataInputStream(bais);

	String strToken = in.readUTF();

	Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);

	tokenCount++;
	}
	}

	// JAQL Hash
	public int tokenHash(String token, int tokenCount) {
	int h = AbstractUTF8Token.GOLDEN_RATIO_32;
	for (int i = 0; i < token.length(); i++) {
	h ^= token.charAt(i);
	h *= AbstractUTF8Token.GOLDEN_RATIO_32;
	}
	return h + tokenCount;
	}
	}