hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java - asterixdb - Git at Google

 /**
  * Copyright 2010-2011 The Regents of the University of California
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on
  * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations under
  * the License.
  *
  * Author: Alexander Behm <abehm (at) ics.uci.edu>
  */

 package edu.uci.ics.hyracks.storage.am.invertedindex;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;

 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;

 import edu.uci.ics.hyracks.data.std.util.GrowableArray;
 import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
 import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
 import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
 import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;

 public class NGramTokenizerTest {

     private char PRECHAR = '#';
     private char POSTCHAR = '$';

     private String str = "Jürgen S. Generic's Car";
     private byte[] inputBuffer;

     private int gramLength = 3;

     private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {

         String tmp = s.toLowerCase();
         if (prePost) {
             StringBuilder preBuilder = new StringBuilder();
             for (int i = 0; i < gramLength - 1; i++) {
                 preBuilder.append(PRECHAR);
             }
             String pre = preBuilder.toString();

             StringBuilder postBuilder = new StringBuilder();
             for (int i = 0; i < gramLength - 1; i++) {
                 postBuilder.append(POSTCHAR);
             }
             String post = postBuilder.toString();

             tmp = pre + s.toLowerCase() + post;
         }

         for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
             String gram = tmp.substring(i, i + gramLength);
             grams.add(gram);
         }
     }

     @Before
     public void init() throws Exception {
         // serialize string into bytes
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         DataOutput dos = new DataOutputStream(baos);
         dos.writeUTF(str);
         inputBuffer = baos.toByteArray();
     }

     void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
         HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
         NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
                 false, tokenFactory);
         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         ArrayList<String> expectedGrams = new ArrayList<String>();
         getExpectedGrams(str, gramLength, expectedGrams, prePost);
         ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
         HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
         for (String s : expectedGrams) {
             Integer count = gramCounts.get(s);
             if (count == null) {
                 count = 1;
                 gramCounts.put(s, count);
             } else {
                 count++;
             }

             int hash = tokenHash(s, count);
             expectedHashedGrams.add(hash);
         }

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenStorage = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenStorage);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
             DataInput in = new DataInputStream(bais);

             Integer hashedGram = in.readInt();

             // System.out.println(hashedGram);

             Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);

             tokenCount++;
         }
         // System.out.println("---------");
     }

     void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
         HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
         NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
                 tokenFactory);
         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         ArrayList<String> expectedGrams = new ArrayList<String>();
         getExpectedGrams(str, gramLength, expectedGrams, prePost);
         ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
         for (String s : expectedGrams) {
             int hash = tokenHash(s, 1);
             expectedHashedGrams.add(hash);
         }

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenStorage = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenStorage);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
             DataInput in = new DataInputStream(bais);

             Integer hashedGram = in.readInt();

             // System.out.println(hashedGram);

             Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);

             tokenCount++;
         }
         // System.out.println("---------");
     }

     void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
         UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
         NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
                 tokenFactory);
         tokenizer.reset(inputBuffer, 0, inputBuffer.length);

         ArrayList<String> expectedGrams = new ArrayList<String>();
         getExpectedGrams(str, gramLength, expectedGrams, prePost);

         int tokenCount = 0;

         while (tokenizer.hasNext()) {
             tokenizer.next();

             // serialize hashed token
             GrowableArray tokenStorage = new GrowableArray();

             IToken token = tokenizer.getToken();
             token.serializeToken(tokenStorage);

             // deserialize token
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
             DataInput in = new DataInputStream(bais);

             String strGram = in.readUTF();

             // System.out.println("\"" + strGram + "\"");

             Assert.assertEquals(expectedGrams.get(tokenCount), strGram);

             tokenCount++;
         }
         // System.out.println("---------");
     }

     @Test
     public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
         runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
         runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
     }

     @Test
     public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
         runTestNGramTokenizerWithHashedUTF8Tokens(false);
         runTestNGramTokenizerWithHashedUTF8Tokens(true);
     }

     @Test
     public void testNGramTokenizerWithUTF8Tokens() throws IOException {
         runTestNGramTokenizerWithUTF8Tokens(false);
         runTestNGramTokenizerWithUTF8Tokens(true);
     }

     public int tokenHash(String token, int tokenCount) {
         int h = AbstractUTF8Token.GOLDEN_RATIO_32;
         for (int i = 0; i < token.length(); i++) {
             h ^= token.charAt(i);
             h *= AbstractUTF8Token.GOLDEN_RATIO_32;
         }
         return h + tokenCount;
     }
 }
	/**
	* Copyright 2010-2011 The Regents of the University of California
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on
	* an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations under
	* the License.
	*
	* Author: Alexander Behm <abehm (at) ics.uci.edu>
	*/

	package edu.uci.ics.hyracks.storage.am.invertedindex;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.DataInput;
	import java.io.DataInputStream;
	import java.io.DataOutput;
	import java.io.DataOutputStream;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;

	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Test;

	import edu.uci.ics.hyracks.data.std.util.GrowableArray;
	import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
	import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
	import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
	import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
	import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;

	public class NGramTokenizerTest {

	private char PRECHAR = '#';
	private char POSTCHAR = '$';

	private String str = "Jürgen S. Generic's Car";
	private byte[] inputBuffer;

	private int gramLength = 3;

	private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {

	String tmp = s.toLowerCase();
	if (prePost) {
	StringBuilder preBuilder = new StringBuilder();
	for (int i = 0; i < gramLength - 1; i++) {
	preBuilder.append(PRECHAR);
	}
	String pre = preBuilder.toString();

	StringBuilder postBuilder = new StringBuilder();
	for (int i = 0; i < gramLength - 1; i++) {
	postBuilder.append(POSTCHAR);
	}
	String post = postBuilder.toString();

	tmp = pre + s.toLowerCase() + post;
	}

	for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
	String gram = tmp.substring(i, i + gramLength);
	grams.add(gram);
	}
	}

	@Before
	public void init() throws Exception {
	// serialize string into bytes
	ByteArrayOutputStream baos = new ByteArrayOutputStream();
	DataOutput dos = new DataOutputStream(baos);
	dos.writeUTF(str);
	inputBuffer = baos.toByteArray();
	}

	void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
	HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
	NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
	false, tokenFactory);
	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	ArrayList<String> expectedGrams = new ArrayList<String>();
	getExpectedGrams(str, gramLength, expectedGrams, prePost);
	ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
	HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
	for (String s : expectedGrams) {
	Integer count = gramCounts.get(s);
	if (count == null) {
	count = 1;
	gramCounts.put(s, count);
	} else {
	count++;
	}

	int hash = tokenHash(s, count);
	expectedHashedGrams.add(hash);
	}

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenStorage = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenStorage);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
	DataInput in = new DataInputStream(bais);

	Integer hashedGram = in.readInt();

	// System.out.println(hashedGram);

	Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);

	tokenCount++;
	}
	// System.out.println("---------");
	}

	void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
	HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
	NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
	tokenFactory);
	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	ArrayList<String> expectedGrams = new ArrayList<String>();
	getExpectedGrams(str, gramLength, expectedGrams, prePost);
	ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
	for (String s : expectedGrams) {
	int hash = tokenHash(s, 1);
	expectedHashedGrams.add(hash);
	}

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenStorage = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenStorage);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
	DataInput in = new DataInputStream(bais);

	Integer hashedGram = in.readInt();

	// System.out.println(hashedGram);

	Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);

	tokenCount++;
	}
	// System.out.println("---------");
	}

	void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
	UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
	NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
	tokenFactory);
	tokenizer.reset(inputBuffer, 0, inputBuffer.length);

	ArrayList<String> expectedGrams = new ArrayList<String>();
	getExpectedGrams(str, gramLength, expectedGrams, prePost);

	int tokenCount = 0;

	while (tokenizer.hasNext()) {
	tokenizer.next();

	// serialize hashed token
	GrowableArray tokenStorage = new GrowableArray();

	IToken token = tokenizer.getToken();
	token.serializeToken(tokenStorage);

	// deserialize token
	ByteArrayInputStream bais = new ByteArrayInputStream(tokenStorage.getByteArray());
	DataInput in = new DataInputStream(bais);

	String strGram = in.readUTF();

	// System.out.println("\"" + strGram + "\"");

	Assert.assertEquals(expectedGrams.get(tokenCount), strGram);

	tokenCount++;
	}
	// System.out.println("---------");
	}

	@Test
	public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
	runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
	runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
	}

	@Test
	public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
	runTestNGramTokenizerWithHashedUTF8Tokens(false);
	runTestNGramTokenizerWithHashedUTF8Tokens(true);
	}

	@Test
	public void testNGramTokenizerWithUTF8Tokens() throws IOException {
	runTestNGramTokenizerWithUTF8Tokens(false);
	runTestNGramTokenizerWithUTF8Tokens(true);
	}

	public int tokenHash(String token, int tokenCount) {
	int h = AbstractUTF8Token.GOLDEN_RATIO_32;
	for (int i = 0; i < token.length(); i++) {
	h ^= token.charAt(i);
	h *= AbstractUTF8Token.GOLDEN_RATIO_32;
	}
	return h + tokenCount;
	}
	}