hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java - asterixdb - Git at Google

 /**
  * Copyright 2010-2011 The Regents of the University of California
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on
  * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations under
  * the License.
  *
  * Author: Alexander Behm <abehm (at) ics.uci.edu>
  */

 package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;

 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;

 public class NGramUTF8StringBinaryTokenizer extends
 		AbstractUTF8StringBinaryTokenizer {

 	private int gramLength;
 	private boolean usePrePost;

 	private int gramNum;
 	private int totalGrams;

 	private final INGramToken concreteToken;

 	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
 			boolean ignoreTokenCount, boolean sourceHasTypeTag,
 			ITokenFactory tokenFactory) {
 		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
 		this.gramLength = gramLength;
 		this.usePrePost = usePrePost;
 		concreteToken = (INGramToken) token;
 	}

 	@Override
 	public boolean hasNext() {
 		if (gramNum < totalGrams) {
 			return true;
 		} else {
 			return false;
 		}
 	}

 	@Override
 	public void next() {
 		int currentTokenStart = index;
 		int tokenCount = 1;
 		int numPreChars = 0;
 		int numPostChars = 0;
 		if (usePrePost) {
 			numPreChars = Math.max(gramLength - gramNum - 1, 0);
 			numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
 					- totalGrams + gramNum : 0;
 		}
 		gramNum++;

 		concreteToken.setNumPrePostChars(numPreChars, numPostChars);
 		if (numPreChars == 0) {
 			index += StringUtils.charSize(data, index);
 		}

 		// compute token count
 		// ignore pre and post grams for duplicate detection
 		if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
 			int tmpIndex = start;
 			while (tmpIndex < currentTokenStart) {
 				tokenCount++; // assume found
 				int offset = 0;
 				for (int j = 0; j < gramLength; j++) {
 					if (StringUtils.toLowerCase(StringUtils.charAt(data,
 							currentTokenStart + offset)) != StringUtils
 							.toLowerCase(StringUtils.charAt(data, tmpIndex
 									+ offset))) {
 						tokenCount--;
 						break;
 					}
 					offset += StringUtils.charSize(data, tmpIndex + offset);
 				}
 				tmpIndex += StringUtils.charSize(data, tmpIndex);
 			}
 		}

 		// set token
 		token.reset(data, currentTokenStart, length, gramLength, tokenCount);
 	}

 	@Override
 	public void reset(byte[] data, int start, int length) {
 		super.reset(data, start, length);
 		gramNum = 0;

 		int numChars = 0;
 		int pos = index;
 		int end = pos + utf8Length;
 		while (pos < end) {
 			numChars++;
 			pos += StringUtils.charSize(data, pos);
 		}

 		if (usePrePost) {
 			totalGrams = numChars + gramLength - 1;
 		} else {
 			totalGrams = numChars - gramLength + 1;
 		}
 	}

 	public void setGramlength(int gramLength) {
 		this.gramLength = gramLength;
 	}

 	public void setPrePost(boolean usePrePost) {
 		this.usePrePost = usePrePost;
 	}
 }
	/**
	* Copyright 2010-2011 The Regents of the University of California
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on
	* an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations under
	* the License.
	*
	* Author: Alexander Behm <abehm (at) ics.uci.edu>
	*/

	package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;

	import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;

	public class NGramUTF8StringBinaryTokenizer extends
	AbstractUTF8StringBinaryTokenizer {

	private int gramLength;
	private boolean usePrePost;

	private int gramNum;
	private int totalGrams;

	private final INGramToken concreteToken;

	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
	boolean ignoreTokenCount, boolean sourceHasTypeTag,
	ITokenFactory tokenFactory) {
	super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
	this.gramLength = gramLength;
	this.usePrePost = usePrePost;
	concreteToken = (INGramToken) token;
	}

	@Override
	public boolean hasNext() {
	if (gramNum < totalGrams) {
	return true;
	} else {
	return false;
	}
	}

	@Override
	public void next() {
	int currentTokenStart = index;
	int tokenCount = 1;
	int numPreChars = 0;
	int numPostChars = 0;
	if (usePrePost) {
	numPreChars = Math.max(gramLength - gramNum - 1, 0);
	numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
	- totalGrams + gramNum : 0;
	}
	gramNum++;

	concreteToken.setNumPrePostChars(numPreChars, numPostChars);
	if (numPreChars == 0) {
	index += StringUtils.charSize(data, index);
	}

	// compute token count
	// ignore pre and post grams for duplicate detection
	if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
	int tmpIndex = start;
	while (tmpIndex < currentTokenStart) {
	tokenCount++; // assume found
	int offset = 0;
	for (int j = 0; j < gramLength; j++) {
	if (StringUtils.toLowerCase(StringUtils.charAt(data,
	currentTokenStart + offset)) != StringUtils
	.toLowerCase(StringUtils.charAt(data, tmpIndex
	+ offset))) {
	tokenCount--;
	break;
	}
	offset += StringUtils.charSize(data, tmpIndex + offset);
	}
	tmpIndex += StringUtils.charSize(data, tmpIndex);
	}
	}

	// set token
	token.reset(data, currentTokenStart, length, gramLength, tokenCount);
	}

	@Override
	public void reset(byte[] data, int start, int length) {
	super.reset(data, start, length);
	gramNum = 0;

	int numChars = 0;
	int pos = index;
	int end = pos + utf8Length;
	while (pos < end) {
	numChars++;
	pos += StringUtils.charSize(data, pos);
	}

	if (usePrePost) {
	totalGrams = numChars + gramLength - 1;
	} else {
	totalGrams = numChars - gramLength + 1;
	}
	}

	public void setGramlength(int gramLength) {
	this.gramLength = gramLength;
	}

	public void setPrePost(boolean usePrePost) {
	this.usePrePost = usePrePost;
	}
	}