blob: 0af0335b25630cf6134a6aae1cc67ca6584e2b78 [file] [log] [blame]
/**
* Copyright 2010-2011 The Regents of the University of California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on
* an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under
* the License.
*
* Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
public class NGramUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
private int gramLength;
private boolean usePrePost;
private int gramNum;
private int totalGrams;
private final INGramToken concreteToken;
public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
this.gramLength = gramLength;
this.usePrePost = usePrePost;
concreteToken = (INGramToken) token;
}
@Override
public boolean hasNext() {
if (gramNum < totalGrams) {
return true;
} else {
return false;
}
}
@Override
public void next() {
int currentTokenStart = index;
int tokenCount = 1;
int numPreChars = 0;
int numPostChars = 0;
if (usePrePost) {
numPreChars = Math.max(gramLength - gramNum - 1, 0);
numPostChars = (gramNum > totalGrams - gramLength) ? gramLength - totalGrams + gramNum : 0;
}
gramNum++;
concreteToken.setNumPrePostChars(numPreChars, numPostChars);
if (numPreChars == 0) {
index += UTF8StringPointable.charSize(data, index);
}
// compute token count
// ignore pre and post grams for duplicate detection
if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
int tmpIndex = start;
while (tmpIndex < currentTokenStart) {
tokenCount++; // assume found
int offset = 0;
for (int j = 0; j < gramLength; j++) {
if (Character.toLowerCase(UTF8StringPointable.charAt(data, currentTokenStart + offset)) != Character
.toLowerCase(UTF8StringPointable.charAt(data, tmpIndex + offset))) {
tokenCount--;
break;
}
offset += UTF8StringPointable.charSize(data, tmpIndex + offset);
}
tmpIndex += UTF8StringPointable.charSize(data, tmpIndex);
}
}
// set token
token.reset(data, currentTokenStart, length, gramLength, tokenCount);
}
@Override
public void reset(byte[] data, int start, int length) {
super.reset(data, start, length);
gramNum = 0;
int numChars = 0;
int pos = index;
int end = pos + utf8Length;
while (pos < end) {
numChars++;
pos += UTF8StringPointable.charSize(data, pos);
}
if (usePrePost) {
totalGrams = numChars + gramLength - 1;
} else {
totalGrams = numChars - gramLength + 1;
}
}
public void setGramlength(int gramLength) {
this.gramLength = gramLength;
}
public void setPrePost(boolean usePrePost) {
this.usePrePost = usePrePost;
}
}