blob: 4c1152333a96b6c50283275a86d21b7e6b1a2681 [file] [log] [blame]
/**
* Copyright 2010-2011 The Regents of the University of California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on
* an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under
* the License.
*
* Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
public class DelimitedUTF8StringBinaryTokenizer extends AbstractUTF8StringBinaryTokenizer {
public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount, boolean sourceHasTypeTag,
ITokenFactory tokenFactory) {
super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
}
@Override
public boolean hasNext() {
// skip delimiters
while (index < length && isSeparator(UTF8StringPointable.charAt(data, index))) {
index += UTF8StringPointable.charSize(data, index);
}
return index < length;
}
private boolean isSeparator(char c) {
return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
}
@Override
public void next() {
tokenLength = 0;
int currentTokenStart = index;
while (index < length && !isSeparator(UTF8StringPointable.charAt(data, index))) {
index += UTF8StringPointable.charSize(data, index);
tokenLength++;
}
int tokenCount = 1;
if (tokenLength > 0 && !ignoreTokenCount) {
// search if we got the same token before
for (int i = 0; i < tokensStart.length(); ++i) {
if (tokenLength == tokensLength.get(i)) {
int tokenStart = tokensStart.get(i);
tokenCount++; // assume we found it
int offset = 0;
int currLength = 0;
while (currLength < tokenLength) {
// case insensitive comparison
if (Character.toLowerCase(UTF8StringPointable.charAt(data, currentTokenStart + offset)) != Character
.toLowerCase(UTF8StringPointable.charAt(data, tokenStart + offset))) {
tokenCount--;
break;
}
offset += UTF8StringPointable.charSize(data, currentTokenStart + offset);
currLength++;
}
}
}
// add the new token to the list of seen tokens
tokensStart.add(currentTokenStart);
tokensLength.add(tokenLength);
}
// set token
token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
}
}