blob: de9ad2c55db02392b02ec02a1eae6e64178d85a8 [file] [log] [blame]
/**
* Copyright 2010-2011 The Regents of the University of California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on
* an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under
* the License.
*
* Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
public class DelimitedUTF8StringBinaryTokenizer extends
AbstractUTF8StringBinaryTokenizer {
public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
}
@Override
public boolean hasNext() {
// skip delimiters
while (index < length && isSeparator(StringUtils.charAt(data, index))) {
index += StringUtils.charSize(data, index);
}
return index < length;
}
private boolean isSeparator(char c) {
return !(Character.isLetterOrDigit(c)
|| Character.getType(c) == Character.OTHER_LETTER || Character
.getType(c) == Character.OTHER_NUMBER);
}
@Override
public void next() {
tokenLength = 0;
int currentTokenStart = index;
while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
index += StringUtils.charSize(data, index);
tokenLength++;
}
int tokenCount = 1;
if (tokenLength > 0 && !ignoreTokenCount) {
// search if we got the same token before
for (int i = 0; i < tokensStart.length(); ++i) {
if (tokenLength == tokensLength.get(i)) {
int tokenStart = tokensStart.get(i);
tokenCount++; // assume we found it
int offset = 0;
int currLength = 0;
while (currLength < tokenLength) {
// case insensitive comparison
if (StringUtils.toLowerCase(StringUtils.charAt(data,
currentTokenStart + offset)) != StringUtils
.toLowerCase(StringUtils.charAt(data,
tokenStart + offset))) {
tokenCount--;
break;
}
offset += StringUtils.charSize(data, currentTokenStart
+ offset);
currLength++;
}
}
}
// add the new token to the list of seen tokens
tokensStart.add(currentTokenStart);
tokensLength.add(tokenLength);
}
// set token
token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
}
}