blob: d8ecb77d401e96f5f43cc14c52150f3702d4ac08 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.icu.segmentation;
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
/**
* Contain all the issues surrounding BreakIterators in ICU in one place.
* Basically this boils down to the fact that they aren't very friendly to any
* sort of OO design.
* <p>
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
* BreakIterator from RuleBasedBreakIterator
* <p>
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
* doesn't actually behave as a subclass: it always returns 0 for
* getRuleStatus():
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
* tags
* @lucene.experimental
*/
abstract class BreakIteratorWrapper {
protected final CharArrayIterator textIterator = new CharArrayIterator();
protected char text[];
protected int start;
protected int length;
abstract int next();
abstract int current();
abstract int getRuleStatus();
abstract void setText(CharacterIterator text);
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.length = length;
textIterator.setText(text, start, length);
setText(textIterator);
}
/**
* If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);
}
/**
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
* a DictionaryBasedBreakIterator) behaves correctly.
*/
static final class RBBIWrapper extends BreakIteratorWrapper {
private final RuleBasedBreakIterator rbbi;
RBBIWrapper(RuleBasedBreakIterator rbbi) {
this.rbbi = rbbi;
}
@Override
int current() {
return rbbi.current();
}
@Override
int getRuleStatus() {
return rbbi.getRuleStatus();
}
@Override
int next() {
return rbbi.next();
}
@Override
void setText(CharacterIterator text) {
rbbi.setText(text);
}
}
/**
* Generic BreakIterator wrapper: Either the rulestatus method is not
* available or always returns 0. Calculate a rulestatus here so it behaves
* like RuleBasedBreakIterator.
*
* Note: This is slower than RuleBasedBreakIterator.
*/
static final class BIWrapper extends BreakIteratorWrapper {
private final BreakIterator bi;
private int status;
BIWrapper(BreakIterator bi) {
this.bi = bi;
}
@Override
int current() {
return bi.current();
}
@Override
int getRuleStatus() {
return status;
}
@Override
int next() {
int current = bi.current();
int next = bi.next();
status = calcStatus(current, next);
return next;
}
private int calcStatus(int current, int next) {
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
return RuleBasedBreakIterator.WORD_NONE;
int begin = start + current;
int end = start + next;
int codepoint;
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, begin);
if (UCharacter.isDigit(codepoint))
return RuleBasedBreakIterator.WORD_NUMBER;
else if (UCharacter.isLetter(codepoint)) {
// TODO: try to separately specify ideographic, kana?
// [currently all bundled as letter for this case]
return RuleBasedBreakIterator.WORD_LETTER;
}
}
return RuleBasedBreakIterator.WORD_NONE;
}
@Override
void setText(CharacterIterator text) {
bi.setText(text);
status = RuleBasedBreakIterator.WORD_NONE;
}
}
}