| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.icu.segmentation; |
| |
| |
| import java.text.CharacterIterator; |
| |
| import com.ibm.icu.lang.UCharacter; |
| import com.ibm.icu.text.BreakIterator; |
| import com.ibm.icu.text.RuleBasedBreakIterator; |
| import com.ibm.icu.text.UTF16; |
| |
| /** |
| * Contain all the issues surrounding BreakIterators in ICU in one place. |
| * Basically this boils down to the fact that they aren't very friendly to any |
| * sort of OO design. |
| * <p> |
| * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to |
| * BreakIterator from RuleBasedBreakIterator |
| * <p> |
| * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but |
| * doesn't actually behave as a subclass: it always returns 0 for |
| * getRuleStatus(): |
| * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type |
| * tags |
| * @lucene.experimental |
| */ |
| abstract class BreakIteratorWrapper { |
| protected final CharArrayIterator textIterator = new CharArrayIterator(); |
| protected char text[]; |
| protected int start; |
| protected int length; |
| |
| abstract int next(); |
| abstract int current(); |
| abstract int getRuleStatus(); |
| abstract void setText(CharacterIterator text); |
| |
| void setText(char text[], int start, int length) { |
| this.text = text; |
| this.start = start; |
| this.length = length; |
| textIterator.setText(text, start, length); |
| setText(textIterator); |
| } |
| |
| /** |
| * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's |
| * any other BreakIterator, the rulestatus method is not available, so treat |
| * it like a generic BreakIterator. |
| */ |
| static BreakIteratorWrapper wrap(BreakIterator breakIterator) { |
| if (breakIterator instanceof RuleBasedBreakIterator) |
| return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); |
| else |
| return new BIWrapper(breakIterator); |
| } |
| |
| /** |
| * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not |
| * a DictionaryBasedBreakIterator) behaves correctly. |
| */ |
| static final class RBBIWrapper extends BreakIteratorWrapper { |
| private final RuleBasedBreakIterator rbbi; |
| |
| RBBIWrapper(RuleBasedBreakIterator rbbi) { |
| this.rbbi = rbbi; |
| } |
| |
| @Override |
| int current() { |
| return rbbi.current(); |
| } |
| |
| @Override |
| int getRuleStatus() { |
| return rbbi.getRuleStatus(); |
| } |
| |
| @Override |
| int next() { |
| return rbbi.next(); |
| } |
| |
| @Override |
| void setText(CharacterIterator text) { |
| rbbi.setText(text); |
| } |
| } |
| |
| /** |
| * Generic BreakIterator wrapper: Either the rulestatus method is not |
| * available or always returns 0. Calculate a rulestatus here so it behaves |
| * like RuleBasedBreakIterator. |
| * |
| * Note: This is slower than RuleBasedBreakIterator. |
| */ |
| static final class BIWrapper extends BreakIteratorWrapper { |
| private final BreakIterator bi; |
| private int status; |
| |
| BIWrapper(BreakIterator bi) { |
| this.bi = bi; |
| } |
| |
| @Override |
| int current() { |
| return bi.current(); |
| } |
| |
| @Override |
| int getRuleStatus() { |
| return status; |
| } |
| |
| @Override |
| int next() { |
| int current = bi.current(); |
| int next = bi.next(); |
| status = calcStatus(current, next); |
| return next; |
| } |
| |
| private int calcStatus(int current, int next) { |
| if (current == BreakIterator.DONE || next == BreakIterator.DONE) |
| return RuleBasedBreakIterator.WORD_NONE; |
| |
| int begin = start + current; |
| int end = start + next; |
| |
| int codepoint; |
| for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { |
| codepoint = UTF16.charAt(text, 0, end, begin); |
| |
| if (UCharacter.isDigit(codepoint)) |
| return RuleBasedBreakIterator.WORD_NUMBER; |
| else if (UCharacter.isLetter(codepoint)) { |
| // TODO: try to separately specify ideographic, kana? |
| // [currently all bundled as letter for this case] |
| return RuleBasedBreakIterator.WORD_LETTER; |
| } |
| } |
| |
| return RuleBasedBreakIterator.WORD_NONE; |
| } |
| |
| @Override |
| void setText(CharacterIterator text) { |
| bi.setText(text); |
| status = RuleBasedBreakIterator.WORD_NONE; |
| } |
| } |
| } |