| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.util; |
| |
| import java.text.BreakIterator; // javadoc |
| import java.text.CharacterIterator; |
| import java.util.Locale; |
| |
| /** |
| * A CharacterIterator used internally for use with {@link BreakIterator} |
| * |
| * @lucene.internal |
| */ |
| public abstract class CharArrayIterator implements CharacterIterator { |
| private char array[]; |
| private int start; |
| private int index; |
| private int length; |
| private int limit; |
| |
| public char[] getText() { |
| return array; |
| } |
| |
| public int getStart() { |
| return start; |
| } |
| |
| public int getLength() { |
| return length; |
| } |
| |
| /** |
| * Set a new region of text to be examined by this iterator |
| * |
| * @param array text buffer to examine |
| * @param start offset into buffer |
| * @param length maximum length to examine |
| */ |
| public void setText(final char array[], int start, int length) { |
| this.array = array; |
| this.start = start; |
| this.index = start; |
| this.length = length; |
| this.limit = start + length; |
| } |
| |
| @Override |
| public char current() { |
| return (index == limit) ? DONE : jreBugWorkaround(array[index]); |
| } |
| |
| protected abstract char jreBugWorkaround(char ch); |
| |
| @Override |
| public char first() { |
| index = start; |
| return current(); |
| } |
| |
| @Override |
| public int getBeginIndex() { |
| return 0; |
| } |
| |
| @Override |
| public int getEndIndex() { |
| return length; |
| } |
| |
| @Override |
| public int getIndex() { |
| return index - start; |
| } |
| |
| @Override |
| public char last() { |
| index = (limit == start) ? limit : limit - 1; |
| return current(); |
| } |
| |
| @Override |
| public char next() { |
| if (++index >= limit) { |
| index = limit; |
| return DONE; |
| } else { |
| return current(); |
| } |
| } |
| |
| @Override |
| public char previous() { |
| if (--index < start) { |
| index = start; |
| return DONE; |
| } else { |
| return current(); |
| } |
| } |
| |
| @Override |
| public char setIndex(int position) { |
| if (position < getBeginIndex() || position > getEndIndex()) |
| throw new IllegalArgumentException("Illegal Position: " + position); |
| index = start + position; |
| return current(); |
| } |
| |
| @Override |
| public CharArrayIterator clone() { |
| try { |
| return (CharArrayIterator) super.clone(); |
| } catch (CloneNotSupportedException e) { |
| // CharacterIterator does not allow you to throw CloneNotSupported |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** |
| * Create a new CharArrayIterator that works around JRE bugs in a manner suitable for {@link |
| * BreakIterator#getSentenceInstance()} |
| */ |
| public static CharArrayIterator newSentenceInstance() { |
| if (HAS_BUGGY_BREAKITERATORS) { |
| return new CharArrayIterator() { |
| // work around this for now by lying about all surrogates to |
| // the sentence tokenizer, instead we treat them all as |
| // SContinue so we won't break around them. |
| @Override |
| protected char jreBugWorkaround(char ch) { |
| return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch; |
| } |
| }; |
| } else { |
| return new CharArrayIterator() { |
| // no bugs |
| @Override |
| protected char jreBugWorkaround(char ch) { |
| return ch; |
| } |
| }; |
| } |
| } |
| |
| /** |
| * Create a new CharArrayIterator that works around JRE bugs in a manner suitable for {@link |
| * BreakIterator#getWordInstance()} |
| */ |
| public static CharArrayIterator newWordInstance() { |
| if (HAS_BUGGY_BREAKITERATORS) { |
| return new CharArrayIterator() { |
| // work around this for now by lying about all surrogates to the word, |
| // instead we treat them all as ALetter so we won't break around them. |
| @Override |
| protected char jreBugWorkaround(char ch) { |
| return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch; |
| } |
| }; |
| } else { |
| return new CharArrayIterator() { |
| // no bugs |
| @Override |
| protected char jreBugWorkaround(char ch) { |
| return ch; |
| } |
| }; |
| } |
| } |
| |
| /** True if this JRE has a buggy BreakIterator implementation */ |
| public static final boolean HAS_BUGGY_BREAKITERATORS; |
| |
| static { |
| boolean v; |
| try { |
| BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); |
| bi.setText("\udb40\udc53"); |
| bi.next(); |
| v = false; |
| } catch (Exception e) { |
| v = true; |
| } |
| HAS_BUGGY_BREAKITERATORS = v; |
| } |
| } |