| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.icu; |
| |
| import com.ibm.icu.text.Normalizer2; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.util.Objects; |
| import org.apache.lucene.analysis.CharacterUtils; |
| import org.apache.lucene.analysis.charfilter.BaseCharFilter; |
| |
| /** Normalize token text with ICU's {@link Normalizer2}. */ |
| public final class ICUNormalizer2CharFilter extends BaseCharFilter { |
| |
| private final Normalizer2 normalizer; |
| private final StringBuilder inputBuffer = new StringBuilder(); |
| private final StringBuilder resultBuffer = new StringBuilder(); |
| |
| private boolean inputFinished; |
| private boolean afterQuickCheckYes; |
| private int checkedInputBoundary; |
| private int charCount; |
| |
| /** |
| * Create a new Normalizer2CharFilter that combines NFKC normalization, Case Folding, and removes |
| * Default Ignorables (NFKC_Casefold) |
| */ |
| public ICUNormalizer2CharFilter(Reader in) { |
| this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); |
| } |
| |
| /** |
| * Create a new Normalizer2CharFilter with the specified Normalizer2 |
| * |
| * @param in text |
| * @param normalizer normalizer to use |
| */ |
| public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) { |
| this(in, normalizer, 128); |
| } |
| |
| // for testing ONLY |
| ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) { |
| super(in); |
| this.normalizer = Objects.requireNonNull(normalizer); |
| this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize); |
| } |
| |
| @Override |
| public int read(char[] cbuf, int off, int len) throws IOException { |
| if (off < 0) throw new IllegalArgumentException("off < 0"); |
| if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length"); |
| if (len <= 0) throw new IllegalArgumentException("len <= 0"); |
| |
| while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) { |
| int retLen; |
| |
| if (resultBuffer.length() > 0) { |
| retLen = outputFromResultBuffer(cbuf, off, len); |
| if (retLen > 0) { |
| return retLen; |
| } |
| } |
| |
| int resLen = readAndNormalizeFromInput(); |
| if (resLen > 0) { |
| retLen = outputFromResultBuffer(cbuf, off, len); |
| if (retLen > 0) { |
| return retLen; |
| } |
| } |
| |
| readInputToBuffer(); |
| } |
| |
| return -1; |
| } |
| |
| private final CharacterUtils.CharacterBuffer tmpBuffer; |
| |
| private void readInputToBuffer() throws IOException { |
| while (true) { |
| // CharacterUtils.fill is supplementary char aware |
| final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input); |
| |
| assert tmpBuffer.getOffset() == 0; |
| inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength()); |
| |
| if (hasRemainingChars == false) { |
| inputFinished = true; |
| break; |
| } |
| |
| final int lastCodePoint = |
| Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0); |
| if (normalizer.isInert(lastCodePoint)) { |
| // we require an inert char so that we can normalize content before and |
| // after this character independently |
| break; |
| } |
| } |
| |
| // if checkedInputBoundary was at the end of a buffer, we need to check that char again |
| checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0); |
| } |
| |
| private int readAndNormalizeFromInput() { |
| if (inputBuffer.length() <= 0) { |
| afterQuickCheckYes = false; |
| return 0; |
| } |
| if (!afterQuickCheckYes) { |
| int resLen = readFromInputWhileSpanQuickCheckYes(); |
| afterQuickCheckYes = true; |
| if (resLen > 0) return resLen; |
| } |
| int resLen = readFromIoNormalizeUptoBoundary(); |
| if (resLen > 0) { |
| afterQuickCheckYes = false; |
| } |
| return resLen; |
| } |
| |
| private int readFromInputWhileSpanQuickCheckYes() { |
| int end = normalizer.spanQuickCheckYes(inputBuffer); |
| if (end > 0) { |
| resultBuffer.append(inputBuffer.subSequence(0, end)); |
| inputBuffer.delete(0, end); |
| checkedInputBoundary = Math.max(checkedInputBoundary - end, 0); |
| charCount += end; |
| } |
| return end; |
| } |
| |
| private int readFromIoNormalizeUptoBoundary() { |
| // if there's no buffer to normalize, return 0 |
| if (inputBuffer.length() <= 0) { |
| return 0; |
| } |
| |
| boolean foundBoundary = false; |
| final int bufLen = inputBuffer.length(); |
| |
| while (checkedInputBoundary <= bufLen - 1) { |
| int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary)); |
| checkedInputBoundary += charLen; |
| if (checkedInputBoundary < bufLen |
| && normalizer.hasBoundaryBefore(inputBuffer.codePointAt(checkedInputBoundary))) { |
| foundBoundary = true; |
| break; |
| } |
| } |
| if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) { |
| foundBoundary = true; |
| checkedInputBoundary = bufLen; |
| } |
| |
| if (!foundBoundary) { |
| return 0; |
| } |
| |
| return normalizeInputUpto(checkedInputBoundary); |
| } |
| |
| private int normalizeInputUpto(final int length) { |
| final int destOrigLen = resultBuffer.length(); |
| normalizer.normalizeSecondAndAppend(resultBuffer, inputBuffer.subSequence(0, length)); |
| inputBuffer.delete(0, length); |
| checkedInputBoundary = Math.max(checkedInputBoundary - length, 0); |
| final int resultLength = resultBuffer.length() - destOrigLen; |
| recordOffsetDiff(length, resultLength); |
| return resultLength; |
| } |
| |
| private void recordOffsetDiff(int inputLength, int outputLength) { |
| if (inputLength == outputLength) { |
| charCount += outputLength; |
| return; |
| } |
| final int diff = inputLength - outputLength; |
| final int cumuDiff = getLastCumulativeDiff(); |
| if (diff < 0) { |
| for (int i = 1; i <= -diff; ++i) { |
| addOffCorrectMap(charCount + i, cumuDiff - i); |
| } |
| } else { |
| addOffCorrectMap(charCount + outputLength, cumuDiff + diff); |
| } |
| charCount += outputLength; |
| } |
| |
| private int outputFromResultBuffer(char[] cbuf, int begin, int len) { |
| len = Math.min(resultBuffer.length(), len); |
| resultBuffer.getChars(0, len, cbuf, begin); |
| if (len > 0) { |
| resultBuffer.delete(0, len); |
| } |
| return len; |
| } |
| } |