blob: 4dbe7a7abc8321f4fa9f65e94c6a4b3931fe6d42 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.Objects;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.charfilter.BaseCharFilter;
* Normalize token text with ICU's {@link Normalizer2}.
public final class ICUNormalizer2CharFilter extends BaseCharFilter {
private final Normalizer2 normalizer;
private final StringBuilder inputBuffer = new StringBuilder();
private final StringBuilder resultBuffer = new StringBuilder();
private boolean inputFinished;
private boolean afterQuickCheckYes;
private int checkedInputBoundary;
private int charCount;
* Create a new Normalizer2CharFilter that combines NFKC normalization, Case
* Folding, and removes Default Ignorables (NFKC_Casefold)
public ICUNormalizer2CharFilter(Reader in) {
this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
* Create a new Normalizer2CharFilter with the specified Normalizer2
* @param in text
* @param normalizer normalizer to use
public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
this(in, normalizer, 128);
// for testing ONLY
ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
this.normalizer = Objects.requireNonNull(normalizer);
this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
public int read(char[] cbuf, int off, int len) throws IOException {
if (off < 0) throw new IllegalArgumentException("off < 0");
if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
if (len <= 0) throw new IllegalArgumentException("len <= 0");
while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
int retLen;
if (resultBuffer.length() > 0) {
retLen = outputFromResultBuffer(cbuf, off, len);
if (retLen > 0) {
return retLen;
int resLen = readAndNormalizeFromInput();
if (resLen > 0) {
retLen = outputFromResultBuffer(cbuf, off, len);
if (retLen > 0) {
return retLen;
return -1;
private final CharacterUtils.CharacterBuffer tmpBuffer;
private void readInputToBuffer() throws IOException {
// CharacterUtils.fill is supplementary char aware
if (!CharacterUtils.fill(tmpBuffer, input)) {
inputFinished = true;
assert tmpBuffer.getOffset() == 0;
inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());
// if checkedInputBoundary was at the end of a buffer, we need to check that char again
checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
private int readAndNormalizeFromInput() {
if (inputBuffer.length() <= 0) {
afterQuickCheckYes = false;
return 0;
if (!afterQuickCheckYes) {
int resLen = readFromInputWhileSpanQuickCheckYes();
if (resLen > 0) return resLen;
int resLen = readFromIoNormalizeUptoBoundary();
if(resLen > 0){
afterQuickCheckYes = false;
return resLen;
private int readFromInputWhileSpanQuickCheckYes() {
afterQuickCheckYes = true;
int end = normalizer.spanQuickCheckYes(inputBuffer);
if (end > 0) {
int cp;
if (end == inputBuffer.length()
&& !normalizer.hasBoundaryAfter(cp = inputBuffer.codePointBefore(end))) {
our quickCheckYes result is valid thru the end of current buffer, but we need to back off
because we're not at a normalization boundary. At a minimum this is relevant wrt imposing
canonical ordering of combining characters across the buffer boundary.
afterQuickCheckYes = false;
end -= Character.charCount(cp);
// NOTE: for the loop, we pivot to using `hasBoundaryBefore()` because per the docs for
// `Normalizer2.hasBoundaryAfter()`:
// "Note that this operation may be significantly slower than hasBoundaryBefore()"
while (end > 0 && !normalizer.hasBoundaryBefore(cp)) {
cp = inputBuffer.codePointBefore(end);
end -= Character.charCount(cp);
if (end == 0) {
return 0;
resultBuffer.append(inputBuffer.subSequence(0, end));
inputBuffer.delete(0, end);
checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
charCount += end;
return end;
private int readFromIoNormalizeUptoBoundary() {
// if there's no buffer to normalize, return 0
if (inputBuffer.length() <= 0) {
return 0;
boolean foundBoundary = false;
final int bufLen = inputBuffer.length();
while (checkedInputBoundary <= bufLen - 1) {
int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
checkedInputBoundary += charLen;
if (checkedInputBoundary < bufLen && normalizer.hasBoundaryBefore(inputBuffer
.codePointAt(checkedInputBoundary))) {
foundBoundary = true;
if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
foundBoundary = true;
checkedInputBoundary = bufLen;
if (!foundBoundary) {
return 0;
return normalizeInputUpto(checkedInputBoundary);
private int normalizeInputUpto(final int length) {
final int destOrigLen = resultBuffer.length();
inputBuffer.subSequence(0, length));
inputBuffer.delete(0, length);
checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
final int resultLength = resultBuffer.length() - destOrigLen;
recordOffsetDiff(length, resultLength);
return resultLength;
private void recordOffsetDiff(int inputLength, int outputLength) {
if (inputLength == outputLength) {
charCount += outputLength;
final int diff = inputLength - outputLength;
final int cumuDiff = getLastCumulativeDiff();
if (diff < 0) {
for (int i = 1; i <= -diff; ++i) {
addOffCorrectMap(charCount + i, cumuDiff - i);
} else {
addOffCorrectMap(charCount + outputLength, cumuDiff + diff);
charCount += outputLength;
private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
len = Math.min(resultBuffer.length(), len);
resultBuffer.getChars(0, len, cbuf, begin);
if (len > 0) {
resultBuffer.delete(0, len);
return len;