lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/ICUNormalizer2CharFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.icu;

 import com.ibm.icu.text.Normalizer2;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Objects;
 import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.charfilter.BaseCharFilter;

 /** Normalize token text with ICU's {@link Normalizer2}. */
 public final class ICUNormalizer2CharFilter extends BaseCharFilter {

   private final Normalizer2 normalizer;
   private final StringBuilder inputBuffer = new StringBuilder();
   private final StringBuilder resultBuffer = new StringBuilder();

   private boolean inputFinished;
   private boolean afterQuickCheckYes;
   private int checkedInputBoundary;
   private int charCount;

   /**
    * Create a new Normalizer2CharFilter that combines NFKC normalization, Case Folding, and removes
    * Default Ignorables (NFKC_Casefold)
    */
   public ICUNormalizer2CharFilter(Reader in) {
     this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
   }

   /**
    * Create a new Normalizer2CharFilter with the specified Normalizer2
    *
    * @param in text
    * @param normalizer normalizer to use
    */
   public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
     this(in, normalizer, 128);
   }

   // for testing ONLY
   ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
     super(in);
     this.normalizer = Objects.requireNonNull(normalizer);
     this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
   }

   @Override
   public int read(char[] cbuf, int off, int len) throws IOException {
     if (off < 0) throw new IllegalArgumentException("off < 0");
     if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
     if (len <= 0) throw new IllegalArgumentException("len <= 0");

     while (!inputFinished || inputBuffer.length() > 0 || resultBuffer.length() > 0) {
       int retLen;

       if (resultBuffer.length() > 0) {
         retLen = outputFromResultBuffer(cbuf, off, len);
         if (retLen > 0) {
           return retLen;
         }
       }

       int resLen = readAndNormalizeFromInput();
       if (resLen > 0) {
         retLen = outputFromResultBuffer(cbuf, off, len);
         if (retLen > 0) {
           return retLen;
         }
       }

       readInputToBuffer();
     }

     return -1;
   }

   private final CharacterUtils.CharacterBuffer tmpBuffer;

   private void readInputToBuffer() throws IOException {
     while (true) {
       // CharacterUtils.fill is supplementary char aware
       final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);

       assert tmpBuffer.getOffset() == 0;
       inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());

       if (hasRemainingChars == false) {
         inputFinished = true;
         break;
       }

       final int lastCodePoint =
           Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
       if (normalizer.isInert(lastCodePoint)) {
         // we require an inert char so that we can normalize content before and
         // after this character independently
         break;
       }
     }

     // if checkedInputBoundary was at the end of a buffer, we need to check that char again
     checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
   }

   private int readAndNormalizeFromInput() {
     if (inputBuffer.length() <= 0) {
       afterQuickCheckYes = false;
       return 0;
     }
     if (!afterQuickCheckYes) {
       int resLen = readFromInputWhileSpanQuickCheckYes();
       afterQuickCheckYes = true;
       if (resLen > 0) return resLen;
     }
     int resLen = readFromIoNormalizeUptoBoundary();
     if (resLen > 0) {
       afterQuickCheckYes = false;
     }
     return resLen;
   }

   private int readFromInputWhileSpanQuickCheckYes() {
     int end = normalizer.spanQuickCheckYes(inputBuffer);
     if (end > 0) {
       resultBuffer.append(inputBuffer.subSequence(0, end));
       inputBuffer.delete(0, end);
       checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
       charCount += end;
     }
     return end;
   }

   private int readFromIoNormalizeUptoBoundary() {
     // if there's no buffer to normalize, return 0
     if (inputBuffer.length() <= 0) {
       return 0;
     }

     boolean foundBoundary = false;
     final int bufLen = inputBuffer.length();

     while (checkedInputBoundary <= bufLen - 1) {
       int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
       checkedInputBoundary += charLen;
       if (checkedInputBoundary < bufLen
           && normalizer.hasBoundaryBefore(inputBuffer.codePointAt(checkedInputBoundary))) {
         foundBoundary = true;
         break;
       }
     }
     if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
       foundBoundary = true;
       checkedInputBoundary = bufLen;
     }

     if (!foundBoundary) {
       return 0;
     }

     return normalizeInputUpto(checkedInputBoundary);
   }

   private int normalizeInputUpto(final int length) {
     final int destOrigLen = resultBuffer.length();
     normalizer.normalizeSecondAndAppend(resultBuffer, inputBuffer.subSequence(0, length));
     inputBuffer.delete(0, length);
     checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
     final int resultLength = resultBuffer.length() - destOrigLen;
     recordOffsetDiff(length, resultLength);
     return resultLength;
   }

   private void recordOffsetDiff(int inputLength, int outputLength) {
     if (inputLength == outputLength) {
       charCount += outputLength;
       return;
     }
     final int diff = inputLength - outputLength;
     final int cumuDiff = getLastCumulativeDiff();
     if (diff < 0) {
       for (int i = 1; i <= -diff; ++i) {
         addOffCorrectMap(charCount + i, cumuDiff - i);
       }
     } else {
       addOffCorrectMap(charCount + outputLength, cumuDiff + diff);
     }
     charCount += outputLength;
   }

   private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
     len = Math.min(resultBuffer.length(), len);
     resultBuffer.getChars(0, len, cbuf, begin);
     if (len > 0) {
       resultBuffer.delete(0, len);
     }
     return len;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.icu;

	import com.ibm.icu.text.Normalizer2;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.Objects;
	import org.apache.lucene.analysis.CharacterUtils;
	import org.apache.lucene.analysis.charfilter.BaseCharFilter;

	/** Normalize token text with ICU's {@link Normalizer2}. */
	public final class ICUNormalizer2CharFilter extends BaseCharFilter {

	private final Normalizer2 normalizer;
	private final StringBuilder inputBuffer = new StringBuilder();
	private final StringBuilder resultBuffer = new StringBuilder();

	private boolean inputFinished;
	private boolean afterQuickCheckYes;
	private int checkedInputBoundary;
	private int charCount;

	/**
	* Create a new Normalizer2CharFilter that combines NFKC normalization, Case Folding, and removes
	* Default Ignorables (NFKC_Casefold)
	*/
	public ICUNormalizer2CharFilter(Reader in) {
	this(in, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
	}

	/**
	* Create a new Normalizer2CharFilter with the specified Normalizer2
	*
	* @param in text
	* @param normalizer normalizer to use
	*/
	public ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer) {
	this(in, normalizer, 128);
	}

	// for testing ONLY
	ICUNormalizer2CharFilter(Reader in, Normalizer2 normalizer, int bufferSize) {
	super(in);
	this.normalizer = Objects.requireNonNull(normalizer);
	this.tmpBuffer = CharacterUtils.newCharacterBuffer(bufferSize);
	}

	@Override
	public int read(char[] cbuf, int off, int len) throws IOException {
	if (off < 0) throw new IllegalArgumentException("off < 0");
	if (off >= cbuf.length) throw new IllegalArgumentException("off >= cbuf.length");
	if (len <= 0) throw new IllegalArgumentException("len <= 0");

	while (!inputFinished \|\| inputBuffer.length() > 0 \|\| resultBuffer.length() > 0) {
	int retLen;

	if (resultBuffer.length() > 0) {
	retLen = outputFromResultBuffer(cbuf, off, len);
	if (retLen > 0) {
	return retLen;
	}
	}

	int resLen = readAndNormalizeFromInput();
	if (resLen > 0) {
	retLen = outputFromResultBuffer(cbuf, off, len);
	if (retLen > 0) {
	return retLen;
	}
	}

	readInputToBuffer();
	}

	return -1;
	}

	private final CharacterUtils.CharacterBuffer tmpBuffer;

	private void readInputToBuffer() throws IOException {
	while (true) {
	// CharacterUtils.fill is supplementary char aware
	final boolean hasRemainingChars = CharacterUtils.fill(tmpBuffer, input);

	assert tmpBuffer.getOffset() == 0;
	inputBuffer.append(tmpBuffer.getBuffer(), 0, tmpBuffer.getLength());

	if (hasRemainingChars == false) {
	inputFinished = true;
	break;
	}

	final int lastCodePoint =
	Character.codePointBefore(tmpBuffer.getBuffer(), tmpBuffer.getLength(), 0);
	if (normalizer.isInert(lastCodePoint)) {
	// we require an inert char so that we can normalize content before and
	// after this character independently
	break;
	}
	}

	// if checkedInputBoundary was at the end of a buffer, we need to check that char again
	checkedInputBoundary = Math.max(checkedInputBoundary - 1, 0);
	}

	private int readAndNormalizeFromInput() {
	if (inputBuffer.length() <= 0) {
	afterQuickCheckYes = false;
	return 0;
	}
	if (!afterQuickCheckYes) {
	int resLen = readFromInputWhileSpanQuickCheckYes();
	afterQuickCheckYes = true;
	if (resLen > 0) return resLen;
	}
	int resLen = readFromIoNormalizeUptoBoundary();
	if (resLen > 0) {
	afterQuickCheckYes = false;
	}
	return resLen;
	}

	private int readFromInputWhileSpanQuickCheckYes() {
	int end = normalizer.spanQuickCheckYes(inputBuffer);
	if (end > 0) {
	resultBuffer.append(inputBuffer.subSequence(0, end));
	inputBuffer.delete(0, end);
	checkedInputBoundary = Math.max(checkedInputBoundary - end, 0);
	charCount += end;
	}
	return end;
	}

	private int readFromIoNormalizeUptoBoundary() {
	// if there's no buffer to normalize, return 0
	if (inputBuffer.length() <= 0) {
	return 0;
	}

	boolean foundBoundary = false;
	final int bufLen = inputBuffer.length();

	while (checkedInputBoundary <= bufLen - 1) {
	int charLen = Character.charCount(inputBuffer.codePointAt(checkedInputBoundary));
	checkedInputBoundary += charLen;
	if (checkedInputBoundary < bufLen
	&& normalizer.hasBoundaryBefore(inputBuffer.codePointAt(checkedInputBoundary))) {
	foundBoundary = true;
	break;
	}
	}
	if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) {
	foundBoundary = true;
	checkedInputBoundary = bufLen;
	}

	if (!foundBoundary) {
	return 0;
	}

	return normalizeInputUpto(checkedInputBoundary);
	}

	private int normalizeInputUpto(final int length) {
	final int destOrigLen = resultBuffer.length();
	normalizer.normalizeSecondAndAppend(resultBuffer, inputBuffer.subSequence(0, length));
	inputBuffer.delete(0, length);
	checkedInputBoundary = Math.max(checkedInputBoundary - length, 0);
	final int resultLength = resultBuffer.length() - destOrigLen;
	recordOffsetDiff(length, resultLength);
	return resultLength;
	}

	private void recordOffsetDiff(int inputLength, int outputLength) {
	if (inputLength == outputLength) {
	charCount += outputLength;
	return;
	}
	final int diff = inputLength - outputLength;
	final int cumuDiff = getLastCumulativeDiff();
	if (diff < 0) {
	for (int i = 1; i <= -diff; ++i) {
	addOffCorrectMap(charCount + i, cumuDiff - i);
	}
	} else {
	addOffCorrectMap(charCount + outputLength, cumuDiff + diff);
	}
	charCount += outputLength;
	}

	private int outputFromResultBuffer(char[] cbuf, int begin, int len) {
	len = Math.min(resultBuffer.length(), len);
	resultBuffer.getChars(0, len, cbuf, begin);
	if (len > 0) {
	resultBuffer.delete(0, len);
	}
	return len;
	}
	}