lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.tr;


 import java.io.IOException;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 /**
  * Normalizes Turkish token text to lower case.
  * <p>
  * Turkish and Azeri have unique casing behavior for some characters. This
  * filter applies Turkish lowercase rules. For more information, see <a
  * href="http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I"
  * >http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I</a>
  * </p>
  */
 public final class TurkishLowerCaseFilter extends TokenFilter {
   private static final int LATIN_CAPITAL_LETTER_I = '\u0049';
   private static final int LATIN_SMALL_LETTER_I = '\u0069';
   private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
   private static final int COMBINING_DOT_ABOVE = '\u0307';
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

   /**
    * Create a new TurkishLowerCaseFilter, that normalizes Turkish token text
    * to lower case.
    *
    * @param in TokenStream to filter
    */
   public TurkishLowerCaseFilter(TokenStream in) {
     super(in);
   }

   @Override
   public final boolean incrementToken() throws IOException {
     boolean iOrAfter = false;

     if (input.incrementToken()) {
       final char[] buffer = termAtt.buffer();
       int length = termAtt.length();
       for (int i = 0; i < length;) {
         final int ch = Character.codePointAt(buffer, i, length);

         iOrAfter = (ch == LATIN_CAPITAL_LETTER_I ||
             (iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));

         if (iOrAfter) { // all the special I turkish handling happens here.
           switch(ch) {
             // remove COMBINING_DOT_ABOVE to mimic composed lowercase
             case COMBINING_DOT_ABOVE:
               length = delete(buffer, i, length);
               continue;
             // i itself, it depends if it is followed by COMBINING_DOT_ABOVE
             // if it is, we will make it small i and later remove the dot
             case LATIN_CAPITAL_LETTER_I:
               if (isBeforeDot(buffer, i + 1, length)) {
                 buffer[i] = LATIN_SMALL_LETTER_I;
               } else {
                 buffer[i] = LATIN_SMALL_LETTER_DOTLESS_I;
                 // below is an optimization. no COMBINING_DOT_ABOVE follows,
                 // so don't waste time calculating Character.getType(), etc
                 iOrAfter = false;
               }
               i++;
               continue;
           }
         }

         i += Character.toChars(Character.toLowerCase(ch), buffer, i);
       }

       termAtt.setLength(length);
       return true;
     } else
       return false;
   }


   /**
    * lookahead for a combining dot above.
    * other NSMs may be in between.
    */
   private boolean isBeforeDot(char s[], int pos, int len) {
     for (int i = pos; i < len;) {
       final int ch = Character.codePointAt(s, i, len);
       if (Character.getType(ch) != Character.NON_SPACING_MARK)
         return false;
       if (ch == COMBINING_DOT_ABOVE)
         return true;
       i += Character.charCount(ch);
     }

     return false;
   }

   /**
    * delete a character in-place.
    * rarely happens, only if COMBINING_DOT_ABOVE is found after an i
    */
   private int delete(char s[], int pos, int len) {
     if (pos < len)
       System.arraycopy(s, pos + 1, s, pos, len - pos - 1);

     return len - 1;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.tr;


	import java.io.IOException;

	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

	/**
	* Normalizes Turkish token text to lower case.
	* <p>
	* Turkish and Azeri have unique casing behavior for some characters. This
	* filter applies Turkish lowercase rules. For more information, see <a
	* href="http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I"
	* >http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I</a>
	* </p>
	*/
	public final class TurkishLowerCaseFilter extends TokenFilter {
	private static final int LATIN_CAPITAL_LETTER_I = '\u0049';
	private static final int LATIN_SMALL_LETTER_I = '\u0069';
	private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
	private static final int COMBINING_DOT_ABOVE = '\u0307';
	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

	/**
	* Create a new TurkishLowerCaseFilter, that normalizes Turkish token text
	* to lower case.
	*
	* @param in TokenStream to filter
	*/
	public TurkishLowerCaseFilter(TokenStream in) {
	super(in);
	}

	@Override
	public final boolean incrementToken() throws IOException {
	boolean iOrAfter = false;

	if (input.incrementToken()) {
	final char[] buffer = termAtt.buffer();
	int length = termAtt.length();
	for (int i = 0; i < length;) {
	final int ch = Character.codePointAt(buffer, i, length);

	iOrAfter = (ch == LATIN_CAPITAL_LETTER_I \|\|
	(iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));

	if (iOrAfter) { // all the special I turkish handling happens here.
	switch(ch) {
	// remove COMBINING_DOT_ABOVE to mimic composed lowercase
	case COMBINING_DOT_ABOVE:
	length = delete(buffer, i, length);
	continue;
	// i itself, it depends if it is followed by COMBINING_DOT_ABOVE
	// if it is, we will make it small i and later remove the dot
	case LATIN_CAPITAL_LETTER_I:
	if (isBeforeDot(buffer, i + 1, length)) {
	buffer[i] = LATIN_SMALL_LETTER_I;
	} else {
	buffer[i] = LATIN_SMALL_LETTER_DOTLESS_I;
	// below is an optimization. no COMBINING_DOT_ABOVE follows,
	// so don't waste time calculating Character.getType(), etc
	iOrAfter = false;
	}
	i++;
	continue;
	}
	}

	i += Character.toChars(Character.toLowerCase(ch), buffer, i);
	}

	termAtt.setLength(length);
	return true;
	} else
	return false;
	}


	/**
	* lookahead for a combining dot above.
	* other NSMs may be in between.
	*/
	private boolean isBeforeDot(char s[], int pos, int len) {
	for (int i = pos; i < len;) {
	final int ch = Character.codePointAt(s, i, len);
	if (Character.getType(ch) != Character.NON_SPACING_MARK)
	return false;
	if (ch == COMBINING_DOT_ABOVE)
	return true;
	i += Character.charCount(ch);
	}

	return false;
	}

	/**
	* delete a character in-place.
	* rarely happens, only if COMBINING_DOT_ABOVE is found after an i
	*/
	private int delete(char s[], int pos, int len) {
	if (pos < len)
	System.arraycopy(s, pos + 1, s, pos, len - pos - 1);

	return len - 1;
	}
	}