docs/attachments/LUCENE-1343/UnicodeNormalizationFilter.java - lucene-jira-archive - Git at Google

 package schema;
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.lang.reflect.Field;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;


 /**
  * Simple filter that runs all tokens through icu4j unicode normalizer
  * transforming them into the Unicode Composed Normalized Form.
  * Changing letters followed by Combining characters for accent marks into
  * a single character consisting of the accented form of the letter.
  *
  */
 public class UnicodeNormalizationFilter extends TokenFilter
 {
     private Method normalize;
     private Object mode;
     private boolean composed;
     private boolean removeDiacritics;
     private boolean removeSpacingModifiers;
     private boolean fold;

  	public UnicodeNormalizationFilter(TokenStream in, boolean icu4jVersion, boolean composedForm,
  	                                  boolean removeDia, boolean removeMod, boolean fold)
  	       throws ClassNotFoundException, SecurityException, NoSuchMethodException,
  	       NoSuchFieldException, IllegalArgumentException, IllegalAccessException, InvocationTargetException
 	{
 		super(in);
         composed = composedForm;
         removeDiacritics = removeDia;
         removeSpacingModifiers = removeMod;
         this.fold = fold;
         if (icu4jVersion)
         {
             Class cl = Class.forName("com.ibm.icu.text.Normalizer");
             Field field = cl.getField(composed ? "NFKC" : "NFKD");
             mode = field.get(null);
             normalize = cl.getMethod("normalize", String.class, field.getType());
         }
         else
         {
             Class cl = Class.forName("java.text.Normalizer");
             Class cl1 = Class.forName("java.text.Normalizer$Form");
             normalize = cl.getMethod("normalize", CharSequence.class, cl1);
             Method getMode = cl1.getMethod("valueOf", String.class);
             mode = getMode.invoke(null, composed ? "NFKC" : "NFKD");
         }
  	}
 	/**
 	 * Uses the static <i>normalize</i> method from the Normalizer class
 	 * to convert tokens to a standard format for searching/faceting.
 	 */
 	public Token next() throws IOException
 	{
 		final Token t = input.next();
 		if (t == null)  return null;

         // see Normalizer.Form enum to choose a normalization form
         String termtext = t.termText();
         String normtext;

         try
         {
             normtext = normalize.invoke(null, termtext, mode).toString();
             if (removeDiacritics || removeSpacingModifiers || fold)
             {
                 StringBuffer newNormText = new StringBuffer();
                 for (int i = 0; i < normtext.length(); i++)
                 {
                     char c = normtext.charAt(i);
                     char foldC;
                     if (removeDiacritics && UnicodeCharUtil.isCombiningCharacter(c))
                     {

                     }
                     else if (removeSpacingModifiers && UnicodeCharUtil.isSpacingModifier(c))
                     {

                     }
                     else if (fold && (foldC = UnicodeCharUtil.foldNonDiacriticChar(c)) != 0x00)
                     {
                         newNormText.append(foldC);
                     }
                     else
                     {
                         newNormText.append(c);
                     }
                 }
                 normtext = newNormText.toString();
             }
         }
         catch (Exception e)
         {
             return(t);
         }

         if (!normtext.equals(termtext))
         {
             t.setTermText(normtext);
         }
         return(t);
 	}

 }
	package schema;
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.lang.reflect.Field;
	import java.lang.reflect.InvocationTargetException;
	import java.lang.reflect.Method;

	import org.apache.lucene.analysis.Token;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;


	/**
	* Simple filter that runs all tokens through icu4j unicode normalizer
	* transforming them into the Unicode Composed Normalized Form.
	* Changing letters followed by Combining characters for accent marks into
	* a single character consisting of the accented form of the letter.
	*
	*/
	public class UnicodeNormalizationFilter extends TokenFilter
	{
	private Method normalize;
	private Object mode;
	private boolean composed;
	private boolean removeDiacritics;
	private boolean removeSpacingModifiers;
	private boolean fold;

	public UnicodeNormalizationFilter(TokenStream in, boolean icu4jVersion, boolean composedForm,
	boolean removeDia, boolean removeMod, boolean fold)
	throws ClassNotFoundException, SecurityException, NoSuchMethodException,
	NoSuchFieldException, IllegalArgumentException, IllegalAccessException, InvocationTargetException
	{
	super(in);
	composed = composedForm;
	removeDiacritics = removeDia;
	removeSpacingModifiers = removeMod;
	this.fold = fold;
	if (icu4jVersion)
	{
	Class cl = Class.forName("com.ibm.icu.text.Normalizer");
	Field field = cl.getField(composed ? "NFKC" : "NFKD");
	mode = field.get(null);
	normalize = cl.getMethod("normalize", String.class, field.getType());
	}
	else
	{
	Class cl = Class.forName("java.text.Normalizer");
	Class cl1 = Class.forName("java.text.Normalizer$Form");
	normalize = cl.getMethod("normalize", CharSequence.class, cl1);
	Method getMode = cl1.getMethod("valueOf", String.class);
	mode = getMode.invoke(null, composed ? "NFKC" : "NFKD");
	}
	}
	/**
	* Uses the static <i>normalize</i> method from the Normalizer class
	* to convert tokens to a standard format for searching/faceting.
	*/
	public Token next() throws IOException
	{
	final Token t = input.next();
	if (t == null) return null;

	// see Normalizer.Form enum to choose a normalization form
	String termtext = t.termText();
	String normtext;

	try
	{
	normtext = normalize.invoke(null, termtext, mode).toString();
	if (removeDiacritics \|\| removeSpacingModifiers \|\| fold)
	{
	StringBuffer newNormText = new StringBuffer();
	for (int i = 0; i < normtext.length(); i++)
	{
	char c = normtext.charAt(i);
	char foldC;
	if (removeDiacritics && UnicodeCharUtil.isCombiningCharacter(c))
	{

	}
	else if (removeSpacingModifiers && UnicodeCharUtil.isSpacingModifier(c))
	{

	}
	else if (fold && (foldC = UnicodeCharUtil.foldNonDiacriticChar(c)) != 0x00)
	{
	newNormText.append(foldC);
	}
	else
	{
	newNormText.append(c);
	}
	}
	normtext = newNormText.toString();
	}
	}
	catch (Exception e)
	{
	return(t);
	}

	if (!normtext.equals(termtext))
	{
	t.setTermText(normtext);
	}
	return(t);
	}

	}