lucene/analysis/common/src/java/org/apache/lucene/analysis/bn/BengaliNormalizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.bn;


 import static org.apache.lucene.analysis.util.StemmerUtil.delete;

 /**
  * Normalizer for Bengali.
  * <p>
  * Implements the Bengali-language specific algorithm specified in:
  * <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
  * Naushad UzZaman and Mumit Khan.
  * http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
  * </p>
  */
 public class BengaliNormalizer {
   /**
    * Normalize an input buffer of Bengali text
    *
    * @param s   input buffer
    * @param len length of input buffer
    * @return length of input buffer after normalization
    */
   public int normalize(char s[], int len) {

     for (int i = 0; i < len; i++) {
       switch (s[i]) {
         // delete Chandrabindu
         case '\u0981':
           len = delete(s, i, len);
           i--;
           break;

         // DirghoI kar -> RosshoI kar
         case '\u09C0':
           s[i] = '\u09BF';
           break;

         // DirghoU kar -> RosshoU kar
         case '\u09C2':
           s[i] = '\u09C1';
           break;

         // Khio (Ka + Hoshonto + Murdorno Sh)
         case '\u0995':
           if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
             if (i == 0) {
               s[i] = '\u0996';
               len = delete(s, i + 2, len);
               len = delete(s, i + 1, len);
             } else {
               s[i+1] = '\u0996';
               len = delete(s, i + 2, len);
             }
           }
           break;

         // Nga to Anusvara
         case '\u0999':
           s[i] = '\u0982';
           break;

         // Ja Phala
         case '\u09AF':
           if(i - 2 == 0 && s[i-1] == '\u09CD') {
             s[i - 1] = '\u09C7';

             if(i + 1 < len && s[i+1] == '\u09BE') {
               len = delete(s, i+1, len);
             }
             len = delete(s, i, len);
             i --;
           } else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
             len = delete(s, i, len);
             len = delete(s, i-1, len);
             i -=2;
           }
           break;

         // Ba Phalaa
         case '\u09AC':
           if((i >= 1 && s[i-1] != '\u09CD') || i == 0)
             break;
           if(i - 2 == 0) {
             len = delete(s, i, len);
             len = delete(s, i - 1, len);
             i -= 2;
           } else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
             len = delete(s, i, len);
             len = delete(s, i-1, len);
             i -=2;
           } else if(i - 2 >= 0){
             s[i - 1] = s[i - 2];
             len = delete(s, i, len);
             i --;
           }
           break;

         // Visarga
         case '\u0983':
           if(i == len -1) {
             if(len <= 3) {
               s[i] = '\u09B9';
             } else {
               len = delete(s, i, len);
             }
           } else {
             s[i] = s[i+1];
           }
           break;

         //All sh
         case '\u09B6':
         case '\u09B7':
           s[i] = '\u09B8';
           break;

         //check na
         case '\u09A3':
           s[i] = '\u09A8';
           break;

         //check ra
         case '\u09DC':
         case '\u09DD':
           s[i] = '\u09B0';
           break;

         case '\u09CE':
           s[i] = '\u09A4';
           break;

         default:
           break;
       }
     }

     return len;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.bn;


	import static org.apache.lucene.analysis.util.StemmerUtil.delete;

	/**
	* Normalizer for Bengali.
	* <p>
	* Implements the Bengali-language specific algorithm specified in:
	* <i>A Double Metaphone encoding for Bangla and its application in spelling checker</i>
	* Naushad UzZaman and Mumit Khan.
	* http://www.panl10n.net/english/final%20reports/pdf%20files/Bangladesh/BAN16.pdf
	* </p>
	*/
	public class BengaliNormalizer {
	/**
	* Normalize an input buffer of Bengali text
	*
	* @param s input buffer
	* @param len length of input buffer
	* @return length of input buffer after normalization
	*/
	public int normalize(char s[], int len) {

	for (int i = 0; i < len; i++) {
	switch (s[i]) {
	// delete Chandrabindu
	case '\u0981':
	len = delete(s, i, len);
	i--;
	break;

	// DirghoI kar -> RosshoI kar
	case '\u09C0':
	s[i] = '\u09BF';
	break;

	// DirghoU kar -> RosshoU kar
	case '\u09C2':
	s[i] = '\u09C1';
	break;

	// Khio (Ka + Hoshonto + Murdorno Sh)
	case '\u0995':
	if(i + 2 < len && s[i+1] == '\u09CD' && s[i+2] == '\u09BF') {
	if (i == 0) {
	s[i] = '\u0996';
	len = delete(s, i + 2, len);
	len = delete(s, i + 1, len);
	} else {
	s[i+1] = '\u0996';
	len = delete(s, i + 2, len);
	}
	}
	break;

	// Nga to Anusvara
	case '\u0999':
	s[i] = '\u0982';
	break;

	// Ja Phala
	case '\u09AF':
	if(i - 2 == 0 && s[i-1] == '\u09CD') {
	s[i - 1] = '\u09C7';

	if(i + 1 < len && s[i+1] == '\u09BE') {
	len = delete(s, i+1, len);
	}
	len = delete(s, i, len);
	i --;
	} else if(i - 1 >= 0 && s[i-1] == '\u09CD' ){
	len = delete(s, i, len);
	len = delete(s, i-1, len);
	i -=2;
	}
	break;

	// Ba Phalaa
	case '\u09AC':
	if((i >= 1 && s[i-1] != '\u09CD') \|\| i == 0)
	break;
	if(i - 2 == 0) {
	len = delete(s, i, len);
	len = delete(s, i - 1, len);
	i -= 2;
	} else if(i - 5 >= 0 && s[i - 3] == '\u09CD') {
	len = delete(s, i, len);
	len = delete(s, i-1, len);
	i -=2;
	} else if(i - 2 >= 0){
	s[i - 1] = s[i - 2];
	len = delete(s, i, len);
	i --;
	}
	break;

	// Visarga
	case '\u0983':
	if(i == len -1) {
	if(len <= 3) {
	s[i] = '\u09B9';
	} else {
	len = delete(s, i, len);
	}
	} else {
	s[i] = s[i+1];
	}
	break;

	//All sh
	case '\u09B6':
	case '\u09B7':
	s[i] = '\u09B8';
	break;

	//check na
	case '\u09A3':
	s[i] = '\u09A8';
	break;

	//check ra
	case '\u09DC':
	case '\u09DD':
	s[i] = '\u09B0';
	break;

	case '\u09CE':
	s[i] = '\u09A4';
	break;

	default:
	break;
	}
	}

	return len;
	}
	}