lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechStemmer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.cz;


 import static org.apache.lucene.analysis.util.StemmerUtil.*;

 /**
  * Light Stemmer for Czech.
  * <p>
  * Implements the algorithm described in:
  * <i>
  * Indexing and stemming approaches for the Czech language
  * </i>
  * http://portal.acm.org/citation.cfm?id=1598600
  * </p>
  */
 public class CzechStemmer {

   /**
    * Stem an input buffer of Czech text.
    *
    * @param s input buffer
    * @param len length of input buffer
    * @return length of input buffer after normalization
    *
    * <p><b>NOTE</b>: Input is expected to be in lowercase,
    * but with diacritical marks</p>
    */
   public int stem(char s[], int len) {
     len = removeCase(s, len);
     len = removePossessives(s, len);
     if (len > 0) {
       len = normalize(s, len);
     }
     return len;
   }

   private int removeCase(char s[], int len) {
     if (len > 7 && endsWith(s, len, "atech"))
       return len - 5;

     if (len > 6 &&
         (endsWith(s, len,"ětem") ||
         endsWith(s, len,"etem") ||
         endsWith(s, len,"atům")))
       return len - 4;

     if (len > 5 &&
         (endsWith(s, len, "ech") ||
         endsWith(s, len, "ich") ||
         endsWith(s, len, "ích") ||
         endsWith(s, len, "ého") ||
         endsWith(s, len, "ěmi") ||
         endsWith(s, len, "emi") ||
         endsWith(s, len, "ému") ||
         endsWith(s, len, "ěte") ||
         endsWith(s, len, "ete") ||
         endsWith(s, len, "ěti") ||
         endsWith(s, len, "eti") ||
         endsWith(s, len, "ího") ||
         endsWith(s, len, "iho") ||
         endsWith(s, len, "ími") ||
         endsWith(s, len, "ímu") ||
         endsWith(s, len, "imu") ||
         endsWith(s, len, "ách") ||
         endsWith(s, len, "ata") ||
         endsWith(s, len, "aty") ||
         endsWith(s, len, "ých") ||
         endsWith(s, len, "ama") ||
         endsWith(s, len, "ami") ||
         endsWith(s, len, "ové") ||
         endsWith(s, len, "ovi") ||
         endsWith(s, len, "ými")))
       return len - 3;

     if (len > 4 &&
         (endsWith(s, len, "em") ||
         endsWith(s, len, "es") ||
         endsWith(s, len, "ém") ||
         endsWith(s, len, "ím") ||
         endsWith(s, len, "ům") ||
         endsWith(s, len, "at") ||
         endsWith(s, len, "ám") ||
         endsWith(s, len, "os") ||
         endsWith(s, len, "us") ||
         endsWith(s, len, "ým") ||
         endsWith(s, len, "mi") ||
         endsWith(s, len, "ou")))
       return len - 2;

     if (len > 3) {
       switch (s[len - 1]) {
         case 'a':
         case 'e':
         case 'i':
         case 'o':
         case 'u':
         case 'ů':
         case 'y':
         case 'á':
         case 'é':
         case 'í':
         case 'ý':
         case 'ě':
           return len - 1;
       }
     }

     return len;
   }

   private int removePossessives(char s[], int len) {
     if (len > 5 &&
         (endsWith(s, len, "ov") ||
         endsWith(s, len, "in") ||
         endsWith(s, len, "ův")))
       return len - 2;

     return len;
   }

   private int normalize(char s[], int len) {
     if (endsWith(s, len, "čt")) { // čt -> ck
       s[len - 2] = 'c';
       s[len - 1] = 'k';
       return len;
     }

     if (endsWith(s, len, "št")) { // št -> sk
       s[len - 2] = 's';
       s[len - 1] = 'k';
       return len;
     }

     switch(s[len - 1]) {
       case 'c': // [cč] -> k
       case 'č':
         s[len - 1] = 'k';
         return len;
       case 'z': // [zž] -> h
       case 'ž':
         s[len - 1] = 'h';
         return len;
     }

     if (len > 1 && s[len - 2] == 'e') {
       s[len - 2] = s[len - 1]; // e* > *
       return len - 1;
     }

     if (len > 2 && s[len - 2] == 'ů') {
       s[len - 2] = 'o'; // *ů* -> *o*
       return len;
     }

     return len;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.cz;


	import static org.apache.lucene.analysis.util.StemmerUtil.*;

	/**
	* Light Stemmer for Czech.
	* <p>
	* Implements the algorithm described in:
	* <i>
	* Indexing and stemming approaches for the Czech language
	* </i>
	* http://portal.acm.org/citation.cfm?id=1598600
	* </p>
	*/
	public class CzechStemmer {

	/**
	* Stem an input buffer of Czech text.
	*
	* @param s input buffer
	* @param len length of input buffer
	* @return length of input buffer after normalization
	*
	* <p><b>NOTE</b>: Input is expected to be in lowercase,
	* but with diacritical marks</p>
	*/
	public int stem(char s[], int len) {
	len = removeCase(s, len);
	len = removePossessives(s, len);
	if (len > 0) {
	len = normalize(s, len);
	}
	return len;
	}

	private int removeCase(char s[], int len) {
	if (len > 7 && endsWith(s, len, "atech"))
	return len - 5;

	if (len > 6 &&
	(endsWith(s, len,"ětem") \|\|
	endsWith(s, len,"etem") \|\|
	endsWith(s, len,"atům")))
	return len - 4;

	if (len > 5 &&
	(endsWith(s, len, "ech") \|\|
	endsWith(s, len, "ich") \|\|
	endsWith(s, len, "ích") \|\|
	endsWith(s, len, "ého") \|\|
	endsWith(s, len, "ěmi") \|\|
	endsWith(s, len, "emi") \|\|
	endsWith(s, len, "ému") \|\|
	endsWith(s, len, "ěte") \|\|
	endsWith(s, len, "ete") \|\|
	endsWith(s, len, "ěti") \|\|
	endsWith(s, len, "eti") \|\|
	endsWith(s, len, "ího") \|\|
	endsWith(s, len, "iho") \|\|
	endsWith(s, len, "ími") \|\|
	endsWith(s, len, "ímu") \|\|
	endsWith(s, len, "imu") \|\|
	endsWith(s, len, "ách") \|\|
	endsWith(s, len, "ata") \|\|
	endsWith(s, len, "aty") \|\|
	endsWith(s, len, "ých") \|\|
	endsWith(s, len, "ama") \|\|
	endsWith(s, len, "ami") \|\|
	endsWith(s, len, "ové") \|\|
	endsWith(s, len, "ovi") \|\|
	endsWith(s, len, "ými")))
	return len - 3;

	if (len > 4 &&
	(endsWith(s, len, "em") \|\|
	endsWith(s, len, "es") \|\|
	endsWith(s, len, "ém") \|\|
	endsWith(s, len, "ím") \|\|
	endsWith(s, len, "ům") \|\|
	endsWith(s, len, "at") \|\|
	endsWith(s, len, "ám") \|\|
	endsWith(s, len, "os") \|\|
	endsWith(s, len, "us") \|\|
	endsWith(s, len, "ým") \|\|
	endsWith(s, len, "mi") \|\|
	endsWith(s, len, "ou")))
	return len - 2;

	if (len > 3) {
	switch (s[len - 1]) {
	case 'a':
	case 'e':
	case 'i':
	case 'o':
	case 'u':
	case 'ů':
	case 'y':
	case 'á':
	case 'é':
	case 'í':
	case 'ý':
	case 'ě':
	return len - 1;
	}
	}

	return len;
	}

	private int removePossessives(char s[], int len) {
	if (len > 5 &&
	(endsWith(s, len, "ov") \|\|
	endsWith(s, len, "in") \|\|
	endsWith(s, len, "ův")))
	return len - 2;

	return len;
	}

	private int normalize(char s[], int len) {
	if (endsWith(s, len, "čt")) { // čt -> ck
	s[len - 2] = 'c';
	s[len - 1] = 'k';
	return len;
	}

	if (endsWith(s, len, "št")) { // št -> sk
	s[len - 2] = 's';
	s[len - 1] = 'k';
	return len;
	}

	switch(s[len - 1]) {
	case 'c': // [cč] -> k
	case 'č':
	s[len - 1] = 'k';
	return len;
	case 'z': // [zž] -> h
	case 'ž':
	s[len - 1] = 'h';
	return len;
	}

	if (len > 1 && s[len - 2] == 'e') {
	s[len - 2] = s[len - 1]; // e* > *
	return len - 1;
	}

	if (len > 2 && s[len - 2] == 'ů') {
	s[len - 2] = 'o'; // ů -> o
	return len;
	}

	return len;
	}
	}