| package org.apache.lucene.analysis.fr; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /** |
| * A stemmer for French words. |
| * <p> |
| * The algorithm is based on the work of |
| * Dr Martin Porter on his snowball project<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html<br> |
| * (French stemming algorithm) for details |
| * </p> |
| * @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead, |
| * which has the same functionality. This filter will be removed in Lucene 5.0 |
| */ |
| @Deprecated |
| public class FrenchStemmer { |
| |
| /** |
| * Buffer for the terms while stemming them. |
| */ |
| private StringBuilder sb = new StringBuilder(); |
| |
| /** |
| * A temporary buffer, used to reconstruct R2 |
| */ |
| private StringBuilder tb = new StringBuilder(); |
| |
| /** |
| * Region R0 is equal to the whole buffer |
| */ |
| private String R0; |
| |
| /** |
| * Region RV |
| * "If the word begins with two vowels, RV is the region after the third letter, |
| * otherwise the region after the first vowel not at the beginning of the word, |
| * or the end of the word if these positions cannot be found." |
| */ |
| private String RV; |
| |
| /** |
| * Region R1 |
| * "R1 is the region after the first non-vowel following a vowel |
| * or is the null region at the end of the word if there is no such non-vowel" |
| */ |
| private String R1; |
| |
| /** |
| * Region R2 |
| * "R2 is the region after the first non-vowel in R1 following a vowel |
| * or is the null region at the end of the word if there is no such non-vowel" |
| */ |
| private String R2; |
| |
| |
| /** |
| * Set to true if we need to perform step 2 |
| */ |
| private boolean suite; |
| |
| /** |
| * Set to true if the buffer was modified |
| */ |
| private boolean modified; |
| |
| |
| /** |
| * Stems the given term to a unique <tt>discriminator</tt>. |
| * |
| * @param term java.langString The term that should be stemmed |
| * @return java.lang.String Discriminator for <tt>term</tt> |
| */ |
| protected String stem( String term ) { |
| if ( !isStemmable( term ) ) { |
| return term; |
| } |
| |
| // Use lowercase for medium stemming. |
| term = term.toLowerCase(); |
| |
| // Reset the StringBuilder. |
| sb.delete( 0, sb.length() ); |
| sb.insert( 0, term ); |
| |
| // reset the booleans |
| modified = false; |
| suite = false; |
| |
| sb = treatVowels( sb ); |
| |
| setStrings(); |
| |
| step1(); |
| |
| if (!modified || suite) |
| { |
| if (RV != null) |
| { |
| suite = step2a(); |
| if (!suite) |
| step2b(); |
| } |
| } |
| |
| if (modified || suite) |
| step3(); |
| else |
| step4(); |
| |
| step5(); |
| |
| step6(); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Sets the search region Strings<br> |
| * it needs to be done each time the buffer was modified |
| */ |
| private void setStrings() { |
| // set the strings |
| R0 = sb.toString(); |
| RV = retrieveRV( sb ); |
| R1 = retrieveR( sb ); |
| if ( R1 != null ) |
| { |
| tb.delete( 0, tb.length() ); |
| tb.insert( 0, R1 ); |
| R2 = retrieveR( tb ); |
| } |
| else |
| R2 = null; |
| } |
| |
| /** |
| * First step of the Porter Algorithm<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step1( ) { |
| String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; |
| deleteFrom( R2, suffix ); |
| |
| replaceFrom( R2, new String[] { "logies", "logie" }, "log" ); |
| replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); |
| replaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); |
| |
| String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; |
| deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); |
| |
| deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); |
| deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); |
| deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); |
| deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); |
| deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); |
| |
| deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); |
| deleteFrom( RV, new String[] { "ements", "ement" } ); |
| |
| deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" ); |
| deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" ); |
| deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true ); |
| |
| String[] autre = { "ifs", "ives", "if", "ive" }; |
| deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); |
| deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); |
| |
| replaceFrom( R0, new String[] { "eaux" }, "eau" ); |
| |
| replaceFrom( R1, new String[] { "aux" }, "al" ); |
| |
| deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); |
| |
| deleteFrom( R2, new String[] { "eux" } ); |
| |
| // if one of the next steps is performed, we will need to perform step2a |
| boolean temp = false; |
| temp = replaceFrom( RV, new String[] { "amment" }, "ant" ); |
| if (temp == true) |
| suite = true; |
| temp = replaceFrom( RV, new String[] { "emment" }, "ent" ); |
| if (temp == true) |
| suite = true; |
| temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); |
| if (temp == true) |
| suite = true; |
| |
| } |
| |
| /** |
| * Second step (A) of the Porter Algorithm<br> |
| * Will be performed if nothing changed from the first step |
| * or changed were done in the amment, emment, ments or ment suffixes<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| * |
| * @return boolean - true if something changed in the StringBuilder |
| */ |
| private boolean step2a() { |
| String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira", |
| "irent", "iriez", "irez", "irions", "irons", "iront", |
| "issaIent", "issais", "issantes", "issante", "issants", "issant", |
| "issait", "issais", "issions", "issons", "issiez", "issez", "issent", |
| "isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" }; |
| return deleteFromIfTestVowelBeforeIn( RV, search, false, RV ); |
| } |
| |
| /** |
| * Second step (B) of the Porter Algorithm<br> |
| * Will be performed if step 2 A was performed unsuccessfully<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step2b() { |
| String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", |
| "erons", "eront","erez", "èrent", "era", "ées", "iez", |
| "ée", "és", "er", "ez", "é" }; |
| deleteFrom( RV, suffix ); |
| |
| String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", |
| "antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant", |
| "ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" }; |
| deleteButSuffixFrom( RV, search, "e", true ); |
| |
| deleteFrom( R2, new String[] { "ions" } ); |
| } |
| |
| /** |
| * Third step of the Porter Algorithm<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step3() { |
| if (sb.length()>0) |
| { |
| char ch = sb.charAt( sb.length()-1 ); |
| if (ch == 'Y') |
| { |
| sb.setCharAt( sb.length()-1, 'i' ); |
| setStrings(); |
| } |
| else if (ch == 'ç') |
| { |
| sb.setCharAt( sb.length()-1, 'c' ); |
| setStrings(); |
| } |
| } |
| } |
| |
| /** |
| * Fourth step of the Porter Algorithm<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step4() { |
| if (sb.length() > 1) |
| { |
| char ch = sb.charAt( sb.length()-1 ); |
| if (ch == 's') |
| { |
| char b = sb.charAt( sb.length()-2 ); |
| if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') |
| { |
| sb.delete( sb.length() - 1, sb.length()); |
| setStrings(); |
| } |
| } |
| } |
| boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); |
| if (!found) |
| found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); |
| |
| replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" ); |
| deleteFrom( RV, new String[] { "e" } ); |
| deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" ); |
| } |
| |
| /** |
| * Fifth step of the Porter Algorithm<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step5() { |
| if (R0 != null) |
| { |
| if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) |
| { |
| sb.delete( sb.length() - 1, sb.length() ); |
| setStrings(); |
| } |
| } |
| } |
| |
| /** |
| * Sixth (and last!) step of the Porter Algorithm<br> |
| * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation |
| */ |
| private void step6() { |
| if (R0!=null && R0.length()>0) |
| { |
| boolean seenVowel = false; |
| boolean seenConson = false; |
| int pos = -1; |
| for (int i = R0.length()-1; i > -1; i--) |
| { |
| char ch = R0.charAt(i); |
| if (isVowel(ch)) |
| { |
| if (!seenVowel) |
| { |
| if (ch == 'é' || ch == 'è') |
| { |
| pos = i; |
| break; |
| } |
| } |
| seenVowel = true; |
| } |
| else |
| { |
| if (seenVowel) |
| break; |
| else |
| seenConson = true; |
| } |
| } |
| if (pos > -1 && seenConson && !seenVowel) |
| sb.setCharAt(pos, 'e'); |
| } |
| } |
| |
| /** |
| * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string |
| * |
| * @param source java.lang.String - the primary source zone for search |
| * @param search java.lang.String[] - the strings to search for suppression |
| * @param from java.lang.String - the secondary source zone for search |
| * @param prefix java.lang.String - the prefix to add to the search string to test |
| * @return boolean - true if modified |
| */ |
| private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { |
| boolean found = false; |
| if (source!=null ) |
| { |
| for (int i = 0; i < search.length; i++) { |
| if ( source.endsWith( search[i] )) |
| { |
| if (from!=null && from.endsWith( prefix + search[i] )) |
| { |
| sb.delete( sb.length() - search[i].length(), sb.length()); |
| found = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| } |
| return found; |
| } |
| |
| /** |
| * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel |
| * |
| * @param source java.lang.String - the primary source zone for search |
| * @param search java.lang.String[] - the strings to search for suppression |
| * @param vowel boolean - true if we need a vowel before the search string |
| * @param from java.lang.String - the secondary source zone for search (where vowel could be) |
| * @return boolean - true if modified |
| */ |
| private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) { |
| boolean found = false; |
| if (source!=null && from!=null) |
| { |
| for (int i = 0; i < search.length; i++) { |
| if ( source.endsWith( search[i] )) |
| { |
| if ((search[i].length() + 1) <= from.length()) |
| { |
| boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1))); |
| if (test == vowel) |
| { |
| sb.delete( sb.length() - search[i].length(), sb.length()); |
| modified = true; |
| found = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| } |
| } |
| return found; |
| } |
| |
| /** |
| * Delete a suffix searched in zone "source" if preceded by the prefix |
| * |
| * @param source java.lang.String - the primary source zone for search |
| * @param search java.lang.String[] - the strings to search for suppression |
| * @param prefix java.lang.String - the prefix to add to the search string to test |
| * @param without boolean - true if it will be deleted even without prefix found |
| */ |
| private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) { |
| if (source!=null) |
| { |
| for (int i = 0; i < search.length; i++) { |
| if ( source.endsWith( prefix + search[i] )) |
| { |
| sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| else if ( without && source.endsWith( search[i] )) |
| { |
| sb.delete( sb.length() - search[i].length(), sb.length() ); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Delete a suffix searched in zone "source" if preceded by prefix<br> |
| * or replace it with the replace string if preceded by the prefix in the zone "from"<br> |
| * or delete the suffix if specified |
| * |
| * @param source java.lang.String - the primary source zone for search |
| * @param search java.lang.String[] - the strings to search for suppression |
| * @param prefix java.lang.String - the prefix to add to the search string to test |
| * @param without boolean - true if it will be deleted even without prefix found |
| */ |
| private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) { |
| if (source!=null) |
| { |
| for (int i = 0; i < search.length; i++) { |
| if ( source.endsWith( prefix + search[i] )) |
| { |
| sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() ); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| else if ( from!=null && from.endsWith( prefix + search[i] )) |
| { |
| sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace ); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| else if ( without && source.endsWith( search[i] )) |
| { |
| sb.delete( sb.length() - search[i].length(), sb.length() ); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Replace a search string with another within the source zone |
| * |
| * @param source java.lang.String - the source zone for search |
| * @param search java.lang.String[] - the strings to search for replacement |
| * @param replace java.lang.String - the replacement string |
| */ |
| private boolean replaceFrom( String source, String[] search, String replace ) { |
| boolean found = false; |
| if (source!=null) |
| { |
| for (int i = 0; i < search.length; i++) { |
| if ( source.endsWith( search[i] )) |
| { |
| sb.replace( sb.length() - search[i].length(), sb.length(), replace ); |
| modified = true; |
| found = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| return found; |
| } |
| |
| /** |
| * Delete a search string within the source zone |
| * |
| * @param source the source zone for search |
| * @param suffix the strings to search for suppression |
| */ |
| private void deleteFrom(String source, String[] suffix ) { |
| if (source!=null) |
| { |
| for (int i = 0; i < suffix.length; i++) { |
| if (source.endsWith( suffix[i] )) |
| { |
| sb.delete( sb.length() - suffix[i].length(), sb.length()); |
| modified = true; |
| setStrings(); |
| break; |
| } |
| } |
| } |
| } |
| |
| /** |
| * Test if a char is a french vowel, including accentuated ones |
| * |
| * @param ch the char to test |
| * @return boolean - true if the char is a vowel |
| */ |
| private boolean isVowel(char ch) { |
| switch (ch) |
| { |
| case 'a': |
| case 'e': |
| case 'i': |
| case 'o': |
| case 'u': |
| case 'y': |
| case 'â': |
| case 'à': |
| case 'ë': |
| case 'é': |
| case 'ê': |
| case 'è': |
| case 'ï': |
| case 'î': |
| case 'ô': |
| case 'ü': |
| case 'ù': |
| case 'û': |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| /** |
| * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br> |
| * "R is the region after the first non-vowel following a vowel |
| * or is the null region at the end of the word if there is no such non-vowel"<br> |
| * @param buffer java.lang.StringBuilder - the in buffer |
| * @return java.lang.String - the resulting string |
| */ |
| private String retrieveR( StringBuilder buffer ) { |
| int len = buffer.length(); |
| int pos = -1; |
| for (int c = 0; c < len; c++) { |
| if (isVowel( buffer.charAt( c ))) |
| { |
| pos = c; |
| break; |
| } |
| } |
| if (pos > -1) |
| { |
| int consonne = -1; |
| for (int c = pos; c < len; c++) { |
| if (!isVowel(buffer.charAt( c ))) |
| { |
| consonne = c; |
| break; |
| } |
| } |
| if (consonne > -1 && (consonne+1) < len) |
| return buffer.substring( consonne+1, len ); |
| else |
| return null; |
| } |
| else |
| return null; |
| } |
| |
| /** |
| * Retrieve the "RV zone" from a buffer an return the corresponding string<br> |
| * "If the word begins with two vowels, RV is the region after the third letter, |
| * otherwise the region after the first vowel not at the beginning of the word, |
| * or the end of the word if these positions cannot be found."<br> |
| * @param buffer java.lang.StringBuilder - the in buffer |
| * @return java.lang.String - the resulting string |
| */ |
| private String retrieveRV( StringBuilder buffer ) { |
| int len = buffer.length(); |
| if ( buffer.length() > 3) |
| { |
| if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) { |
| return buffer.substring(3,len); |
| } |
| else |
| { |
| int pos = 0; |
| for (int c = 1; c < len; c++) { |
| if (isVowel( buffer.charAt( c ))) |
| { |
| pos = c; |
| break; |
| } |
| } |
| if ( pos+1 < len ) |
| return buffer.substring( pos+1, len ); |
| else |
| return null; |
| } |
| } |
| else |
| return null; |
| } |
| |
| |
| |
| /** |
| * Turns u and i preceded AND followed by a vowel to UpperCase<br> |
| * Turns y preceded OR followed by a vowel to UpperCase<br> |
| * Turns u preceded by q to UpperCase<br> |
| * |
| * @param buffer java.util.StringBuilder - the buffer to treat |
| * @return java.util.StringBuilder - the treated buffer |
| */ |
| private StringBuilder treatVowels( StringBuilder buffer ) { |
| for ( int c = 0; c < buffer.length(); c++ ) { |
| char ch = buffer.charAt( c ); |
| |
| if (c == 0) // first char |
| { |
| if (buffer.length()>1) |
| { |
| if (ch == 'y' && isVowel(buffer.charAt( c + 1 ))) |
| buffer.setCharAt( c, 'Y' ); |
| } |
| } |
| else if (c == buffer.length()-1) // last char |
| { |
| if (ch == 'u' && buffer.charAt( c - 1 ) == 'q') |
| buffer.setCharAt( c, 'U' ); |
| if (ch == 'y' && isVowel(buffer.charAt( c - 1 ))) |
| buffer.setCharAt( c, 'Y' ); |
| } |
| else // other cases |
| { |
| if (ch == 'u') |
| { |
| if (buffer.charAt( c - 1) == 'q') |
| buffer.setCharAt( c, 'U' ); |
| else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) |
| buffer.setCharAt( c, 'U' ); |
| } |
| if (ch == 'i') |
| { |
| if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 ))) |
| buffer.setCharAt( c, 'I' ); |
| } |
| if (ch == 'y') |
| { |
| if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 ))) |
| buffer.setCharAt( c, 'Y' ); |
| } |
| } |
| } |
| |
| return buffer; |
| } |
| |
| /** |
| * Checks a term if it can be processed correctly. |
| * |
| * @return boolean - true if, and only if, the given term consists in letters. |
| */ |
| private boolean isStemmable( String term ) { |
| boolean upper = false; |
| int first = -1; |
| for ( int c = 0; c < term.length(); c++ ) { |
| // Discard terms that contain non-letter characters. |
| if ( !Character.isLetter( term.charAt( c ) ) ) { |
| return false; |
| } |
| // Discard terms that contain multiple uppercase letters. |
| if ( Character.isUpperCase( term.charAt( c ) ) ) { |
| if ( upper ) { |
| return false; |
| } |
| // First encountered uppercase letter, set flag and save |
| // position. |
| else { |
| first = c; |
| upper = true; |
| } |
| } |
| } |
| // Discard the term if it contains a single uppercase letter that |
| // is not starting the term. |
| if ( first > 0 ) { |
| return false; |
| } |
| return true; |
| } |
| } |