lucene/analysis/common/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.snowball;

 import java.io.IOException;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
 import org.tartarus.snowball.SnowballStemmer;

 /**
  * A filter that stems words using a Snowball-generated stemmer.
  *
  * <p>Available stemmers are listed in {@link org.tartarus.snowball.ext}.
  *
  * <p><b>NOTE</b>: SnowballFilter expects lowercased text.
  *
  * <ul>
  *   <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
  *   <li>For other languages, see {@link LowerCaseFilter}.
  * </ul>
  *
  * <p>Note: This filter is aware of the {@link KeywordAttribute}. To prevent certain terms from
  * being passed to the stemmer {@link KeywordAttribute#isKeyword()} should be set to <code>true
  * </code> in a previous {@link TokenStream}.
  *
  * <p>Note: For including the original term as well as the stemmed version, see {@link
  * org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
  */
 public final class SnowballFilter extends TokenFilter {

   private final SnowballStemmer stemmer;

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

   public SnowballFilter(TokenStream input, SnowballStemmer stemmer) {
     super(input);
     this.stemmer = stemmer;
   }

   /**
    * Construct the named stemming filter.
    *
    * <p>Available stemmers are listed in {@link org.tartarus.snowball.ext}. The name of a stemmer is
    * the part of the class name before "Stemmer", e.g., the stemmer in {@link
    * org.tartarus.snowball.ext.EnglishStemmer} is named "English".
    *
    * @param in the input tokens to stem
    * @param name the name of a stemmer
    */
   public SnowballFilter(TokenStream in, String name) {
     super(in);
     // Class.forName is frowned upon in place of the ResourceLoader but in this case,
     // the factory will use the other constructor so that the program is already loaded.
     try {
       Class<? extends SnowballStemmer> stemClass =
           Class.forName("org.tartarus.snowball.ext." + name + "Stemmer")
               .asSubclass(SnowballStemmer.class);
       stemmer = stemClass.getConstructor().newInstance();
     } catch (Exception e) {
       throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
     }
   }

   /** Returns the next input Token, after being stemmed */
   @Override
   public final boolean incrementToken() throws IOException {
     if (input.incrementToken()) {
       if (!keywordAttr.isKeyword()) {
         char termBuffer[] = termAtt.buffer();
         final int length = termAtt.length();
         stemmer.setCurrent(termBuffer, length);
         stemmer.stem();
         final char finalTerm[] = stemmer.getCurrentBuffer();
         final int newLength = stemmer.getCurrentBufferLength();
         if (finalTerm != termBuffer) termAtt.copyBuffer(finalTerm, 0, newLength);
         else termAtt.setLength(newLength);
       }
       return true;
     } else {
       return false;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.snowball;

	import java.io.IOException;
	import org.apache.lucene.analysis.LowerCaseFilter;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
	import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
	import org.tartarus.snowball.SnowballStemmer;

	/**
	* A filter that stems words using a Snowball-generated stemmer.
	*
	* <p>Available stemmers are listed in {@link org.tartarus.snowball.ext}.
	*
	* <p><b>NOTE</b>: SnowballFilter expects lowercased text.
	*
	* <ul>
	* <li>For the Turkish language, see {@link TurkishLowerCaseFilter}.
	* <li>For other languages, see {@link LowerCaseFilter}.
	* </ul>
	*
	* <p>Note: This filter is aware of the {@link KeywordAttribute}. To prevent certain terms from
	* being passed to the stemmer {@link KeywordAttribute#isKeyword()} should be set to <code>true
	* </code> in a previous {@link TokenStream}.
	*
	* <p>Note: For including the original term as well as the stemmed version, see {@link
	* org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
	*/
	public final class SnowballFilter extends TokenFilter {

	private final SnowballStemmer stemmer;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

	public SnowballFilter(TokenStream input, SnowballStemmer stemmer) {
	super(input);
	this.stemmer = stemmer;
	}

	/**
	* Construct the named stemming filter.
	*
	* <p>Available stemmers are listed in {@link org.tartarus.snowball.ext}. The name of a stemmer is
	* the part of the class name before "Stemmer", e.g., the stemmer in {@link
	* org.tartarus.snowball.ext.EnglishStemmer} is named "English".
	*
	* @param in the input tokens to stem
	* @param name the name of a stemmer
	*/
	public SnowballFilter(TokenStream in, String name) {
	super(in);
	// Class.forName is frowned upon in place of the ResourceLoader but in this case,
	// the factory will use the other constructor so that the program is already loaded.
	try {
	Class<? extends SnowballStemmer> stemClass =
	Class.forName("org.tartarus.snowball.ext." + name + "Stemmer")
	.asSubclass(SnowballStemmer.class);
	stemmer = stemClass.getConstructor().newInstance();
	} catch (Exception e) {
	throw new IllegalArgumentException("Invalid stemmer class specified: " + name, e);
	}
	}

	/** Returns the next input Token, after being stemmed */
	@Override
	public final boolean incrementToken() throws IOException {
	if (input.incrementToken()) {
	if (!keywordAttr.isKeyword()) {
	char termBuffer[] = termAtt.buffer();
	final int length = termAtt.length();
	stemmer.setCurrent(termBuffer, length);
	stemmer.stem();
	final char finalTerm[] = stemmer.getCurrentBuffer();
	final int newLength = stemmer.getCurrentBufferLength();
	if (finalTerm != termBuffer) termAtt.copyBuffer(finalTerm, 0, newLength);
	else termAtt.setLength(newLength);
	}
	return true;
	} else {
	return false;
	}
	}
	}