src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java - lucene-solr - Git at Google

 package org.apache.lucene.analysis.de;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import java.io.File;
 import java.io.Reader;
 import java.util.Hashtable;

 /**
  * Analyzer for german language. Supports an external list of stopwords (words that
  * will not be indexed at all) and an external list of exclusions (word that will
  * not be stemmed, but indexed).
  *
  * @author    Gerhard Schwarz
  * @version   $Id$
  */
 public final class GermanAnalyzer extends Analyzer {

 	/**
 	 * List of typical german stopwords.
 	 */
 	private String[] GERMAN_STOP_WORDS = {
 		"einer", "eine", "eines", "einem", "einen",
 		"der", "die", "das", "dass", "daß",
 		"du", "er", "sie", "es",
 		"was", "wer", "wie", "wir",
 		"und", "oder", "ohne", "mit",
 		"am", "im", "in", "aus", "auf",
 		"ist", "sein", "war", "wird",
 		"ihr", "ihre", "ihres",
 		"als", "für", "von", "mit",
 		"dich", "dir", "mich", "mir",
 		"mein", "sein", "kein",
 		"durch", "wegen"
 		};

 	/**
 	 * Contains the stopwords used with the StopFilter.
 	 */
 	private Hashtable stoptable = new Hashtable();
 	/**
 	 * Contains words that should be indexed but not stemmed.
 	 */
 	private Hashtable excltable = new Hashtable();

 	/**
 	 * Builds an analyzer.
 	 */
 	public GermanAnalyzer() {
 		stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
 	}

 	/**
 	 * Builds an analyzer with the given stop words.
 	 */
 	public GermanAnalyzer( String[] stopwords ) {
 		stoptable = StopFilter.makeStopTable( stopwords );
 	}

 	/**
 	 * Builds an analyzer with the given stop words.
 	 */
 	public GermanAnalyzer( Hashtable stopwords ) {
 		stoptable = stopwords;
 	}

 	/**
 	 * Builds an analyzer with the given stop words.
 	 */
 	public GermanAnalyzer( File stopwords ) {
 		stoptable = WordlistLoader.getWordtable( stopwords );
 	}

 	/**
 	 * Builds an exclusionlist from an array of Strings.
 	 */
 	public void setStemExclusionTable( String[] exclusionlist ) {
 		excltable = StopFilter.makeStopTable( exclusionlist );
 	}
 	/**
 	 * Builds an exclusionlist from a Hashtable.
 	 */
 	public void setStemExclusionTable( Hashtable exclusionlist ) {
 		excltable = exclusionlist;
 	}
 	/**
 	 * Builds an exclusionlist from the words contained in the given file.
 	 */
 	public void setStemExclusionTable( File exclusionlist ) {
 		excltable = WordlistLoader.getWordtable( exclusionlist );
 	}

 	/**
 	 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 	 *
 	 * @return  A TokenStream build from a StandardTokenizer filtered with
 	 * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
 	 */
 	public final TokenStream tokenStream(String fieldName, Reader reader) {
 		TokenStream result = new StandardTokenizer( reader );
 		result = new StandardFilter( result );
 		result = new StopFilter( result, stoptable );
 		result = new GermanStemFilter( result, excltable );
 		// Convert to lowercase after stemming!
 		result = new LowerCaseFilter( result );
 		return result;
 	}
 }
	package org.apache.lucene.analysis.de;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.LowerCaseFilter;
	import org.apache.lucene.analysis.StopFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.standard.StandardFilter;
	import org.apache.lucene.analysis.standard.StandardTokenizer;
	import java.io.File;
	import java.io.Reader;
	import java.util.Hashtable;

	/**
	* Analyzer for german language. Supports an external list of stopwords (words that
	* will not be indexed at all) and an external list of exclusions (word that will
	* not be stemmed, but indexed).
	*
	* @author Gerhard Schwarz
	* @version $Id$
	*/
	public final class GermanAnalyzer extends Analyzer {

	/**
	* List of typical german stopwords.
	*/
	private String[] GERMAN_STOP_WORDS = {
	"einer", "eine", "eines", "einem", "einen",
	"der", "die", "das", "dass", "daß",
	"du", "er", "sie", "es",
	"was", "wer", "wie", "wir",
	"und", "oder", "ohne", "mit",
	"am", "im", "in", "aus", "auf",
	"ist", "sein", "war", "wird",
	"ihr", "ihre", "ihres",
	"als", "für", "von", "mit",
	"dich", "dir", "mich", "mir",
	"mein", "sein", "kein",
	"durch", "wegen"
	};

	/**
	* Contains the stopwords used with the StopFilter.
	*/
	private Hashtable stoptable = new Hashtable();
	/**
	* Contains words that should be indexed but not stemmed.
	*/
	private Hashtable excltable = new Hashtable();

	/**
	* Builds an analyzer.
	*/
	public GermanAnalyzer() {
	stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
	}

	/**
	* Builds an analyzer with the given stop words.
	*/
	public GermanAnalyzer( String[] stopwords ) {
	stoptable = StopFilter.makeStopTable( stopwords );
	}

	/**
	* Builds an analyzer with the given stop words.
	*/
	public GermanAnalyzer( Hashtable stopwords ) {
	stoptable = stopwords;
	}

	/**
	* Builds an analyzer with the given stop words.
	*/
	public GermanAnalyzer( File stopwords ) {
	stoptable = WordlistLoader.getWordtable( stopwords );
	}

	/**
	* Builds an exclusionlist from an array of Strings.
	*/
	public void setStemExclusionTable( String[] exclusionlist ) {
	excltable = StopFilter.makeStopTable( exclusionlist );
	}
	/**
	* Builds an exclusionlist from a Hashtable.
	*/
	public void setStemExclusionTable( Hashtable exclusionlist ) {
	excltable = exclusionlist;
	}
	/**
	* Builds an exclusionlist from the words contained in the given file.
	*/
	public void setStemExclusionTable( File exclusionlist ) {
	excltable = WordlistLoader.getWordtable( exclusionlist );
	}

	/**
	* Creates a TokenStream which tokenizes all the text in the provided Reader.
	*
	* @return A TokenStream build from a StandardTokenizer filtered with
	* StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
	*/
	public final TokenStream tokenStream(String fieldName, Reader reader) {
	TokenStream result = new StandardTokenizer( reader );
	result = new StandardFilter( result );
	result = new StopFilter( result, stoptable );
	result = new GermanStemFilter( result, excltable );
	// Convert to lowercase after stemming!
	result = new LowerCaseFilter( result );
	return result;
	}
	}