src/contrib/Analyzers/Fa/PersianAnalyzer.cs - lucenenet - Git at Google

 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
 */

 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using Lucene.Net.Analysis.AR;
 using Version = Lucene.Net.Util.Version;

 namespace Lucene.Net.Analysis.Fa
 {
     /*
      * {@link Analyzer} for Persian.
      * <p>
      * This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
      * zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
      * yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
      * </p>
      */
     public sealed class PersianAnalyzer : Analyzer
     {

         /*
          * File containing default Persian stopwords.
          *
          * Default stopword list is from
          * http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
          * BSD-Licensed.
          *
          */
         public readonly static String DEFAULT_STOPWORD_FILE = "stopwords.txt";

         /*
          * Contains the stopwords used with the StopFilter.
          */
         private readonly ISet<string> stoptable;

         /*
          * The comment character in the stopwords file. All lines prefixed with this
          * will be ignored
          */
         public static readonly String STOPWORDS_COMMENT = "#";

         /*
          * Returns an unmodifiable instance of the default stop-words set.
          * @return an unmodifiable instance of the default stop-words set.
          */
         public static ISet<string> getDefaultStopSet()
         {
             return DefaultSetHolder.DEFAULT_STOP_SET;
         }

         /*
          * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
          * accesses the static final set the first time.;
          */
         private static class DefaultSetHolder
         {
             internal static readonly ISet<string> DEFAULT_STOP_SET;

             static DefaultSetHolder()
             {
                 try
                 {
                     DEFAULT_STOP_SET = LoadDefaultStopWordSet();
                 }
                 catch (IOException ex)
                 {
                     // default set should always be present as it is part of the
                     // distribution (JAR)
                     throw new Exception("Unable to load default stopword set");
                 }
             }

             static ISet<String> LoadDefaultStopWordSet()
             {

                 var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);
                 try
                 {
                     StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
                     // make sure it is unmodifiable as we expose it in the outer class
                     return CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true));
                 }
                 finally
                 {
                     stream.Close();
                 }
             }
         }

         private readonly Version matchVersion;

         /*
          * Builds an analyzer with the default stop words:
          * {@link #DEFAULT_STOPWORD_FILE}.
          */
         public PersianAnalyzer(Version matchVersion)
             : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
         {

         }

         /*
          * Builds an analyzer with the given stop words
          *
          * @param matchVersion
          *          lucene compatibility version
          * @param stopwords
          *          a stopword set
          */
         public PersianAnalyzer(Version matchVersion, ISet<string> stopwords)
         {
             stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
             this.matchVersion = matchVersion;
         }

         /*
          * Builds an analyzer with the given stop words.
          * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
          */
         public PersianAnalyzer(Version matchVersion, params string[] stopwords)
             : this(matchVersion, StopFilter.MakeStopSet(stopwords))
         {

         }

         /*
          * Builds an analyzer with the given stop words.
          * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
          */
         public PersianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
             : this(matchVersion, stopwords.Keys.ToArray())
         {

         }

         /*
          * Builds an analyzer with the given stop words. Lines can be commented out
          * using {@link #STOPWORDS_COMMENT}
          * @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
          */
         public PersianAnalyzer(Version matchVersion, FileInfo stopwords)
             : this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
         {

         }

         /*
          * Creates a {@link TokenStream} which tokenizes all the text in the provided
          * {@link Reader}.
          *
          * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
          *         filtered with {@link LowerCaseFilter},
          *         {@link ArabicNormalizationFilter},
          *         {@link PersianNormalizationFilter} and Persian Stop words
          */
         public override TokenStream TokenStream(String fieldName, TextReader reader)
         {
             TokenStream result = new ArabicLetterTokenizer(reader);
             result = new LowerCaseFilter(result);
             result = new ArabicNormalizationFilter(result);
             /* additional persian-specific normalization */
             result = new PersianNormalizationFilter(result);
             /*
              * the order here is important: the stopword list is normalized with the
              * above!
              */
             result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                     result, stoptable);
             return result;
         }

         private class SavedStreams
         {
             protected internal Tokenizer source;
             protected internal TokenStream result;
         }

         /*
          * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
          * in the provided {@link Reader}.
          *
          * @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
          *         filtered with {@link LowerCaseFilter},
          *         {@link ArabicNormalizationFilter},
          *         {@link PersianNormalizationFilter} and Persian Stop words
          */
         public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
         {
             SavedStreams streams = (SavedStreams)PreviousTokenStream;
             if (streams == null)
             {
                 streams = new SavedStreams();
                 streams.source = new ArabicLetterTokenizer(reader);
                 streams.result = new LowerCaseFilter(streams.source);
                 streams.result = new ArabicNormalizationFilter(streams.result);
                 /* additional persian-specific normalization */
                 streams.result = new PersianNormalizationFilter(streams.result);
                 /*
                  * the order here is important: the stopword list is normalized with the
                  * above!
                  */
                 streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
                                                 streams.result, stoptable);
                 PreviousTokenStream = streams;
             }
             else
             {
                 streams.source.Reset(reader);
             }
             return streams.result;
         }
     }
 }
	/*
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*/

	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Linq;
	using Lucene.Net.Analysis.AR;
	using Version = Lucene.Net.Util.Version;

	namespace Lucene.Net.Analysis.Fa
	{
	/*
	* {@link Analyzer} for Persian.
	* <p>
	* This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
	* zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
	* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
	* </p>
	*/
	public sealed class PersianAnalyzer : Analyzer
	{

	/*
	* File containing default Persian stopwords.
	*
	* Default stopword list is from
	* http://members.unine.ch/jacques.savoy/clef/index.html The stopword list is
	* BSD-Licensed.
	*
	*/
	public readonly static String DEFAULT_STOPWORD_FILE = "stopwords.txt";

	/*
	* Contains the stopwords used with the StopFilter.
	*/
	private readonly ISet<string> stoptable;

	/*
	* The comment character in the stopwords file. All lines prefixed with this
	* will be ignored
	*/
	public static readonly String STOPWORDS_COMMENT = "#";

	/*
	* Returns an unmodifiable instance of the default stop-words set.
	* @return an unmodifiable instance of the default stop-words set.
	*/
	public static ISet<string> getDefaultStopSet()
	{
	return DefaultSetHolder.DEFAULT_STOP_SET;
	}

	/*
	* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class
	* accesses the static final set the first time.;
	*/
	private static class DefaultSetHolder
	{
	internal static readonly ISet<string> DEFAULT_STOP_SET;

	static DefaultSetHolder()
	{
	try
	{
	DEFAULT_STOP_SET = LoadDefaultStopWordSet();
	}
	catch (IOException ex)
	{
	// default set should always be present as it is part of the
	// distribution (JAR)
	throw new Exception("Unable to load default stopword set");
	}
	}

	static ISet<String> LoadDefaultStopWordSet()
	{

	var stream = System.Reflection.Assembly.GetAssembly(typeof(PersianAnalyzer)).GetManifestResourceStream("Lucene.Net.Analyzers.Fa." + DEFAULT_STOPWORD_FILE);
	try
	{
	StreamReader reader = new StreamReader(stream, System.Text.Encoding.UTF8);
	// make sure it is unmodifiable as we expose it in the outer class
	return CharArraySet.UnmodifiableSet(new CharArraySet(WordlistLoader.GetWordSet(reader, STOPWORDS_COMMENT), true));
	}
	finally
	{
	stream.Close();
	}
	}
	}

	private readonly Version matchVersion;

	/*
	* Builds an analyzer with the default stop words:
	* {@link #DEFAULT_STOPWORD_FILE}.
	*/
	public PersianAnalyzer(Version matchVersion)
	: this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
	{

	}

	/*
	* Builds an analyzer with the given stop words
	*
	* @param matchVersion
	* lucene compatibility version
	* @param stopwords
	* a stopword set
	*/
	public PersianAnalyzer(Version matchVersion, ISet<string> stopwords)
	{
	stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
	this.matchVersion = matchVersion;
	}

	/*
	* Builds an analyzer with the given stop words.
	* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
	*/
	public PersianAnalyzer(Version matchVersion, params string[] stopwords)
	: this(matchVersion, StopFilter.MakeStopSet(stopwords))
	{

	}

	/*
	* Builds an analyzer with the given stop words.
	* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
	*/
	public PersianAnalyzer(Version matchVersion, IDictionary<string, string> stopwords)
	: this(matchVersion, stopwords.Keys.ToArray())
	{

	}

	/*
	* Builds an analyzer with the given stop words. Lines can be commented out
	* using {@link #STOPWORDS_COMMENT}
	* @deprecated use {@link #PersianAnalyzer(Version, Set)} instead
	*/
	public PersianAnalyzer(Version matchVersion, FileInfo stopwords)
	: this(matchVersion, WordlistLoader.GetWordSet(stopwords, STOPWORDS_COMMENT))
	{

	}

	/*
	* Creates a {@link TokenStream} which tokenizes all the text in the provided
	* {@link Reader}.
	*
	* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
	* filtered with {@link LowerCaseFilter},
	* {@link ArabicNormalizationFilter},
	* {@link PersianNormalizationFilter} and Persian Stop words
	*/
	public override TokenStream TokenStream(String fieldName, TextReader reader)
	{
	TokenStream result = new ArabicLetterTokenizer(reader);
	result = new LowerCaseFilter(result);
	result = new ArabicNormalizationFilter(result);
	/* additional persian-specific normalization */
	result = new PersianNormalizationFilter(result);
	/*
	* the order here is important: the stopword list is normalized with the
	* above!
	*/
	result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
	result, stoptable);
	return result;
	}

	private class SavedStreams
	{
	protected internal Tokenizer source;
	protected internal TokenStream result;
	}

	/*
	* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
	* in the provided {@link Reader}.
	*
	* @return A {@link TokenStream} built from a {@link ArabicLetterTokenizer}
	* filtered with {@link LowerCaseFilter},
	* {@link ArabicNormalizationFilter},
	* {@link PersianNormalizationFilter} and Persian Stop words
	*/
	public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
	{
	SavedStreams streams = (SavedStreams)PreviousTokenStream;
	if (streams == null)
	{
	streams = new SavedStreams();
	streams.source = new ArabicLetterTokenizer(reader);
	streams.result = new LowerCaseFilter(streams.source);
	streams.result = new ArabicNormalizationFilter(streams.result);
	/* additional persian-specific normalization */
	streams.result = new PersianNormalizationFilter(streams.result);
	/*
	* the order here is important: the stopword list is normalized with the
	* above!
	*/
	streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
	streams.result, stoptable);
	PreviousTokenStream = streams;
	}
	else
	{
	streams.source.Reset(reader);
	}
	return streams.result;
	}
	}
	}