src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Core;
 using Lucene.Net.Analysis.Miscellaneous;
 using Lucene.Net.Analysis.Snowball;
 using Lucene.Net.Analysis.Standard;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System;
 using System.IO;
 using System.Text;

 namespace Lucene.Net.Analysis.Nl
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// <see cref="Analyzer"/> for Dutch language.
     /// <para>
     /// Supports an external list of stopwords (words that
     /// will not be indexed at all), an external list of exclusions (word that will
     /// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
     /// the algorithm (dictionary stemming).
     /// A default set of stopwords is used unless an alternative list is specified, but the
     /// exclusion list is empty by default.
     /// </para>
     ///
     /// <para>You must specify the required <see cref="LuceneVersion"/>
     /// compatibility when creating <see cref="DutchAnalyzer"/>:
     /// <list type="bullet">
     ///   <item><description> As of 3.6, <see cref="DutchAnalyzer(LuceneVersion, CharArraySet)"/> and
     ///        <see cref="DutchAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> also populate
     ///        the default entries for the stem override dictionary</description></item>
     ///   <item><description> As of 3.1, Snowball stemming is done with SnowballFilter,
     ///        LowerCaseFilter is used prior to StopFilter, and Snowball
     ///        stopwords are used by default.</description></item>
     ///   <item><description> As of 2.9, StopFilter preserves position
     ///        increments</description></item>
     /// </list>
     ///
     /// </para>
     /// <para><b>NOTE</b>: This class uses the same <see cref="LuceneVersion"/>
     /// dependent settings as <see cref="StandardAnalyzer"/>.</para>
     /// </summary>
     public sealed class DutchAnalyzer : Analyzer
     {
         /// <summary>
         /// File containing default Dutch stopwords. </summary>
         public const string DEFAULT_STOPWORD_FILE = "dutch_stop.txt";

         /// <summary>
         /// Returns an unmodifiable instance of the default stop-words set. </summary>
         /// <returns> an unmodifiable instance of the default stop-words set. </returns>
         public static CharArraySet DefaultStopSet => DefaultSetHolder.DEFAULT_STOP_SET;

         private class DefaultSetHolder
         {
             internal static readonly CharArraySet DEFAULT_STOP_SET = LoadDefaultStopSet();
             internal static readonly CharArrayMap<string> DEFAULT_STEM_DICT = LoadDefaultStemDict();
             private static CharArraySet LoadDefaultStopSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
             {
                 try
                 {
                     return WordlistLoader.GetSnowballWordSet(
                         IOUtils.GetDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, Encoding.UTF8),
 #pragma warning disable 612, 618
                         LuceneVersion.LUCENE_CURRENT);
 #pragma warning restore 612, 618
                 }
                 catch (IOException ex)
                 {
                     // default set should always be present as it is part of the
                     // distribution (JAR)
                     throw new Exception("Unable to load default stopword set", ex);
                 }

             }

             private static CharArrayMap<string> LoadDefaultStemDict() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
             {
 #pragma warning disable 612, 618
                 var DEFAULT_STEM_DICT = new CharArrayMap<string>(LuceneVersion.LUCENE_CURRENT, 4, false);
 #pragma warning restore 612, 618
                 DEFAULT_STEM_DICT.Put("fiets", "fiets"); //otherwise fiet
                 DEFAULT_STEM_DICT.Put("bromfiets", "bromfiets"); //otherwise bromfiet
                 DEFAULT_STEM_DICT.Put("ei", "eier");
                 DEFAULT_STEM_DICT.Put("kind", "kinder");
                 return DEFAULT_STEM_DICT;
             }
         }


         /// <summary>
         /// Contains the stopwords used with the <see cref="StopFilter"/>.
         /// </summary>
         private readonly CharArraySet stoptable;

         /// <summary>
         /// Contains words that should be indexed but not stemmed.
         /// </summary>
         private CharArraySet excltable = CharArraySet.EMPTY_SET;

         private readonly StemmerOverrideFilter.StemmerOverrideMap stemdict;

         // null if on 3.1 or later - only for bw compat
         private readonly CharArrayMap<string> origStemdict;
         private readonly LuceneVersion matchVersion;

         /// <summary>
         /// Builds an analyzer with the default stop words (<see cref="DefaultStopSet"/>)
         /// and a few default entries for the stem exclusion table.
         /// </summary>
         public DutchAnalyzer(LuceneVersion matchVersion)
               : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT)
         {
             // historically, only this ctor populated the stem dict!!!!!
         }

         public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
               : this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
 #pragma warning disable 612, 618
                     matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
 #pragma warning restore 612, 618
                     DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap())
         {
             // historically, this ctor never the stem dict!!!!!
             // so we populate it only for >= 3.6
         }

         public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
               : this(matchVersion, stopwords, stemExclusionTable,
 #pragma warning disable 612, 618
                     matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
 #pragma warning restore 612, 618
                     DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap())
         {
             // historically, this ctor never the stem dict!!!!!
             // so we populate it only for >= 3.6
         }

         public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
         {
             this.matchVersion = matchVersion;
             this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
             this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
 #pragma warning disable 612, 618
             if (stemOverrideDict.Count == 0 || !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
 #pragma warning restore 612, 618
             {
                 this.stemdict = null;
                 this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
             }
             else
             {
                 this.origStemdict = null;
                 // we don't need to ignore case here since we lowercase in this analyzer anyway
                 StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
                 using (CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator())
                 {
                     CharsRef spare = new CharsRef();
                     while (iter.HasNext)
                     {
                         char[] nextKey = iter.NextKey();
                         spare.CopyChars(nextKey, 0, nextKey.Length);
                         builder.Add(new string(spare.Chars), iter.CurrentValue);
                     }
                 }
                 try
                 {
                     this.stemdict = builder.Build();
                 }
                 catch (IOException ex)
                 {
                     throw new Exception("can not build stem dict", ex);
                 }
             }
         }

         /// <summary>
         /// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the
         /// text in the provided <see cref="TextReader"/>.
         /// </summary>
         /// <returns> A <see cref="TokenStream"/> built from a <see cref="StandardTokenizer"/>
         ///   filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>,
         ///   <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
         ///   <see cref="StemmerOverrideFilter"/>, and <see cref="SnowballFilter"/> </returns>
         protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
         {
 #pragma warning disable 612, 618
             if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
 #pragma warning restore 612, 618
             {
                 Tokenizer source = new StandardTokenizer(matchVersion, aReader);
                 TokenStream result = new StandardFilter(matchVersion, source);
                 result = new LowerCaseFilter(matchVersion, result);
                 result = new StopFilter(matchVersion, result, stoptable);
                 if (excltable.Count > 0)
                 {
                     result = new SetKeywordMarkerFilter(result, excltable);
                 }
                 if (stemdict != null)
                 {
                     result = new StemmerOverrideFilter(result, stemdict);
                 }
                 result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
                 return new TokenStreamComponents(source, result);
             }
             else
             {
                 Tokenizer source = new StandardTokenizer(matchVersion, aReader);
                 TokenStream result = new StandardFilter(matchVersion, source);
                 result = new StopFilter(matchVersion, result, stoptable);
                 if (excltable.Count > 0)
                 {
                     result = new SetKeywordMarkerFilter(result, excltable);
                 }
 #pragma warning disable 612, 618
                 result = new DutchStemFilter(result, origStemdict);
 #pragma warning restore 612, 618
                 return new TokenStreamComponents(source, result);
             }
         }
     }
 }
	using Lucene.Net.Analysis.Core;
	using Lucene.Net.Analysis.Miscellaneous;
	using Lucene.Net.Analysis.Snowball;
	using Lucene.Net.Analysis.Standard;
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Util;
	using System;
	using System.IO;
	using System.Text;

	namespace Lucene.Net.Analysis.Nl
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// <see cref="Analyzer"/> for Dutch language.
	/// <para>
	/// Supports an external list of stopwords (words that
	/// will not be indexed at all), an external list of exclusions (word that will
	/// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
	/// the algorithm (dictionary stemming).
	/// A default set of stopwords is used unless an alternative list is specified, but the
	/// exclusion list is empty by default.
	/// </para>
	///
	/// <para>You must specify the required <see cref="LuceneVersion"/>
	/// compatibility when creating <see cref="DutchAnalyzer"/>:
	/// <list type="bullet">
	/// <item><description> As of 3.6, <see cref="DutchAnalyzer(LuceneVersion, CharArraySet)"/> and
	/// <see cref="DutchAnalyzer(LuceneVersion, CharArraySet, CharArraySet)"/> also populate
	/// the default entries for the stem override dictionary</description></item>
	/// <item><description> As of 3.1, Snowball stemming is done with SnowballFilter,
	/// LowerCaseFilter is used prior to StopFilter, and Snowball
	/// stopwords are used by default.</description></item>
	/// <item><description> As of 2.9, StopFilter preserves position
	/// increments</description></item>
	/// </list>
	///
	/// </para>
	/// <para><b>NOTE</b>: This class uses the same <see cref="LuceneVersion"/>
	/// dependent settings as <see cref="StandardAnalyzer"/>.</para>
	/// </summary>
	public sealed class DutchAnalyzer : Analyzer
	{
	/// <summary>
	/// File containing default Dutch stopwords. </summary>
	public const string DEFAULT_STOPWORD_FILE = "dutch_stop.txt";

	/// <summary>
	/// Returns an unmodifiable instance of the default stop-words set. </summary>
	/// <returns> an unmodifiable instance of the default stop-words set. </returns>
	public static CharArraySet DefaultStopSet => DefaultSetHolder.DEFAULT_STOP_SET;

	private class DefaultSetHolder
	{
	internal static readonly CharArraySet DEFAULT_STOP_SET = LoadDefaultStopSet();
	internal static readonly CharArrayMap<string> DEFAULT_STEM_DICT = LoadDefaultStemDict();
	private static CharArraySet LoadDefaultStopSet() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
	{
	try
	{
	return WordlistLoader.GetSnowballWordSet(
	IOUtils.GetDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, Encoding.UTF8),
	#pragma warning disable 612, 618
	LuceneVersion.LUCENE_CURRENT);
	#pragma warning restore 612, 618
	}
	catch (IOException ex)
	{
	// default set should always be present as it is part of the
	// distribution (JAR)
	throw new Exception("Unable to load default stopword set", ex);
	}

	}

	private static CharArrayMap<string> LoadDefaultStemDict() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
	{
	#pragma warning disable 612, 618
	var DEFAULT_STEM_DICT = new CharArrayMap<string>(LuceneVersion.LUCENE_CURRENT, 4, false);
	#pragma warning restore 612, 618
	DEFAULT_STEM_DICT.Put("fiets", "fiets"); //otherwise fiet
	DEFAULT_STEM_DICT.Put("bromfiets", "bromfiets"); //otherwise bromfiet
	DEFAULT_STEM_DICT.Put("ei", "eier");
	DEFAULT_STEM_DICT.Put("kind", "kinder");
	return DEFAULT_STEM_DICT;
	}
	}


	/// <summary>
	/// Contains the stopwords used with the <see cref="StopFilter"/>.
	/// </summary>
	private readonly CharArraySet stoptable;

	/// <summary>
	/// Contains words that should be indexed but not stemmed.
	/// </summary>
	private CharArraySet excltable = CharArraySet.EMPTY_SET;

	private readonly StemmerOverrideFilter.StemmerOverrideMap stemdict;

	// null if on 3.1 or later - only for bw compat
	private readonly CharArrayMap<string> origStemdict;
	private readonly LuceneVersion matchVersion;

	/// <summary>
	/// Builds an analyzer with the default stop words (<see cref="DefaultStopSet"/>)
	/// and a few default entries for the stem exclusion table.
	/// </summary>
	public DutchAnalyzer(LuceneVersion matchVersion)
	: this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT)
	{
	// historically, only this ctor populated the stem dict!!!!!
	}

	public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords)
	: this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
	#pragma warning disable 612, 618
	matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
	#pragma warning restore 612, 618
	DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap())
	{
	// historically, this ctor never the stem dict!!!!!
	// so we populate it only for >= 3.6
	}

	public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable)
	: this(matchVersion, stopwords, stemExclusionTable,
	#pragma warning disable 612, 618
	matchVersion.OnOrAfter(LuceneVersion.LUCENE_36) ?
	#pragma warning restore 612, 618
	DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap<string>.EmptyMap())
	{
	// historically, this ctor never the stem dict!!!!!
	// so we populate it only for >= 3.6
	}

	public DutchAnalyzer(LuceneVersion matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
	{
	this.matchVersion = matchVersion;
	this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stopwords));
	this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(matchVersion, stemExclusionTable));
	#pragma warning disable 612, 618
	if (stemOverrideDict.Count == 0 \|\| !matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
	#pragma warning restore 612, 618
	{
	this.stemdict = null;
	this.origStemdict = CharArrayMap.UnmodifiableMap(CharArrayMap.Copy(matchVersion, stemOverrideDict));
	}
	else
	{
	this.origStemdict = null;
	// we don't need to ignore case here since we lowercase in this analyzer anyway
	StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
	using (CharArrayMap<string>.EntryIterator iter = (CharArrayMap<string>.EntryIterator)stemOverrideDict.EntrySet().GetEnumerator())
	{
	CharsRef spare = new CharsRef();
	while (iter.HasNext)
	{
	char[] nextKey = iter.NextKey();
	spare.CopyChars(nextKey, 0, nextKey.Length);
	builder.Add(new string(spare.Chars), iter.CurrentValue);
	}
	}
	try
	{
	this.stemdict = builder.Build();
	}
	catch (IOException ex)
	{
	throw new Exception("can not build stem dict", ex);
	}
	}
	}

	/// <summary>
	/// Returns a (possibly reused) <see cref="TokenStream"/> which tokenizes all the
	/// text in the provided <see cref="TextReader"/>.
	/// </summary>
	/// <returns> A <see cref="TokenStream"/> built from a <see cref="StandardTokenizer"/>
	/// filtered with <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>,
	/// <see cref="StopFilter"/>, <see cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
	/// <see cref="StemmerOverrideFilter"/>, and <see cref="SnowballFilter"/> </returns>
	protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
	{
	#pragma warning disable 612, 618
	if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_31))
	#pragma warning restore 612, 618
	{
	Tokenizer source = new StandardTokenizer(matchVersion, aReader);
	TokenStream result = new StandardFilter(matchVersion, source);
	result = new LowerCaseFilter(matchVersion, result);
	result = new StopFilter(matchVersion, result, stoptable);
	if (excltable.Count > 0)
	{
	result = new SetKeywordMarkerFilter(result, excltable);
	}
	if (stemdict != null)
	{
	result = new StemmerOverrideFilter(result, stemdict);
	}
	result = new SnowballFilter(result, new Tartarus.Snowball.Ext.DutchStemmer());
	return new TokenStreamComponents(source, result);
	}
	else
	{
	Tokenizer source = new StandardTokenizer(matchVersion, aReader);
	TokenStream result = new StandardFilter(matchVersion, source);
	result = new StopFilter(matchVersion, result, stoptable);
	if (excltable.Count > 0)
	{
	result = new SetKeywordMarkerFilter(result, excltable);
	}
	#pragma warning disable 612, 618
	result = new DutchStemFilter(result, origStemdict);
	#pragma warning restore 612, 618
	return new TokenStreamComponents(source, result);
	}
	}
	}
	}