src/Lucene.Net.Analysis.Common/Analysis/Util/WordlistLoader.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Text.RegularExpressions;

 namespace Lucene.Net.Analysis.Util
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Loader for text files that represent a list of stopwords.
     /// <para/>
     /// <see cref="IOUtils"/> to obtain <see cref="TextReader"/> instances.
     /// @lucene.internal
     /// </summary>
     public class WordlistLoader
     {
         private const int INITIAL_CAPACITY = 16;

         // LUCENENET specific
         private readonly static Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);

         /// <summary>
         /// no instance </summary>
         private WordlistLoader()
         {
         }

         // LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3?

         /// <summary>
         /// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting
         /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
         /// one word. The words need to be in lowercase if you make use of an
         /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
         /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
         /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
         public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
         {
             try
             {
                 string word = null;
                 while ((word = reader.ReadLine()) != null)
                 {
                     result.Add(word.Trim());
                 }

             }
             finally
             {
                 IOUtils.Dispose(reader);
             }
             return result;
         }

         /// <summary>
         /// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting
         /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
         /// one word. The words need to be in lowercase if you make use of an
         /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
         /// <param name="matchVersion"> the <see cref="LuceneVersion"/> </param>
         /// <returns> A <see cref="CharArraySet"/> with the reader's words </returns>
         public static CharArraySet GetWordSet(TextReader reader, LuceneVersion matchVersion)
         {
             return GetWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
         }

         /// <summary>
         /// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting
         /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
         /// one word. The words need to be in lowercase if you make use of an
         /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
         /// <param name="comment"> The string representing a comment. </param>
         /// <param name="matchVersion"> the <see cref="LuceneVersion"/> </param>
         /// <returns> A CharArraySet with the reader's words </returns>
         public static CharArraySet GetWordSet(TextReader reader, string comment, LuceneVersion matchVersion)
         {
             return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
         }

         /// <summary>
         /// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting
         /// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
         /// one word. The words need to be in lowercase if you make use of an
         /// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
         /// <param name="comment"> The string representing a comment. </param>
         /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
         /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
         public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result)
         {
             try
             {
                 string word = null;
                 while ((word = reader.ReadLine()) != null)
                 {
                     if (word.StartsWith(comment, StringComparison.Ordinal) == false)
                     {
                         result.Add(word.Trim());
                     }
                 }
             }
             finally
             {
                 IOUtils.Dispose(reader);
             }
             return result;
         }


         /// <summary>
         /// Reads stopwords from a stopword list in Snowball format.
         /// <para>
         /// The snowball format is the following:
         /// <list type="bullet">
         ///     <item><description>Lines may contain multiple words separated by whitespace.</description></item>
         ///     <item><description>The comment character is the vertical line (&#124;).</description></item>
         ///     <item><description>Lines may contain trailing comments.</description></item>
         /// </list>
         /// </para>
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param>
         /// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
         /// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
         public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
         {
             try
             {
                 string line = null;
                 while ((line = reader.ReadLine()) != null)
                 {
                     int comment = line.IndexOf('|');
                     if (comment >= 0)
                     {
                         line = line.Substring(0, comment);
                     }
                     string[] words = WHITESPACE.Split(line).TrimEnd();
                     foreach (var word in words)
                     {
                         if (word.Length > 0)
                         {
                             result.Add(word);
                         }
                     }
                 }
             }
             finally
             {
                 IOUtils.Dispose(reader);
             }
             return result;
         }

         /// <summary>
         /// Reads stopwords from a stopword list in Snowball format.
         /// <para>
         /// The snowball format is the following:
         /// <list type="bullet">
         ///     <item><description>Lines may contain multiple words separated by whitespace.</description></item>
         ///     <item><description>The comment character is the vertical line (&#124;).</description></item>
         ///     <item><description>Lines may contain trailing comments.</description></item>
         /// </list>
         /// </para>
         /// </summary>
         /// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param>
         /// <param name="matchVersion"> the Lucene <see cref="LuceneVersion"/> </param>
         /// <returns> A <see cref="CharArraySet"/> with the reader's words </returns>
         public static CharArraySet GetSnowballWordSet(TextReader reader, LuceneVersion matchVersion)
         {
             return GetSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
         }


         /// <summary>
         /// Reads a stem dictionary. Each line contains:
         /// <code>word<b>\t</b>stem</code>
         /// (i.e. two tab separated words)
         /// </summary>
         /// <returns> stem dictionary that overrules the stemming algorithm </returns>
         /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
         public static CharArrayMap<string> GetStemDict(TextReader reader, CharArrayMap<string> result)
         {
             try
             {
                 string line;
                 while ((line = reader.ReadLine()) != null)
                 {
                     string[] wordstem = line.Split(new char[] { '\t' }, 2);
                     result.Put(wordstem[0], wordstem[1]);
                 }
             }
             finally
             {
                 IOUtils.Dispose(reader);
             }
             return result;
         }

         /// <summary>
         /// Accesses a resource by name and returns the (non comment) lines containing
         /// data using the given character encoding.
         /// <para>
         /// A comment line is any line that starts with the character "#"
         /// </para>
         /// </summary>
         /// <returns> a list of non-blank non-comment lines with whitespace trimmed </returns>
         /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
         public static IList<string> GetLines(Stream stream, Encoding encoding)
         {
             List<string> lines = new List<string>();

             using (StreamReader reader = new StreamReader(stream, encoding))
             {
                 string word;
                 while ((word = reader.ReadLine()) != null)
                 {
                     // skip initial bom marker
                     if (lines.Count == 0 && word.Length > 0 && word[0] == '\uFEFF')
                     {
                         word = word.Substring(1);
                     }
                     // skip comments
                     if (word.StartsWith("#", StringComparison.Ordinal))
                     {
                         continue;
                     }
                     word = word.Trim();
                     // skip blank lines
                     if (word.Length == 0)
                     {
                         continue;
                     }
                     lines.Add(word);
                 }
             }
             return lines;
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Text;
	using System.Text.RegularExpressions;

	namespace Lucene.Net.Analysis.Util
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Loader for text files that represent a list of stopwords.
	/// <para/>
	/// <see cref="IOUtils"/> to obtain <see cref="TextReader"/> instances.
	/// @lucene.internal
	/// </summary>
	public class WordlistLoader
	{
	private const int INITIAL_CAPACITY = 16;

	// LUCENENET specific
	private readonly static Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled);

	/// <summary>
	/// no instance </summary>
	private WordlistLoader()
	{
	}

	// LUCENENET TODO: Add .NET overloads that accept a file name? Or at least a FileInfo object as was done in 3.0.3?

	/// <summary>
	/// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting
	/// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
	/// one word. The words need to be in lowercase if you make use of an
	/// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
	/// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
	/// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
	public static CharArraySet GetWordSet(TextReader reader, CharArraySet result)
	{
	try
	{
	string word = null;
	while ((word = reader.ReadLine()) != null)
	{
	result.Add(word.Trim());
	}

	}
	finally
	{
	IOUtils.Dispose(reader);
	}
	return result;
	}

	/// <summary>
	/// Reads lines from a <see cref="TextReader"/> and adds every line as an entry to a <see cref="CharArraySet"/> (omitting
	/// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
	/// one word. The words need to be in lowercase if you make use of an
	/// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
	/// <param name="matchVersion"> the <see cref="LuceneVersion"/> </param>
	/// <returns> A <see cref="CharArraySet"/> with the reader's words </returns>
	public static CharArraySet GetWordSet(TextReader reader, LuceneVersion matchVersion)
	{
	return GetWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	}

	/// <summary>
	/// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting
	/// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
	/// one word. The words need to be in lowercase if you make use of an
	/// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
	/// <param name="comment"> The string representing a comment. </param>
	/// <param name="matchVersion"> the <see cref="LuceneVersion"/> </param>
	/// <returns> A CharArraySet with the reader's words </returns>
	public static CharArraySet GetWordSet(TextReader reader, string comment, LuceneVersion matchVersion)
	{
	return GetWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	}

	/// <summary>
	/// Reads lines from a <see cref="TextReader"/> and adds every non-comment line as an entry to a <see cref="CharArraySet"/> (omitting
	/// leading and trailing whitespace). Every line of the <see cref="TextReader"/> should contain only
	/// one word. The words need to be in lowercase if you make use of an
	/// <see cref="Analyzer"/> which uses <see cref="Core.LowerCaseFilter"/> (like <see cref="Standard.StandardAnalyzer"/>).
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing the wordlist </param>
	/// <param name="comment"> The string representing a comment. </param>
	/// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
	/// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
	public static CharArraySet GetWordSet(TextReader reader, string comment, CharArraySet result)
	{
	try
	{
	string word = null;
	while ((word = reader.ReadLine()) != null)
	{
	if (word.StartsWith(comment, StringComparison.Ordinal) == false)
	{
	result.Add(word.Trim());
	}
	}
	}
	finally
	{
	IOUtils.Dispose(reader);
	}
	return result;
	}


	/// <summary>
	/// Reads stopwords from a stopword list in Snowball format.
	/// <para>
	/// The snowball format is the following:
	/// <list type="bullet">
	/// <item><description>Lines may contain multiple words separated by whitespace.</description></item>
	/// <item><description>The comment character is the vertical line (\|).</description></item>
	/// <item><description>Lines may contain trailing comments.</description></item>
	/// </list>
	/// </para>
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param>
	/// <param name="result"> the <see cref="CharArraySet"/> to fill with the readers words </param>
	/// <returns> the given <see cref="CharArraySet"/> with the reader's words </returns>
	public static CharArraySet GetSnowballWordSet(TextReader reader, CharArraySet result)
	{
	try
	{
	string line = null;
	while ((line = reader.ReadLine()) != null)
	{
	int comment = line.IndexOf('\|');
	if (comment >= 0)
	{
	line = line.Substring(0, comment);
	}
	string[] words = WHITESPACE.Split(line).TrimEnd();
	foreach (var word in words)
	{
	if (word.Length > 0)
	{
	result.Add(word);
	}
	}
	}
	}
	finally
	{
	IOUtils.Dispose(reader);
	}
	return result;
	}

	/// <summary>
	/// Reads stopwords from a stopword list in Snowball format.
	/// <para>
	/// The snowball format is the following:
	/// <list type="bullet">
	/// <item><description>Lines may contain multiple words separated by whitespace.</description></item>
	/// <item><description>The comment character is the vertical line (\|).</description></item>
	/// <item><description>Lines may contain trailing comments.</description></item>
	/// </list>
	/// </para>
	/// </summary>
	/// <param name="reader"> <see cref="TextReader"/> containing a Snowball stopword list </param>
	/// <param name="matchVersion"> the Lucene <see cref="LuceneVersion"/> </param>
	/// <returns> A <see cref="CharArraySet"/> with the reader's words </returns>
	public static CharArraySet GetSnowballWordSet(TextReader reader, LuceneVersion matchVersion)
	{
	return GetSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
	}


	/// <summary>
	/// Reads a stem dictionary. Each line contains:
	/// <code>word<b>\t</b>stem</code>
	/// (i.e. two tab separated words)
	/// </summary>
	/// <returns> stem dictionary that overrules the stemming algorithm </returns>
	/// <exception cref="IOException"> If there is a low-level I/O error. </exception>
	public static CharArrayMap<string> GetStemDict(TextReader reader, CharArrayMap<string> result)
	{
	try
	{
	string line;
	while ((line = reader.ReadLine()) != null)
	{
	string[] wordstem = line.Split(new char[] { '\t' }, 2);
	result.Put(wordstem[0], wordstem[1]);
	}
	}
	finally
	{
	IOUtils.Dispose(reader);
	}
	return result;
	}

	/// <summary>
	/// Accesses a resource by name and returns the (non comment) lines containing
	/// data using the given character encoding.
	/// <para>
	/// A comment line is any line that starts with the character "#"
	/// </para>
	/// </summary>
	/// <returns> a list of non-blank non-comment lines with whitespace trimmed </returns>
	/// <exception cref="IOException"> If there is a low-level I/O error. </exception>
	public static IList<string> GetLines(Stream stream, Encoding encoding)
	{
	List<string> lines = new List<string>();

	using (StreamReader reader = new StreamReader(stream, encoding))
	{
	string word;
	while ((word = reader.ReadLine()) != null)
	{
	// skip initial bom marker
	if (lines.Count == 0 && word.Length > 0 && word[0] == '\uFEFF')
	{
	word = word.Substring(1);
	}
	// skip comments
	if (word.StartsWith("#", StringComparison.Ordinal))
	{
	continue;
	}
	word = word.Trim();
	// skip blank lines
	if (word.Length == 0)
	{
	continue;
	}
	lines.Add(word);
	}
	}
	return lines;
	}
	}
	}