src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/CapitalizationFilter.cs - lucenenet - Git at Google

 using System;
 using System.Collections.Generic;
 using System.Globalization;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;

 namespace Lucene.Net.Analysis.Miscellaneous
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// A filter to apply normal capitalization rules to Tokens.  It will make the first letter
     /// capital and the rest lower case.
     /// <para/>
     /// This filter is particularly useful to build nice looking facet parameters.  This filter
     /// is not appropriate if you intend to use a prefix query.
     /// </summary>
     public sealed class CapitalizationFilter : TokenFilter
     {
         public static readonly int DEFAULT_MAX_WORD_COUNT = int.MaxValue;
         public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;

         private readonly bool onlyFirstWord;
         private readonly CharArraySet keep;
         private readonly bool forceFirstLetter;
         private readonly ICollection<char[]> okPrefix;

         private readonly int minWordLength;
         private readonly int maxWordCount;
         private readonly int maxTokenLength;

         private readonly ICharTermAttribute termAtt;

         // LUCENENET specific for specifying culture instead of using
         // invariant culture (which makes this class more generally useful).
         // Per MSDN, InvariantCulture shouldn't be used for cases such as this:
         // https://msdn.microsoft.com/en-us/library/dd465121(v=vs.110).aspx
         // However, it would seem unnatural to rely on the current culture as a default
         // when choosing a filter or analyzer. To match the behavior of other filters,
         // the invariant culture is used as a default, but we added constructors so the
         // user can specify to override the behavior, if needed.
         private readonly CultureInfo culture;

         /// <summary>
         /// Creates a <see cref="CapitalizationFilter"/> with the default parameters using the invariant culture.
         /// <para>
         /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
         ///     CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
         /// </see>
         /// </para>
         /// </summary>
         public CapitalizationFilter(TokenStream @in)
             : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
         {
         }

         /// <summary>
         /// Creates a <see cref="CapitalizationFilter"/> with the default parameters and the specified <paramref name="culture"/>.
         /// <para>
         /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
         ///     CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)
         /// </see>
         /// </para>
         /// </summary>
         /// <param name="in"> input tokenstream </param>
         /// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param>
         // LUCENENET specific overload for specifying culture instead of using
         // invariant culture (which makes this class more generally useful).
         public CapitalizationFilter(TokenStream @in, CultureInfo culture)
             : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, culture)
         {
         }

         /// <summary>
         /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters using the invariant culture.</summary>
         /// <param name="in"> input tokenstream </param>
         /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
         /// <param name="keep"> a keep word list.  Each word that should be kept separated by whitespace. </param>
         /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
         /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
         /// <param name="minWordLength"> how long the word needs to be to get capitalization applied.  If the
         ///                      minWordLength is 3, "and" &gt; "And" but "or" stays "or". </param>
         /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
         ///                     assumed to be correct. </param>
         /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
         public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
             : this(@in, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, null)
         {
         }

         /// <summary>
         /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters and the specified <paramref name="culture"/>. </summary>
         /// <param name="in"> input tokenstream </param>
         /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
         /// <param name="keep"> a keep word list.  Each word that should be kept separated by whitespace. </param>
         /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
         /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
         /// <param name="minWordLength"> how long the word needs to be to get capitalization applied.  If the
         ///                      minWordLength is 3, "and" &gt; "And" but "or" stays "or". </param>
         /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
         ///                     assumed to be correct. </param>
         /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
         /// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param>
         // LUCENENET specific overload for specifying culture instead of using
         // invariant culture (which makes this class more generally useful).
         public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength, CultureInfo culture)
             : base(@in)
         {
             // LUCENENET NOTE: The guard clauses were copied here from a later version of Lucene.
             // Apparently, the tests were not ported from 4.8.0 because they expected this and the
             // original tests did not. Adding them anyway because there is no downside to this.
             if (minWordLength < 0)
             {
                 throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero");
             }
             if (maxWordCount < 1)
             {
                 throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero");
             }
             if (maxTokenLength < 1)
             {
                 throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero");
             }

             this.onlyFirstWord = onlyFirstWord;
             this.keep = keep;
             this.forceFirstLetter = forceFirstLetter;
             this.okPrefix = okPrefix;
             this.minWordLength = minWordLength;
             this.maxWordCount = maxWordCount;
             this.maxTokenLength = maxTokenLength;
             this.culture = culture ?? CultureInfo.InvariantCulture;
             termAtt = AddAttribute<ICharTermAttribute>();
         }

         public override bool IncrementToken()
         {
             if (!m_input.IncrementToken())
             {
                 return false;
             }

             char[] termBuffer = termAtt.Buffer;
             int termBufferLength = termAtt.Length;
             char[] backup = null;

             if (maxWordCount < DEFAULT_MAX_WORD_COUNT)
             {
                 //make a backup in case we exceed the word count
                 backup = new char[termBufferLength];
                 Array.Copy(termBuffer, 0, backup, 0, termBufferLength);
             }

             if (termBufferLength < maxTokenLength)
             {
                 int wordCount = 0;

                 int lastWordStart = 0;
                 for (int i = 0; i < termBufferLength; i++)
                 {
                     char c = termBuffer[i];
                     if (c <= ' ' || c == '.')
                     {
                         int len = i - lastWordStart;
                         if (len > 0)
                         {
                             ProcessWord(termBuffer, lastWordStart, len, wordCount++);
                             lastWordStart = i + 1;
                             i++;
                         }
                     }
                 }

                 // process the last word
                 if (lastWordStart < termBufferLength)
                 {
                     ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
                 }

                 if (wordCount > maxWordCount)
                 {
                     termAtt.CopyBuffer(backup, 0, termBufferLength);
                 }
             }

             return true;
         }

         private void ProcessWord(char[] buffer, int offset, int length, int wordCount)
         {
             if (length < 1)
             {
                 return;
             }

             if (onlyFirstWord && wordCount > 0)
             {
                 for (int i = 0; i < length; i++)
                 {
                     buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]);
                 }
                 return;
             }

             if (keep != null && keep.Contains(buffer, offset, length))
             {
                 if (wordCount == 0 && forceFirstLetter)
                 {
                     buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]);
                 }
                 return;
             }

             if (length < minWordLength)
             {
                 return;
             }

             if (okPrefix != null)
             {
                 foreach (char[] prefix in okPrefix)
                 {
                     if (length >= prefix.Length) //don't bother checking if the buffer length is less than the prefix
                     {
                         bool match = true;
                         for (int i = 0; i < prefix.Length; i++)
                         {
                             if (prefix[i] != buffer[offset + i])
                             {
                                 match = false;
                                 break;
                             }
                         }
                         if (match)
                         {
                             return;
                         }
                     }
                 }
             }

             // We know it has at least one character
             /*char[] chars = w.toCharArray();
             StringBuilder word = new StringBuilder( w.length() );
             word.append( Character.toUpperCase( chars[0] ) );*/
             buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]);

             for (int i = 1; i < length; i++)
             {
                 buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]);
             }
             //return word.toString();
         }
     }
 }
	using System;
	using System.Collections.Generic;
	using System.Globalization;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Analysis.Util;

	namespace Lucene.Net.Analysis.Miscellaneous
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// A filter to apply normal capitalization rules to Tokens. It will make the first letter
	/// capital and the rest lower case.
	/// <para/>
	/// This filter is particularly useful to build nice looking facet parameters. This filter
	/// is not appropriate if you intend to use a prefix query.
	/// </summary>
	public sealed class CapitalizationFilter : TokenFilter
	{
	public static readonly int DEFAULT_MAX_WORD_COUNT = int.MaxValue;
	public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;

	private readonly bool onlyFirstWord;
	private readonly CharArraySet keep;
	private readonly bool forceFirstLetter;
	private readonly ICollection<char[]> okPrefix;

	private readonly int minWordLength;
	private readonly int maxWordCount;
	private readonly int maxTokenLength;

	private readonly ICharTermAttribute termAtt;

	// LUCENENET specific for specifying culture instead of using
	// invariant culture (which makes this class more generally useful).
	// Per MSDN, InvariantCulture shouldn't be used for cases such as this:
	// https://msdn.microsoft.com/en-us/library/dd465121(v=vs.110).aspx
	// However, it would seem unnatural to rely on the current culture as a default
	// when choosing a filter or analyzer. To match the behavior of other filters,
	// the invariant culture is used as a default, but we added constructors so the
	// user can specify to override the behavior, if needed.
	private readonly CultureInfo culture;

	/// <summary>
	/// Creates a <see cref="CapitalizationFilter"/> with the default parameters using the invariant culture.
	/// <para>
	/// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
	/// CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
	/// </see>
	/// </para>
	/// </summary>
	public CapitalizationFilter(TokenStream @in)
	: this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null)
	{
	}

	/// <summary>
	/// Creates a <see cref="CapitalizationFilter"/> with the default parameters and the specified <paramref name="culture"/>.
	/// <para>
	/// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)">
	/// CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)
	/// </see>
	/// </para>
	/// </summary>
	/// <param name="in"> input tokenstream </param>
	/// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param>
	// LUCENENET specific overload for specifying culture instead of using
	// invariant culture (which makes this class more generally useful).
	public CapitalizationFilter(TokenStream @in, CultureInfo culture)
	: this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, culture)
	{
	}

	/// <summary>
	/// Creates a <see cref="CapitalizationFilter"/> with the specified parameters using the invariant culture.</summary>
	/// <param name="in"> input tokenstream </param>
	/// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
	/// <param name="keep"> a keep word list. Each word that should be kept separated by whitespace. </param>
	/// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
	/// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
	/// <param name="minWordLength"> how long the word needs to be to get capitalization applied. If the
	/// minWordLength is 3, "and" > "And" but "or" stays "or". </param>
	/// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
	/// assumed to be correct. </param>
	/// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
	public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength)
	: this(@in, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, null)
	{
	}

	/// <summary>
	/// Creates a <see cref="CapitalizationFilter"/> with the specified parameters and the specified <paramref name="culture"/>. </summary>
	/// <param name="in"> input tokenstream </param>
	/// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param>
	/// <param name="keep"> a keep word list. Each word that should be kept separated by whitespace. </param>
	/// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param>
	/// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param>
	/// <param name="minWordLength"> how long the word needs to be to get capitalization applied. If the
	/// minWordLength is 3, "and" > "And" but "or" stays "or". </param>
	/// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is
	/// assumed to be correct. </param>
	/// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param>
	/// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param>
	// LUCENENET specific overload for specifying culture instead of using
	// invariant culture (which makes this class more generally useful).
	public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength, CultureInfo culture)
	: base(@in)
	{
	// LUCENENET NOTE: The guard clauses were copied here from a later version of Lucene.
	// Apparently, the tests were not ported from 4.8.0 because they expected this and the
	// original tests did not. Adding them anyway because there is no downside to this.
	if (minWordLength < 0)
	{
	throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero");
	}
	if (maxWordCount < 1)
	{
	throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero");
	}
	if (maxTokenLength < 1)
	{
	throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero");
	}

	this.onlyFirstWord = onlyFirstWord;
	this.keep = keep;
	this.forceFirstLetter = forceFirstLetter;
	this.okPrefix = okPrefix;
	this.minWordLength = minWordLength;
	this.maxWordCount = maxWordCount;
	this.maxTokenLength = maxTokenLength;
	this.culture = culture ?? CultureInfo.InvariantCulture;
	termAtt = AddAttribute<ICharTermAttribute>();
	}

	public override bool IncrementToken()
	{
	if (!m_input.IncrementToken())
	{
	return false;
	}

	char[] termBuffer = termAtt.Buffer;
	int termBufferLength = termAtt.Length;
	char[] backup = null;

	if (maxWordCount < DEFAULT_MAX_WORD_COUNT)
	{
	//make a backup in case we exceed the word count
	backup = new char[termBufferLength];
	Array.Copy(termBuffer, 0, backup, 0, termBufferLength);
	}

	if (termBufferLength < maxTokenLength)
	{
	int wordCount = 0;

	int lastWordStart = 0;
	for (int i = 0; i < termBufferLength; i++)
	{
	char c = termBuffer[i];
	if (c <= ' ' \|\| c == '.')
	{
	int len = i - lastWordStart;
	if (len > 0)
	{
	ProcessWord(termBuffer, lastWordStart, len, wordCount++);
	lastWordStart = i + 1;
	i++;
	}
	}
	}

	// process the last word
	if (lastWordStart < termBufferLength)
	{
	ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
	}

	if (wordCount > maxWordCount)
	{
	termAtt.CopyBuffer(backup, 0, termBufferLength);
	}
	}

	return true;
	}

	private void ProcessWord(char[] buffer, int offset, int length, int wordCount)
	{
	if (length < 1)
	{
	return;
	}

	if (onlyFirstWord && wordCount > 0)
	{
	for (int i = 0; i < length; i++)
	{
	buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]);
	}
	return;
	}

	if (keep != null && keep.Contains(buffer, offset, length))
	{
	if (wordCount == 0 && forceFirstLetter)
	{
	buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]);
	}
	return;
	}

	if (length < minWordLength)
	{
	return;
	}

	if (okPrefix != null)
	{
	foreach (char[] prefix in okPrefix)
	{
	if (length >= prefix.Length) //don't bother checking if the buffer length is less than the prefix
	{
	bool match = true;
	for (int i = 0; i < prefix.Length; i++)
	{
	if (prefix[i] != buffer[offset + i])
	{
	match = false;
	break;
	}
	}
	if (match)
	{
	return;
	}
	}
	}
	}

	// We know it has at least one character
	/*char[] chars = w.toCharArray();
	StringBuilder word = new StringBuilder( w.length() );
	word.append( Character.toUpperCase( chars[0] ) );*/
	buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]);

	for (int i = 1; i < length; i++)
	{
	buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]);
	}
	//return word.toString();
	}
	}
	}