| using System; |
| using System.Collections.Generic; |
| using System.Globalization; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Analysis.Util; |
| |
| namespace Lucene.Net.Analysis.Miscellaneous |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// A filter to apply normal capitalization rules to Tokens. It will make the first letter |
| /// capital and the rest lower case. |
| /// <para/> |
| /// This filter is particularly useful to build nice looking facet parameters. This filter |
| /// is not appropriate if you intend to use a prefix query. |
| /// </summary> |
| public sealed class CapitalizationFilter : TokenFilter |
| { |
| public static readonly int DEFAULT_MAX_WORD_COUNT = int.MaxValue; |
| public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue; |
| |
| private readonly bool onlyFirstWord; |
| private readonly CharArraySet keep; |
| private readonly bool forceFirstLetter; |
| private readonly ICollection<char[]> okPrefix; |
| |
| private readonly int minWordLength; |
| private readonly int maxWordCount; |
| private readonly int maxTokenLength; |
| |
| private readonly ICharTermAttribute termAtt; |
| |
| // LUCENENET specific for specifying culture instead of using |
| // invariant culture (which makes this class more generally useful). |
| // Per MSDN, InvariantCulture shouldn't be used for cases such as this: |
| // https://msdn.microsoft.com/en-us/library/dd465121(v=vs.110).aspx |
| // However, it would seem unnatural to rely on the current culture as a default |
| // when choosing a filter or analyzer. To match the behavior of other filters, |
| // the invariant culture is used as a default, but we added constructors so the |
| // user can specify to override the behavior, if needed. |
| private readonly CultureInfo culture; |
| |
| /// <summary> |
| /// Creates a <see cref="CapitalizationFilter"/> with the default parameters using the invariant culture. |
| /// <para> |
| /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)"> |
| /// CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null) |
| /// </see> |
| /// </para> |
| /// </summary> |
| public CapitalizationFilter(TokenStream @in) |
| : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, null) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a <see cref="CapitalizationFilter"/> with the default parameters and the specified <paramref name="culture"/>. |
| /// <para> |
| /// Calls <see cref="CapitalizationFilter.CapitalizationFilter(TokenStream, bool, CharArraySet, bool, ICollection{char[]}, int, int, int)"> |
| /// CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH) |
| /// </see> |
| /// </para> |
| /// </summary> |
| /// <param name="in"> input tokenstream </param> |
| /// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param> |
| // LUCENENET specific overload for specifying culture instead of using |
| // invariant culture (which makes this class more generally useful). |
| public CapitalizationFilter(TokenStream @in, CultureInfo culture) |
| : this(@in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH, culture) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters using the invariant culture.</summary> |
| /// <param name="in"> input tokenstream </param> |
| /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param> |
| /// <param name="keep"> a keep word list. Each word that should be kept separated by whitespace. </param> |
| /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param> |
| /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param> |
| /// <param name="minWordLength"> how long the word needs to be to get capitalization applied. If the |
| /// minWordLength is 3, "and" > "And" but "or" stays "or". </param> |
| /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is |
| /// assumed to be correct. </param> |
| /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param> |
| public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) |
| : this(@in, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength, null) |
| { |
| } |
| |
| /// <summary> |
| /// Creates a <see cref="CapitalizationFilter"/> with the specified parameters and the specified <paramref name="culture"/>. </summary> |
| /// <param name="in"> input tokenstream </param> |
| /// <param name="onlyFirstWord"> should each word be capitalized or all of the words? </param> |
| /// <param name="keep"> a keep word list. Each word that should be kept separated by whitespace. </param> |
| /// <param name="forceFirstLetter"> Force the first letter to be capitalized even if it is in the keep list. </param> |
| /// <param name="okPrefix"> do not change word capitalization if a word begins with something in this list. </param> |
| /// <param name="minWordLength"> how long the word needs to be to get capitalization applied. If the |
| /// minWordLength is 3, "and" > "And" but "or" stays "or". </param> |
| /// <param name="maxWordCount"> if the token contains more then maxWordCount words, the capitalization is |
| /// assumed to be correct. </param> |
| /// <param name="maxTokenLength"> The maximum length for an individual token. Tokens that exceed this length will not have the capitalization operation performed. </param> |
| /// <param name="culture"> The culture to use for the casing operation. If null, <see cref="CultureInfo.InvariantCulture"/> will be used. </param> |
| // LUCENENET specific overload for specifying culture instead of using |
| // invariant culture (which makes this class more generally useful). |
| public CapitalizationFilter(TokenStream @in, bool onlyFirstWord, CharArraySet keep, bool forceFirstLetter, ICollection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength, CultureInfo culture) |
| : base(@in) |
| { |
| // LUCENENET NOTE: The guard clauses were copied here from a later version of Lucene. |
| // Apparently, the tests were not ported from 4.8.0 because they expected this and the |
| // original tests did not. Adding them anyway because there is no downside to this. |
| if (minWordLength < 0) |
| { |
| throw new ArgumentOutOfRangeException("minWordLength must be greater than or equal to zero"); |
| } |
| if (maxWordCount < 1) |
| { |
| throw new ArgumentOutOfRangeException("maxWordCount must be greater than zero"); |
| } |
| if (maxTokenLength < 1) |
| { |
| throw new ArgumentOutOfRangeException("maxTokenLength must be greater than zero"); |
| } |
| |
| this.onlyFirstWord = onlyFirstWord; |
| this.keep = keep; |
| this.forceFirstLetter = forceFirstLetter; |
| this.okPrefix = okPrefix; |
| this.minWordLength = minWordLength; |
| this.maxWordCount = maxWordCount; |
| this.maxTokenLength = maxTokenLength; |
| this.culture = culture ?? CultureInfo.InvariantCulture; |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| } |
| |
| public override bool IncrementToken() |
| { |
| if (!m_input.IncrementToken()) |
| { |
| return false; |
| } |
| |
| char[] termBuffer = termAtt.Buffer; |
| int termBufferLength = termAtt.Length; |
| char[] backup = null; |
| |
| if (maxWordCount < DEFAULT_MAX_WORD_COUNT) |
| { |
| //make a backup in case we exceed the word count |
| backup = new char[termBufferLength]; |
| Array.Copy(termBuffer, 0, backup, 0, termBufferLength); |
| } |
| |
| if (termBufferLength < maxTokenLength) |
| { |
| int wordCount = 0; |
| |
| int lastWordStart = 0; |
| for (int i = 0; i < termBufferLength; i++) |
| { |
| char c = termBuffer[i]; |
| if (c <= ' ' || c == '.') |
| { |
| int len = i - lastWordStart; |
| if (len > 0) |
| { |
| ProcessWord(termBuffer, lastWordStart, len, wordCount++); |
| lastWordStart = i + 1; |
| i++; |
| } |
| } |
| } |
| |
| // process the last word |
| if (lastWordStart < termBufferLength) |
| { |
| ProcessWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); |
| } |
| |
| if (wordCount > maxWordCount) |
| { |
| termAtt.CopyBuffer(backup, 0, termBufferLength); |
| } |
| } |
| |
| return true; |
| } |
| |
| private void ProcessWord(char[] buffer, int offset, int length, int wordCount) |
| { |
| if (length < 1) |
| { |
| return; |
| } |
| |
| if (onlyFirstWord && wordCount > 0) |
| { |
| for (int i = 0; i < length; i++) |
| { |
| buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]); |
| } |
| return; |
| } |
| |
| if (keep != null && keep.Contains(buffer, offset, length)) |
| { |
| if (wordCount == 0 && forceFirstLetter) |
| { |
| buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]); |
| } |
| return; |
| } |
| |
| if (length < minWordLength) |
| { |
| return; |
| } |
| |
| if (okPrefix != null) |
| { |
| foreach (char[] prefix in okPrefix) |
| { |
| if (length >= prefix.Length) //don't bother checking if the buffer length is less than the prefix |
| { |
| bool match = true; |
| for (int i = 0; i < prefix.Length; i++) |
| { |
| if (prefix[i] != buffer[offset + i]) |
| { |
| match = false; |
| break; |
| } |
| } |
| if (match) |
| { |
| return; |
| } |
| } |
| } |
| } |
| |
| // We know it has at least one character |
| /*char[] chars = w.toCharArray(); |
| StringBuilder word = new StringBuilder( w.length() ); |
| word.append( Character.toUpperCase( chars[0] ) );*/ |
| buffer[offset] = culture.TextInfo.ToUpper(buffer[offset]); |
| |
| for (int i = 1; i < length; i++) |
| { |
| buffer[offset + i] = culture.TextInfo.ToLower(buffer[offset + i]); |
| } |
| //return word.toString(); |
| } |
| } |
| } |