src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs - lucenenet - Git at Google

 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System.Collections.Generic;

 namespace Lucene.Net.Analysis.Miscellaneous
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Provides the ability to override any <see cref="KeywordAttribute"/> aware stemmer
     /// with custom dictionary-based stemming.
     /// </summary>
     public sealed class StemmerOverrideFilter : TokenFilter
     {
         private readonly StemmerOverrideMap stemmerOverrideMap;

         private readonly ICharTermAttribute termAtt;
         private readonly IKeywordAttribute keywordAtt;
         private readonly FST.BytesReader fstReader;
         private readonly FST.Arc<BytesRef> scratchArc = new FST.Arc<BytesRef>();
         private readonly CharsRef spare = new CharsRef();

         /// <summary>
         /// Create a new <see cref="StemmerOverrideFilter"/>, performing dictionary-based stemming
         /// with the provided dictionary (<paramref name="stemmerOverrideMap"/>).
         /// <para>
         /// Any dictionary-stemmed terms will be marked with <see cref="KeywordAttribute"/>
         /// so that they will not be stemmed with stemmers down the chain.
         /// </para>
         /// </summary>
         public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap)
               : base(input)
         {
             this.stemmerOverrideMap = stemmerOverrideMap;
             fstReader = stemmerOverrideMap.GetBytesReader();
             termAtt = AddAttribute<ICharTermAttribute>();
             keywordAtt = AddAttribute<IKeywordAttribute>();
         }

         public override bool IncrementToken()
         {
             if (m_input.IncrementToken())
             {
                 if (fstReader == null)
                 {
                     // No overrides
                     return true;
                 }
                 if (!keywordAtt.IsKeyword) // don't muck with already-keyworded terms
                 {
                     BytesRef stem = stemmerOverrideMap.Get(termAtt.Buffer, termAtt.Length, scratchArc, fstReader);
                     if (stem != null)
                     {
                         char[] buffer = spare.Chars = termAtt.Buffer;
                         UnicodeUtil.UTF8toUTF16(stem.Bytes, stem.Offset, stem.Length, spare);
                         if (spare.Chars != buffer)
                         {
                             termAtt.CopyBuffer(spare.Chars, spare.Offset, spare.Length);
                         }
                         termAtt.Length = spare.Length;
                         keywordAtt.IsKeyword = true;
                     }
                 }
                 return true;
             }
             else
             {
                 return false;
             }
         }

         /// <summary>
         /// A read-only 4-byte FST backed map that allows fast case-insensitive key
         /// value lookups for <see cref="StemmerOverrideFilter"/>
         /// </summary>
         // TODO maybe we can generalize this and reuse this map somehow?
         public sealed class StemmerOverrideMap
         {
             private readonly FST<BytesRef> fst;
             private readonly bool ignoreCase;

             /// <summary>
             /// Creates a new <see cref="StemmerOverrideMap"/> </summary>
             /// <param name="fst"> the fst to lookup the overrides </param>
             /// <param name="ignoreCase"> if the keys case should be ingored </param>
             public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase)
             {
                 this.fst = fst;
                 this.ignoreCase = ignoreCase;
             }

             /// <summary>
             /// Returns a <see cref="FST.BytesReader"/> to pass to the <see cref="Get(char[], int, FST.Arc{BytesRef}, FST.BytesReader)"/> method.
             /// </summary>
             public FST.BytesReader GetBytesReader()
             {
                 if (fst == null)
                 {
                     return null;
                 }
                 else
                 {
                     return fst.GetBytesReader();
                 }
             }

             /// <summary>
             /// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
             /// </summary>
             public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
             {
                 BytesRef pendingOutput = fst.Outputs.NoOutput;
                 BytesRef matchOutput = null;
                 int bufUpto = 0;
                 fst.GetFirstArc(scratchArc);
                 while (bufUpto < bufferLen)
                 {
                     int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
                     if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
                     {
                         return null;
                     }
                     pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
                     bufUpto += Character.CharCount(codePoint);
                 }
                 if (scratchArc.IsFinal)
                 {
                     matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
                 }
                 return matchOutput;
             }
         }

         /// <summary>
         /// This builder builds an <see cref="FST"/> for the <see cref="StemmerOverrideFilter"/>
         /// </summary>
         public class Builder
         {
             private readonly BytesRefHash hash = new BytesRefHash();
             private readonly BytesRef spare = new BytesRef();
             private readonly List<string> outputValues = new List<string>();
             private readonly bool ignoreCase;
             private readonly CharsRef charsSpare = new CharsRef();

             /// <summary>
             /// Creates a new <see cref="Builder"/> with <see cref="ignoreCase"/> set to <c>false</c>
             /// </summary>
             public Builder()
                 : this(false)
             {
             }

             /// <summary>
             /// Creates a new <see cref="Builder"/> </summary>
             /// <param name="ignoreCase"> if the input case should be ignored. </param>
             public Builder(bool ignoreCase)
             {
                 this.ignoreCase = ignoreCase;
             }

             /// <summary>
             /// Adds an input string and it's stemmer override output to this builder.
             /// </summary>
             /// <param name="input"> the input char sequence </param>
             /// <param name="output"> the stemmer override output char sequence </param>
             /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
             public virtual bool Add(string input, string output)
             {
                 int length = input.Length;
                 if (ignoreCase)
                 {
                     // convert on the fly to lowercase
                     charsSpare.Grow(length);
                     char[] buffer = charsSpare.Chars;
                     for (int i = 0; i < length;)
                     {
                         i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
                     }
                     UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
                 }
                 else
                 {
                     UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
                 }
                 if (hash.Add(spare) >= 0)
                 {
                     outputValues.Add(output);
                     return true;
                 }
                 return false;
             }

             /// <summary>
             /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
             /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
             /// <exception cref="System.IO.IOException"> if an <see cref="System.IO.IOException"/> occurs; </exception>
             public virtual StemmerOverrideMap Build()
             {
                 ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
                 Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
                 int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
                 Int32sRef intsSpare = new Int32sRef();
                 int size = hash.Count;
                 for (int i = 0; i < size; i++)
                 {
                     int id = sort[i];
                     BytesRef bytesRef = hash.Get(id, spare);
                     UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
                     builder.Add(intsSpare, new BytesRef(outputValues[id]));
                 }
                 return new StemmerOverrideMap(builder.Finish(), ignoreCase);
             }
         }
     }
 }
	using J2N;
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System.Collections.Generic;

	namespace Lucene.Net.Analysis.Miscellaneous
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Provides the ability to override any <see cref="KeywordAttribute"/> aware stemmer
	/// with custom dictionary-based stemming.
	/// </summary>
	public sealed class StemmerOverrideFilter : TokenFilter
	{
	private readonly StemmerOverrideMap stemmerOverrideMap;

	private readonly ICharTermAttribute termAtt;
	private readonly IKeywordAttribute keywordAtt;
	private readonly FST.BytesReader fstReader;
	private readonly FST.Arc<BytesRef> scratchArc = new FST.Arc<BytesRef>();
	private readonly CharsRef spare = new CharsRef();

	/// <summary>
	/// Create a new <see cref="StemmerOverrideFilter"/>, performing dictionary-based stemming
	/// with the provided dictionary (<paramref name="stemmerOverrideMap"/>).
	/// <para>
	/// Any dictionary-stemmed terms will be marked with <see cref="KeywordAttribute"/>
	/// so that they will not be stemmed with stemmers down the chain.
	/// </para>
	/// </summary>
	public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap)
	: base(input)
	{
	this.stemmerOverrideMap = stemmerOverrideMap;
	fstReader = stemmerOverrideMap.GetBytesReader();
	termAtt = AddAttribute<ICharTermAttribute>();
	keywordAtt = AddAttribute<IKeywordAttribute>();
	}

	public override bool IncrementToken()
	{
	if (m_input.IncrementToken())
	{
	if (fstReader == null)
	{
	// No overrides
	return true;
	}
	if (!keywordAtt.IsKeyword) // don't muck with already-keyworded terms
	{
	BytesRef stem = stemmerOverrideMap.Get(termAtt.Buffer, termAtt.Length, scratchArc, fstReader);
	if (stem != null)
	{
	char[] buffer = spare.Chars = termAtt.Buffer;
	UnicodeUtil.UTF8toUTF16(stem.Bytes, stem.Offset, stem.Length, spare);
	if (spare.Chars != buffer)
	{
	termAtt.CopyBuffer(spare.Chars, spare.Offset, spare.Length);
	}
	termAtt.Length = spare.Length;
	keywordAtt.IsKeyword = true;
	}
	}
	return true;
	}
	else
	{
	return false;
	}
	}

	/// <summary>
	/// A read-only 4-byte FST backed map that allows fast case-insensitive key
	/// value lookups for <see cref="StemmerOverrideFilter"/>
	/// </summary>
	// TODO maybe we can generalize this and reuse this map somehow?
	public sealed class StemmerOverrideMap
	{
	private readonly FST<BytesRef> fst;
	private readonly bool ignoreCase;

	/// <summary>
	/// Creates a new <see cref="StemmerOverrideMap"/> </summary>
	/// <param name="fst"> the fst to lookup the overrides </param>
	/// <param name="ignoreCase"> if the keys case should be ingored </param>
	public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase)
	{
	this.fst = fst;
	this.ignoreCase = ignoreCase;
	}

	/// <summary>
	/// Returns a <see cref="FST.BytesReader"/> to pass to the <see cref="Get(char[], int, FST.Arc{BytesRef}, FST.BytesReader)"/> method.
	/// </summary>
	public FST.BytesReader GetBytesReader()
	{
	if (fst == null)
	{
	return null;
	}
	else
	{
	return fst.GetBytesReader();
	}
	}

	/// <summary>
	/// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
	/// </summary>
	public BytesRef Get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
	{
	BytesRef pendingOutput = fst.Outputs.NoOutput;
	BytesRef matchOutput = null;
	int bufUpto = 0;
	fst.GetFirstArc(scratchArc);
	while (bufUpto < bufferLen)
	{
	int codePoint = Character.CodePointAt(buffer, bufUpto, bufferLen);
	if (fst.FindTargetArc(ignoreCase ? Character.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
	{
	return null;
	}
	pendingOutput = fst.Outputs.Add(pendingOutput, scratchArc.Output);
	bufUpto += Character.CharCount(codePoint);
	}
	if (scratchArc.IsFinal)
	{
	matchOutput = fst.Outputs.Add(pendingOutput, scratchArc.NextFinalOutput);
	}
	return matchOutput;
	}
	}

	/// <summary>
	/// This builder builds an <see cref="FST"/> for the <see cref="StemmerOverrideFilter"/>
	/// </summary>
	public class Builder
	{
	private readonly BytesRefHash hash = new BytesRefHash();
	private readonly BytesRef spare = new BytesRef();
	private readonly List<string> outputValues = new List<string>();
	private readonly bool ignoreCase;
	private readonly CharsRef charsSpare = new CharsRef();

	/// <summary>
	/// Creates a new <see cref="Builder"/> with <see cref="ignoreCase"/> set to <c>false</c>
	/// </summary>
	public Builder()
	: this(false)
	{
	}

	/// <summary>
	/// Creates a new <see cref="Builder"/> </summary>
	/// <param name="ignoreCase"> if the input case should be ignored. </param>
	public Builder(bool ignoreCase)
	{
	this.ignoreCase = ignoreCase;
	}

	/// <summary>
	/// Adds an input string and it's stemmer override output to this builder.
	/// </summary>
	/// <param name="input"> the input char sequence </param>
	/// <param name="output"> the stemmer override output char sequence </param>
	/// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns>
	public virtual bool Add(string input, string output)
	{
	int length = input.Length;
	if (ignoreCase)
	{
	// convert on the fly to lowercase
	charsSpare.Grow(length);
	char[] buffer = charsSpare.Chars;
	for (int i = 0; i < length;)
	{
	i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i)), buffer, i);
	}
	UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
	}
	else
	{
	UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare);
	}
	if (hash.Add(spare) >= 0)
	{
	outputValues.Add(output);
	return true;
	}
	return false;
	}

	/// <summary>
	/// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary>
	/// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns>
	/// <exception cref="System.IO.IOException"> if an <see cref="System.IO.IOException"/> occurs; </exception>
	public virtual StemmerOverrideMap Build()
	{
	ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
	Builder<BytesRef> builder = new Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
	int[] sort = hash.Sort(BytesRef.UTF8SortedAsUnicodeComparer);
	Int32sRef intsSpare = new Int32sRef();
	int size = hash.Count;
	for (int i = 0; i < size; i++)
	{
	int id = sort[i];
	BytesRef bytesRef = hash.Get(id, spare);
	UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
	builder.Add(intsSpare, new BytesRef(outputValues[id]));
	}
	return new StemmerOverrideMap(builder.Finish(), ignoreCase);
	}
	}
	}
	}