src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;

 namespace Lucene.Net.Analysis.CharFilters
 {
     /*
 	 * Licensed to the Apache Software Foundation (ASF) under one or more
 	 * contributor license agreements.  See the NOTICE file distributed with
 	 * this work for additional information regarding copyright ownership.
 	 * The ASF licenses this file to You under the Apache License, Version 2.0
 	 * (the "License"); you may not use this file except in compliance with
 	 * the License.  You may obtain a copy of the License at
 	 *
 	 *     http://www.apache.org/licenses/LICENSE-2.0
 	 *
 	 * Unless required by applicable law or agreed to in writing, software
 	 * distributed under the License is distributed on an "AS IS" BASIS,
 	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 	 * See the License for the specific language governing permissions and
 	 * limitations under the License.
 	 */

     /// <summary>
     /// Simplistic <see cref="CharFilter"/> that applies the mappings
     /// contained in a <see cref="NormalizeCharMap"/> to the character
     /// stream, and correcting the resulting changes to the
     /// offsets.  Matching is greedy (longest pattern matching at
     /// a given point wins).  Replacement is allowed to be the
     /// empty string.
     /// </summary>
     public class MappingCharFilter : BaseCharFilter
     {
         private readonly Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
         private readonly FST<CharsRef> map;
         private readonly FST.BytesReader fstReader;
         private readonly RollingCharBuffer buffer = new RollingCharBuffer();
         private readonly FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
         private readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs;

         private CharsRef replacement;
         private int replacementPointer;
         private int inputOff;

         /// <summary>
         /// LUCENENET specific support to buffer the reader.
         /// </summary>
         private readonly BufferedCharFilter _input;

         /// <summary>
         /// Default constructor that takes a <see cref="TextReader"/>. </summary>
         public MappingCharFilter(NormalizeCharMap normMap, TextReader @in)
             : base(@in)
         {
             //LUCENENET support to reset the reader.
             _input = GetBufferedReader(@in);
             _input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE);
             buffer.Reset(_input);
             //buffer.Reset(@in);

             map = normMap.map;
             cachedRootArcs = normMap.cachedRootArcs;

             if (map != null)
             {
                 fstReader = map.GetBytesReader();
             }
             else
             {
                 fstReader = null;
             }
         }

         /// <summary>
         /// LUCENENET: Copied this method from the <see cref="WordlistLoader"/> class - this class requires readers
         /// with a Reset() method (which .NET readers don't support). So, we use the <see cref="BufferedCharFilter"/>
         /// (which is similar to Java BufferedReader) as a wrapper for whatever reader the user passes
         /// (unless it is already a <see cref="BufferedCharFilter"/>).
         /// </summary>
         /// <param name="reader"></param>
         /// <returns></returns>
         private static BufferedCharFilter GetBufferedReader(TextReader reader)
         {
             return (reader is BufferedCharFilter) ? (BufferedCharFilter)reader : new BufferedCharFilter(reader);
         }

         public override void Reset()
         {
             // LUCENENET: reset the BufferedCharFilter.
             _input.Reset();
             buffer.Reset(_input);
             replacement = null;
             inputOff = 0;
         }

         public override int Read()
         {

             //System.out.println("\nread");
             while (true)
             {

                 if (replacement != null && replacementPointer < replacement.Length)
                 {
                     //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
                     return replacement.Chars[replacement.Offset + replacementPointer++];
                 }

                 // TODO: a more efficient approach would be Aho/Corasick's
                 // algorithm
                 // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
                 // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
                 //
                 // I think this would be (almost?) equivalent to 1) adding
                 // epsilon arcs from all final nodes back to the init
                 // node in the FST, 2) adding a .* (skip any char)
                 // loop on the initial node, and 3) determinizing
                 // that.  Then we would not have to Restart matching
                 // at each position.

                 int lastMatchLen = -1;
                 CharsRef lastMatch = null;

                 int firstCH = buffer.Get(inputOff);
                 if (firstCH != -1)
                 {
                     // LUCENENET fix: Check the dictionary to ensure it contains a key before reading it.
                     char key = Convert.ToChar((char)firstCH);
                     if (cachedRootArcs.TryGetValue(key, out FST.Arc<CharsRef> arc) && arc != null)
                     {
                         if (!FST.TargetHasArcs(arc))
                         {
                             // Fast pass for single character match:
                             Debug.Assert(arc.IsFinal);
                             lastMatchLen = 1;
                             lastMatch = arc.Output;
                         }
                         else
                         {
                             int lookahead = 0;
                             CharsRef output = arc.Output;
                             while (true)
                             {
                                 lookahead++;

                                 if (arc.IsFinal)
                                 {
                                     // Match! (to node is final)
                                     lastMatchLen = lookahead;
                                     lastMatch = outputs.Add(output, arc.NextFinalOutput);
                                     // Greedy: keep searching to see if there's a
                                     // longer match...
                                 }

                                 if (!FST.TargetHasArcs(arc))
                                 {
                                     break;
                                 }

                                 int ch = buffer.Get(inputOff + lookahead);
                                 if (ch == -1)
                                 {
                                     break;
                                 }
                                 if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
                                 {
                                     // Dead end
                                     break;
                                 }
                                 output = outputs.Add(output, arc.Output);
                             }
                         }
                     }
                 }

                 if (lastMatch != null)
                 {
                     inputOff += lastMatchLen;
                     //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
                     int diff = lastMatchLen - lastMatch.Length;

                     if (diff != 0)
                     {
                         int prevCumulativeDiff = LastCumulativeDiff;
                         if (diff > 0)
                         {
                             // Replacement is shorter than matched input:
                             AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
                         }
                         else
                         {
                             // Replacement is longer than matched input: remap
                             // the "extra" chars all back to the same input
                             // offset:
                             int outputStart = inputOff - prevCumulativeDiff;
                             for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
                             {
                                 AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
                             }
                         }
                     }

                     replacement = lastMatch;
                     replacementPointer = 0;

                 }
                 else
                 {
                     int ret = buffer.Get(inputOff);
                     if (ret != -1)
                     {
                         inputOff++;
                         buffer.FreeBefore(inputOff);
                     }
                     return ret;
                 }
             }
         }

         public override int Read(char[] cbuf, int off, int len)
         {
             int numRead = 0;
             for (int i = off; i < off + len; i++)
             {
                 int c = Read();
                 if (c == -1)
                 {
                     break;
                 }
                 cbuf[i] = (char)c;
                 numRead++;
             }

             return numRead == 0 ? -1 : numRead;
         }
     }
 }
	using Lucene.Net.Analysis.Util;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System;
	using System.Collections.Generic;
	using System.Diagnostics;
	using System.IO;

	namespace Lucene.Net.Analysis.CharFilters
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Simplistic <see cref="CharFilter"/> that applies the mappings
	/// contained in a <see cref="NormalizeCharMap"/> to the character
	/// stream, and correcting the resulting changes to the
	/// offsets. Matching is greedy (longest pattern matching at
	/// a given point wins). Replacement is allowed to be the
	/// empty string.
	/// </summary>
	public class MappingCharFilter : BaseCharFilter
	{
	private readonly Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
	private readonly FST<CharsRef> map;
	private readonly FST.BytesReader fstReader;
	private readonly RollingCharBuffer buffer = new RollingCharBuffer();
	private readonly FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
	private readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs;

	private CharsRef replacement;
	private int replacementPointer;
	private int inputOff;

	/// <summary>
	/// LUCENENET specific support to buffer the reader.
	/// </summary>
	private readonly BufferedCharFilter _input;

	/// <summary>
	/// Default constructor that takes a <see cref="TextReader"/>. </summary>
	public MappingCharFilter(NormalizeCharMap normMap, TextReader @in)
	: base(@in)
	{
	//LUCENENET support to reset the reader.
	_input = GetBufferedReader(@in);
	_input.Mark(BufferedCharFilter.DEFAULT_CHAR_BUFFER_SIZE);
	buffer.Reset(_input);
	//buffer.Reset(@in);

	map = normMap.map;
	cachedRootArcs = normMap.cachedRootArcs;

	if (map != null)
	{
	fstReader = map.GetBytesReader();
	}
	else
	{
	fstReader = null;
	}
	}

	/// <summary>
	/// LUCENENET: Copied this method from the <see cref="WordlistLoader"/> class - this class requires readers
	/// with a Reset() method (which .NET readers don't support). So, we use the <see cref="BufferedCharFilter"/>
	/// (which is similar to Java BufferedReader) as a wrapper for whatever reader the user passes
	/// (unless it is already a <see cref="BufferedCharFilter"/>).
	/// </summary>
	/// <param name="reader"></param>
	/// <returns></returns>
	private static BufferedCharFilter GetBufferedReader(TextReader reader)
	{
	return (reader is BufferedCharFilter) ? (BufferedCharFilter)reader : new BufferedCharFilter(reader);
	}

	public override void Reset()
	{
	// LUCENENET: reset the BufferedCharFilter.
	_input.Reset();
	buffer.Reset(_input);
	replacement = null;
	inputOff = 0;
	}

	public override int Read()
	{

	//System.out.println("\nread");
	while (true)
	{

	if (replacement != null && replacementPointer < replacement.Length)
	{
	//System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
	return replacement.Chars[replacement.Offset + replacementPointer++];
	}

	// TODO: a more efficient approach would be Aho/Corasick's
	// algorithm
	// (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
	// or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
	//
	// I think this would be (almost?) equivalent to 1) adding
	// epsilon arcs from all final nodes back to the init
	// node in the FST, 2) adding a .* (skip any char)
	// loop on the initial node, and 3) determinizing
	// that. Then we would not have to Restart matching
	// at each position.

	int lastMatchLen = -1;
	CharsRef lastMatch = null;

	int firstCH = buffer.Get(inputOff);
	if (firstCH != -1)
	{
	// LUCENENET fix: Check the dictionary to ensure it contains a key before reading it.
	char key = Convert.ToChar((char)firstCH);
	if (cachedRootArcs.TryGetValue(key, out FST.Arc<CharsRef> arc) && arc != null)
	{
	if (!FST.TargetHasArcs(arc))
	{
	// Fast pass for single character match:
	Debug.Assert(arc.IsFinal);
	lastMatchLen = 1;
	lastMatch = arc.Output;
	}
	else
	{
	int lookahead = 0;
	CharsRef output = arc.Output;
	while (true)
	{
	lookahead++;

	if (arc.IsFinal)
	{
	// Match! (to node is final)
	lastMatchLen = lookahead;
	lastMatch = outputs.Add(output, arc.NextFinalOutput);
	// Greedy: keep searching to see if there's a
	// longer match...
	}

	if (!FST.TargetHasArcs(arc))
	{
	break;
	}

	int ch = buffer.Get(inputOff + lookahead);
	if (ch == -1)
	{
	break;
	}
	if ((arc = map.FindTargetArc(ch, arc, scratchArc, fstReader)) == null)
	{
	// Dead end
	break;
	}
	output = outputs.Add(output, arc.Output);
	}
	}
	}
	}

	if (lastMatch != null)
	{
	inputOff += lastMatchLen;
	//System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);
	int diff = lastMatchLen - lastMatch.Length;

	if (diff != 0)
	{
	int prevCumulativeDiff = LastCumulativeDiff;
	if (diff > 0)
	{
	// Replacement is shorter than matched input:
	AddOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
	}
	else
	{
	// Replacement is longer than matched input: remap
	// the "extra" chars all back to the same input
	// offset:
	int outputStart = inputOff - prevCumulativeDiff;
	for (int extraIDX = 0; extraIDX < -diff; extraIDX++)
	{
	AddOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
	}
	}
	}

	replacement = lastMatch;
	replacementPointer = 0;

	}
	else
	{
	int ret = buffer.Get(inputOff);
	if (ret != -1)
	{
	inputOff++;
	buffer.FreeBefore(inputOff);
	}
	return ret;
	}
	}
	}

	public override int Read(char[] cbuf, int off, int len)
	{
	int numRead = 0;
	for (int i = off; i < off + len; i++)
	{
	int c = Read();
	if (c == -1)
	{
	break;
	}
	cbuf[i] = (char)c;
	numRead++;
	}

	return numRead == 0 ? -1 : numRead;
	}
	}
	}