src/Lucene.Net.Analysis.Common/Analysis/Synonym/SlowSynonymFilter.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Util;
 using System;
 using System.Collections.Generic;

 namespace Lucene.Net.Analysis.Synonym
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// <see cref="SynonymFilter"/> handles multi-token synonyms with variable position increment offsets.
     /// <para>
     /// The matched tokens from the input stream may be optionally passed through (includeOrig=true)
     /// or discarded.  If the original tokens are included, the position increments may be modified
     /// to retain absolute positions after merging with the synonym tokenstream.
     /// </para>
     /// <para>
     /// Generated synonyms will start at the same position as the first matched source token.
     /// </para>
     /// </summary>
     /// @deprecated (3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
     [Obsolete("(3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0")]
     internal sealed class SlowSynonymFilter : TokenFilter
     {
         private readonly SlowSynonymMap map; // Map<String, SynonymMap>
         private IEnumerator<AttributeSource> replacement; // iterator over generated tokens

         public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map)
             : base(@in)
         {
             if (map == null)
             {
                 throw new ArgumentException("map is required", "map");
             }

             this.map = map;
             // just ensuring these attributes exist...
             AddAttribute<ICharTermAttribute>();
             AddAttribute<IPositionIncrementAttribute>();
             AddAttribute<IOffsetAttribute>();
             AddAttribute<ITypeAttribute>();
         }


         /*
          * Need to worry about multiple scenarios:
          *  - need to go for the longest match
          *    a b => foo      #shouldn't match if "a b" is followed by "c d"
          *    a b c d => bar
          *  - need to backtrack - retry matches for tokens already read
          *     a b c d => foo
          *       b c => bar
          *     If the input stream is "a b c x", one will consume "a b c d"
          *     trying to match the first rule... all but "a" should be
          *     pushed back so a match may be made on "b c".
          *  - don't try and match generated tokens (thus need separate queue)
          *    matching is not recursive.
          *  - handle optional generation of original tokens in all these cases,
          *    merging token streams to preserve token positions.
          *  - preserve original positionIncrement of first matched token
          */
         public override bool IncrementToken()
         {
             while (true)
             {
                 // if there are any generated tokens, return them... don't try any
                 // matches against them, as we specifically don't want recursion.
                 if (replacement != null && replacement.MoveNext())
                 {
                     Copy(this, replacement.Current);
                     return true;
                 }

                 // common case fast-path of first token not matching anything
                 AttributeSource firstTok = NextTok();
                 if (firstTok == null)
                 {
                     return false;
                 }
                 var termAtt = firstTok.AddAttribute<ICharTermAttribute>();
                 SlowSynonymMap result = map.Submap != null ? map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null;
                 if (result == null)
                 {
                     Copy(this, firstTok);
                     return true;
                 }

                 // fast-path failed, clone ourselves if needed
                 if (firstTok == this)
                 {
                     firstTok = CloneAttributes();
                 }
                 // OK, we matched a token, so find the longest match.

                 matched = new LinkedList<AttributeSource>();

                 result = Match(result);

                 if (result == null)
                 {
                     // no match, simply return the first token read.
                     Copy(this, firstTok);
                     return true;
                 }

                 // reuse, or create new one each time?
                 List<AttributeSource> generated = new List<AttributeSource>(result.Synonyms.Length + matched.Count + 1);

                 //
                 // there was a match... let's generate the new tokens, merging
                 // in the matched tokens (position increments need adjusting)
                 //
                 AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value;
                 bool includeOrig = result.IncludeOrig;

                 AttributeSource origTok = includeOrig ? firstTok : null;
                 IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute<IPositionIncrementAttribute>();
                 int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
                 int repPos = 0; // curr position in replacement token stream
                 int pos = 0; // current position in merged token stream

                 for (int i = 0; i < result.Synonyms.Length; i++)
                 {
                     Token repTok = result.Synonyms[i];
                     AttributeSource newTok = firstTok.CloneAttributes();
                     ICharTermAttribute newTermAtt = newTok.AddAttribute<ICharTermAttribute>();
                     IOffsetAttribute newOffsetAtt = newTok.AddAttribute<IOffsetAttribute>();
                     IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute<IPositionIncrementAttribute>();

                     IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute<IOffsetAttribute>();

                     newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset);
                     newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length);
                     repPos += repTok.PositionIncrement;
                     if (i == 0) // make position of first token equal to original
                     {
                         repPos = origPos;
                     }

                     // if necessary, insert original tokens and adjust position increment
                     while (origTok != null && origPos <= repPos)
                     {
                         IPositionIncrementAttribute origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
                         origPosInc.PositionIncrement = origPos - pos;
                         generated.Add(origTok);
                         pos += origPosInc.PositionIncrement;
                         //origTok = matched.Count == 0 ? null : matched.RemoveFirst();
                         if (matched.Count == 0)
                         {
                             origTok = null;
                         }
                         else
                         {
                             origTok = matched.First.Value;
                             matched.Remove(origTok);
                         }
                         if (origTok != null)
                         {
                             origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
                             origPos += origPosInc.PositionIncrement;
                         }
                     }

                     newPosIncAtt.PositionIncrement = repPos - pos;
                     generated.Add(newTok);
                     pos += newPosIncAtt.PositionIncrement;
                 }

                 // finish up any leftover original tokens
                 while (origTok != null)
                 {
                     IPositionIncrementAttribute origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
                     origPosInc.PositionIncrement = origPos - pos;
                     generated.Add(origTok);
                     pos += origPosInc.PositionIncrement;
                     if (matched.Count == 0)
                     {
                         origTok = null;
                     }
                     else
                     {
                         origTok = matched.First.Value;
                         matched.Remove(origTok);
                     }
                     if (origTok != null)
                     {
                         origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
                         origPos += origPosInc.PositionIncrement;
                     }
                 }

                 // what if we replaced a longer sequence with a shorter one?
                 // a/0 b/5 =>  foo/0
                 // should I re-create the gap on the next buffered token?

                 replacement = generated.GetEnumerator();
                 // Now return to the top of the loop to read and return the first
                 // generated token.. The reason this is done is that we may have generated
                 // nothing at all, and may need to continue with more matching logic.
             }
         }


         //
         // Defer creation of the buffer until the first time it is used to
         // optimize short fields with no matches.
         //
         private LinkedList<AttributeSource> buffer;
         private LinkedList<AttributeSource> matched;

         private bool exhausted;

         private AttributeSource NextTok()
         {
             if (buffer != null && buffer.Count > 0)
             {
                 var first = buffer.First.Value;
                 buffer.Remove(first);
                 return first;
             }
             else
             {
                 if (!exhausted && m_input.IncrementToken())
                 {
                     return this;
                 }
                 else
                 {
                     exhausted = true;
                     return null;
                 }
             }
         }

         private void PushTok(AttributeSource t)
         {
             if (buffer == null)
             {
                 buffer = new LinkedList<AttributeSource>();
             }
             buffer.AddFirst(t);
         }

         private SlowSynonymMap Match(SlowSynonymMap map)
         {
             SlowSynonymMap result = null;

             if (map.Submap != null)
             {
                 AttributeSource tok = NextTok();
                 if (tok != null)
                 {
                     // clone ourselves.
                     if (tok == this)
                     {
                         tok = CloneAttributes();
                     }
                     // check for positionIncrement!=1?  if>1, should not match, if==0, check multiple at this level?
                     var termAtt = tok.GetAttribute<ICharTermAttribute>();
                     SlowSynonymMap subMap = map.Submap.Get(termAtt.Buffer, 0, termAtt.Length);

                     if (subMap != null)
                     {
                         // recurse
                         result = Match(subMap);
                     }

                     if (result != null)
                     {
                         matched.AddFirst(tok);
                     }
                     else
                     {
                         // push back unmatched token
                         PushTok(tok);
                     }
                 }
             }

             // if no longer sequence matched, so if this node has synonyms, it's the match.
             if (result == null && map.Synonyms != null)
             {
                 result = map;
             }

             return result;
         }

         private void Copy(AttributeSource target, AttributeSource source)
         {
             if (target != source)
             {
                 source.CopyTo(target);
             }
         }

         public override void Reset()
         {
             m_input.Reset();
             replacement = null;
             exhausted = false;
         }
     }
 }
	using Lucene.Net.Analysis.TokenAttributes;
	using Lucene.Net.Util;
	using System;
	using System.Collections.Generic;

	namespace Lucene.Net.Analysis.Synonym
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// <see cref="SynonymFilter"/> handles multi-token synonyms with variable position increment offsets.
	/// <para>
	/// The matched tokens from the input stream may be optionally passed through (includeOrig=true)
	/// or discarded. If the original tokens are included, the position increments may be modified
	/// to retain absolute positions after merging with the synonym tokenstream.
	/// </para>
	/// <para>
	/// Generated synonyms will start at the same position as the first matched source token.
	/// </para>
	/// </summary>
	/// @deprecated (3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0
	[Obsolete("(3.4) use SynonymFilterFactory instead. only for precise index backwards compatibility. this factory will be removed in Lucene 5.0")]
	internal sealed class SlowSynonymFilter : TokenFilter
	{
	private readonly SlowSynonymMap map; // Map<String, SynonymMap>
	private IEnumerator<AttributeSource> replacement; // iterator over generated tokens

	public SlowSynonymFilter(TokenStream @in, SlowSynonymMap map)
	: base(@in)
	{
	if (map == null)
	{
	throw new ArgumentException("map is required", "map");
	}

	this.map = map;
	// just ensuring these attributes exist...
	AddAttribute<ICharTermAttribute>();
	AddAttribute<IPositionIncrementAttribute>();
	AddAttribute<IOffsetAttribute>();
	AddAttribute<ITypeAttribute>();
	}


	/*
	* Need to worry about multiple scenarios:
	* - need to go for the longest match
	* a b => foo #shouldn't match if "a b" is followed by "c d"
	* a b c d => bar
	* - need to backtrack - retry matches for tokens already read
	* a b c d => foo
	* b c => bar
	* If the input stream is "a b c x", one will consume "a b c d"
	* trying to match the first rule... all but "a" should be
	* pushed back so a match may be made on "b c".
	* - don't try and match generated tokens (thus need separate queue)
	* matching is not recursive.
	* - handle optional generation of original tokens in all these cases,
	* merging token streams to preserve token positions.
	* - preserve original positionIncrement of first matched token
	*/
	public override bool IncrementToken()
	{
	while (true)
	{
	// if there are any generated tokens, return them... don't try any
	// matches against them, as we specifically don't want recursion.
	if (replacement != null && replacement.MoveNext())
	{
	Copy(this, replacement.Current);
	return true;
	}

	// common case fast-path of first token not matching anything
	AttributeSource firstTok = NextTok();
	if (firstTok == null)
	{
	return false;
	}
	var termAtt = firstTok.AddAttribute<ICharTermAttribute>();
	SlowSynonymMap result = map.Submap != null ? map.Submap.Get(termAtt.Buffer, 0, termAtt.Length) : null;
	if (result == null)
	{
	Copy(this, firstTok);
	return true;
	}

	// fast-path failed, clone ourselves if needed
	if (firstTok == this)
	{
	firstTok = CloneAttributes();
	}
	// OK, we matched a token, so find the longest match.

	matched = new LinkedList<AttributeSource>();

	result = Match(result);

	if (result == null)
	{
	// no match, simply return the first token read.
	Copy(this, firstTok);
	return true;
	}

	// reuse, or create new one each time?
	List<AttributeSource> generated = new List<AttributeSource>(result.Synonyms.Length + matched.Count + 1);

	//
	// there was a match... let's generate the new tokens, merging
	// in the matched tokens (position increments need adjusting)
	//
	AttributeSource lastTok = matched.Count == 0 ? firstTok : matched.Last.Value;
	bool includeOrig = result.IncludeOrig;

	AttributeSource origTok = includeOrig ? firstTok : null;
	IPositionIncrementAttribute firstPosIncAtt = firstTok.AddAttribute<IPositionIncrementAttribute>();
	int origPos = firstPosIncAtt.PositionIncrement; // position of origTok in the original stream
	int repPos = 0; // curr position in replacement token stream
	int pos = 0; // current position in merged token stream

	for (int i = 0; i < result.Synonyms.Length; i++)
	{
	Token repTok = result.Synonyms[i];
	AttributeSource newTok = firstTok.CloneAttributes();
	ICharTermAttribute newTermAtt = newTok.AddAttribute<ICharTermAttribute>();
	IOffsetAttribute newOffsetAtt = newTok.AddAttribute<IOffsetAttribute>();
	IPositionIncrementAttribute newPosIncAtt = newTok.AddAttribute<IPositionIncrementAttribute>();

	IOffsetAttribute lastOffsetAtt = lastTok.AddAttribute<IOffsetAttribute>();

	newOffsetAtt.SetOffset(newOffsetAtt.StartOffset, lastOffsetAtt.EndOffset);
	newTermAtt.CopyBuffer(repTok.Buffer, 0, repTok.Length);
	repPos += repTok.PositionIncrement;
	if (i == 0) // make position of first token equal to original
	{
	repPos = origPos;
	}

	// if necessary, insert original tokens and adjust position increment
	while (origTok != null && origPos <= repPos)
	{
	IPositionIncrementAttribute origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
	origPosInc.PositionIncrement = origPos - pos;
	generated.Add(origTok);
	pos += origPosInc.PositionIncrement;
	//origTok = matched.Count == 0 ? null : matched.RemoveFirst();
	if (matched.Count == 0)
	{
	origTok = null;
	}
	else
	{
	origTok = matched.First.Value;
	matched.Remove(origTok);
	}
	if (origTok != null)
	{
	origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
	origPos += origPosInc.PositionIncrement;
	}
	}

	newPosIncAtt.PositionIncrement = repPos - pos;
	generated.Add(newTok);
	pos += newPosIncAtt.PositionIncrement;
	}

	// finish up any leftover original tokens
	while (origTok != null)
	{
	IPositionIncrementAttribute origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
	origPosInc.PositionIncrement = origPos - pos;
	generated.Add(origTok);
	pos += origPosInc.PositionIncrement;
	if (matched.Count == 0)
	{
	origTok = null;
	}
	else
	{
	origTok = matched.First.Value;
	matched.Remove(origTok);
	}
	if (origTok != null)
	{
	origPosInc = origTok.AddAttribute<IPositionIncrementAttribute>();
	origPos += origPosInc.PositionIncrement;
	}
	}

	// what if we replaced a longer sequence with a shorter one?
	// a/0 b/5 => foo/0
	// should I re-create the gap on the next buffered token?

	replacement = generated.GetEnumerator();
	// Now return to the top of the loop to read and return the first
	// generated token.. The reason this is done is that we may have generated
	// nothing at all, and may need to continue with more matching logic.
	}
	}


	//
	// Defer creation of the buffer until the first time it is used to
	// optimize short fields with no matches.
	//
	private LinkedList<AttributeSource> buffer;
	private LinkedList<AttributeSource> matched;

	private bool exhausted;

	private AttributeSource NextTok()
	{
	if (buffer != null && buffer.Count > 0)
	{
	var first = buffer.First.Value;
	buffer.Remove(first);
	return first;
	}
	else
	{
	if (!exhausted && m_input.IncrementToken())
	{
	return this;
	}
	else
	{
	exhausted = true;
	return null;
	}
	}
	}

	private void PushTok(AttributeSource t)
	{
	if (buffer == null)
	{
	buffer = new LinkedList<AttributeSource>();
	}
	buffer.AddFirst(t);
	}

	private SlowSynonymMap Match(SlowSynonymMap map)
	{
	SlowSynonymMap result = null;

	if (map.Submap != null)
	{
	AttributeSource tok = NextTok();
	if (tok != null)
	{
	// clone ourselves.
	if (tok == this)
	{
	tok = CloneAttributes();
	}
	// check for positionIncrement!=1? if>1, should not match, if==0, check multiple at this level?
	var termAtt = tok.GetAttribute<ICharTermAttribute>();
	SlowSynonymMap subMap = map.Submap.Get(termAtt.Buffer, 0, termAtt.Length);

	if (subMap != null)
	{
	// recurse
	result = Match(subMap);
	}

	if (result != null)
	{
	matched.AddFirst(tok);
	}
	else
	{
	// push back unmatched token
	PushTok(tok);
	}
	}
	}

	// if no longer sequence matched, so if this node has synonyms, it's the match.
	if (result == null && map.Synonyms != null)
	{
	result = map;
	}

	return result;
	}

	private void Copy(AttributeSource target, AttributeSource source)
	{
	if (target != source)
	{
	source.CopyTo(target);
	}
	}

	public override void Reset()
	{
	m_input.Reset();
	replacement = null;
	exhausted = false;
	}
	}
	}