src/Lucene.Net.TestFramework/Analysis/MockTokenizer.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.Tokenattributes;
 using System;
 using System.Diagnostics;
 using NUnit.Framework;

 namespace Lucene.Net.Analysis
 {
     using Lucene.Net.Support;

     //using RandomizedContext = com.carrotsearch.randomizedtesting.RandomizedContext;
     using System.IO;

     /*
          * Licensed to the Apache Software Foundation (ASF) under one or more
          * contributor license agreements.  See the NOTICE file distributed with
          * this work for additional information regarding copyright ownership.
          * The ASF licenses this file to You under the Apache License, Version 2.0
          * (the "License"); you may not use this file except in compliance with
          * the License.  You may obtain a copy of the License at
          *
          *     http://www.apache.org/licenses/LICENSE-2.0
          *
          * Unless required by applicable law or agreed to in writing, software
          * distributed under the License is distributed on an "AS IS" BASIS,
          * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
          * See the License for the specific language governing permissions and
          * limitations under the License.
          */

     using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton;
     using RegExp = Lucene.Net.Util.Automaton.RegExp;

     /// <summary>
     /// Tokenizer for testing.
     /// <p>
     /// this tokenizer is a replacement for <seealso cref="#WHITESPACE"/>, <seealso cref="#SIMPLE"/>, and <seealso cref="#KEYWORD"/>
     /// tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test
     /// it wrapping this tokenizer instead for extra checks. this tokenizer has the following behavior:
     /// <ul>
     ///   <li>An internal state-machine is used for checking consumer consistency. These checks can
     ///       be disabled with <seealso cref="#setEnableChecks(boolean)"/>.
     ///   <li>For convenience, optionally lowercases terms that it outputs.
     /// </ul>
     /// </summary>
     public class MockTokenizer : Tokenizer
     {
         /// <summary>
         /// Acts Similar to WhitespaceTokenizer </summary>
         public static readonly CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton());

         /// <summary>
         /// Acts Similar to KeywordTokenizer.
         /// TODO: Keyword returns an "empty" token for an empty reader...
         /// </summary>
         public static readonly CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton(new RegExp(".*").ToAutomaton());

         /// <summary>
         /// Acts like LetterTokenizer. </summary>
         // the ugly regex below is incomplete Unicode 5.2 [:Letter:]
         public static readonly CharacterRunAutomaton SIMPLE = new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").ToAutomaton());

         private readonly CharacterRunAutomaton RunAutomaton;
         private readonly bool LowerCase;
         private readonly int MaxTokenLength;
         public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;
         private int state;

         private readonly ICharTermAttribute TermAtt;
         private readonly IOffsetAttribute OffsetAtt;
         internal int Off = 0;

         // buffered state (previous codepoint and offset). we replay this once we
         // hit a reject state in case its permissible as the start of a new term.
         internal int BufferedCodePoint = -1; // -1 indicates empty buffer

         internal int BufferedOff = -1;

         // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
         // currently, we can only check that the lifecycle is correct if someone is reusing,
         // but not for "one-offs".
         private enum State
         {
             SETREADER, // consumer set a reader input either via ctor or via reset(Reader)
             RESET, // consumer has called reset()
             INCREMENT, // consumer is consuming, has called IncrementToken() == true
             INCREMENT_FALSE, // consumer has called IncrementToken() which returned false
             END, // consumer has called end() to perform end of stream operations
             CLOSE // consumer has called close() to release any resources
         }

         private State StreamState = State.CLOSE;
         private int LastOffset = 0; // only for asserting
         private bool EnableChecks_Renamed = true;

         // evil: but we don't change the behavior with this random, we only switch up how we read
         private readonly Random Random = new Random(/*RandomizedContext.Current.Random.nextLong()*/);

         public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
             : base(factory, input)
         {
             this.RunAutomaton = runAutomaton;
             this.LowerCase = lowerCase;
             this.state = runAutomaton.InitialState;
             this.StreamState = State.SETREADER;
             this.MaxTokenLength = maxTokenLength;
             TermAtt = AddAttribute<ICharTermAttribute>();
             OffsetAtt = AddAttribute<IOffsetAttribute>();
         }

         public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
             : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
         {
         }

         public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
             : this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
         {
         }

         /// <summary>
         /// Calls <seealso cref="#MockTokenizer(Reader, CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)"/> </summary>
         public MockTokenizer(TextReader input)
             : this(input, WHITESPACE, true)
         {
         }

         public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
             : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
         {
         }

         /// <summary>
         /// Calls {@link #MockTokenizer(Lucene.Net.Util.AttributeSource.AttributeFactory,Reader,CharacterRunAutomaton,boolean)
         ///                MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)}
         /// </summary>
         public MockTokenizer(AttributeFactory factory, TextReader input)
             : this(input, WHITESPACE, true)
         {
         }

         public sealed override bool IncrementToken()
         {
             //Debug.Assert(!EnableChecks_Renamed || (StreamState == State.RESET || StreamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + StreamState);
             ClearAttributes();
             for (; ; )
             {
                 int startOffset;
                 int cp;
                 if (BufferedCodePoint >= 0)
                 {
                     cp = BufferedCodePoint;
                     startOffset = BufferedOff;
                     BufferedCodePoint = -1;
                 }
                 else
                 {
                     startOffset = Off;
                     cp = ReadCodePoint();
                 }
                 if (cp < 0)
                 {
                     break;
                 }
                 else if (IsTokenChar(cp))
                 {
                     int endOffset;
                     do
                     {
                         char[] chars = Character.ToChars(Normalize(cp));
                         for (int i = 0; i < chars.Length; i++)
                         {
                             TermAtt.Append(chars[i]);
                         }
                         endOffset = Off;
                         if (TermAtt.Length >= MaxTokenLength)
                         {
                             break;
                         }
                         cp = ReadCodePoint();
                     } while (cp >= 0 && IsTokenChar(cp));

                     if (TermAtt.Length < MaxTokenLength)
                     {
                         // buffer up, in case the "rejected" char can start a new word of its own
                         BufferedCodePoint = cp;
                         BufferedOff = endOffset;
                     }
                     else
                     {
                         // otherwise, its because we hit term limit.
                         BufferedCodePoint = -1;
                     }
                     int correctedStartOffset = CorrectOffset(startOffset);
                     int correctedEndOffset = CorrectOffset(endOffset);
                     Assert.True(correctedStartOffset >= 0);
                     Assert.True(correctedEndOffset >= 0);
                     Assert.True(correctedStartOffset >= LastOffset);
                     LastOffset = correctedStartOffset;
                     Assert.True(correctedEndOffset >= correctedStartOffset);
                     OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
                     if (state == -1 || RunAutomaton.IsAccept(state))
                     {
                         // either we hit a reject state (longest match), or end-of-text, but in an accept state
                         StreamState = State.INCREMENT;
                         return true;
                     }
                 }
             }
             StreamState = State.INCREMENT_FALSE;
             return false;
         }

         protected internal virtual int ReadCodePoint()
         {
             int ch = ReadChar();
             if (ch < 0)
             {
                 return ch;
             }
             else
             {
                 Assert.True(!char.IsLowSurrogate((char)ch), "unpaired low surrogate: " + ch.ToString("x"));
                 Off++;
                 if (char.IsHighSurrogate((char)ch))
                 {
                     int ch2 = ReadChar();
                     if (ch2 >= 0)
                     {
                         Off++;
                         Assert.True(char.IsLowSurrogate((char)ch2), "unpaired high surrogate: " + ch.ToString("x") + ", followed by: " + ch2.ToString("x"));
                         return Character.ToCodePoint((char)ch, (char)ch2);
                     }
                     else
                     {
                         Assert.True(false, "stream ends with unpaired high surrogate: " + ch.ToString("x"));
                     }
                 }
                 return ch;
             }
         }

         protected internal virtual int ReadChar()
         {
             switch (Random.Next(0, 10))
             {
                 case 0:
                     {
                         // read(char[])
                         char[] c = new char[1];
                         int ret = input.Read(c, 0, c.Length);
                         return ret <= 0 ? -1 : c[0];
                     }
                 case 1:
                     {
                         // read(char[], int, int)
                         char[] c = new char[2];
                         int ret = input.Read(c, 1, 1);
                         return ret <= 0 ? -1 : c[1];
                     }
                 /* LUCENE TO-DO not sure if needed, CharBuffer not supported
                   case 2:
                   {
                     // read(CharBuffer)
                     char[] c = new char[1];
                     CharBuffer cb = CharBuffer.Wrap(c);
                     int ret = Input.Read(cb);
                     return ret < 0 ? ret : c[0];
                   }*/
                 default:
                     // read()
                     return input.Read();
             }
         }

         protected internal virtual bool IsTokenChar(int c)
         {
             if (state < 0)
             {
                 state = RunAutomaton.InitialState;
             }
             state = RunAutomaton.Step(state, c);
             if (state < 0)
             {
                 return false;
             }
             else
             {
                 return true;
             }
         }

         protected internal virtual int Normalize(int c)
         {
             return LowerCase ? Character.ToLowerCase(c) : c;
         }

         public override void Reset()
         {
             base.Reset();
             state = RunAutomaton.InitialState;
             LastOffset = Off = 0;
             BufferedCodePoint = -1;
             Assert.True(!EnableChecks_Renamed || StreamState != State.RESET, "double reset()");
             StreamState = State.RESET;
         }

         public override void Dispose()
         {
             base.Dispose();
             // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
             // these tests should disable this check, by default we check the normal workflow.
             // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
             Assert.True(!EnableChecks_Renamed || StreamState == State.END || StreamState == State.CLOSE, "close() called in wrong state: " + StreamState);
             StreamState = State.CLOSE;
         }

         internal bool SetReaderTestPoint()
         {
             Assert.True(!EnableChecks_Renamed || StreamState == State.CLOSE, "setReader() called in wrong state: " + StreamState);
             StreamState = State.SETREADER;
             return true;
         }

         public override void End()
         {
             base.End();
             int finalOffset = CorrectOffset(Off);
             OffsetAtt.SetOffset(finalOffset, finalOffset);
             // some tokenizers, such as limiting tokenizers, call end() before IncrementToken() returns false.
             // these tests should disable this check (in general you should consume the entire stream)
             try
             {
                 //Debug.Assert(!EnableChecks_Renamed || StreamState == State.INCREMENT_FALSE, "end() called before IncrementToken() returned false!");
             }
             finally
             {
                 StreamState = State.END;
             }
         }

         /// <summary>
         /// Toggle consumer workflow checking: if your test consumes tokenstreams normally you
         /// should leave this enabled.
         /// </summary>
         public virtual bool EnableChecks
         {
             set
             {
                 this.EnableChecks_Renamed = value;
             }
         }
     }
 }
	using Lucene.Net.Analysis.Tokenattributes;
	using System;
	using System.Diagnostics;
	using NUnit.Framework;

	namespace Lucene.Net.Analysis
	{
	using Lucene.Net.Support;

	//using RandomizedContext = com.carrotsearch.randomizedtesting.RandomizedContext;
	using System.IO;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton;
	using RegExp = Lucene.Net.Util.Automaton.RegExp;

	/// <summary>
	/// Tokenizer for testing.
	/// <p>
	/// this tokenizer is a replacement for <seealso cref="#WHITESPACE"/>, <seealso cref="#SIMPLE"/>, and <seealso cref="#KEYWORD"/>
	/// tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test
	/// it wrapping this tokenizer instead for extra checks. this tokenizer has the following behavior:
	/// <ul>
	/// <li>An internal state-machine is used for checking consumer consistency. These checks can
	/// be disabled with <seealso cref="#setEnableChecks(boolean)"/>.
	/// <li>For convenience, optionally lowercases terms that it outputs.
	/// </ul>
	/// </summary>
	public class MockTokenizer : Tokenizer
	{
	/// <summary>
	/// Acts Similar to WhitespaceTokenizer </summary>
	public static readonly CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton());

	/// <summary>
	/// Acts Similar to KeywordTokenizer.
	/// TODO: Keyword returns an "empty" token for an empty reader...
	/// </summary>
	public static readonly CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton(new RegExp(".*").ToAutomaton());

	/// <summary>
	/// Acts like LetterTokenizer. </summary>
	// the ugly regex below is incomplete Unicode 5.2 [:Letter:]
	public static readonly CharacterRunAutomaton SIMPLE = new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").ToAutomaton());

	private readonly CharacterRunAutomaton RunAutomaton;
	private readonly bool LowerCase;
	private readonly int MaxTokenLength;
	public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;
	private int state;

	private readonly ICharTermAttribute TermAtt;
	private readonly IOffsetAttribute OffsetAtt;
	internal int Off = 0;

	// buffered state (previous codepoint and offset). we replay this once we
	// hit a reject state in case its permissible as the start of a new term.
	internal int BufferedCodePoint = -1; // -1 indicates empty buffer

	internal int BufferedOff = -1;

	// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
	// currently, we can only check that the lifecycle is correct if someone is reusing,
	// but not for "one-offs".
	private enum State
	{
	SETREADER, // consumer set a reader input either via ctor or via reset(Reader)
	RESET, // consumer has called reset()
	INCREMENT, // consumer is consuming, has called IncrementToken() == true
	INCREMENT_FALSE, // consumer has called IncrementToken() which returned false
	END, // consumer has called end() to perform end of stream operations
	CLOSE // consumer has called close() to release any resources
	}

	private State StreamState = State.CLOSE;
	private int LastOffset = 0; // only for asserting
	private bool EnableChecks_Renamed = true;

	// evil: but we don't change the behavior with this random, we only switch up how we read
	private readonly Random Random = new Random(/RandomizedContext.Current.Random.nextLong()/);

	public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
	: base(factory, input)
	{
	this.RunAutomaton = runAutomaton;
	this.LowerCase = lowerCase;
	this.state = runAutomaton.InitialState;
	this.StreamState = State.SETREADER;
	this.MaxTokenLength = maxTokenLength;
	TermAtt = AddAttribute<ICharTermAttribute>();
	OffsetAtt = AddAttribute<IOffsetAttribute>();
	}

	public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
	: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
	{
	}

	public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
	: this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
	{
	}

	/// <summary>
	/// Calls <seealso cref="#MockTokenizer(Reader, CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)"/> </summary>
	public MockTokenizer(TextReader input)
	: this(input, WHITESPACE, true)
	{
	}

	public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
	: this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
	{
	}

	/// <summary>
	/// Calls {@link #MockTokenizer(Lucene.Net.Util.AttributeSource.AttributeFactory,Reader,CharacterRunAutomaton,boolean)
	/// MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)}
	/// </summary>
	public MockTokenizer(AttributeFactory factory, TextReader input)
	: this(input, WHITESPACE, true)
	{
	}

	public sealed override bool IncrementToken()
	{
	//Debug.Assert(!EnableChecks_Renamed \|\| (StreamState == State.RESET \|\| StreamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + StreamState);
	ClearAttributes();
	for (; ; )
	{
	int startOffset;
	int cp;
	if (BufferedCodePoint >= 0)
	{
	cp = BufferedCodePoint;
	startOffset = BufferedOff;
	BufferedCodePoint = -1;
	}
	else
	{
	startOffset = Off;
	cp = ReadCodePoint();
	}
	if (cp < 0)
	{
	break;
	}
	else if (IsTokenChar(cp))
	{
	int endOffset;
	do
	{
	char[] chars = Character.ToChars(Normalize(cp));
	for (int i = 0; i < chars.Length; i++)
	{
	TermAtt.Append(chars[i]);
	}
	endOffset = Off;
	if (TermAtt.Length >= MaxTokenLength)
	{
	break;
	}
	cp = ReadCodePoint();
	} while (cp >= 0 && IsTokenChar(cp));

	if (TermAtt.Length < MaxTokenLength)
	{
	// buffer up, in case the "rejected" char can start a new word of its own
	BufferedCodePoint = cp;
	BufferedOff = endOffset;
	}
	else
	{
	// otherwise, its because we hit term limit.
	BufferedCodePoint = -1;
	}
	int correctedStartOffset = CorrectOffset(startOffset);
	int correctedEndOffset = CorrectOffset(endOffset);
	Assert.True(correctedStartOffset >= 0);
	Assert.True(correctedEndOffset >= 0);
	Assert.True(correctedStartOffset >= LastOffset);
	LastOffset = correctedStartOffset;
	Assert.True(correctedEndOffset >= correctedStartOffset);
	OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
	if (state == -1 \|\| RunAutomaton.IsAccept(state))
	{
	// either we hit a reject state (longest match), or end-of-text, but in an accept state
	StreamState = State.INCREMENT;
	return true;
	}
	}
	}
	StreamState = State.INCREMENT_FALSE;
	return false;
	}

	protected internal virtual int ReadCodePoint()
	{
	int ch = ReadChar();
	if (ch < 0)
	{
	return ch;
	}
	else
	{
	Assert.True(!char.IsLowSurrogate((char)ch), "unpaired low surrogate: " + ch.ToString("x"));
	Off++;
	if (char.IsHighSurrogate((char)ch))
	{
	int ch2 = ReadChar();
	if (ch2 >= 0)
	{
	Off++;
	Assert.True(char.IsLowSurrogate((char)ch2), "unpaired high surrogate: " + ch.ToString("x") + ", followed by: " + ch2.ToString("x"));
	return Character.ToCodePoint((char)ch, (char)ch2);
	}
	else
	{
	Assert.True(false, "stream ends with unpaired high surrogate: " + ch.ToString("x"));
	}
	}
	return ch;
	}
	}

	protected internal virtual int ReadChar()
	{
	switch (Random.Next(0, 10))
	{
	case 0:
	{
	// read(char[])
	char[] c = new char[1];
	int ret = input.Read(c, 0, c.Length);
	return ret <= 0 ? -1 : c[0];
	}
	case 1:
	{
	// read(char[], int, int)
	char[] c = new char[2];
	int ret = input.Read(c, 1, 1);
	return ret <= 0 ? -1 : c[1];
	}
	/* LUCENE TO-DO not sure if needed, CharBuffer not supported
	case 2:
	{
	// read(CharBuffer)
	char[] c = new char[1];
	CharBuffer cb = CharBuffer.Wrap(c);
	int ret = Input.Read(cb);
	return ret < 0 ? ret : c[0];
	}*/
	default:
	// read()
	return input.Read();
	}
	}

	protected internal virtual bool IsTokenChar(int c)
	{
	if (state < 0)
	{
	state = RunAutomaton.InitialState;
	}
	state = RunAutomaton.Step(state, c);
	if (state < 0)
	{
	return false;
	}
	else
	{
	return true;
	}
	}

	protected internal virtual int Normalize(int c)
	{
	return LowerCase ? Character.ToLowerCase(c) : c;
	}

	public override void Reset()
	{
	base.Reset();
	state = RunAutomaton.InitialState;
	LastOffset = Off = 0;
	BufferedCodePoint = -1;
	Assert.True(!EnableChecks_Renamed \|\| StreamState != State.RESET, "double reset()");
	StreamState = State.RESET;
	}

	public override void Dispose()
	{
	base.Dispose();
	// in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
	// these tests should disable this check, by default we check the normal workflow.
	// TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
	Assert.True(!EnableChecks_Renamed \|\| StreamState == State.END \|\| StreamState == State.CLOSE, "close() called in wrong state: " + StreamState);
	StreamState = State.CLOSE;
	}

	internal bool SetReaderTestPoint()
	{
	Assert.True(!EnableChecks_Renamed \|\| StreamState == State.CLOSE, "setReader() called in wrong state: " + StreamState);
	StreamState = State.SETREADER;
	return true;
	}

	public override void End()
	{
	base.End();
	int finalOffset = CorrectOffset(Off);
	OffsetAtt.SetOffset(finalOffset, finalOffset);
	// some tokenizers, such as limiting tokenizers, call end() before IncrementToken() returns false.
	// these tests should disable this check (in general you should consume the entire stream)
	try
	{
	//Debug.Assert(!EnableChecks_Renamed \|\| StreamState == State.INCREMENT_FALSE, "end() called before IncrementToken() returned false!");
	}
	finally
	{
	StreamState = State.END;
	}
	}

	/// <summary>
	/// Toggle consumer workflow checking: if your test consumes tokenstreams normally you
	/// should leave this enabled.
	/// </summary>
	public virtual bool EnableChecks
	{
	set
	{
	this.EnableChecks_Renamed = value;
	}
	}
	}
	}