blob: 7d9ee7217ab6b6aa8a43b25e018525a482e94e1b [file] [log] [blame]
using Lucene.Net.Analysis.Tokenattributes;
using System;
using System.Diagnostics;
using NUnit.Framework;
namespace Lucene.Net.Analysis
{
using Lucene.Net.Support;
//using RandomizedContext = com.carrotsearch.randomizedtesting.RandomizedContext;
using System.IO;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton;
using RegExp = Lucene.Net.Util.Automaton.RegExp;
/// <summary>
/// Tokenizer for testing.
/// <p>
/// this tokenizer is a replacement for <seealso cref="#WHITESPACE"/>, <seealso cref="#SIMPLE"/>, and <seealso cref="#KEYWORD"/>
/// tokenizers. If you are writing a component such as a TokenFilter, its a great idea to test
/// it wrapping this tokenizer instead for extra checks. this tokenizer has the following behavior:
/// <ul>
/// <li>An internal state-machine is used for checking consumer consistency. These checks can
/// be disabled with <seealso cref="#setEnableChecks(boolean)"/>.
/// <li>For convenience, optionally lowercases terms that it outputs.
/// </ul>
/// </summary>
public class MockTokenizer : Tokenizer
{
/// <summary>
/// Acts Similar to WhitespaceTokenizer </summary>
public static readonly CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton());
/// <summary>
/// Acts Similar to KeywordTokenizer.
/// TODO: Keyword returns an "empty" token for an empty reader...
/// </summary>
public static readonly CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton(new RegExp(".*").ToAutomaton());
/// <summary>
/// Acts like LetterTokenizer. </summary>
// the ugly regex below is incomplete Unicode 5.2 [:Letter:]
public static readonly CharacterRunAutomaton SIMPLE = new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").ToAutomaton());
private readonly CharacterRunAutomaton RunAutomaton;
private readonly bool LowerCase;
private readonly int MaxTokenLength;
public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue;
private int state;
private readonly ICharTermAttribute TermAtt;
private readonly IOffsetAttribute OffsetAtt;
internal int Off = 0;
// buffered state (previous codepoint and offset). we replay this once we
// hit a reject state in case its permissible as the start of a new term.
internal int BufferedCodePoint = -1; // -1 indicates empty buffer
internal int BufferedOff = -1;
// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
// currently, we can only check that the lifecycle is correct if someone is reusing,
// but not for "one-offs".
private enum State
{
SETREADER, // consumer set a reader input either via ctor or via reset(Reader)
RESET, // consumer has called reset()
INCREMENT, // consumer is consuming, has called IncrementToken() == true
INCREMENT_FALSE, // consumer has called IncrementToken() which returned false
END, // consumer has called end() to perform end of stream operations
CLOSE // consumer has called close() to release any resources
}
private State StreamState = State.CLOSE;
private int LastOffset = 0; // only for asserting
private bool EnableChecks_Renamed = true;
// evil: but we don't change the behavior with this random, we only switch up how we read
private readonly Random Random = new Random(/*RandomizedContext.Current.Random.nextLong()*/);
public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
: base(factory, input)
{
this.RunAutomaton = runAutomaton;
this.LowerCase = lowerCase;
this.state = runAutomaton.InitialState;
this.StreamState = State.SETREADER;
this.MaxTokenLength = maxTokenLength;
TermAtt = AddAttribute<ICharTermAttribute>();
OffsetAtt = AddAttribute<IOffsetAttribute>();
}
public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength)
: this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength)
{
}
public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
: this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
{
}
/// <summary>
/// Calls <seealso cref="#MockTokenizer(Reader, CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)"/> </summary>
public MockTokenizer(TextReader input)
: this(input, WHITESPACE, true)
{
}
public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase)
: this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH)
{
}
/// <summary>
/// Calls {@link #MockTokenizer(Lucene.Net.Util.AttributeSource.AttributeFactory,Reader,CharacterRunAutomaton,boolean)
/// MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)}
/// </summary>
public MockTokenizer(AttributeFactory factory, TextReader input)
: this(input, WHITESPACE, true)
{
}
public sealed override bool IncrementToken()
{
//Debug.Assert(!EnableChecks_Renamed || (StreamState == State.RESET || StreamState == State.INCREMENT), "IncrementToken() called while in wrong state: " + StreamState);
ClearAttributes();
for (; ; )
{
int startOffset;
int cp;
if (BufferedCodePoint >= 0)
{
cp = BufferedCodePoint;
startOffset = BufferedOff;
BufferedCodePoint = -1;
}
else
{
startOffset = Off;
cp = ReadCodePoint();
}
if (cp < 0)
{
break;
}
else if (IsTokenChar(cp))
{
int endOffset;
do
{
char[] chars = Character.ToChars(Normalize(cp));
for (int i = 0; i < chars.Length; i++)
{
TermAtt.Append(chars[i]);
}
endOffset = Off;
if (TermAtt.Length >= MaxTokenLength)
{
break;
}
cp = ReadCodePoint();
} while (cp >= 0 && IsTokenChar(cp));
if (TermAtt.Length < MaxTokenLength)
{
// buffer up, in case the "rejected" char can start a new word of its own
BufferedCodePoint = cp;
BufferedOff = endOffset;
}
else
{
// otherwise, its because we hit term limit.
BufferedCodePoint = -1;
}
int correctedStartOffset = CorrectOffset(startOffset);
int correctedEndOffset = CorrectOffset(endOffset);
Assert.True(correctedStartOffset >= 0);
Assert.True(correctedEndOffset >= 0);
Assert.True(correctedStartOffset >= LastOffset);
LastOffset = correctedStartOffset;
Assert.True(correctedEndOffset >= correctedStartOffset);
OffsetAtt.SetOffset(correctedStartOffset, correctedEndOffset);
if (state == -1 || RunAutomaton.IsAccept(state))
{
// either we hit a reject state (longest match), or end-of-text, but in an accept state
StreamState = State.INCREMENT;
return true;
}
}
}
StreamState = State.INCREMENT_FALSE;
return false;
}
protected internal virtual int ReadCodePoint()
{
int ch = ReadChar();
if (ch < 0)
{
return ch;
}
else
{
Assert.True(!char.IsLowSurrogate((char)ch), "unpaired low surrogate: " + ch.ToString("x"));
Off++;
if (char.IsHighSurrogate((char)ch))
{
int ch2 = ReadChar();
if (ch2 >= 0)
{
Off++;
Assert.True(char.IsLowSurrogate((char)ch2), "unpaired high surrogate: " + ch.ToString("x") + ", followed by: " + ch2.ToString("x"));
return Character.ToCodePoint((char)ch, (char)ch2);
}
else
{
Assert.True(false, "stream ends with unpaired high surrogate: " + ch.ToString("x"));
}
}
return ch;
}
}
protected internal virtual int ReadChar()
{
switch (Random.Next(0, 10))
{
case 0:
{
// read(char[])
char[] c = new char[1];
int ret = input.Read(c, 0, c.Length);
return ret <= 0 ? -1 : c[0];
}
case 1:
{
// read(char[], int, int)
char[] c = new char[2];
int ret = input.Read(c, 1, 1);
return ret <= 0 ? -1 : c[1];
}
/* LUCENE TO-DO not sure if needed, CharBuffer not supported
case 2:
{
// read(CharBuffer)
char[] c = new char[1];
CharBuffer cb = CharBuffer.Wrap(c);
int ret = Input.Read(cb);
return ret < 0 ? ret : c[0];
}*/
default:
// read()
return input.Read();
}
}
protected internal virtual bool IsTokenChar(int c)
{
if (state < 0)
{
state = RunAutomaton.InitialState;
}
state = RunAutomaton.Step(state, c);
if (state < 0)
{
return false;
}
else
{
return true;
}
}
protected internal virtual int Normalize(int c)
{
return LowerCase ? Character.ToLowerCase(c) : c;
}
public override void Reset()
{
base.Reset();
state = RunAutomaton.InitialState;
LastOffset = Off = 0;
BufferedCodePoint = -1;
Assert.True(!EnableChecks_Renamed || StreamState != State.RESET, "double reset()");
StreamState = State.RESET;
}
public override void Dispose()
{
base.Dispose();
// in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
// these tests should disable this check, by default we check the normal workflow.
// TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
Assert.True(!EnableChecks_Renamed || StreamState == State.END || StreamState == State.CLOSE, "close() called in wrong state: " + StreamState);
StreamState = State.CLOSE;
}
internal bool SetReaderTestPoint()
{
Assert.True(!EnableChecks_Renamed || StreamState == State.CLOSE, "setReader() called in wrong state: " + StreamState);
StreamState = State.SETREADER;
return true;
}
public override void End()
{
base.End();
int finalOffset = CorrectOffset(Off);
OffsetAtt.SetOffset(finalOffset, finalOffset);
// some tokenizers, such as limiting tokenizers, call end() before IncrementToken() returns false.
// these tests should disable this check (in general you should consume the entire stream)
try
{
//Debug.Assert(!EnableChecks_Renamed || StreamState == State.INCREMENT_FALSE, "end() called before IncrementToken() returned false!");
}
finally
{
StreamState = State.END;
}
}
/// <summary>
/// Toggle consumer workflow checking: if your test consumes tokenstreams normally you
/// should leave this enabled.
/// </summary>
public virtual bool EnableChecks
{
set
{
this.EnableChecks_Renamed = value;
}
}
}
}