| using J2N; |
| using Lucene.Net.Analysis.TokenAttributes; |
| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Util; |
| using System; |
| using System.Globalization; |
| using System.IO; |
| using Assert = Lucene.Net.TestFramework.Assert; |
| using CharacterRunAutomaton = Lucene.Net.Util.Automaton.CharacterRunAutomaton; |
| using RegExp = Lucene.Net.Util.Automaton.RegExp; |
| |
| namespace Lucene.Net.Analysis |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Tokenizer for testing. |
| /// <para/> |
| /// This tokenizer is a replacement for <see cref="WHITESPACE"/>, <see cref="SIMPLE"/>, and <see cref="KEYWORD"/> |
| /// tokenizers. If you are writing a component such as a <see cref="TokenFilter"/>, its a great idea to test |
| /// it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior: |
| /// <list type="bullet"> |
| /// <item> |
| /// <description> |
| /// An internal state-machine is used for checking consumer consistency. These checks can |
| /// be disabled with <see cref="EnableChecks"/>. |
| /// </description> |
| /// </item> |
| /// <item> |
| /// <description> |
| /// For convenience, optionally lowercases terms that it outputs. |
| /// </description> |
| /// </item> |
| /// </list> |
| /// </summary> |
| public class MockTokenizer : Tokenizer |
| { |
| /// <summary> |
| /// Acts Similar to <see cref="Analysis.Core.WhitespaceTokenizer"/>.</summary> |
| public static readonly CharacterRunAutomaton WHITESPACE = new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").ToAutomaton()); |
| |
| /// <summary> |
| /// Acts Similar to <see cref="Analysis.Core.KeywordTokenizer"/>. |
| /// TODO: Keyword returns an "empty" token for an empty reader... |
| /// </summary> |
| public static readonly CharacterRunAutomaton KEYWORD = new CharacterRunAutomaton(new RegExp(".*").ToAutomaton()); |
| |
| /// <summary> |
| /// Acts like <see cref="Analysis.Core.LetterTokenizer"/>. </summary> |
| // the ugly regex below is incomplete Unicode 5.2 [:Letter:] |
| public static readonly CharacterRunAutomaton SIMPLE = new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").ToAutomaton()); |
| |
| private readonly CharacterRunAutomaton runAutomaton; |
| private readonly bool lowerCase; |
| private readonly int maxTokenLength; |
| public static readonly int DEFAULT_MAX_TOKEN_LENGTH = int.MaxValue; |
| private int state; |
| |
| private readonly ICharTermAttribute termAtt; |
| private readonly IOffsetAttribute offsetAtt; |
| internal int off = 0; |
| |
| // buffered state (previous codepoint and offset). we replay this once we |
| // hit a reject state in case its permissible as the start of a new term. |
| internal int bufferedCodePoint = -1; // -1 indicates empty buffer |
| |
| internal int bufferedOff = -1; |
| |
| // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? |
| // currently, we can only check that the lifecycle is correct if someone is reusing, |
| // but not for "one-offs". |
| new private enum State // LUCENENET: new keyword required to hide AttributeSource.State |
| { |
| SETREADER, // consumer set a reader input either via ctor or via reset(Reader) |
| RESET, // consumer has called reset() |
| INCREMENT, // consumer is consuming, has called IncrementToken() == true |
| INCREMENT_FALSE, // consumer has called IncrementToken() which returned false |
| END, // consumer has called end() to perform end of stream operations |
| CLOSE // consumer has called close() to release any resources |
| } |
| |
| private State streamState = State.CLOSE; |
| private int lastOffset = 0; // only for asserting |
| private bool enableChecks = true; |
| |
| // evil: but we don't change the behavior with this random, we only switch up how we read |
| private readonly Random random = new Random(LuceneTestCase.Random.Next() /*RandomizedContext.Current.Random.nextLong()*/); // LUCENENET TODO: Random seed synchronization |
| |
| public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) |
| : base(factory, input) |
| { |
| this.runAutomaton = runAutomaton; |
| this.lowerCase = lowerCase; |
| this.state = runAutomaton.InitialState; |
| this.streamState = State.SETREADER; |
| this.maxTokenLength = maxTokenLength; |
| termAtt = AddAttribute<ICharTermAttribute>(); |
| offsetAtt = AddAttribute<IOffsetAttribute>(); |
| } |
| |
| public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase, int maxTokenLength) |
| : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, runAutomaton, lowerCase, maxTokenLength) |
| { } |
| |
| public MockTokenizer(TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) |
| : this(input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) |
| { } |
| |
| /// <summary> |
| /// Calls <c>MockTokenizer(TextReader, WHITESPACE, true)</c>.</summary> |
| public MockTokenizer(TextReader input) |
| : this(input, WHITESPACE, true) |
| { } |
| |
| public MockTokenizer(AttributeFactory factory, TextReader input, CharacterRunAutomaton runAutomaton, bool lowerCase) |
| : this(factory, input, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH) |
| { } |
| |
| /// <summary> |
| /// Calls <c>MockTokenizer(AttributeFactory, TextReader, WHITESPACE, true)</c> |
| /// </summary> |
| public MockTokenizer(AttributeFactory factory, TextReader input) |
| : this(input, WHITESPACE, true) |
| { } |
| |
| public sealed override bool IncrementToken() |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(!enableChecks || (streamState == State.RESET || streamState == State.INCREMENT),"IncrementToken() called while in wrong state: {0}", streamState); |
| ClearAttributes(); |
| for (; ; ) |
| { |
| int startOffset; |
| int cp; |
| if (bufferedCodePoint >= 0) |
| { |
| cp = bufferedCodePoint; |
| startOffset = bufferedOff; |
| bufferedCodePoint = -1; |
| } |
| else |
| { |
| startOffset = off; |
| cp = ReadCodePoint(); |
| } |
| if (cp < 0) |
| { |
| break; |
| } |
| else if (IsTokenChar(cp)) |
| { |
| int endOffset; |
| do |
| { |
| char[] chars = Character.ToChars(Normalize(cp)); |
| for (int i = 0; i < chars.Length; i++) |
| { |
| termAtt.Append(chars[i]); |
| } |
| endOffset = off; |
| if (termAtt.Length >= maxTokenLength) |
| { |
| break; |
| } |
| cp = ReadCodePoint(); |
| } while (cp >= 0 && IsTokenChar(cp)); |
| |
| if (termAtt.Length < maxTokenLength) |
| { |
| // buffer up, in case the "rejected" char can start a new word of its own |
| bufferedCodePoint = cp; |
| bufferedOff = endOffset; |
| } |
| else |
| { |
| // otherwise, its because we hit term limit. |
| bufferedCodePoint = -1; |
| } |
| int correctedStartOffset = CorrectOffset(startOffset); |
| int correctedEndOffset = CorrectOffset(endOffset); |
| Assert.True(correctedStartOffset >= 0); |
| Assert.True(correctedEndOffset >= 0); |
| Assert.True(correctedStartOffset >= lastOffset); |
| lastOffset = correctedStartOffset; |
| Assert.True(correctedEndOffset >= correctedStartOffset); |
| offsetAtt.SetOffset(correctedStartOffset, correctedEndOffset); |
| if (state == -1 || runAutomaton.IsAccept(state)) |
| { |
| // either we hit a reject state (longest match), or end-of-text, but in an accept state |
| streamState = State.INCREMENT; |
| return true; |
| } |
| } |
| } |
| streamState = State.INCREMENT_FALSE; |
| return false; |
| } |
| |
| protected virtual int ReadCodePoint() |
| { |
| int ch = ReadChar(); |
| if (ch < 0) |
| { |
| return ch; |
| } |
| else |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(!char.IsLowSurrogate((char)ch),"unpaired low surrogate: {0:x}", ch); |
| off++; |
| if (char.IsHighSurrogate((char)ch)) |
| { |
| int ch2 = ReadChar(); |
| if (ch2 >= 0) |
| { |
| off++; |
| if (Debugging.AssertsEnabled) Debugging.Assert(char.IsLowSurrogate((char)ch2),"unpaired high surrogate: {0:x}, followed by: {1:x}", ch, ch2); |
| return Character.ToCodePoint((char)ch, (char)ch2); |
| } |
| else |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(false,"stream ends with unpaired high surrogate: {0:x}", ch); |
| } |
| } |
| return ch; |
| } |
| } |
| |
| protected virtual int ReadChar() |
| { |
| switch (random.Next(0, 10)) |
| { |
| case 0: |
| { |
| // read(char[]) |
| char[] c = new char[1]; |
| int ret = m_input.Read(c, 0, c.Length); |
| return ret <= 0 ? -1 : c[0]; |
| } |
| case 1: |
| { |
| // read(char[], int, int) |
| char[] c = new char[2]; |
| int ret = m_input.Read(c, 1, 1); |
| return ret <= 0 ? -1 : c[1]; |
| } |
| // LUCENENET NOTE: CharBuffer not supported |
| //case 2: |
| // { |
| // // read(CharBuffer) |
| // char[] c = new char[1]; |
| // CharBuffer cb = CharBuffer.Wrap(c); |
| // int ret = m_input.Read(cb); |
| // return ret < 0 ? ret : c[0]; |
| // } |
| default: |
| // read() |
| return m_input.Read(); |
| } |
| } |
| |
| protected virtual bool IsTokenChar(int c) |
| { |
| if (state < 0) |
| { |
| state = runAutomaton.InitialState; |
| } |
| state = runAutomaton.Step(state, c); |
| if (state < 0) |
| { |
| return false; |
| } |
| else |
| { |
| return true; |
| } |
| } |
| |
| protected virtual int Normalize(int c) |
| { |
| return lowerCase ? Character.ToLower(c, CultureInfo.InvariantCulture) : c; // LUCENENET specific - need to use invariant culture to match Java |
| } |
| |
| public override void Reset() |
| { |
| base.Reset(); |
| state = runAutomaton.InitialState; |
| lastOffset = off = 0; |
| bufferedCodePoint = -1; |
| if (Debugging.AssertsEnabled) Debugging.Assert(!enableChecks || streamState != State.RESET, "Double Reset()"); |
| streamState = State.RESET; |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| base.Dispose(disposing); |
| if (disposing) |
| { |
| // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() |
| // these tests should disable this check, by default we check the normal workflow. |
| // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this |
| if (Debugging.AssertsEnabled) Debugging.Assert(!enableChecks || streamState == State.END || streamState == State.CLOSE,"Dispose() called in wrong state: {0}", streamState); |
| streamState = State.CLOSE; |
| } |
| } |
| |
| internal override bool SetReaderTestPoint() |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(!enableChecks || streamState == State.CLOSE,"SetReader() called in wrong state: {0}", streamState); |
| streamState = State.SETREADER; |
| return true; |
| } |
| |
| public override void End() |
| { |
| base.End(); |
| int finalOffset = CorrectOffset(off); |
| offsetAtt.SetOffset(finalOffset, finalOffset); |
| // some tokenizers, such as limiting tokenizers, call End() before IncrementToken() returns false. |
| // these tests should disable this check (in general you should consume the entire stream) |
| try |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(!enableChecks || streamState == State.INCREMENT_FALSE, "End() called before IncrementToken() returned false!"); |
| } |
| finally |
| { |
| streamState = State.END; |
| } |
| } |
| |
| /// <summary> |
| /// Toggle consumer workflow checking: if your test consumes tokenstreams normally you |
| /// should leave this enabled. |
| /// </summary> |
| public virtual bool EnableChecks |
| { |
| get => enableChecks; // LUCENENET specific - added getter (to follow MSDN property guidelines) |
| set => enableChecks = value; |
| } |
| } |
| } |