| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis; |
| |
| import java.io.IOException; |
| import java.nio.CharBuffer; |
| import java.util.Random; |
| |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.util.AttributeFactory; |
| import org.apache.lucene.util.automaton.CharacterRunAutomaton; |
| import org.apache.lucene.util.automaton.RegExp; |
| |
| import com.carrotsearch.randomizedtesting.RandomizedContext; |
| |
| /** |
| * Tokenizer for testing. |
| * <p> |
| * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD} |
| * tokenizers. If you are writing a component such as a TokenFilter, it's a great idea to test |
| * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior: |
| * <ul> |
| * <li>An internal state-machine is used for checking consumer consistency. These checks can |
| * be disabled with {@link #setEnableChecks(boolean)}. |
| * <li>For convenience, optionally lowercases terms that it outputs. |
| * </ul> |
| */ |
| public class MockTokenizer extends Tokenizer { |
| /** Acts Similar to WhitespaceTokenizer */ |
| public static final CharacterRunAutomaton WHITESPACE = |
| new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton()); |
| /** Acts Similar to KeywordTokenizer. |
| * TODO: Keyword returns an "empty" token for an empty reader... |
| */ |
| public static final CharacterRunAutomaton KEYWORD = |
| new CharacterRunAutomaton(new RegExp(".*").toAutomaton()); |
| /** Acts like LetterTokenizer. */ |
| // the ugly regex below is incomplete Unicode 5.2 [:Letter:] |
| public static final CharacterRunAutomaton SIMPLE = |
| new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton()); |
| |
| private final CharacterRunAutomaton runAutomaton; |
| private final boolean lowerCase; |
| private final int maxTokenLength; |
| public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE; |
| private int state; |
| |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| int off = 0; |
| |
| // buffered state (previous codepoint and offset). we replay this once we |
| // hit a reject state in case it's permissible as the start of a new term. |
| int bufferedCodePoint = -1; // -1 indicates empty buffer |
| int bufferedOff = -1; |
| |
| // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? |
| // currently, we can only check that the lifecycle is correct if someone is reusing, |
| // but not for "one-offs". |
| private static enum State { |
| SETREADER, // consumer set a reader input either via ctor or via reset(Reader) |
| RESET, // consumer has called reset() |
| INCREMENT, // consumer is consuming, has called incrementToken() == true |
| INCREMENT_FALSE, // consumer has called incrementToken() which returned false |
| END, // consumer has called end() to perform end of stream operations |
| CLOSE // consumer has called close() to release any resources |
| }; |
| |
| private State streamState = State.CLOSE; |
| private int lastOffset = 0; // only for checks |
| private boolean enableChecks = true; |
| |
| // evil: but we don't change the behavior with this random, we only switch up how we read |
| private final Random random = new Random(RandomizedContext.current().getRandom().nextLong()); |
| |
| public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { |
| super(factory); |
| this.runAutomaton = runAutomaton; |
| this.lowerCase = lowerCase; |
| this.state = 0; |
| this.maxTokenLength = maxTokenLength; |
| } |
| |
| public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) { |
| this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength); |
| } |
| |
| public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) { |
| this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); |
| } |
| |
| /** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */ |
| public MockTokenizer() { |
| this(WHITESPACE, true); |
| } |
| |
| public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase) { |
| this(factory, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH); |
| } |
| |
| /** Calls {@link #MockTokenizer(AttributeFactory,CharacterRunAutomaton,boolean) |
| * MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */ |
| public MockTokenizer(AttributeFactory factory) { |
| this(factory, WHITESPACE, true); |
| } |
| |
| // we allow some checks (e.g. state machine) to be turned off. |
| // turning off checks just means we suppress exceptions from them |
| private void fail(String message) { |
| if (enableChecks) { |
| throw new IllegalStateException(message); |
| } |
| } |
| |
| private void failAlways(String message) { |
| throw new IllegalStateException(message); |
| } |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| if (streamState != State.RESET && streamState != State.INCREMENT) { |
| fail("incrementToken() called while in wrong state: " + streamState); |
| } |
| |
| clearAttributes(); |
| for (;;) { |
| int startOffset; |
| int cp; |
| if (bufferedCodePoint >= 0) { |
| cp = bufferedCodePoint; |
| startOffset = bufferedOff; |
| bufferedCodePoint = -1; |
| } else { |
| startOffset = off; |
| cp = readCodePoint(); |
| } |
| if (cp < 0) { |
| break; |
| } else if (isTokenChar(cp)) { |
| char chars[] = new char[2]; |
| int endOffset; |
| do { |
| int len = Character.toChars(normalize(cp), chars, 0); |
| for (int i = 0; i < len; i++) { |
| termAtt.append(chars[i]); |
| } |
| endOffset = off; |
| if (termAtt.length() >= maxTokenLength) { |
| break; |
| } |
| cp = readCodePoint(); |
| } while (cp >= 0 && isTokenChar(cp)); |
| |
| if (termAtt.length() < maxTokenLength) { |
| // buffer up, in case the "rejected" char can start a new word of its own |
| bufferedCodePoint = cp; |
| bufferedOff = endOffset; |
| } else { |
| // otherwise, it's because we hit term limit. |
| bufferedCodePoint = -1; |
| } |
| int correctedStartOffset = correctOffset(startOffset); |
| int correctedEndOffset = correctOffset(endOffset); |
| if (correctedStartOffset < 0) { |
| failAlways("invalid start offset: " + correctedStartOffset + ", before correction: " + startOffset); |
| } |
| if (correctedEndOffset < 0) { |
| failAlways("invalid end offset: " + correctedEndOffset + ", before correction: " + endOffset); |
| } |
| if (correctedStartOffset < lastOffset) { |
| failAlways("start offset went backwards: " + correctedStartOffset + ", before correction: " + startOffset + ", lastOffset: " + lastOffset); |
| } |
| lastOffset = correctedStartOffset; |
| if (correctedEndOffset < correctedStartOffset) { |
| failAlways("end offset: " + correctedEndOffset + " is before start offset: " + correctedStartOffset); |
| } |
| offsetAtt.setOffset(correctedStartOffset, correctedEndOffset); |
| if (state == -1 || runAutomaton.isAccept(state)) { |
| // either we hit a reject state (longest match), or end-of-text, but in an accept state |
| streamState = State.INCREMENT; |
| return true; |
| } |
| } |
| } |
| streamState = State.INCREMENT_FALSE; |
| return false; |
| } |
| |
| protected int readCodePoint() throws IOException { |
| int ch = readChar(); |
| if (ch < 0) { |
| return ch; |
| } else { |
| if (Character.isLowSurrogate((char) ch)) { |
| failAlways("unpaired low surrogate: " + Integer.toHexString(ch)); |
| } |
| off++; |
| if (Character.isHighSurrogate((char) ch)) { |
| int ch2 = readChar(); |
| if (ch2 >= 0) { |
| off++; |
| if (!Character.isLowSurrogate((char) ch2)) { |
| failAlways("unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2)); |
| } |
| return Character.toCodePoint((char) ch, (char) ch2); |
| } else { |
| failAlways("stream ends with unpaired high surrogate: " + Integer.toHexString(ch)); |
| } |
| } |
| return ch; |
| } |
| } |
| |
| protected int readChar() throws IOException { |
| switch(random.nextInt(10)) { |
| case 0: { |
| // read(char[]) |
| char c[] = new char[1]; |
| int ret = input.read(c); |
| return ret < 0 ? ret : c[0]; |
| } |
| case 1: { |
| // read(char[], int, int) |
| char c[] = new char[2]; |
| int ret = input.read(c, 1, 1); |
| return ret < 0 ? ret : c[1]; |
| } |
| case 2: { |
| // read(CharBuffer) |
| char c[] = new char[1]; |
| CharBuffer cb = CharBuffer.wrap(c); |
| int ret = input.read(cb); |
| return ret < 0 ? ret : c[0]; |
| } |
| default: |
| // read() |
| return input.read(); |
| } |
| } |
| |
| protected boolean isTokenChar(int c) { |
| if (state < 0) { |
| state = 0; |
| } |
| state = runAutomaton.step(state, c); |
| if (state < 0) { |
| return false; |
| } else { |
| return true; |
| } |
| } |
| |
| protected int normalize(int c) { |
| return lowerCase ? Character.toLowerCase(c) : c; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| try { |
| super.reset(); |
| state = 0; |
| lastOffset = off = 0; |
| bufferedCodePoint = -1; |
| if (streamState == State.RESET) { |
| fail("double reset()"); |
| } |
| } finally { |
| streamState = State.RESET; |
| } |
| } |
| |
| @Override |
| public void close() throws IOException { |
| try { |
| super.close(); |
| // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close() |
| // these tests should disable this check, by default we check the normal workflow. |
| // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this |
| if (!(streamState == State.END || streamState == State.CLOSE)) { |
| fail("close() called in wrong state: " + streamState); |
| } |
| } finally { |
| streamState = State.CLOSE; |
| } |
| } |
| |
| @Override |
| void setReaderTestPoint() { |
| try { |
| if (streamState != State.CLOSE) { |
| fail("setReader() called in wrong state: " + streamState); |
| } |
| } finally { |
| streamState = State.SETREADER; |
| } |
| } |
| |
| @Override |
| public void end() throws IOException { |
| try { |
| super.end(); |
| int finalOffset = correctOffset(off); |
| offsetAtt.setOffset(finalOffset, finalOffset); |
| // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false. |
| // these tests should disable this check (in general you should consume the entire stream) |
| if (streamState != State.INCREMENT_FALSE) { |
| fail("end() called in wrong state=" + streamState + "!"); |
| } |
| } finally { |
| streamState = State.END; |
| } |
| } |
| |
| /** |
| * Toggle consumer workflow checking: if your test consumes tokenstreams normally you |
| * should leave this enabled. |
| */ |
| public void setEnableChecks(boolean enableChecks) { |
| this.enableChecks = enableChecks; |
| } |
| } |