lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;

 import java.io.IOException;
 import java.nio.CharBuffer;
 import java.util.Random;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;

 import com.carrotsearch.randomizedtesting.RandomizedContext;

 /**
  * Tokenizer for testing.
  * <p>
  * This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD}
  * tokenizers. If you are writing a component such as a TokenFilter, it's a great idea to test
  * it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior:
  * <ul>
  *   <li>An internal state-machine is used for checking consumer consistency. These checks can
  *       be disabled with {@link #setEnableChecks(boolean)}.
  *   <li>For convenience, optionally lowercases terms that it outputs.
  * </ul>
  */
 public class MockTokenizer extends Tokenizer {
   /** Acts Similar to WhitespaceTokenizer */
   public static final CharacterRunAutomaton WHITESPACE =
     new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
   /** Acts Similar to KeywordTokenizer.
    * TODO: Keyword returns an "empty" token for an empty reader...
    */
   public static final CharacterRunAutomaton KEYWORD =
     new CharacterRunAutomaton(new RegExp(".*").toAutomaton());
   /** Acts like LetterTokenizer. */
   // the ugly regex below is incomplete Unicode 5.2 [:Letter:]
   public static final CharacterRunAutomaton SIMPLE =
     new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton());

   private final CharacterRunAutomaton runAutomaton;
   private final boolean lowerCase;
   private final int maxTokenLength;
   public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
   private int state;

   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   int off = 0;

   // buffered state (previous codepoint and offset). we replay this once we
   // hit a reject state in case it's permissible as the start of a new term.
   int bufferedCodePoint = -1; // -1 indicates empty buffer
   int bufferedOff = -1;

   // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
   // currently, we can only check that the lifecycle is correct if someone is reusing,
   // but not for "one-offs".
   private static enum State {
     SETREADER,       // consumer set a reader input either via ctor or via reset(Reader)
     RESET,           // consumer has called reset()
     INCREMENT,       // consumer is consuming, has called incrementToken() == true
     INCREMENT_FALSE, // consumer has called incrementToken() which returned false
     END,             // consumer has called end() to perform end of stream operations
     CLOSE            // consumer has called close() to release any resources
   };

   private State streamState = State.CLOSE;
   private int lastOffset = 0; // only for checks
   private boolean enableChecks = true;

   // evil: but we don't change the behavior with this random, we only switch up how we read
   private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());

   public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
     super(factory);
     this.runAutomaton = runAutomaton;
     this.lowerCase = lowerCase;
     this.state = 0;
     this.maxTokenLength = maxTokenLength;
   }

   public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
     this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength);
   }

   public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
     this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
   }

   /** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */
   public MockTokenizer() {
     this(WHITESPACE, true);
   }

   public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
     this(factory, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
   }

   /** Calls {@link #MockTokenizer(AttributeFactory,CharacterRunAutomaton,boolean)
    *                MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */
   public MockTokenizer(AttributeFactory factory) {
     this(factory, WHITESPACE, true);
   }

   // we allow some checks (e.g. state machine) to be turned off.
   // turning off checks just means we suppress exceptions from them
   private void fail(String message) {
     if (enableChecks) {
       throw new IllegalStateException(message);
     }
   }

   private void failAlways(String message) {
     throw new IllegalStateException(message);
   }

   @Override
   public final boolean incrementToken() throws IOException {
     if (streamState != State.RESET && streamState != State.INCREMENT) {
       fail("incrementToken() called while in wrong state: " + streamState);
     }

     clearAttributes();
     for (;;) {
       int startOffset;
       int cp;
       if (bufferedCodePoint >= 0) {
         cp = bufferedCodePoint;
         startOffset = bufferedOff;
         bufferedCodePoint = -1;
       } else {
         startOffset = off;
         cp = readCodePoint();
       }
       if (cp < 0) {
         break;
       } else if (isTokenChar(cp)) {
         char chars[] = new char[2];
         int endOffset;
         do {
           int len = Character.toChars(normalize(cp), chars, 0);
           for (int i = 0; i < len; i++) {
             termAtt.append(chars[i]);
           }
           endOffset = off;
           if (termAtt.length() >= maxTokenLength) {
             break;
           }
           cp = readCodePoint();
         } while (cp >= 0 && isTokenChar(cp));

         if (termAtt.length() < maxTokenLength) {
           // buffer up, in case the "rejected" char can start a new word of its own
           bufferedCodePoint = cp;
           bufferedOff = endOffset;
         } else {
           // otherwise, it's because we hit term limit.
           bufferedCodePoint = -1;
         }
         int correctedStartOffset = correctOffset(startOffset);
         int correctedEndOffset = correctOffset(endOffset);
         if (correctedStartOffset < 0) {
           failAlways("invalid start offset: " + correctedStartOffset + ", before correction: " + startOffset);
         }
         if (correctedEndOffset < 0) {
           failAlways("invalid end offset: " + correctedEndOffset + ", before correction: " + endOffset);
         }
         if (correctedStartOffset < lastOffset) {
           failAlways("start offset went backwards: " + correctedStartOffset + ", before correction: " + startOffset + ", lastOffset: " + lastOffset);
         }
         lastOffset = correctedStartOffset;
         if (correctedEndOffset < correctedStartOffset) {
           failAlways("end offset: " + correctedEndOffset + " is before start offset: " + correctedStartOffset);
         }
         offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
         if (state == -1 || runAutomaton.isAccept(state)) {
           // either we hit a reject state (longest match), or end-of-text, but in an accept state
           streamState = State.INCREMENT;
           return true;
         }
       }
     }
     streamState = State.INCREMENT_FALSE;
     return false;
   }

   protected int readCodePoint() throws IOException {
     int ch = readChar();
     if (ch < 0) {
       return ch;
     } else {
       if (Character.isLowSurrogate((char) ch)) {
         failAlways("unpaired low surrogate: " + Integer.toHexString(ch));
       }
       off++;
       if (Character.isHighSurrogate((char) ch)) {
         int ch2 = readChar();
         if (ch2 >= 0) {
           off++;
           if (!Character.isLowSurrogate((char) ch2)) {
             failAlways("unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2));
           }
           return Character.toCodePoint((char) ch, (char) ch2);
         } else {
           failAlways("stream ends with unpaired high surrogate: " + Integer.toHexString(ch));
         }
       }
       return ch;
     }
   }

   protected int readChar() throws IOException {
     switch(random.nextInt(10)) {
       case 0: {
         // read(char[])
         char c[] = new char[1];
         int ret = input.read(c);
         return ret < 0 ? ret : c[0];
       }
       case 1: {
         // read(char[], int, int)
         char c[] = new char[2];
         int ret = input.read(c, 1, 1);
         return ret < 0 ? ret : c[1];
       }
       case 2: {
         // read(CharBuffer)
         char c[] = new char[1];
         CharBuffer cb = CharBuffer.wrap(c);
         int ret = input.read(cb);
         return ret < 0 ? ret : c[0];
       }
       default:
         // read()
         return input.read();
     }
   }

   protected boolean isTokenChar(int c) {
     if (state < 0) {
       state = 0;
     }
     state = runAutomaton.step(state, c);
     if (state < 0) {
       return false;
     } else {
       return true;
     }
   }

   protected int normalize(int c) {
     return lowerCase ? Character.toLowerCase(c) : c;
   }

   @Override
   public void reset() throws IOException {
     try {
       super.reset();
       state = 0;
       lastOffset = off = 0;
       bufferedCodePoint = -1;
       if (streamState == State.RESET) {
         fail("double reset()");
       }
     } finally {
       streamState = State.RESET;
     }
   }

   @Override
   public void close() throws IOException {
     try {
       super.close();
       // in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
       // these tests should disable this check, by default we check the normal workflow.
       // TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
       if (!(streamState == State.END || streamState == State.CLOSE)) {
         fail("close() called in wrong state: " + streamState);
       }
     } finally {
       streamState = State.CLOSE;
     }
   }

   @Override
   void setReaderTestPoint() {
     try {
       if (streamState != State.CLOSE) {
         fail("setReader() called in wrong state: " + streamState);
       }
     } finally {
       streamState = State.SETREADER;
     }
   }

   @Override
   public void end() throws IOException {
     try {
       super.end();
       int finalOffset = correctOffset(off);
       offsetAtt.setOffset(finalOffset, finalOffset);
       // some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
       // these tests should disable this check (in general you should consume the entire stream)
       if (streamState != State.INCREMENT_FALSE) {
         fail("end() called in wrong state=" + streamState + "!");
       }
     } finally {
       streamState = State.END;
     }
   }

   /**
    * Toggle consumer workflow checking: if your test consumes tokenstreams normally you
    * should leave this enabled.
    */
   public void setEnableChecks(boolean enableChecks) {
     this.enableChecks = enableChecks;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;

	import java.io.IOException;
	import java.nio.CharBuffer;
	import java.util.Random;

	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
	import org.apache.lucene.util.AttributeFactory;
	import org.apache.lucene.util.automaton.CharacterRunAutomaton;
	import org.apache.lucene.util.automaton.RegExp;

	import com.carrotsearch.randomizedtesting.RandomizedContext;

	/**
	* Tokenizer for testing.
	* <p>
	* This tokenizer is a replacement for {@link #WHITESPACE}, {@link #SIMPLE}, and {@link #KEYWORD}
	* tokenizers. If you are writing a component such as a TokenFilter, it's a great idea to test
	* it wrapping this tokenizer instead for extra checks. This tokenizer has the following behavior:
	* <ul>
	* <li>An internal state-machine is used for checking consumer consistency. These checks can
	* be disabled with {@link #setEnableChecks(boolean)}.
	* <li>For convenience, optionally lowercases terms that it outputs.
	* </ul>
	*/
	public class MockTokenizer extends Tokenizer {
	/** Acts Similar to WhitespaceTokenizer */
	public static final CharacterRunAutomaton WHITESPACE =
	new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+").toAutomaton());
	/** Acts Similar to KeywordTokenizer.
	* TODO: Keyword returns an "empty" token for an empty reader...
	*/
	public static final CharacterRunAutomaton KEYWORD =
	new CharacterRunAutomaton(new RegExp(".*").toAutomaton());
	/** Acts like LetterTokenizer. */
	// the ugly regex below is incomplete Unicode 5.2 [:Letter:]
	public static final CharacterRunAutomaton SIMPLE =
	new CharacterRunAutomaton(new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton());

	private final CharacterRunAutomaton runAutomaton;
	private final boolean lowerCase;
	private final int maxTokenLength;
	public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
	private int state;

	private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
	private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
	int off = 0;

	// buffered state (previous codepoint and offset). we replay this once we
	// hit a reject state in case it's permissible as the start of a new term.
	int bufferedCodePoint = -1; // -1 indicates empty buffer
	int bufferedOff = -1;

	// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
	// currently, we can only check that the lifecycle is correct if someone is reusing,
	// but not for "one-offs".
	private static enum State {
	SETREADER, // consumer set a reader input either via ctor or via reset(Reader)
	RESET, // consumer has called reset()
	INCREMENT, // consumer is consuming, has called incrementToken() == true
	INCREMENT_FALSE, // consumer has called incrementToken() which returned false
	END, // consumer has called end() to perform end of stream operations
	CLOSE // consumer has called close() to release any resources
	};

	private State streamState = State.CLOSE;
	private int lastOffset = 0; // only for checks
	private boolean enableChecks = true;

	// evil: but we don't change the behavior with this random, we only switch up how we read
	private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());

	public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
	super(factory);
	this.runAutomaton = runAutomaton;
	this.lowerCase = lowerCase;
	this.state = 0;
	this.maxTokenLength = maxTokenLength;
	}

	public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
	this(BaseTokenStreamTestCase.newAttributeFactory(), runAutomaton, lowerCase, maxTokenLength);
	}

	public MockTokenizer(CharacterRunAutomaton runAutomaton, boolean lowerCase) {
	this(runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
	}

	/** Calls {@link #MockTokenizer(CharacterRunAutomaton, boolean) MockTokenizer(Reader, WHITESPACE, true)} */
	public MockTokenizer() {
	this(WHITESPACE, true);
	}

	public MockTokenizer(AttributeFactory factory, CharacterRunAutomaton runAutomaton, boolean lowerCase) {
	this(factory, runAutomaton, lowerCase, DEFAULT_MAX_TOKEN_LENGTH);
	}

	/** Calls {@link #MockTokenizer(AttributeFactory,CharacterRunAutomaton,boolean)
	* MockTokenizer(AttributeFactory, Reader, WHITESPACE, true)} */
	public MockTokenizer(AttributeFactory factory) {
	this(factory, WHITESPACE, true);
	}

	// we allow some checks (e.g. state machine) to be turned off.
	// turning off checks just means we suppress exceptions from them
	private void fail(String message) {
	if (enableChecks) {
	throw new IllegalStateException(message);
	}
	}

	private void failAlways(String message) {
	throw new IllegalStateException(message);
	}

	@Override
	public final boolean incrementToken() throws IOException {
	if (streamState != State.RESET && streamState != State.INCREMENT) {
	fail("incrementToken() called while in wrong state: " + streamState);
	}

	clearAttributes();
	for (;;) {
	int startOffset;
	int cp;
	if (bufferedCodePoint >= 0) {
	cp = bufferedCodePoint;
	startOffset = bufferedOff;
	bufferedCodePoint = -1;
	} else {
	startOffset = off;
	cp = readCodePoint();
	}
	if (cp < 0) {
	break;
	} else if (isTokenChar(cp)) {
	char chars[] = new char[2];
	int endOffset;
	do {
	int len = Character.toChars(normalize(cp), chars, 0);
	for (int i = 0; i < len; i++) {
	termAtt.append(chars[i]);
	}
	endOffset = off;
	if (termAtt.length() >= maxTokenLength) {
	break;
	}
	cp = readCodePoint();
	} while (cp >= 0 && isTokenChar(cp));

	if (termAtt.length() < maxTokenLength) {
	// buffer up, in case the "rejected" char can start a new word of its own
	bufferedCodePoint = cp;
	bufferedOff = endOffset;
	} else {
	// otherwise, it's because we hit term limit.
	bufferedCodePoint = -1;
	}
	int correctedStartOffset = correctOffset(startOffset);
	int correctedEndOffset = correctOffset(endOffset);
	if (correctedStartOffset < 0) {
	failAlways("invalid start offset: " + correctedStartOffset + ", before correction: " + startOffset);
	}
	if (correctedEndOffset < 0) {
	failAlways("invalid end offset: " + correctedEndOffset + ", before correction: " + endOffset);
	}
	if (correctedStartOffset < lastOffset) {
	failAlways("start offset went backwards: " + correctedStartOffset + ", before correction: " + startOffset + ", lastOffset: " + lastOffset);
	}
	lastOffset = correctedStartOffset;
	if (correctedEndOffset < correctedStartOffset) {
	failAlways("end offset: " + correctedEndOffset + " is before start offset: " + correctedStartOffset);
	}
	offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
	if (state == -1 \|\| runAutomaton.isAccept(state)) {
	// either we hit a reject state (longest match), or end-of-text, but in an accept state
	streamState = State.INCREMENT;
	return true;
	}
	}
	}
	streamState = State.INCREMENT_FALSE;
	return false;
	}

	protected int readCodePoint() throws IOException {
	int ch = readChar();
	if (ch < 0) {
	return ch;
	} else {
	if (Character.isLowSurrogate((char) ch)) {
	failAlways("unpaired low surrogate: " + Integer.toHexString(ch));
	}
	off++;
	if (Character.isHighSurrogate((char) ch)) {
	int ch2 = readChar();
	if (ch2 >= 0) {
	off++;
	if (!Character.isLowSurrogate((char) ch2)) {
	failAlways("unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2));
	}
	return Character.toCodePoint((char) ch, (char) ch2);
	} else {
	failAlways("stream ends with unpaired high surrogate: " + Integer.toHexString(ch));
	}
	}
	return ch;
	}
	}

	protected int readChar() throws IOException {
	switch(random.nextInt(10)) {
	case 0: {
	// read(char[])
	char c[] = new char[1];
	int ret = input.read(c);
	return ret < 0 ? ret : c[0];
	}
	case 1: {
	// read(char[], int, int)
	char c[] = new char[2];
	int ret = input.read(c, 1, 1);
	return ret < 0 ? ret : c[1];
	}
	case 2: {
	// read(CharBuffer)
	char c[] = new char[1];
	CharBuffer cb = CharBuffer.wrap(c);
	int ret = input.read(cb);
	return ret < 0 ? ret : c[0];
	}
	default:
	// read()
	return input.read();
	}
	}

	protected boolean isTokenChar(int c) {
	if (state < 0) {
	state = 0;
	}
	state = runAutomaton.step(state, c);
	if (state < 0) {
	return false;
	} else {
	return true;
	}
	}

	protected int normalize(int c) {
	return lowerCase ? Character.toLowerCase(c) : c;
	}

	@Override
	public void reset() throws IOException {
	try {
	super.reset();
	state = 0;
	lastOffset = off = 0;
	bufferedCodePoint = -1;
	if (streamState == State.RESET) {
	fail("double reset()");
	}
	} finally {
	streamState = State.RESET;
	}
	}

	@Override
	public void close() throws IOException {
	try {
	super.close();
	// in some exceptional cases (e.g. TestIndexWriterExceptions) a test can prematurely close()
	// these tests should disable this check, by default we check the normal workflow.
	// TODO: investigate the CachingTokenFilter "double-close"... for now we ignore this
	if (!(streamState == State.END \|\| streamState == State.CLOSE)) {
	fail("close() called in wrong state: " + streamState);
	}
	} finally {
	streamState = State.CLOSE;
	}
	}

	@Override
	void setReaderTestPoint() {
	try {
	if (streamState != State.CLOSE) {
	fail("setReader() called in wrong state: " + streamState);
	}
	} finally {
	streamState = State.SETREADER;
	}
	}

	@Override
	public void end() throws IOException {
	try {
	super.end();
	int finalOffset = correctOffset(off);
	offsetAtt.setOffset(finalOffset, finalOffset);
	// some tokenizers, such as limiting tokenizers, call end() before incrementToken() returns false.
	// these tests should disable this check (in general you should consume the entire stream)
	if (streamState != State.INCREMENT_FALSE) {
	fail("end() called in wrong state=" + streamState + "!");
	}
	} finally {
	streamState = State.END;
	}
	}

	/**
	* Toggle consumer workflow checking: if your test consumes tokenstreams normally you
	* should leave this enabled.
	*/
	public void setEnableChecks(boolean enableChecks) {
	this.enableChecks = enableChecks;
	}
	}