lucene/core/src/java/org/apache/lucene/analysis/TokenStream.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis;


 import java.io.IOException;
 import java.io.Closeable;
 import java.lang.reflect.Modifier;

 import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.util.Attribute;
 import org.apache.lucene.util.AttributeFactory;
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeSource;

 /**
  * A <code>TokenStream</code> enumerates the sequence of tokens, either from
  * {@link Field}s of a {@link Document} or from query text.
  * <p>
  * This is an abstract class; concrete subclasses are:
  * <ul>
  * <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
  * <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
  * <code>TokenStream</code>.
  * </ul>
  * <code>TokenStream</code> extends {@link AttributeSource}, which provides
  * access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
  * Note that only one instance per {@link AttributeImpl} is created and reused
  * for every token. This approach reduces object creation and allows local
  * caching of references to the {@link AttributeImpl}s. See
  * {@link #incrementToken()} for further details.
  * <p>
  * <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
  * <ol>
  * <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
  * attributes to/from the {@link AttributeSource}.
  * <li>The consumer calls {@link TokenStream#reset()}.
  * <li>The consumer retrieves attributes from the stream and stores local
  * references to all attributes it wants to access.
  * <li>The consumer calls {@link #incrementToken()} until it returns false
  * consuming the attributes after each call.
  * <li>The consumer calls {@link #end()} so that any end-of-stream operations
  * can be performed.
  * <li>The consumer calls {@link #close()} to release any resource when finished
  * using the <code>TokenStream</code>.
  * </ol>
  * To make sure that filters and consumers know which attributes are available,
  * the attributes must be added during instantiation. Filters and consumers are
  * not required to check for availability of attributes in
  * {@link #incrementToken()}.
  * <p>
  * You can find some example code for the new API in the analysis package level
  * Javadoc.
  * <p>
  * Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
  * e.g., for buffering purposes (see {@link CachingTokenFilter},
  * TeeSinkTokenFilter). For this usecase
  * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
  * can be used.
  * <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
  * Therefore all non-abstract subclasses must be final or have at least a final
  * implementation of {@link #incrementToken}! This is checked when Java
  * assertions are enabled.
  */
 public abstract class TokenStream extends AttributeSource implements Closeable {

   /** Default {@link AttributeFactory} instance that should be used for TokenStreams. */
   public static final AttributeFactory DEFAULT_TOKEN_ATTRIBUTE_FACTORY =
     AttributeFactory.getStaticImplementation(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class);

   /**
    * A TokenStream using the default attribute factory.
    */
   protected TokenStream() {
     super(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
     assert assertFinal();
   }

   /**
    * A TokenStream that uses the same attributes as the supplied one.
    */
   protected TokenStream(AttributeSource input) {
     super(input);
     assert assertFinal();
   }

   /**
    * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
    */
   protected TokenStream(AttributeFactory factory) {
     super(factory);
     assert assertFinal();
   }

   private boolean assertFinal() {
     try {
       final Class<?> clazz = getClass();
       if (!clazz.desiredAssertionStatus())
         return true;
       assert clazz.isAnonymousClass() ||
         (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
         Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
         "TokenStream implementation classes or at least their incrementToken() implementation must be final";
       return true;
     } catch (NoSuchMethodException nsme) {
       return false;
     }
   }

   /**
    * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
    * the next token. Implementing classes must implement this method and update
    * the appropriate {@link AttributeImpl}s with the attributes of the next
    * token.
    * <P>
    * The producer must make no assumptions about the attributes after the method
    * has been returned: the caller may arbitrarily change it. If the producer
    * needs to preserve the state for subsequent calls, it can use
    * {@link #captureState} to create a copy of the current attribute state.
    * <p>
    * This method is called for every token of a document, so an efficient
    * implementation is crucial for good performance. To avoid calls to
    * {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
    * references to all {@link AttributeImpl}s that this stream uses should be
    * retrieved during instantiation.
    * <p>
    * To ensure that filters and consumers know which attributes are available,
    * the attributes must be added during instantiation. Filters and consumers
    * are not required to check for availability of attributes in
    * {@link #incrementToken()}.
    *
    * @return false for end of stream; true otherwise
    */
   public abstract boolean incrementToken() throws IOException;

   /**
    * This method is called by the consumer after the last token has been
    * consumed, after {@link #incrementToken()} returned <code>false</code>
    * (using the new <code>TokenStream</code> API). Streams implementing the old API
    * should upgrade to use this feature.
    * <p>
    * This method can be used to perform any end-of-stream operations, such as
    * setting the final offset of a stream. The final offset of a stream might
    * differ from the offset of the last token eg in case one or more whitespaces
    * followed after the last token, but a WhitespaceTokenizer was used.
    * <p>
    * Additionally any skipped positions (such as those removed by a stopfilter)
    * can be applied to the position increment, or any adjustment of other
    * attributes where the end-of-stream value may be important.
    * <p>
    * If you override this method, always call {@code super.end()}.
    *
    * @throws IOException If an I/O error occurs
    */
   public void end() throws IOException {
     endAttributes(); // LUCENE-3849: don't consume dirty atts
   }

   /**
    * This method is called by a consumer before it begins consumption using
    * {@link #incrementToken()}.
    * <p>
    * Resets this stream to a clean state. Stateful implementations must implement
    * this method so that they can be reused, just as if they had been created fresh.
    * <p>
    * If you override this method, always call {@code super.reset()}, otherwise
    * some internal state will not be correctly reset (e.g., {@link Tokenizer} will
    * throw {@link IllegalStateException} on further usage).
    */
   public void reset() throws IOException {}

   /** Releases resources associated with this stream.
    * <p>
    * If you override this method, always call {@code super.close()}, otherwise
    * some internal state will not be correctly reset (e.g., {@link Tokenizer} will
    * throw {@link IllegalStateException} on reuse).
    */
   @Override
   public void close() throws IOException {}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis;


	import java.io.IOException;
	import java.io.Closeable;
	import java.lang.reflect.Modifier;

	import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.util.Attribute;
	import org.apache.lucene.util.AttributeFactory;
	import org.apache.lucene.util.AttributeImpl;
	import org.apache.lucene.util.AttributeSource;

	/**
	* A <code>TokenStream</code> enumerates the sequence of tokens, either from
	* {@link Field}s of a {@link Document} or from query text.
	* <p>
	* This is an abstract class; concrete subclasses are:
	* <ul>
	* <li>{@link Tokenizer}, a <code>TokenStream</code> whose input is a Reader; and
	* <li>{@link TokenFilter}, a <code>TokenStream</code> whose input is another
	* <code>TokenStream</code>.
	* </ul>
	* <code>TokenStream</code> extends {@link AttributeSource}, which provides
	* access to all of the token {@link Attribute}s for the <code>TokenStream</code>.
	* Note that only one instance per {@link AttributeImpl} is created and reused
	* for every token. This approach reduces object creation and allows local
	* caching of references to the {@link AttributeImpl}s. See
	* {@link #incrementToken()} for further details.
	* <p>
	* <b>The workflow of the new <code>TokenStream</code> API is as follows:</b>
	* <ol>
	* <li>Instantiation of <code>TokenStream</code>/{@link TokenFilter}s which add/get
	* attributes to/from the {@link AttributeSource}.
	* <li>The consumer calls {@link TokenStream#reset()}.
	* <li>The consumer retrieves attributes from the stream and stores local
	* references to all attributes it wants to access.
	* <li>The consumer calls {@link #incrementToken()} until it returns false
	* consuming the attributes after each call.
	* <li>The consumer calls {@link #end()} so that any end-of-stream operations
	* can be performed.
	* <li>The consumer calls {@link #close()} to release any resource when finished
	* using the <code>TokenStream</code>.
	* </ol>
	* To make sure that filters and consumers know which attributes are available,
	* the attributes must be added during instantiation. Filters and consumers are
	* not required to check for availability of attributes in
	* {@link #incrementToken()}.
	* <p>
	* You can find some example code for the new API in the analysis package level
	* Javadoc.
	* <p>
	* Sometimes it is desirable to capture a current state of a <code>TokenStream</code>,
	* e.g., for buffering purposes (see {@link CachingTokenFilter},
	* TeeSinkTokenFilter). For this usecase
	* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
	* can be used.
	* <p>The {@code TokenStream}-API in Lucene is based on the decorator pattern.
	* Therefore all non-abstract subclasses must be final or have at least a final
	* implementation of {@link #incrementToken}! This is checked when Java
	* assertions are enabled.
	*/
	public abstract class TokenStream extends AttributeSource implements Closeable {

	/** Default {@link AttributeFactory} instance that should be used for TokenStreams. */
	public static final AttributeFactory DEFAULT_TOKEN_ATTRIBUTE_FACTORY =
	AttributeFactory.getStaticImplementation(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, PackedTokenAttributeImpl.class);

	/**
	* A TokenStream using the default attribute factory.
	*/
	protected TokenStream() {
	super(DEFAULT_TOKEN_ATTRIBUTE_FACTORY);
	assert assertFinal();
	}

	/**
	* A TokenStream that uses the same attributes as the supplied one.
	*/
	protected TokenStream(AttributeSource input) {
	super(input);
	assert assertFinal();
	}

	/**
	* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
	*/
	protected TokenStream(AttributeFactory factory) {
	super(factory);
	assert assertFinal();
	}

	private boolean assertFinal() {
	try {
	final Class<?> clazz = getClass();
	if (!clazz.desiredAssertionStatus())
	return true;
	assert clazz.isAnonymousClass() \|\|
	(clazz.getModifiers() & (Modifier.FINAL \| Modifier.PRIVATE)) != 0 \|\|
	Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
	"TokenStream implementation classes or at least their incrementToken() implementation must be final";
	return true;
	} catch (NoSuchMethodException nsme) {
	return false;
	}
	}

	/**
	* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
	* the next token. Implementing classes must implement this method and update
	* the appropriate {@link AttributeImpl}s with the attributes of the next
	* token.
	* <P>
	* The producer must make no assumptions about the attributes after the method
	* has been returned: the caller may arbitrarily change it. If the producer
	* needs to preserve the state for subsequent calls, it can use
	* {@link #captureState} to create a copy of the current attribute state.
	* <p>
	* This method is called for every token of a document, so an efficient
	* implementation is crucial for good performance. To avoid calls to
	* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
	* references to all {@link AttributeImpl}s that this stream uses should be
	* retrieved during instantiation.
	* <p>
	* To ensure that filters and consumers know which attributes are available,
	* the attributes must be added during instantiation. Filters and consumers
	* are not required to check for availability of attributes in
	* {@link #incrementToken()}.
	*
	* @return false for end of stream; true otherwise
	*/
	public abstract boolean incrementToken() throws IOException;

	/**
	* This method is called by the consumer after the last token has been
	* consumed, after {@link #incrementToken()} returned <code>false</code>
	* (using the new <code>TokenStream</code> API). Streams implementing the old API
	* should upgrade to use this feature.
	* <p>
	* This method can be used to perform any end-of-stream operations, such as
	* setting the final offset of a stream. The final offset of a stream might
	* differ from the offset of the last token eg in case one or more whitespaces
	* followed after the last token, but a WhitespaceTokenizer was used.
	* <p>
	* Additionally any skipped positions (such as those removed by a stopfilter)
	* can be applied to the position increment, or any adjustment of other
	* attributes where the end-of-stream value may be important.
	* <p>
	* If you override this method, always call {@code super.end()}.
	*
	* @throws IOException If an I/O error occurs
	*/
	public void end() throws IOException {
	endAttributes(); // LUCENE-3849: don't consume dirty atts
	}

	/**
	* This method is called by a consumer before it begins consumption using
	* {@link #incrementToken()}.
	* <p>
	* Resets this stream to a clean state. Stateful implementations must implement
	* this method so that they can be reused, just as if they had been created fresh.
	* <p>
	* If you override this method, always call {@code super.reset()}, otherwise
	* some internal state will not be correctly reset (e.g., {@link Tokenizer} will
	* throw {@link IllegalStateException} on further usage).
	*/
	public void reset() throws IOException {}

	/** Releases resources associated with this stream.
	* <p>
	* If you override this method, always call {@code super.close()}, otherwise
	* some internal state will not be correctly reset (e.g., {@link Tokenizer} will
	* throw {@link IllegalStateException} on reuse).
	*/
	@Override
	public void close() throws IOException {}

	}