Lucas/src/main/java/org/apache/uima/lucas/indexer/analysis/SelectFilter.java - uima-addons - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.lucas.indexer.analysis;

 import java.io.IOException;
 import java.util.Set;

 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;

 /**
  * The opposite to a stop word filter: This filter takes a set of strings. Only
  * incoming tokens which are contained in this set are returned by this filter.
  *
  * @author faessler
  */
 public class SelectFilter extends TokenFilter {

 	private TermAttribute termAtt;
 	private PositionIncrementAttribute posIncrAtt;
 	private CharArraySet includeWords;
 	boolean enablePositionIncrements;
 	/**
 	 * The select filter can sometimes - for example in conjunction with the
 	 * hypernyms or synonyms filter - cause the first token of the stream to
 	 * have a position increment of 0. This must not happen, which is why in
 	 * such a case we just set the position increment to 1. This is an indicator
 	 * variable to show whether we are at the first token or later.
 	 */
 	private boolean firstToken;

 	/**
 	 * Construct a token stream filtering the given input. If
 	 * <code>includeWords</code> is an instance of {@link CharArraySet} (true if
 	 * <code>makeStopSet()</code> was used to construct the set) it will be
 	 * directly used and <code>ignoreCase</code> will be ignored since
 	 * <code>CharArraySet</code> directly controls case sensitivity.
 	 * <p/>
 	 * If <code>includeWords</code> is not an instance of {@link CharArraySet},
 	 * a new CharArraySet will be constructed and <code>ignoreCase</code> will
 	 * be used to specify the case sensitivity of that set.
 	 *
 	 * @param enablePositionIncrements
 	 *            true if token positions should record the removed non-include
 	 *            words
 	 * @param input
 	 *            Input TokenStream
 	 * @param includeWords
 	 *            The set of words to include.
 	 * @param ignoreCase
 	 *            -Ignore case when filtering.
 	 */
 	public SelectFilter(boolean enablePositionIncrements, TokenStream input,
 			Set<?> includeWords, boolean ignoreCase) {
 		super(input);
 		if (includeWords instanceof CharArraySet) {
 			this.includeWords = (CharArraySet) includeWords;
 		} else {
 			this.includeWords = new CharArraySet(includeWords.size(),
 					ignoreCase);
 			this.includeWords.addAll(includeWords);
 		}
 		this.enablePositionIncrements = enablePositionIncrements;
 		termAtt = (TermAttribute) addAttribute(TermAttribute.class);
 		posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
 		firstToken = true;
 	}

 	/**
 	 * Returns the next input Token whose term() is an included word.
 	 */
 	public final boolean incrementToken() throws IOException {
 		// return the first word found to be included
 		int skippedPositions = 0;
 		while (input.incrementToken()) {
 			if (includeWords.contains(termAtt.termBuffer(), 0,
 					termAtt.termLength())) {
 				// The first token must not have a position increment of 0.
 				if (posIncrAtt.getPositionIncrement() == 0 && firstToken)
 					posIncrAtt.setPositionIncrement(1);
 				if (enablePositionIncrements) {
 					posIncrAtt.setPositionIncrement(posIncrAtt
 							.getPositionIncrement() + skippedPositions);
 				}
 				firstToken = false;
 				return true;
 			}
 			skippedPositions += posIncrAtt.getPositionIncrement();
 		}
 		// reached EOS -- return null
 		return false;
 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.lucas.indexer.analysis;

	import java.io.IOException;
	import java.util.Set;

	import org.apache.lucene.analysis.CharArraySet;
	import org.apache.lucene.analysis.TokenFilter;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
	import org.apache.lucene.analysis.tokenattributes.TermAttribute;

	/**
	* The opposite to a stop word filter: This filter takes a set of strings. Only
	* incoming tokens which are contained in this set are returned by this filter.
	*
	* @author faessler
	*/
	public class SelectFilter extends TokenFilter {

	private TermAttribute termAtt;
	private PositionIncrementAttribute posIncrAtt;
	private CharArraySet includeWords;
	boolean enablePositionIncrements;
	/**
	* The select filter can sometimes - for example in conjunction with the
	* hypernyms or synonyms filter - cause the first token of the stream to
	* have a position increment of 0. This must not happen, which is why in
	* such a case we just set the position increment to 1. This is an indicator
	* variable to show whether we are at the first token or later.
	*/
	private boolean firstToken;

	/**
	* Construct a token stream filtering the given input. If
	* <code>includeWords</code> is an instance of {@link CharArraySet} (true if
	* <code>makeStopSet()</code> was used to construct the set) it will be
	* directly used and <code>ignoreCase</code> will be ignored since
	* <code>CharArraySet</code> directly controls case sensitivity.
	* <p/>
	* If <code>includeWords</code> is not an instance of {@link CharArraySet},
	* a new CharArraySet will be constructed and <code>ignoreCase</code> will
	* be used to specify the case sensitivity of that set.
	*
	* @param enablePositionIncrements
	* true if token positions should record the removed non-include
	* words
	* @param input
	* Input TokenStream
	* @param includeWords
	* The set of words to include.
	* @param ignoreCase
	* -Ignore case when filtering.
	*/
	public SelectFilter(boolean enablePositionIncrements, TokenStream input,
	Set<?> includeWords, boolean ignoreCase) {
	super(input);
	if (includeWords instanceof CharArraySet) {
	this.includeWords = (CharArraySet) includeWords;
	} else {
	this.includeWords = new CharArraySet(includeWords.size(),
	ignoreCase);
	this.includeWords.addAll(includeWords);
	}
	this.enablePositionIncrements = enablePositionIncrements;
	termAtt = (TermAttribute) addAttribute(TermAttribute.class);
	posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
	firstToken = true;
	}

	/**
	* Returns the next input Token whose term() is an included word.
	*/
	public final boolean incrementToken() throws IOException {
	// return the first word found to be included
	int skippedPositions = 0;
	while (input.incrementToken()) {
	if (includeWords.contains(termAtt.termBuffer(), 0,
	termAtt.termLength())) {
	// The first token must not have a position increment of 0.
	if (posIncrAtt.getPositionIncrement() == 0 && firstToken)
	posIncrAtt.setPositionIncrement(1);
	if (enablePositionIncrements) {
	posIncrAtt.setPositionIncrement(posIncrAtt
	.getPositionIncrement() + skippedPositions);
	}
	firstToken = false;
	return true;
	}
	skippedPositions += posIncrAtt.getPositionIncrement();
	}
	// reached EOS -- return null
	return false;
	}

	}