blob: 4bb8f5fa3df6f20dc71c8a4257f70ea07113e6c3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.lucas.indexer.analysis;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* The opposite to a stop word filter: This filter takes a set of strings. Only
* incoming tokens which are contained in this set are returned by this filter.
*
* @author faessler
*/
public class SelectFilter extends TokenFilter {
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private CharArraySet includeWords;
boolean enablePositionIncrements;
/**
* The select filter can sometimes - for example in conjunction with the
* hypernyms or synonyms filter - cause the first token of the stream to
* have a position increment of 0. This must not happen, which is why in
* such a case we just set the position increment to 1. This is an indicator
* variable to show whether we are at the first token or later.
*/
private boolean firstToken;
/**
* Construct a token stream filtering the given input. If
* <code>includeWords</code> is an instance of {@link CharArraySet} (true if
* <code>makeStopSet()</code> was used to construct the set) it will be
* directly used and <code>ignoreCase</code> will be ignored since
* <code>CharArraySet</code> directly controls case sensitivity.
* <p/>
* If <code>includeWords</code> is not an instance of {@link CharArraySet},
* a new CharArraySet will be constructed and <code>ignoreCase</code> will
* be used to specify the case sensitivity of that set.
*
* @param enablePositionIncrements
* true if token positions should record the removed non-include
* words
* @param input
* Input TokenStream
* @param includeWords
* The set of words to include.
* @param ignoreCase
* -Ignore case when filtering.
*/
public SelectFilter(boolean enablePositionIncrements, TokenStream input,
Set<?> includeWords, boolean ignoreCase) {
super(input);
if (includeWords instanceof CharArraySet) {
this.includeWords = (CharArraySet) includeWords;
} else {
this.includeWords = new CharArraySet(includeWords.size(),
ignoreCase);
this.includeWords.addAll(includeWords);
}
this.enablePositionIncrements = enablePositionIncrements;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
firstToken = true;
}
/**
* Returns the next input Token whose term() is an included word.
*/
public final boolean incrementToken() throws IOException {
// return the first word found to be included
int skippedPositions = 0;
while (input.incrementToken()) {
if (includeWords.contains(termAtt.termBuffer(), 0,
termAtt.termLength())) {
// The first token must not have a position increment of 0.
if (posIncrAtt.getPositionIncrement() == 0 && firstToken)
posIncrAtt.setPositionIncrement(1);
if (enablePositionIncrements) {
posIncrAtt.setPositionIncrement(posIncrAtt
.getPositionIncrement() + skippedPositions);
}
firstToken = false;
return true;
}
skippedPositions += posIncrAtt.getPositionIncrement();
}
// reached EOS -- return null
return false;
}
}