| /* |
| * This software was produced for the U. S. Government |
| * under Contract No. W15P7T-11-C-F600, and is |
| * subject to the Rights in Noncommercial Computer Software |
| * and Noncommercial Computer Software Documentation |
| * Clause 252.227-7014 (JUN 1995) |
| * |
| * Copyright 2013 The MITRE Corporation. All Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.solr.handler.tagger; |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| |
| /** |
| * Simple TokenFilter that lookup only Tokens with more as the parsed number |
| * of chars.<p> |
| * <b>NOTE:</b>This implementation is only intended to be used as an example |
| * and for unit testing the {@link TaggingAttribute} feature. Typically |
| * implementations will be based on NLP results (e.g. using POS tags or |
| * detected Named Entities). |
| * <p> |
| * <b>Example Usage:</b><p> |
| * Currently the usage requires to modify the Analyzer as defined by the |
| * <code>indexedField</code>. An alternative would be to allow the configuration |
| * of a special FieldType in the schema.xml and use this Analyzer for processing |
| * the text sent to the request.<p> |
| * While the current solution is fine for direct API usage, defining the |
| * Analyzer in the schema.xml would be better suitable for using this feature |
| * with the {@link TaggerRequestHandler}. |
| * |
| * <pre class="prettyprint"> |
| * Analyzer analyzer = req.getSchema().getField(indexedField).getType().getAnalyzer(); |
| * //get the TokenStream from the Analyzer |
| * TokenStream baseStream = analyzer.tokenStream("", reader); |
| * //add a FilterStream that sets the LookupAttribute to the end |
| * TokenStream filterStream = new WordLengthLookupFilter(baseStream); |
| * //create the Tagger using the modified analyzer chain. |
| * new Tagger(corpus, filterStream, tagClusterReducer) { |
| * |
| * protected void tagCallback(int startOffset, int endOffset, long docIdsKey) { |
| * //implement the callback |
| * } |
| * |
| * }.process(); |
| * </pre> |
| */ |
| public class WordLengthTaggingFilter extends TokenFilter { |
| |
| /** |
| * The default minimum length is <code>3</code> |
| */ |
| public static final int DEFAULT_MIN_LENGTH = 3; |
| private final TaggingAttribute lookupAtt = addAttribute(TaggingAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| private int minLength; |
| |
| /** |
| * TokenFilter only marks tokens to be looked up with equals or more as |
| * {@link #DEFAULT_MIN_LENGTH} characters |
| */ |
| public WordLengthTaggingFilter(TokenStream input) { |
| this(input, null); |
| } |
| |
| /** |
| * TokenFilter only marks tokens to be looked up with equals or more characters |
| * as the parsed minimum. |
| * |
| * @param input the TokenStream to consume tokens from |
| * @param minLength The minimum length to lookup a Token. <code>null</code> |
| * or <= 0 to use the #DEFAULT_MIN_LENGTH |
| */ |
| public WordLengthTaggingFilter(TokenStream input, Integer minLength) { |
| super(input); |
| if (minLength == null || minLength <= 0) { |
| this.minLength = DEFAULT_MIN_LENGTH; |
| } else { |
| this.minLength = minLength; |
| } |
| } |
| |
| @Override |
| public final boolean incrementToken() throws IOException { |
| if (input.incrementToken()) { |
| int size = offsetAtt.endOffset() - offsetAtt.startOffset(); |
| lookupAtt.setTaggable(size >= minLength); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| } |