| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.standard; |
| |
| |
| import org.apache.lucene.analysis.TokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| |
| /** Normalizes tokens extracted with {@link ClassicTokenizer}. */ |
| |
| public class ClassicFilter extends TokenFilter { |
| |
| /** Construct filtering <i>in</i>. */ |
| public ClassicFilter(TokenStream in) { |
| super(in); |
| } |
| |
| private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE]; |
| private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM]; |
| |
| // this filters uses attribute type |
| private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| |
| /** Returns the next token in the stream, or null at EOS. |
| * <p>Removes <tt>'s</tt> from the end of words. |
| * <p>Removes dots from acronyms. |
| */ |
| @Override |
| public final boolean incrementToken() throws java.io.IOException { |
| if (!input.incrementToken()) { |
| return false; |
| } |
| |
| final char[] buffer = termAtt.buffer(); |
| final int bufferLength = termAtt.length(); |
| final String type = typeAtt.type(); |
| |
| if (type == APOSTROPHE_TYPE && // remove 's |
| bufferLength >= 2 && |
| buffer[bufferLength-2] == '\'' && |
| (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { |
| // Strip last 2 characters off |
| termAtt.setLength(bufferLength - 2); |
| } else if (type == ACRONYM_TYPE) { // remove dots |
| int upto = 0; |
| for(int i=0;i<bufferLength;i++) { |
| char c = buffer[i]; |
| if (c != '.') |
| buffer[upto++] = c; |
| } |
| termAtt.setLength(upto); |
| } |
| |
| return true; |
| } |
| } |