| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.analysis.classic; |
| |
| import java.io.IOException; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.analysis.tokenattributes.TypeAttribute; |
| import org.apache.lucene.util.AttributeFactory; |
| |
| /** |
| * A grammar-based tokenizer constructed with JFlex |
| * |
| * <p>This should be a good tokenizer for most European-language documents: |
| * |
| * <ul> |
| * <li>Splits words at punctuation characters, removing punctuation. However, a dot that's not |
| * followed by whitespace is considered part of a token. |
| * <li>Splits words at hyphens, unless there's a number in the token, in which case the whole |
| * token is interpreted as a product number and is not split. |
| * <li>Recognizes email addresses and internet hostnames as one token. |
| * </ul> |
| * |
| * <p>Many applications have specific tokenizer needs. If this tokenizer does not suit your |
| * application, please consider copying this source code directory to your project and maintaining |
| * your own grammar-based tokenizer. |
| * |
| * <p>ClassicTokenizer was named StandardTokenizer in Lucene versions prior to 3.1. As of 3.1, |
| * {@link StandardTokenizer} implements Unicode text segmentation, as specified by UAX#29. |
| */ |
| public final class ClassicTokenizer extends Tokenizer { |
| /** A private instance of the JFlex-constructed scanner */ |
| private ClassicTokenizerImpl scanner; |
| |
| public static final int ALPHANUM = 0; |
| public static final int APOSTROPHE = 1; |
| public static final int ACRONYM = 2; |
| public static final int COMPANY = 3; |
| public static final int EMAIL = 4; |
| public static final int HOST = 5; |
| public static final int NUM = 6; |
| public static final int CJ = 7; |
| |
| public static final int ACRONYM_DEP = 8; |
| |
| /** String token types that correspond to token type int constants */ |
| public static final String[] TOKEN_TYPES = |
| new String[] { |
| "<ALPHANUM>", |
| "<APOSTROPHE>", |
| "<ACRONYM>", |
| "<COMPANY>", |
| "<EMAIL>", |
| "<HOST>", |
| "<NUM>", |
| "<CJ>", |
| "<ACRONYM_DEP>" |
| }; |
| |
| private int skippedPositions; |
| |
| private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; |
| |
| /** Set the max allowed token length. Any token longer than this is skipped. */ |
| public void setMaxTokenLength(int length) { |
| if (length < 1) { |
| throw new IllegalArgumentException("maxTokenLength must be greater than zero"); |
| } |
| this.maxTokenLength = length; |
| } |
| |
| /** @see #setMaxTokenLength */ |
| public int getMaxTokenLength() { |
| return maxTokenLength; |
| } |
| |
| /** |
| * Creates a new instance of the {@link ClassicTokenizer}. Attaches the <code>input</code> to the |
| * newly created JFlex scanner. |
| * |
| * <p>See http://issues.apache.org/jira/browse/LUCENE-1068 |
| */ |
| public ClassicTokenizer() { |
| init(); |
| } |
| |
| /** Creates a new ClassicTokenizer with a given {@link org.apache.lucene.util.AttributeFactory} */ |
| public ClassicTokenizer(AttributeFactory factory) { |
| super(factory); |
| init(); |
| } |
| |
| private void init() { |
| this.scanner = new ClassicTokenizerImpl(input); |
| } |
| |
| // this tokenizer generates three attributes: |
| // term offset, positionIncrement and type |
| private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); |
| private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); |
| private final PositionIncrementAttribute posIncrAtt = |
| addAttribute(PositionIncrementAttribute.class); |
| private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.analysis.TokenStream#next() |
| */ |
| @Override |
| public final boolean incrementToken() throws IOException { |
| clearAttributes(); |
| skippedPositions = 0; |
| |
| while (true) { |
| int tokenType = scanner.getNextToken(); |
| |
| if (tokenType == ClassicTokenizerImpl.YYEOF) { |
| return false; |
| } |
| |
| if (scanner.yylength() <= maxTokenLength) { |
| posIncrAtt.setPositionIncrement(skippedPositions + 1); |
| scanner.getText(termAtt); |
| final int start = scanner.yychar(); |
| offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length())); |
| |
| if (tokenType == ClassicTokenizer.ACRONYM_DEP) { |
| typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST]); |
| termAtt.setLength(termAtt.length() - 1); // remove extra '.' |
| } else { |
| typeAtt.setType(ClassicTokenizer.TOKEN_TYPES[tokenType]); |
| } |
| return true; |
| } else |
| // When we skip a too-long term, we still increment the |
| // position increment |
| skippedPositions++; |
| } |
| } |
| |
| @Override |
| public final void end() throws IOException { |
| super.end(); |
| // set final offset |
| int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); |
| offsetAtt.setOffset(finalOffset, finalOffset); |
| // adjust any skipped tokens |
| posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| super.close(); |
| scanner.yyreset(input); |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| scanner.yyreset(input); |
| skippedPositions = 0; |
| } |
| } |