| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.scoring.similarity.util; |
| |
| import java.io.StringReader; |
| import java.util.List; |
| |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.core.LowerCaseFilter; |
| import org.apache.lucene.analysis.core.StopFilter; |
| import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; |
| import org.apache.lucene.analysis.en.PorterStemFilter; |
| import org.apache.lucene.analysis.standard.ClassicTokenizer; |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; |
| import org.apache.lucene.analysis.standard.StandardTokenizer; |
| import org.apache.lucene.analysis.shingle.ShingleFilter; |
| import org.apache.lucene.analysis.CharArraySet; |
| import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; |
| |
| public class LuceneTokenizer { |
| |
| private TokenStream tokenStream; |
| private TokenizerType tokenizer; |
| private StemFilterType stemFilterType; |
| private CharArraySet stopSet = null; |
| |
| public static enum TokenizerType {CLASSIC, STANDARD} |
| |
| /** |
| * Creates a tokenizer based on param values |
| * @param content - The text to tokenize |
| * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT |
| * @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset |
| * @param stemFilterType - Type of stemming to perform |
| */ |
| public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) { |
| this.tokenizer = tokenizer; |
| this.stemFilterType = stemFilterType; |
| if(useStopFilter) { |
| stopSet = StandardAnalyzer.STOP_WORDS_SET; |
| } |
| tokenStream = createTokenStream(content); |
| } |
| |
| /** |
| * Creates a tokenizer based on param values |
| * @param content - The text to tokenize |
| * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT |
| * @param stopWords - Provide a set of user defined stop words |
| * @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set. |
| * If false, then only the user provided words will be used as the stop set |
| * @param stemFilterType |
| */ |
| public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) { |
| this.tokenizer = tokenizer; |
| this.stemFilterType = stemFilterType; |
| if(addToDefault) { |
| CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);; |
| for(String word : stopWords){ |
| stopSet.add(word); |
| } |
| this.stopSet = stopSet; |
| } |
| else { |
| stopSet = new CharArraySet(stopWords, true); |
| } |
| tokenStream = createTokenStream(content); |
| } |
| |
| /** |
| * Returns the tokenStream created by the Tokenizer |
| * @return |
| */ |
| public TokenStream getTokenStream() { |
| return tokenStream; |
| } |
| |
| /** |
| * Creates a tokenizer for the ngram model based on param values |
| * @param content - The text to tokenize |
| * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT |
| * @param stemFilterType - Type of stemming to perform |
| * @param mingram - Value of mingram for tokenizing |
| * @param maxgram - Value of maxgram for tokenizing |
| */ |
| public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) { |
| this.tokenizer = tokenizer; |
| this.stemFilterType = stemFilterType; |
| tokenStream = createNGramTokenStream(content, mingram, maxgram); |
| } |
| |
| private TokenStream createTokenStream(String content) { |
| tokenStream = generateTokenStreamFromText(content, tokenizer); |
| tokenStream = new LowerCaseFilter(tokenStream); |
| if(stopSet != null) { |
| tokenStream = applyStopFilter(stopSet); |
| } |
| tokenStream = applyStemmer(stemFilterType); |
| return tokenStream; |
| } |
| |
| private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){ |
| Tokenizer tokenizer = null; |
| switch(tokenizerType){ |
| case CLASSIC: |
| tokenizer = new ClassicTokenizer(); |
| break; |
| |
| case STANDARD: |
| default: |
| tokenizer = new StandardTokenizer(); |
| } |
| |
| tokenizer.setReader(new StringReader(content)); |
| |
| tokenStream = tokenizer; |
| |
| return tokenStream; |
| } |
| |
| private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { |
| Tokenizer tokenizer = new StandardTokenizer(); |
| tokenizer.setReader(new StringReader(content)); |
| tokenStream = new LowerCaseFilter(tokenizer); |
| tokenStream = applyStemmer(stemFilterType); |
| ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); |
| shingleFilter.setOutputUnigrams(false); |
| tokenStream = (TokenStream)shingleFilter; |
| return tokenStream; |
| } |
| |
| private TokenStream applyStopFilter(CharArraySet stopWords) { |
| tokenStream = new StopFilter(tokenStream, stopWords); |
| return tokenStream; |
| } |
| |
| private TokenStream applyStemmer(StemFilterType stemFilterType) { |
| switch(stemFilterType){ |
| case ENGLISHMINIMALSTEM_FILTER: |
| tokenStream = new EnglishMinimalStemFilter(tokenStream); |
| break; |
| case PORTERSTEM_FILTER: |
| tokenStream = new PorterStemFilter(tokenStream); |
| break; |
| default: |
| break; |
| } |
| |
| return tokenStream; |
| } |
| } |