blob: 918af9b136274475da8b21dd7b981fbc7a6dbb6d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.scoring.similarity.util;
import java.io.StringReader;
import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
public class LuceneTokenizer {
private TokenStream tokenStream;
private TokenizerType tokenizer;
private StemFilterType stemFilterType;
private CharArraySet stopSet = null;
public static enum TokenizerType {CLASSIC, STANDARD}
/**
* Creates a tokenizer based on param values
* @param content - The text to tokenize
* @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
* @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset
* @param stemFilterType - Type of stemming to perform
*/
public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
if(useStopFilter) {
stopSet = StandardAnalyzer.STOP_WORDS_SET;
}
tokenStream = createTokenStream(content);
}
/**
* Creates a tokenizer based on param values
* @param content - The text to tokenize
* @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
* @param stopWords - Provide a set of user defined stop words
* @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
* If false, then only the user provided words will be used as the stop set
* @param stemFilterType
*/
public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
if(addToDefault) {
CharArraySet stopSet = CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
for(String word : stopWords){
stopSet.add(word);
}
this.stopSet = stopSet;
}
else {
stopSet = new CharArraySet(stopWords, true);
}
tokenStream = createTokenStream(content);
}
/**
* Returns the tokenStream created by the Tokenizer
* @return
*/
public TokenStream getTokenStream() {
return tokenStream;
}
/**
* Creates a tokenizer for the ngram model based on param values
* @param content - The text to tokenize
* @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
* @param stemFilterType - Type of stemming to perform
* @param mingram - Value of mingram for tokenizing
* @param maxgram - Value of maxgram for tokenizing
*/
public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
tokenStream = createNGramTokenStream(content, mingram, maxgram);
}
private TokenStream createTokenStream(String content) {
tokenStream = generateTokenStreamFromText(content, tokenizer);
tokenStream = new LowerCaseFilter(tokenStream);
if(stopSet != null) {
tokenStream = applyStopFilter(stopSet);
}
tokenStream = applyStemmer(stemFilterType);
return tokenStream;
}
private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizerType){
Tokenizer tokenizer = null;
switch(tokenizerType){
case CLASSIC:
tokenizer = new ClassicTokenizer();
break;
case STANDARD:
default:
tokenizer = new StandardTokenizer();
}
tokenizer.setReader(new StringReader(content));
tokenStream = tokenizer;
return tokenStream;
}
private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) {
Tokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(new StringReader(content));
tokenStream = new LowerCaseFilter(tokenizer);
tokenStream = applyStemmer(stemFilterType);
ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram);
shingleFilter.setOutputUnigrams(false);
tokenStream = (TokenStream)shingleFilter;
return tokenStream;
}
private TokenStream applyStopFilter(CharArraySet stopWords) {
tokenStream = new StopFilter(tokenStream, stopWords);
return tokenStream;
}
private TokenStream applyStemmer(StemFilterType stemFilterType) {
switch(stemFilterType){
case ENGLISHMINIMALSTEM_FILTER:
tokenStream = new EnglishMinimalStemFilter(tokenStream);
break;
case PORTERSTEM_FILTER:
tokenStream = new PorterStemFilter(tokenStream);
break;
default:
break;
}
return tokenStream;
}
}