blob: dd06470c4e51c2b59a9a691cc984bdf676ebd3c0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.luke.models.analysis;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.lang.reflect.Modifier;
import java.net.URL;
import java.net.URLClassLoader;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.luke.models.LukeException;
import org.apache.lucene.luke.util.reflection.ClassScanner;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.IOUtils;
/** Default implementation of {@link AnalysisImpl} */
public final class AnalysisImpl implements Analysis {
private List<Class<? extends Analyzer>> presetAnalyzerTypes;
private Analyzer analyzer;
@Override
public void addExternalJars(List<String> jarFiles) {
List<URL> urls = new ArrayList<>();
for (String jarFile : jarFiles) {
Path path = FileSystems.getDefault().getPath(jarFile);
if (!Files.exists(path) || !jarFile.endsWith(".jar")) {
throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile));
}
try {
URL url = path.toUri().toURL();
urls.add(url);
} catch (IOException e) {
throw new LukeException(e.getMessage(), e);
}
}
// reload available tokenizers, charfilters, and tokenfilters
URLClassLoader classLoader = new URLClassLoader(
urls.toArray(new URL[0]), this.getClass().getClassLoader());
CharFilterFactory.reloadCharFilters(classLoader);
TokenizerFactory.reloadTokenizers(classLoader);
TokenFilterFactory.reloadTokenFilters(classLoader);
}
@Override
public Collection<Class<? extends Analyzer>> getPresetAnalyzerTypes() {
if (Objects.isNull(presetAnalyzerTypes)) {
List<Class<? extends Analyzer>> types = new ArrayList<>();
for (Class<? extends Analyzer> clazz : getInstantiableSubTypesBuiltIn(Analyzer.class)) {
try {
// add to presets if no args constructor is available
clazz.getConstructor();
types.add(clazz);
} catch (NoSuchMethodException e) {
}
}
presetAnalyzerTypes = Collections.unmodifiableList(types);
}
return presetAnalyzerTypes;
}
@Override
public Collection<String> getAvailableCharFilters() {
return CharFilterFactory.availableCharFilters().stream().sorted().collect(Collectors.toList());
}
@Override
public Collection<String> getAvailableTokenizers() {
return TokenizerFactory.availableTokenizers().stream().sorted().collect(Collectors.toList());
}
@Override
public Collection<String> getAvailableTokenFilters() {
return TokenFilterFactory.availableTokenFilters().stream().sorted().collect(Collectors.toList());
}
private <T> List<Class<? extends T>> getInstantiableSubTypesBuiltIn(Class<T> superType) {
ClassScanner scanner = new ClassScanner("org.apache.lucene.analysis", getClass().getClassLoader());
Set<Class<? extends T>> types = scanner.scanSubTypes(superType);
return types.stream()
.filter(type -> !Modifier.isAbstract(type.getModifiers()))
.filter(type -> !type.getSimpleName().startsWith("Mock"))
.sorted(Comparator.comparing(Class::getName))
.collect(Collectors.toList());
}
@Override
public List<Token> analyze(String text) {
Objects.requireNonNull(text);
if (analyzer == null) {
throw new LukeException("Analyzer is not set.");
}
try {
List<Token> result = new ArrayList<>();
TokenStream stream = analyzer.tokenStream("", text);
stream.reset();
CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class);
// iterate tokens
while (stream.incrementToken()) {
List<TokenAttribute> attributes = copyAttributes(stream, charAtt);
result.add(new Token(charAtt.toString(), attributes));
}
stream.close();
return result;
} catch (IOException e) {
throw new LukeException(e.getMessage(), e);
}
}
private List<TokenAttribute> copyAttributes(TokenStream tokenStream, CharTermAttribute charAtt) {
List<TokenAttribute> attributes = new ArrayList<>();
Iterator<AttributeImpl> itr = tokenStream.getAttributeImplsIterator();
while(itr.hasNext()) {
AttributeImpl att = itr.next();
Map<String, String> attValues = new LinkedHashMap<>();
att.reflectWith((attClass, key, value) -> {
if (value != null)
attValues.put(key, value.toString());
});
attributes.add(new TokenAttribute(att.getClass().getSimpleName(), attValues));
}
return attributes;
}
@Override
public Analyzer createAnalyzerFromClassName(String analyzerType) {
Objects.requireNonNull(analyzerType);
try {
Class<? extends Analyzer> clazz = Class.forName(analyzerType).asSubclass(Analyzer.class);
this.analyzer = clazz.newInstance();
return analyzer;
} catch (ReflectiveOperationException e) {
throw new LukeException(String.format(Locale.ENGLISH, "Failed to instantiate class: %s", analyzerType), e);
}
}
@Override
public Analyzer buildCustomAnalyzer(CustomAnalyzerConfig config) {
Objects.requireNonNull(config);
try {
// create builder
CustomAnalyzer.Builder builder = config.getConfigDir()
.map(path -> CustomAnalyzer.builder(FileSystems.getDefault().getPath(path)))
.orElse(CustomAnalyzer.builder());
// set tokenizer
builder.withTokenizer(config.getTokenizerConfig().getName(), config.getTokenizerConfig().getParams());
// add char filters
for (CustomAnalyzerConfig.ComponentConfig cfConf : config.getCharFilterConfigs()) {
builder.addCharFilter(cfConf.getName(), cfConf.getParams());
}
// add token filters
for (CustomAnalyzerConfig.ComponentConfig tfConf : config.getTokenFilterConfigs()) {
builder.addTokenFilter(tfConf.getName(), tfConf.getParams());
}
// build analyzer
this.analyzer = builder.build();
return analyzer;
} catch (Exception e) {
throw new LukeException("Failed to build custom analyzer.", e);
}
}
@Override
public Analyzer currentAnalyzer() {
if (analyzer == null) {
throw new LukeException("Analyzer is not set.");
}
return analyzer;
}
@Override
public StepByStepResult analyzeStepByStep(String text){
Objects.requireNonNull(text);
if (analyzer == null) {
throw new LukeException("Analyzer is not set.");
}
if (!(analyzer instanceof CustomAnalyzer)) {
throw new LukeException("Analyzer is not CustomAnalyzer.");
}
List<NamedTokens> namedTokens = new ArrayList<>();
List<CharfilteredText> charfilteredTexts = new ArrayList<>();
try {
CustomAnalyzer customAnalyzer = (CustomAnalyzer)analyzer;
final List<CharFilterFactory> charFilterFactories = customAnalyzer.getCharFilterFactories();
Reader reader = new StringReader(text);
String charFilteredSource = text;
if (charFilterFactories.size() > 0) {
Reader cs = reader;
for (CharFilterFactory charFilterFactory : charFilterFactories) {
cs = charFilterFactory.create(reader);
Reader readerForWriteOut = new StringReader(charFilteredSource);
readerForWriteOut = charFilterFactory.create(readerForWriteOut);
charFilteredSource = writeCharStream(readerForWriteOut);
charfilteredTexts.add(new CharfilteredText(CharFilterFactory.findSPIName(charFilterFactory.getClass()), charFilteredSource));
}
reader = cs;
}
final TokenizerFactory tokenizerFactory = customAnalyzer.getTokenizerFactory();
final List<TokenFilterFactory> tokenFilterFactories = customAnalyzer.getTokenFilterFactories();
TokenStream tokenStream = tokenizerFactory.create();
((Tokenizer)tokenStream).setReader(reader);
List<Token> tokens = new ArrayList<>();
List<AttributeSource> attributeSources = analyzeTokenStream(tokenStream, tokens);
namedTokens.add(new NamedTokens(TokenizerFactory.findSPIName(tokenizerFactory.getClass()), tokens));
ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, attributeSources);
for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
tokens = new ArrayList<>();
attributeSources = analyzeTokenStream(tokenStream, tokens);
namedTokens.add(new NamedTokens(TokenFilterFactory.findSPIName(tokenFilterFactory.getClass()), tokens));
try {
listBasedTokenStream.close();
} catch (IOException e) {
// do nothing;
}
listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, attributeSources);
}
try {
listBasedTokenStream.close();
} catch (IOException e) {
// do nothing.
} finally {
reader.close();
}
return new StepByStepResult(charfilteredTexts, namedTokens);
} catch (Exception e) {
throw new LukeException(e.getMessage(), e);
}
}
/**
* Analyzes the given TokenStream, collecting the Tokens it produces.
*
* @param tokenStream TokenStream to analyze
*
* @return List of tokens produced from the TokenStream
*/
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) {
final List<AttributeSource> tokens = new ArrayList<>();
try {
tokenStream.reset();
CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
tokens.add(tokenStream.cloneAttributes());
List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt);
result.add(new Token(charAtt.toString(), attributes));
}
tokenStream.end();
} catch (IOException ioe) {
throw new RuntimeException("Error occurred while iterating over TokenStream", ioe);
} finally {
IOUtils.closeWhileHandlingException(tokenStream);
}
return tokens;
}
/**
* TokenStream that iterates over a list of pre-existing Tokens
* see org.apache.solr.handler.AnalysisRequestHandlerBase#ListBasedTokenStream
*/
protected final static class ListBasedTokenStream extends TokenStream {
private final List<AttributeSource> tokens;
private Iterator<AttributeSource> tokenIterator;
/**
* Creates a new ListBasedTokenStream which uses the given tokens as its token source.
*
* @param attributeSource source of the attribute factory and attribute impls
* @param tokens Source of tokens to be used
*/
ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) {
super(attributeSource.getAttributeFactory());
this.tokens = tokens;
// Make sure all the attributes of the source are here too
addAttributes(attributeSource);
}
@Override
public void reset() throws IOException {
super.reset();
tokenIterator = tokens.iterator();
}
@Override
public boolean incrementToken() {
if (tokenIterator.hasNext()) {
clearAttributes();
AttributeSource next = tokenIterator.next();
addAttributes(next);
next.copyTo(this);
return true;
} else {
return false;
}
}
void addAttributes(AttributeSource attributeSource) {
Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator();
while (atts.hasNext()) {
addAttributeImpl(atts.next()); // adds both impl & interfaces
}
}
}
private static String writeCharStream(Reader input ){
final int BUFFER_SIZE = 1024;
char[] buf = new char[BUFFER_SIZE];
int len = 0;
StringBuilder sb = new StringBuilder();
do {
try {
len = input.read( buf, 0, BUFFER_SIZE );
} catch (IOException e) {
throw new RuntimeException("Error occurred while iterating over charfiltering", e);
}
if( len > 0 )
sb.append(buf, 0, len);
} while( len == BUFFER_SIZE );
return sb.toString();
}
}