| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.luke.models.analysis; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.lang.reflect.Modifier; |
| import java.net.URL; |
| import java.net.URLClassLoader; |
| import java.nio.file.FileSystems; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.Iterator; |
| import java.util.LinkedHashMap; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Objects; |
| import java.util.Set; |
| import java.util.stream.Collectors; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.Tokenizer; |
| import org.apache.lucene.analysis.custom.CustomAnalyzer; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.util.CharFilterFactory; |
| import org.apache.lucene.analysis.util.TokenFilterFactory; |
| import org.apache.lucene.analysis.util.TokenizerFactory; |
| import org.apache.lucene.luke.models.LukeException; |
| import org.apache.lucene.luke.util.reflection.ClassScanner; |
| import org.apache.lucene.util.AttributeImpl; |
| import org.apache.lucene.util.AttributeSource; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** Default implementation of {@link AnalysisImpl} */ |
| public final class AnalysisImpl implements Analysis { |
| |
| private List<Class<? extends Analyzer>> presetAnalyzerTypes; |
| |
| private Analyzer analyzer; |
| |
| @Override |
| public void addExternalJars(List<String> jarFiles) { |
| List<URL> urls = new ArrayList<>(); |
| |
| for (String jarFile : jarFiles) { |
| Path path = FileSystems.getDefault().getPath(jarFile); |
| if (!Files.exists(path) || !jarFile.endsWith(".jar")) { |
| throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile)); |
| } |
| try { |
| URL url = path.toUri().toURL(); |
| urls.add(url); |
| } catch (IOException e) { |
| throw new LukeException(e.getMessage(), e); |
| } |
| } |
| |
| // reload available tokenizers, charfilters, and tokenfilters |
| URLClassLoader classLoader = new URLClassLoader( |
| urls.toArray(new URL[0]), this.getClass().getClassLoader()); |
| CharFilterFactory.reloadCharFilters(classLoader); |
| TokenizerFactory.reloadTokenizers(classLoader); |
| TokenFilterFactory.reloadTokenFilters(classLoader); |
| } |
| |
| @Override |
| public Collection<Class<? extends Analyzer>> getPresetAnalyzerTypes() { |
| if (Objects.isNull(presetAnalyzerTypes)) { |
| List<Class<? extends Analyzer>> types = new ArrayList<>(); |
| for (Class<? extends Analyzer> clazz : getInstantiableSubTypesBuiltIn(Analyzer.class)) { |
| try { |
| // add to presets if no args constructor is available |
| clazz.getConstructor(); |
| types.add(clazz); |
| } catch (NoSuchMethodException e) { |
| } |
| } |
| presetAnalyzerTypes = Collections.unmodifiableList(types); |
| } |
| return presetAnalyzerTypes; |
| } |
| |
| @Override |
| public Collection<String> getAvailableCharFilters() { |
| return CharFilterFactory.availableCharFilters().stream().sorted().collect(Collectors.toList()); |
| } |
| |
| @Override |
| public Collection<String> getAvailableTokenizers() { |
| return TokenizerFactory.availableTokenizers().stream().sorted().collect(Collectors.toList()); |
| } |
| |
| @Override |
| public Collection<String> getAvailableTokenFilters() { |
| return TokenFilterFactory.availableTokenFilters().stream().sorted().collect(Collectors.toList()); |
| } |
| |
| private <T> List<Class<? extends T>> getInstantiableSubTypesBuiltIn(Class<T> superType) { |
| ClassScanner scanner = new ClassScanner("org.apache.lucene.analysis", getClass().getClassLoader()); |
| Set<Class<? extends T>> types = scanner.scanSubTypes(superType); |
| return types.stream() |
| .filter(type -> !Modifier.isAbstract(type.getModifiers())) |
| .filter(type -> !type.getSimpleName().startsWith("Mock")) |
| .sorted(Comparator.comparing(Class::getName)) |
| .collect(Collectors.toList()); |
| } |
| |
| @Override |
| public List<Token> analyze(String text) { |
| Objects.requireNonNull(text); |
| |
| if (analyzer == null) { |
| throw new LukeException("Analyzer is not set."); |
| } |
| |
| try { |
| List<Token> result = new ArrayList<>(); |
| TokenStream stream = analyzer.tokenStream("", text); |
| stream.reset(); |
| |
| CharTermAttribute charAtt = stream.getAttribute(CharTermAttribute.class); |
| |
| // iterate tokens |
| while (stream.incrementToken()) { |
| List<TokenAttribute> attributes = copyAttributes(stream, charAtt); |
| result.add(new Token(charAtt.toString(), attributes)); |
| } |
| stream.close(); |
| |
| return result; |
| } catch (IOException e) { |
| throw new LukeException(e.getMessage(), e); |
| } |
| } |
| |
| private List<TokenAttribute> copyAttributes(TokenStream tokenStream, CharTermAttribute charAtt) { |
| List<TokenAttribute> attributes = new ArrayList<>(); |
| Iterator<AttributeImpl> itr = tokenStream.getAttributeImplsIterator(); |
| while(itr.hasNext()) { |
| AttributeImpl att = itr.next(); |
| Map<String, String> attValues = new LinkedHashMap<>(); |
| att.reflectWith((attClass, key, value) -> { |
| if (value != null) |
| attValues.put(key, value.toString()); |
| }); |
| attributes.add(new TokenAttribute(att.getClass().getSimpleName(), attValues)); |
| } |
| return attributes; |
| } |
| |
| @Override |
| public Analyzer createAnalyzerFromClassName(String analyzerType) { |
| Objects.requireNonNull(analyzerType); |
| |
| try { |
| Class<? extends Analyzer> clazz = Class.forName(analyzerType).asSubclass(Analyzer.class); |
| this.analyzer = clazz.newInstance(); |
| return analyzer; |
| } catch (ReflectiveOperationException e) { |
| throw new LukeException(String.format(Locale.ENGLISH, "Failed to instantiate class: %s", analyzerType), e); |
| } |
| } |
| |
| @Override |
| public Analyzer buildCustomAnalyzer(CustomAnalyzerConfig config) { |
| Objects.requireNonNull(config); |
| try { |
| // create builder |
| CustomAnalyzer.Builder builder = config.getConfigDir() |
| .map(path -> CustomAnalyzer.builder(FileSystems.getDefault().getPath(path))) |
| .orElse(CustomAnalyzer.builder()); |
| |
| // set tokenizer |
| builder.withTokenizer(config.getTokenizerConfig().getName(), config.getTokenizerConfig().getParams()); |
| |
| // add char filters |
| for (CustomAnalyzerConfig.ComponentConfig cfConf : config.getCharFilterConfigs()) { |
| builder.addCharFilter(cfConf.getName(), cfConf.getParams()); |
| } |
| |
| // add token filters |
| for (CustomAnalyzerConfig.ComponentConfig tfConf : config.getTokenFilterConfigs()) { |
| builder.addTokenFilter(tfConf.getName(), tfConf.getParams()); |
| } |
| |
| // build analyzer |
| this.analyzer = builder.build(); |
| return analyzer; |
| } catch (Exception e) { |
| throw new LukeException("Failed to build custom analyzer.", e); |
| } |
| } |
| |
| @Override |
| public Analyzer currentAnalyzer() { |
| if (analyzer == null) { |
| throw new LukeException("Analyzer is not set."); |
| } |
| return analyzer; |
| } |
| |
| @Override |
| public StepByStepResult analyzeStepByStep(String text){ |
| Objects.requireNonNull(text); |
| if (analyzer == null) { |
| throw new LukeException("Analyzer is not set."); |
| } |
| |
| if (!(analyzer instanceof CustomAnalyzer)) { |
| throw new LukeException("Analyzer is not CustomAnalyzer."); |
| } |
| |
| List<NamedTokens> namedTokens = new ArrayList<>(); |
| List<CharfilteredText> charfilteredTexts = new ArrayList<>(); |
| |
| try { |
| CustomAnalyzer customAnalyzer = (CustomAnalyzer)analyzer; |
| final List<CharFilterFactory> charFilterFactories = customAnalyzer.getCharFilterFactories(); |
| Reader reader = new StringReader(text); |
| String charFilteredSource = text; |
| if (charFilterFactories.size() > 0) { |
| Reader cs = reader; |
| for (CharFilterFactory charFilterFactory : charFilterFactories) { |
| cs = charFilterFactory.create(reader); |
| Reader readerForWriteOut = new StringReader(charFilteredSource); |
| readerForWriteOut = charFilterFactory.create(readerForWriteOut); |
| charFilteredSource = writeCharStream(readerForWriteOut); |
| charfilteredTexts.add(new CharfilteredText(CharFilterFactory.findSPIName(charFilterFactory.getClass()), charFilteredSource)); |
| } |
| reader = cs; |
| } |
| |
| final TokenizerFactory tokenizerFactory = customAnalyzer.getTokenizerFactory(); |
| final List<TokenFilterFactory> tokenFilterFactories = customAnalyzer.getTokenFilterFactories(); |
| |
| TokenStream tokenStream = tokenizerFactory.create(); |
| ((Tokenizer)tokenStream).setReader(reader); |
| List<Token> tokens = new ArrayList<>(); |
| List<AttributeSource> attributeSources = analyzeTokenStream(tokenStream, tokens); |
| namedTokens.add(new NamedTokens(TokenizerFactory.findSPIName(tokenizerFactory.getClass()), tokens)); |
| |
| ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, attributeSources); |
| for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) { |
| tokenStream = tokenFilterFactory.create(listBasedTokenStream); |
| tokens = new ArrayList<>(); |
| attributeSources = analyzeTokenStream(tokenStream, tokens); |
| namedTokens.add(new NamedTokens(TokenFilterFactory.findSPIName(tokenFilterFactory.getClass()), tokens)); |
| try { |
| listBasedTokenStream.close(); |
| } catch (IOException e) { |
| // do nothing; |
| } |
| listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, attributeSources); |
| } |
| try { |
| listBasedTokenStream.close(); |
| } catch (IOException e) { |
| // do nothing. |
| } finally { |
| reader.close(); |
| } |
| return new StepByStepResult(charfilteredTexts, namedTokens); |
| } catch (Exception e) { |
| throw new LukeException(e.getMessage(), e); |
| } |
| } |
| |
| /** |
| * Analyzes the given TokenStream, collecting the Tokens it produces. |
| * |
| * @param tokenStream TokenStream to analyze |
| * |
| * @return List of tokens produced from the TokenStream |
| */ |
| private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream, List<Token> result) { |
| final List<AttributeSource> tokens = new ArrayList<>(); |
| try { |
| tokenStream.reset(); |
| CharTermAttribute charAtt = tokenStream.getAttribute(CharTermAttribute.class); |
| while (tokenStream.incrementToken()) { |
| tokens.add(tokenStream.cloneAttributes()); |
| List<TokenAttribute> attributes = copyAttributes(tokenStream, charAtt); |
| result.add(new Token(charAtt.toString(), attributes)); |
| } |
| tokenStream.end(); |
| } catch (IOException ioe) { |
| throw new RuntimeException("Error occurred while iterating over TokenStream", ioe); |
| } finally { |
| IOUtils.closeWhileHandlingException(tokenStream); |
| } |
| return tokens; |
| } |
| |
| /** |
| * TokenStream that iterates over a list of pre-existing Tokens |
| * see org.apache.solr.handler.AnalysisRequestHandlerBase#ListBasedTokenStream |
| */ |
| protected final static class ListBasedTokenStream extends TokenStream { |
| private final List<AttributeSource> tokens; |
| private Iterator<AttributeSource> tokenIterator; |
| |
| /** |
| * Creates a new ListBasedTokenStream which uses the given tokens as its token source. |
| * |
| * @param attributeSource source of the attribute factory and attribute impls |
| * @param tokens Source of tokens to be used |
| */ |
| ListBasedTokenStream(AttributeSource attributeSource, List<AttributeSource> tokens) { |
| super(attributeSource.getAttributeFactory()); |
| this.tokens = tokens; |
| // Make sure all the attributes of the source are here too |
| addAttributes(attributeSource); |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| super.reset(); |
| tokenIterator = tokens.iterator(); |
| } |
| |
| @Override |
| public boolean incrementToken() { |
| if (tokenIterator.hasNext()) { |
| clearAttributes(); |
| AttributeSource next = tokenIterator.next(); |
| addAttributes(next); |
| next.copyTo(this); |
| return true; |
| } else { |
| return false; |
| } |
| } |
| |
| void addAttributes(AttributeSource attributeSource) { |
| Iterator<AttributeImpl> atts = attributeSource.getAttributeImplsIterator(); |
| while (atts.hasNext()) { |
| addAttributeImpl(atts.next()); // adds both impl & interfaces |
| } |
| } |
| } |
| |
| private static String writeCharStream(Reader input ){ |
| final int BUFFER_SIZE = 1024; |
| char[] buf = new char[BUFFER_SIZE]; |
| int len = 0; |
| StringBuilder sb = new StringBuilder(); |
| do { |
| try { |
| len = input.read( buf, 0, BUFFER_SIZE ); |
| } catch (IOException e) { |
| throw new RuntimeException("Error occurred while iterating over charfiltering", e); |
| } |
| if( len > 0 ) |
| sb.append(buf, 0, len); |
| } while( len == BUFFER_SIZE ); |
| return sb.toString(); |
| } |
| |
| } |