| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.jackrabbit.oak.plugins.index.lucene; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.lang.reflect.Constructor; |
| import java.lang.reflect.InvocationTargetException; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.concurrent.atomic.AtomicBoolean; |
| |
| import com.google.common.collect.ImmutableSet; |
| import com.google.common.collect.Maps; |
| import org.apache.jackrabbit.JcrConstants; |
| import org.apache.jackrabbit.oak.api.Blob; |
| import org.apache.jackrabbit.oak.api.PropertyState; |
| import org.apache.jackrabbit.oak.api.Tree; |
| import org.apache.jackrabbit.oak.api.Type; |
| import org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain; |
| import org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil; |
| import org.apache.jackrabbit.oak.plugins.tree.factories.TreeFactory; |
| import org.apache.jackrabbit.oak.spi.state.NodeState; |
| import org.apache.jackrabbit.oak.spi.state.NodeStateUtils; |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.util.AbstractAnalysisFactory; |
| import org.apache.lucene.analysis.util.CharArraySet; |
| import org.apache.lucene.analysis.util.CharFilterFactory; |
| import org.apache.lucene.analysis.util.ClasspathResourceLoader; |
| import org.apache.lucene.analysis.util.ResourceLoader; |
| import org.apache.lucene.analysis.util.ResourceLoaderAware; |
| import org.apache.lucene.analysis.util.StopwordAnalyzerBase; |
| import org.apache.lucene.analysis.util.TokenFilterFactory; |
| import org.apache.lucene.analysis.util.TokenizerFactory; |
| import org.apache.lucene.analysis.util.WordlistLoader; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.Version; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import static com.google.common.base.Preconditions.checkNotNull; |
| import static com.google.common.collect.Lists.newArrayList; |
| |
| /** |
| * Constructs a Lucene Analyzer from nodes (based on NodeState content). |
| * |
| * Approach taken is similar to one taken in |
| * org.apache.solr.schema.FieldTypePluginLoader which is implemented for xml |
| * based config. Resource lookup are performed via binary property access |
| */ |
| final class NodeStateAnalyzerFactory { |
| private static final AtomicBoolean versionWarningAlreadyLogged = new AtomicBoolean(false); |
| |
| private static final Set<String> IGNORE_PROP_NAMES = ImmutableSet.of( |
| LuceneIndexConstants.ANL_CLASS, |
| LuceneIndexConstants.ANL_NAME, |
| JcrConstants.JCR_PRIMARYTYPE |
| ); |
| |
| private static final Logger log = LoggerFactory.getLogger(NodeStateAnalyzerFactory.class); |
| |
| private final ResourceLoader defaultLoader; |
| private final Version defaultVersion; |
| |
| NodeStateAnalyzerFactory(Version defaultVersion){ |
| this(new ClasspathResourceLoader(NodeStateAnalyzerFactory.class.getClassLoader()), defaultVersion); |
| } |
| |
| NodeStateAnalyzerFactory(ResourceLoader defaultLoader, Version defaultVersion) { |
| this.defaultLoader = defaultLoader; |
| this.defaultVersion = defaultVersion; |
| } |
| |
| public Analyzer createInstance(NodeState state) { |
| if (state.hasProperty(LuceneIndexConstants.ANL_CLASS)){ |
| return createAnalyzerViaReflection(state); |
| } |
| return composeAnalyzer(state); |
| } |
| |
| private Analyzer composeAnalyzer(NodeState state) { |
| TokenizerFactory tf = loadTokenizer(state.getChildNode(LuceneIndexConstants.ANL_TOKENIZER)); |
| CharFilterFactory[] cfs = loadCharFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_CHAR_FILTERS)); |
| TokenFilterFactory[] tffs = loadTokenFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_FILTERS)); |
| return new TokenizerChain(cfs, tf, tffs); |
| } |
| |
| private TokenFilterFactory[] loadTokenFilterFactories(NodeState tokenFiltersState) { |
| List<TokenFilterFactory> result = newArrayList(); |
| |
| Tree tree = TreeFactory.createReadOnlyTree(tokenFiltersState); |
| for (Tree t : tree.getChildren()){ |
| NodeState state = tokenFiltersState.getChildNode(t.getName()); |
| |
| String factoryType = getFactoryType(state, t.getName()); |
| Map<String, String> args = convertNodeState(state); |
| TokenFilterFactory cf = TokenFilterFactory.forName(factoryType, args); |
| init(cf, state); |
| result.add(cf); |
| } |
| |
| return result.toArray(new TokenFilterFactory[result.size()]); |
| } |
| |
| private CharFilterFactory[] loadCharFilterFactories(NodeState charFiltersState) { |
| List<CharFilterFactory> result = newArrayList(); |
| |
| //Need to read children in order |
| Tree tree = TreeFactory.createReadOnlyTree(charFiltersState); |
| for (Tree t : tree.getChildren()){ |
| NodeState state = charFiltersState.getChildNode(t.getName()); |
| |
| String factoryType = getFactoryType(state, t.getName()); |
| Map<String, String> args = convertNodeState(state); |
| CharFilterFactory cf = CharFilterFactory.forName(factoryType, args); |
| init(cf, state); |
| result.add(cf); |
| } |
| |
| return result.toArray(new CharFilterFactory[result.size()]); |
| } |
| |
| private TokenizerFactory loadTokenizer(NodeState state) { |
| String clazz = checkNotNull(state.getString(LuceneIndexConstants.ANL_NAME)); |
| Map<String, String> args = convertNodeState(state); |
| TokenizerFactory tf = TokenizerFactory.forName(clazz, args); |
| init(tf, state); |
| return tf; |
| } |
| |
| private Analyzer createAnalyzerViaReflection(NodeState state) { |
| String clazz = state.getString(LuceneIndexConstants.ANL_CLASS); |
| Class<? extends Analyzer> analyzerClazz = defaultLoader.findClass(clazz, Analyzer.class); |
| |
| Version matchVersion = getVersion(state); |
| CharArraySet stopwords = null; |
| if (StopwordAnalyzerBase.class.isAssignableFrom(analyzerClazz) |
| && state.hasChildNode(LuceneIndexConstants.ANL_STOPWORDS)) { |
| try { |
| stopwords = loadStopwordSet(state.getChildNode(LuceneIndexConstants.ANL_STOPWORDS), |
| LuceneIndexConstants.ANL_STOPWORDS, matchVersion); |
| } catch (IOException e) { |
| throw new RuntimeException("Error occurred while loading stopwords", e); |
| } |
| } |
| Constructor<? extends Analyzer> c = null; |
| |
| try { |
| if (stopwords != null) { |
| c = analyzerClazz.getConstructor(Version.class, CharArraySet.class); |
| return c.newInstance(matchVersion, stopwords); |
| } else { |
| c = analyzerClazz.getConstructor(Version.class); |
| return c.newInstance(matchVersion); |
| } |
| } catch (NoSuchMethodException e) { |
| throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e); |
| } catch (InstantiationException e) { |
| throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e); |
| } catch (IllegalAccessException e) { |
| throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e); |
| } catch (InvocationTargetException e) { |
| throw new RuntimeException("Error occurred while instantiating Analyzer for " + analyzerClazz, e); |
| } |
| } |
| |
| private void init(AbstractAnalysisFactory o, NodeState state) { |
| if (o instanceof ResourceLoaderAware) { |
| try { |
| ((ResourceLoaderAware) o).inform(new NodeStateResourceLoader(state, defaultLoader)); |
| } catch (IOException e) { |
| throw new IllegalArgumentException("Error occurred while initializing type " + o.getClass(), e); |
| } |
| } |
| |
| if (state.hasProperty(LuceneIndexConstants.ANL_LUCENE_MATCH_VERSION)){ |
| o.setExplicitLuceneMatchVersion(true); |
| } |
| } |
| |
| Map<String, String> convertNodeState(NodeState state) { |
| Map<String, String> result = Maps.newHashMap(); |
| for (PropertyState ps : state.getProperties()) { |
| String name = ps.getName(); |
| if (ps.getType() != Type.BINARY |
| && !ps.isArray() |
| && !(name != null && NodeStateUtils.isHidden(name)) |
| && !IGNORE_PROP_NAMES.contains(name)) { |
| result.put(name, ps.getValue(Type.STRING)); |
| } |
| } |
| result.put(LuceneIndexConstants.ANL_LUCENE_MATCH_VERSION, getVersion(state).toString()); |
| return result; |
| } |
| |
| private Version getVersion(NodeState state){ |
| Version version = defaultVersion; |
| if (state.hasProperty(LuceneIndexConstants.ANL_LUCENE_MATCH_VERSION)){ |
| version = parseLuceneVersionString(state.getString(LuceneIndexConstants.ANL_LUCENE_MATCH_VERSION)); |
| } |
| return version; |
| } |
| |
| private static String getFactoryType(NodeState state, String nodeStateName){ |
| String type = state.getString(LuceneIndexConstants.ANL_NAME); |
| return type != null ? type : nodeStateName; |
| } |
| |
| @SuppressWarnings("deprecation") |
| private static Version parseLuceneVersionString(final String matchVersion) { |
| final Version version = Version.parseLeniently(matchVersion); |
| if (version == Version.LUCENE_CURRENT && !versionWarningAlreadyLogged.getAndSet(true)) { |
| log.warn( |
| "You should not use LATEST as luceneMatchVersion property: "+ |
| "if you use this setting, and then Solr upgrades to a newer release of Lucene, "+ |
| "sizable changes may happen. If precise back compatibility is important "+ |
| "then you should instead explicitly specify an actual Lucene version." |
| ); |
| } |
| return version; |
| } |
| |
| private static CharArraySet loadStopwordSet(NodeState file, String name, |
| Version matchVersion) throws IOException { |
| Blob blob = ConfigUtil.getBlob(file, name); |
| Reader stopwords = new InputStreamReader(blob.getNewStream(), IOUtils.CHARSET_UTF_8); |
| try { |
| return WordlistLoader.getWordSet(stopwords, matchVersion); |
| } finally { |
| IOUtils.close(stopwords); |
| } |
| } |
| |
| static class NodeStateResourceLoader implements ResourceLoader { |
| private final NodeState state; |
| private final ResourceLoader delegate; |
| |
| public NodeStateResourceLoader(NodeState state, ResourceLoader delegate) { |
| this.state = state; |
| this.delegate = delegate; |
| } |
| |
| @Override |
| public InputStream openResource(String resource) throws IOException { |
| if (state.hasChildNode(resource)){ |
| return ConfigUtil.getBlob(state.getChildNode(resource), resource).getNewStream(); |
| } |
| return delegate.openResource(resource); |
| } |
| |
| @Override |
| public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) { |
| //For factories the cname is not FQN. Instead its the name without suffix |
| //For e.g. for WhitespaceTokenizerFactory its 'whitespace' |
| if (CharFilterFactory.class.isAssignableFrom(expectedType)) { |
| return CharFilterFactory.lookupClass(cname).asSubclass(expectedType); |
| } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) { |
| return TokenizerFactory.lookupClass(cname).asSubclass(expectedType); |
| } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) { |
| return TokenFilterFactory.lookupClass(cname).asSubclass(expectedType); |
| } |
| return delegate.findClass(cname, expectedType); |
| } |
| |
| @Override |
| public <T> T newInstance(String cname, Class<T> expectedType) { |
| throw new UnsupportedOperationException(); |
| } |
| } |
| } |