blob: 051a8fc0bd194e4f7136807624d44ecfaa9de18e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
import org.apache.lucene.util.Version;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Analyzer factory construction task. The name given to the constructed factory may
* be given to NewAnalyzerTask, which will call AnalyzerFactory.create().
*
* Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue';
* use backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing
* quotation mark,
*
* Specify params in a comma separated list of the following, in order:
* <ol>
* <li>Analyzer args:
* <ul>
* <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code></li>
* <li>Optional: <tt>positionIncrementGap:<i>int value</i></tt> (default: 0)</li>
* <li>Optional: <tt>offsetGap:<i>int value</i></tt> (default: 1)</li>
* </ul>
* </li>
* <li>zero or more CharFilterFactory's, followed by</li>
* <li>exactly one TokenizerFactory, followed by</li>
* <li>zero or more TokenFilterFactory's</li>
* </ol>
*
* Each component analysis factory may specify <tt>luceneMatchVersion</tt> (defaults to
* {@link Version#LATEST}) and any of the args understood by the specified
* *Factory class, in the above-describe param format.
* <p>
* Example:
* <pre>
* -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
* positionIncrementGap:100,
* HTMLStripCharFilter,
* MappingCharFilter(mapping:'mapping-FoldToASCII.txt'),
* WhitespaceTokenizer(luceneMatchVersion:LUCENE_5_0_0),
* TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false))
* [...]
* -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
* </pre>
* <p>
* AnalyzerFactory will direct analysis component factories to look for resources
* under the directory specified in the "work.dir" property.
*/
public class AnalyzerFactoryTask extends PerfTask {
private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis.";
private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN
= Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$");
private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$");
private enum ArgType {ANALYZER_ARG, ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER, TOKENFILTER }
String factoryName = null;
Integer positionIncrementGap = null;
Integer offsetGap = null;
private List<CharFilterFactory> charFilterFactories = new ArrayList<>();
private TokenizerFactory tokenizerFactory = null;
private List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>();
public AnalyzerFactoryTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() {
return 1;
}
/**
* Sets the params.
* Analysis component factory names may optionally include the "Factory" suffix.
*
* @param params analysis pipeline specification: name, (optional) positionIncrementGap,
* (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory,
* and 0+ TokenFilterFactory's
*/
@Override
@SuppressWarnings("fallthrough")
public void setParams(String params) {
super.setParams(params);
ArgType expectedArgType = ArgType.ANALYZER_ARG;
final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
stok.commentChar('#');
stok.quoteChar('"');
stok.quoteChar('\'');
stok.eolIsSignificant(false);
stok.ordinaryChar('(');
stok.ordinaryChar(')');
stok.ordinaryChar(':');
stok.ordinaryChar(',');
try {
while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch (stok.ttype) {
case ',': {
// Do nothing
break;
}
case StreamTokenizer.TT_WORD: {
if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
final String argName = stok.sval;
if ( ! argName.equalsIgnoreCase("name")
&& ! argName.equalsIgnoreCase("positionIncrementGap")
&& ! argName.equalsIgnoreCase("offsetGap")) {
throw new RuntimeException
("Line #" + lineno(stok) + ": Missing 'name' param to AnalyzerFactory: '" + params + "'");
}
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException
("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
}
stok.nextToken();
String argValue = stok.sval;
switch (stok.ttype) {
case StreamTokenizer.TT_NUMBER: {
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fallthrough
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD: {
if (argName.equalsIgnoreCase("name")) {
factoryName = argValue;
expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
} else {
int intArgValue = 0;
try {
intArgValue = Integer.parseInt(argValue);
} catch (NumberFormatException e) {
throw new RuntimeException
("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + argValue + "'", e);
}
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
}
break;
}
case StreamTokenizer.TT_EOF: {
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default: {
throw new RuntimeException
("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
} else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
final String argName = stok.sval;
if (argName.equalsIgnoreCase("positionIncrementGap")
|| argName.equalsIgnoreCase("offsetGap")) {
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException
("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to AnalyzerFactory");
}
stok.nextToken();
int intArgValue = (int)stok.nval;
switch (stok.ttype) {
case '"':
case '\'':
case StreamTokenizer.TT_WORD: {
intArgValue = 0;
try {
intArgValue = Integer.parseInt(stok.sval.trim());
} catch (NumberFormatException e) {
throw new RuntimeException
("Line #" + lineno(stok) + ": Exception parsing " + argName + " value '" + stok.sval + "'", e);
}
// Intentional fall-through
}
case StreamTokenizer.TT_NUMBER: {
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
break;
}
case StreamTokenizer.TT_EOF: {
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default: {
throw new RuntimeException
("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
break;
}
try {
final Class<? extends CharFilterFactory> clazz;
clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
createAnalysisPipelineComponent(stok, clazz);
} catch (IllegalArgumentException e) {
try {
final Class<? extends TokenizerFactory> clazz;
clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
createAnalysisPipelineComponent(stok, clazz);
expectedArgType = ArgType.TOKENFILTER;
} catch (IllegalArgumentException e2) {
throw new RuntimeException("Line #" + lineno(stok) + ": Can't find class '"
+ argName + "' as CharFilterFactory or TokenizerFactory");
}
}
} else { // expectedArgType = ArgType.TOKENFILTER
final String className = stok.sval;
final Class<? extends TokenFilterFactory> clazz;
try {
clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
} catch (IllegalArgumentException e) {
throw new RuntimeException
("Line #" + lineno(stok) + ": Can't find class '" + className + "' as TokenFilterFactory");
}
createAnalysisPipelineComponent(stok, clazz);
}
break;
}
default: {
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw e;
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
final AnalyzerFactory analyzerFactory = new AnalyzerFactory
(charFilterFactories, tokenizerFactory, tokenFilterFactories);
analyzerFactory.setPositionIncrementGap(positionIncrementGap);
analyzerFactory.setOffsetGap(offsetGap);
getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
}
/**
* Instantiates the given analysis factory class after pulling params from
* the given stream tokenizer, then stores the result in the appropriate
* pipeline component list.
*
* @param stok stream tokenizer from which to draw analysis factory params
* @param clazz analysis factory class to instantiate
*/
@SuppressWarnings("fallthrough")
private void createAnalysisPipelineComponent
(StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) {
Map<String,String> argMap = new HashMap<>();
boolean parenthetical = false;
try {
WHILE_LOOP: while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch (stok.ttype) {
case ',': {
if (parenthetical) {
// Do nothing
break;
} else {
// Finished reading this analysis factory configuration
break WHILE_LOOP;
}
}
case '(': {
if (parenthetical) {
throw new RuntimeException
("Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
}
parenthetical = true;
break;
}
case ')': {
if (parenthetical) {
parenthetical = false;
} else {
throw new RuntimeException
("Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
}
break;
}
case StreamTokenizer.TT_WORD: {
if ( ! parenthetical) {
throw new RuntimeException("Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
}
String argName = stok.sval;
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException
("Line #" + lineno(stok) + ": Missing ':' after '" + argName + "' param to " + clazz.getSimpleName());
}
stok.nextToken();
String argValue = stok.sval;
switch (stok.ttype) {
case StreamTokenizer.TT_NUMBER: {
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fall-through
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD: {
argMap.put(argName, argValue);
break;
}
case StreamTokenizer.TT_EOF: {
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default: {
throw new RuntimeException
("Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
}
}
if (!argMap.containsKey("luceneMatchVersion")) {
argMap.put("luceneMatchVersion", Version.LATEST.toString());
}
final AbstractAnalysisFactory instance;
try {
instance = clazz.getConstructor(Map.class).newInstance(argMap);
} catch (Exception e) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
if (instance instanceof ResourceLoaderAware) {
Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work"));
if (!Files.isDirectory(baseDir)) {
baseDir = Paths.get(".");
}
((ResourceLoaderAware)instance).inform(new FilesystemResourceLoader(baseDir));
}
if (CharFilterFactory.class.isAssignableFrom(clazz)) {
charFilterFactories.add((CharFilterFactory)instance);
} else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
tokenizerFactory = (TokenizerFactory)instance;
} else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
tokenFilterFactories.add((TokenFilterFactory)instance);
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw (e);
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
}
/**
* This method looks up a class with its fully qualified name (FQN), or a short-name
* class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
* as the package prefix (e.g. "standard.ClassicTokenizerFactory" -&gt;
* "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
*
* If className contains a period, the class is first looked up as-is, assuming that it
* is an FQN. If this fails, lookup is retried after prepending the Lucene analysis
* package prefix to the class name.
*
* If className does not contain a period, the analysis SPI *Factory.lookupClass()
* methods are used to find the class.
*
* @param className The name or the short name of the class.
* @param expectedType The superclass className is expected to extend
* @return the loaded class.
* @throws ClassNotFoundException if lookup fails
*/
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
throws ClassNotFoundException {
if (className.contains(".")) {
try {
// First, try className == FQN
return Class.forName(className).asSubclass(expectedType);
} catch (ClassNotFoundException e) {
try {
// Second, retry lookup after prepending the Lucene analysis package prefix
return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
} catch (ClassNotFoundException e1) {
throw new ClassNotFoundException("Can't find class '" + className
+ "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
}
}
}
// No dot - use analysis SPI lookup
final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
}
throw new ClassNotFoundException("Can't find class '" + className + "'");
}
/* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
*/
@Override
public boolean supportsParams() {
return true;
}
/** Returns the current line in the algorithm file */
public int lineno(StreamTokenizer stok) {
return getAlgLineNum() + stok.lineno();
}
}