blob: 9120237a17d87c70a0e74c676ae94318af67c6aa [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.tasks;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.AbstractAnalysisFactory;
import org.apache.lucene.analysis.CharFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenizerFactory;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory;
import org.apache.lucene.util.ResourceLoaderAware;
import org.apache.lucene.util.Version;
/**
* Analyzer factory construction task. The name given to the constructed factory may be given to
* NewAnalyzerTask, which will call AnalyzerFactory.create().
*
* <p>Params are in the form argname:argvalue or argname:"argvalue" or argname:'argvalue'; use
* backslashes to escape '"' or "'" inside a quoted value when it's used as the enclosing quotation
* mark,
*
* <p>Specify params in a comma separated list of the following, in order:
*
* <ol>
* <li>Analyzer args:
* <ul>
* <li><b>Required</b>: <code>name:<i>analyzer-factory-name</i></code>
* <li>Optional: <code>positionIncrementGap:<i>int value</i></code> (default: 0)
* <li>Optional: <code>offsetGap:<i>int value</i></code> (default: 1)
* </ul>
* <li>zero or more CharFilterFactory's, followed by
* <li>exactly one TokenizerFactory, followed by
* <li>zero or more TokenFilterFactory's
* </ol>
*
* Each component analysis factory may specify <code>luceneMatchVersion</code> (defaults to {@link
* Version#LATEST}) and any of the args understood by the specified *Factory class, in the
* above-describe param format.
*
* <p>Example:
*
* <pre>
* -AnalyzerFactory(name:'strip html, fold to ascii, whitespace tokenize, max 10k tokens',
* positionIncrementGap:100,
* HTMLStripCharFilter,
* MappingCharFilter(mapping:'mapping-FoldToASCII.txt'),
* WhitespaceTokenizer(luceneMatchVersion:LUCENE_5_0_0),
* TokenLimitFilter(maxTokenCount:10000, consumeAllTokens:false))
* [...]
* -NewAnalyzer('strip html, fold to ascii, whitespace tokenize, max 10k tokens')
* </pre>
*
* <p>AnalyzerFactory will direct analysis component factories to look for resources under the
* directory specified in the "work.dir" property.
*/
public class AnalyzerFactoryTask extends PerfTask {
private static final String LUCENE_ANALYSIS_PACKAGE_PREFIX = "org.apache.lucene.analysis.";
private static final Pattern ANALYSIS_COMPONENT_SUFFIX_PATTERN =
Pattern.compile("(?s:(?:(?:Token|Char)?Filter|Tokenizer)(?:Factory)?)$");
private static final Pattern TRAILING_DOT_ZERO_PATTERN = Pattern.compile("\\.0$");
private enum ArgType {
ANALYZER_ARG,
ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER,
TOKENFILTER
}
String factoryName = null;
Integer positionIncrementGap = null;
Integer offsetGap = null;
private List<CharFilterFactory> charFilterFactories = new ArrayList<>();
private TokenizerFactory tokenizerFactory = null;
private List<TokenFilterFactory> tokenFilterFactories = new ArrayList<>();
public AnalyzerFactoryTask(PerfRunData runData) {
super(runData);
}
@Override
public int doLogic() {
return 1;
}
/**
* Sets the params. Analysis component factory names may optionally include the "Factory" suffix.
*
* @param params analysis pipeline specification: name, (optional) positionIncrementGap,
* (optional) offsetGap, 0+ CharFilterFactory's, 1 TokenizerFactory, and 0+
* TokenFilterFactory's
*/
@Override
@SuppressWarnings("fallthrough")
public void setParams(String params) {
super.setParams(params);
ArgType expectedArgType = ArgType.ANALYZER_ARG;
final StreamTokenizer stok = new StreamTokenizer(new StringReader(params));
stok.commentChar('#');
stok.quoteChar('"');
stok.quoteChar('\'');
stok.eolIsSignificant(false);
stok.ordinaryChar('(');
stok.ordinaryChar(')');
stok.ordinaryChar(':');
stok.ordinaryChar(',');
try {
while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch (stok.ttype) {
case ',':
{
// Do nothing
break;
}
case StreamTokenizer.TT_WORD:
{
if (expectedArgType.equals(ArgType.ANALYZER_ARG)) {
final String argName = stok.sval;
if (!argName.equalsIgnoreCase("name")
&& !argName.equalsIgnoreCase("positionIncrementGap")
&& !argName.equalsIgnoreCase("offsetGap")) {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Missing 'name' param to AnalyzerFactory: '"
+ params
+ "'");
}
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Missing ':' after '"
+ argName
+ "' param to AnalyzerFactory");
}
stok.nextToken();
String argValue = stok.sval;
switch (stok.ttype) {
case StreamTokenizer.TT_NUMBER:
{
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fallthrough
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
if (argName.equalsIgnoreCase("name")) {
factoryName = argValue;
expectedArgType = ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER;
} else {
int intArgValue = 0;
try {
intArgValue = Integer.parseInt(argValue);
} catch (NumberFormatException e) {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Exception parsing "
+ argName
+ " value '"
+ argValue
+ "'",
e);
}
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
}
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
} else if (expectedArgType.equals(ArgType.ANALYZER_ARG_OR_CHARFILTER_OR_TOKENIZER)) {
final String argName = stok.sval;
if (argName.equalsIgnoreCase("positionIncrementGap")
|| argName.equalsIgnoreCase("offsetGap")) {
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Missing ':' after '"
+ argName
+ "' param to AnalyzerFactory");
}
stok.nextToken();
int intArgValue = (int) stok.nval;
switch (stok.ttype) {
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
intArgValue = 0;
try {
intArgValue = Integer.parseInt(stok.sval.trim());
} catch (NumberFormatException e) {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Exception parsing "
+ argName
+ " value '"
+ stok.sval
+ "'",
e);
}
// Intentional fall-through
}
case StreamTokenizer.TT_NUMBER:
{
if (argName.equalsIgnoreCase("positionIncrementGap")) {
positionIncrementGap = intArgValue;
} else if (argName.equalsIgnoreCase("offsetGap")) {
offsetGap = intArgValue;
}
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
break;
}
try {
final Class<? extends CharFilterFactory> clazz;
clazz = lookupAnalysisClass(argName, CharFilterFactory.class);
createAnalysisPipelineComponent(stok, clazz);
} catch (IllegalArgumentException e) {
try {
final Class<? extends TokenizerFactory> clazz;
clazz = lookupAnalysisClass(argName, TokenizerFactory.class);
createAnalysisPipelineComponent(stok, clazz);
expectedArgType = ArgType.TOKENFILTER;
} catch (IllegalArgumentException e2) {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Can't find class '"
+ argName
+ "' as CharFilterFactory or TokenizerFactory");
}
}
} else { // expectedArgType = ArgType.TOKENFILTER
final String className = stok.sval;
final Class<? extends TokenFilterFactory> clazz;
try {
clazz = lookupAnalysisClass(className, TokenFilterFactory.class);
} catch (IllegalArgumentException e) {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Can't find class '"
+ className
+ "' as TokenFilterFactory");
}
createAnalysisPipelineComponent(stok, clazz);
}
break;
}
default:
{
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw e;
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
final AnalyzerFactory analyzerFactory =
new AnalyzerFactory(charFilterFactories, tokenizerFactory, tokenFilterFactories);
analyzerFactory.setPositionIncrementGap(positionIncrementGap);
analyzerFactory.setOffsetGap(offsetGap);
getRunData().getAnalyzerFactories().put(factoryName, analyzerFactory);
}
/**
* Instantiates the given analysis factory class after pulling params from the given stream
* tokenizer, then stores the result in the appropriate pipeline component list.
*
* @param stok stream tokenizer from which to draw analysis factory params
* @param clazz analysis factory class to instantiate
*/
@SuppressWarnings("fallthrough")
private void createAnalysisPipelineComponent(
StreamTokenizer stok, Class<? extends AbstractAnalysisFactory> clazz) {
Map<String, String> argMap = new HashMap<>();
boolean parenthetical = false;
try {
WHILE_LOOP:
while (stok.nextToken() != StreamTokenizer.TT_EOF) {
switch (stok.ttype) {
case ',':
{
if (parenthetical) {
// Do nothing
break;
} else {
// Finished reading this analysis factory configuration
break WHILE_LOOP;
}
}
case '(':
{
if (parenthetical) {
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected opening parenthesis.");
}
parenthetical = true;
break;
}
case ')':
{
if (parenthetical) {
parenthetical = false;
} else {
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected closing parenthesis.");
}
break;
}
case StreamTokenizer.TT_WORD:
{
if (!parenthetical) {
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected token '" + stok.sval + "'");
}
String argName = stok.sval;
stok.nextToken();
if (stok.ttype != ':') {
throw new RuntimeException(
"Line #"
+ lineno(stok)
+ ": Missing ':' after '"
+ argName
+ "' param to "
+ clazz.getSimpleName());
}
stok.nextToken();
String argValue = stok.sval;
switch (stok.ttype) {
case StreamTokenizer.TT_NUMBER:
{
argValue = Double.toString(stok.nval);
// Drop the ".0" from numbers, for integer arguments
argValue = TRAILING_DOT_ZERO_PATTERN.matcher(argValue).replaceFirst("");
// Intentional fall-through
}
case '"':
case '\'':
case StreamTokenizer.TT_WORD:
{
argMap.put(argName, argValue);
break;
}
case StreamTokenizer.TT_EOF:
{
throw new RuntimeException("Unexpected EOF: " + stok.toString());
}
default:
{
throw new RuntimeException(
"Line #" + lineno(stok) + ": Unexpected token: " + stok.toString());
}
}
}
}
}
if (!argMap.containsKey("luceneMatchVersion")) {
argMap.put("luceneMatchVersion", Version.LATEST.toString());
}
final AbstractAnalysisFactory instance;
try {
instance = clazz.getConstructor(Map.class).newInstance(argMap);
} catch (Exception e) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
if (instance instanceof ResourceLoaderAware) {
Path baseDir = Paths.get(getRunData().getConfig().get("work.dir", "work"));
if (!Files.isDirectory(baseDir)) {
baseDir = Paths.get(".");
}
((ResourceLoaderAware) instance).inform(new FilesystemResourceLoader(baseDir));
}
if (CharFilterFactory.class.isAssignableFrom(clazz)) {
charFilterFactories.add((CharFilterFactory) instance);
} else if (TokenizerFactory.class.isAssignableFrom(clazz)) {
tokenizerFactory = (TokenizerFactory) instance;
} else if (TokenFilterFactory.class.isAssignableFrom(clazz)) {
tokenFilterFactories.add((TokenFilterFactory) instance);
}
} catch (RuntimeException e) {
if (e.getMessage().startsWith("Line #")) {
throw (e);
} else {
throw new RuntimeException("Line #" + lineno(stok) + ": ", e);
}
} catch (Throwable t) {
throw new RuntimeException("Line #" + lineno(stok) + ": ", t);
}
}
/**
* This method looks up a class with its fully qualified name (FQN), or a short-name
* class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis." as the
* package prefix (e.g. "standard.ClassicTokenizerFactory" -&gt;
* "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
*
* <p>If className contains a period, the class is first looked up as-is, assuming that it is an
* FQN. If this fails, lookup is retried after prepending the Lucene analysis package prefix to
* the class name.
*
* <p>If className does not contain a period, the analysis SPI *Factory.lookupClass() methods are
* used to find the class.
*
* @param className The name or the short name of the class.
* @param expectedType The superclass className is expected to extend
* @return the loaded class.
* @throws ClassNotFoundException if lookup fails
*/
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
throws ClassNotFoundException {
if (className.contains(".")) {
try {
// First, try className == FQN
return Class.forName(className).asSubclass(expectedType);
} catch (ClassNotFoundException e) {
try {
// Second, retry lookup after prepending the Lucene analysis package prefix
return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
} catch (ClassNotFoundException e1) {
throw new ClassNotFoundException(
"Can't find class '"
+ className
+ "' or '"
+ LUCENE_ANALYSIS_PACKAGE_PREFIX
+ className
+ "'");
}
}
}
// No dot - use analysis SPI lookup
final String analysisComponentName =
ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
} else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
}
throw new ClassNotFoundException("Can't find class '" + className + "'");
}
/* (non-Javadoc)
* @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams()
*/
@Override
public boolean supportsParams() {
return true;
}
/** Returns the current line in the algorithm file */
public int lineno(StreamTokenizer stok) {
return getAlgLineNum() + stok.lineno();
}
}