blob: d03e889f6cf42719fbfedf696b39a10826b59214 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/**
* Factory for {@link ConcatenateGraphFilter}.
*
* <ul>
* <li><code>preserveSep</code>: For lucene versions lesser than {@link
* org.apache.lucene.util.Version#LUCENE_8_4_0} Whether {@link
* ConcatenateGraphFilter#SEP_LABEL} should separate the input tokens in the concatenated
* token
* <li><code>tokenSeparator</code>: Separator to use for concatenation. If not present, {@link
* ConcatenateGraphFilter#DEFAULT_TOKEN_SEPARATOR} will be used. If empty, tokens will be
* concatenated without any separators.
* <li><code>preservePositionIncrements</code>: Whether to add an empty token for missing
* positions. The effect is a consecutive {@link ConcatenateGraphFilter#SEP_LABEL}. When
* false, it's as if there were no missing positions (we pretend the surrounding tokens were
* adjacent).
* <li><code>maxGraphExpansions</code>: If the tokenStream graph has more than this many possible
* paths through, then we'll throw {@link TooComplexToDeterminizeException} to preserve the
* stability and memory of the machine.
* </ul>
*
* @see ConcatenateGraphFilter
* @since 7.4.0
* @lucene.spi {@value #NAME}
*/
public class ConcatenateGraphFilterFactory extends TokenFilterFactory {
/** SPI name */
public static final String NAME = "concatenateGraph";
private Character tokenSeparator;
private boolean preservePositionIncrements;
private int maxGraphExpansions;
public ConcatenateGraphFilterFactory(Map<String, String> args) {
super(args);
Version luceneMatchVersion = getLuceneMatchVersion();
@SuppressWarnings("deprecation")
Version LUCENE_8_4_0 = Version.LUCENE_8_4_0;
if (luceneMatchVersion.onOrAfter(LUCENE_8_4_0)) {
tokenSeparator =
getCharacter(args, "tokenSeparator", ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR);
} else {
boolean preserveSep =
getBoolean(args, "preserveSep", ConcatenateGraphFilter.DEFAULT_PRESERVE_SEP);
tokenSeparator = (preserveSep) ? ConcatenateGraphFilter.DEFAULT_TOKEN_SEPARATOR : null;
}
preservePositionIncrements =
getBoolean(
args,
"preservePositionIncrements",
ConcatenateGraphFilter.DEFAULT_PRESERVE_POSITION_INCREMENTS);
maxGraphExpansions =
getInt(args, "maxGraphExpansions", ConcatenateGraphFilter.DEFAULT_MAX_GRAPH_EXPANSIONS);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/** Default ctor for compatibility with SPI */
public ConcatenateGraphFilterFactory() {
throw defaultCtorException();
}
@Override
public TokenStream create(TokenStream input) {
return new ConcatenateGraphFilter(
input, tokenSeparator, preservePositionIncrements, maxGraphExpansions);
}
protected Character getCharacter(Map<String, String> args, String name, Character defaultVal) {
String s = args.remove(name);
if (s == null) {
return defaultVal;
} else if (s.length() == 0) {
return null;
} else {
if (s.length() != 1) {
throw new IllegalArgumentException(name + " should be a char. \"" + s + "\" is invalid");
} else {
return s.charAt(0);
}
}
}
}