blob: bfdfb4316bfb91f2b1c1ab7fe3a27334594b3a94 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Util;
/**
* Concatenates/Joins every incoming token with a separator into one output token for every path
* through the token stream (which is a graph). In simple cases this yields one token, but in the
* presence of any tokens with a zero positionIncrmeent (e.g. synonyms) it will be more. This filter
* uses the token bytes, position increment, and position length of the incoming stream. Other
* attributes are not used or manipulated.
*
* @lucene.experimental
*/
public final class ConcatenateGraphFilter extends TokenStream {
/*
* Token stream which converts a provided token stream to an automaton.
* The accepted strings enumeration from the automaton are available through the
* {@link org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute} attribute
* The token stream uses a {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute} to store
* a completion's payload (see {@link ConcatenateGraphFilter#setPayload(org.apache.lucene.util.BytesRef)})
*/
/** Represents the default separator between tokens. */
public static final int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;
public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
public static final Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
public static final boolean DEFAULT_PRESERVE_SEP = true;
public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
private final BytesRefBuilderTermAttribute bytesAtt =
addAttribute(BytesRefBuilderTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt =
addAttribute(PositionIncrementAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final TokenStream inputTokenStream;
private final Character tokenSeparator;
private final boolean preservePositionIncrements;
private final int maxGraphExpansions;
private LimitedFiniteStringsIterator finiteStrings;
private CharTermAttribute charTermAttribute;
private boolean wasReset = false;
private int endOffset;
/**
* Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
* its token stream graph.
*
* <p>This constructor uses the default settings of the constants in this class.
*/
public ConcatenateGraphFilter(TokenStream inputTokenStream) {
this(
inputTokenStream,
DEFAULT_TOKEN_SEPARATOR,
DEFAULT_PRESERVE_POSITION_INCREMENTS,
DEFAULT_MAX_GRAPH_EXPANSIONS);
}
/**
* Creates a token stream to convert <code>input</code> to a token stream of accepted strings by
* its token stream graph.
*
* @param inputTokenStream The input/incoming TokenStream
* @param tokenSeparator Separator to use for concatenation. Can be null, in this case tokens will
* be concatenated without any separators.
* @param preservePositionIncrements Whether to add an empty token for missing positions. The
* effect is a consecutive {@link #SEP_LABEL}. When false, it's as if there were no missing
* positions (we pretend the surrounding tokens were adjacent).
* @param maxGraphExpansions If the tokenStream graph has more than this many possible paths
* through, then we'll throw {@link TooComplexToDeterminizeException} to preserve the
* stability and memory of the machine.
* @throws TooComplexToDeterminizeException if the tokenStream graph has more than {@code
* maxGraphExpansions} expansions
*/
public ConcatenateGraphFilter(
TokenStream inputTokenStream,
Character tokenSeparator,
boolean preservePositionIncrements,
int maxGraphExpansions) {
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source
// since we consume
// the input stream entirely in the first call to incrementToken
this.inputTokenStream = inputTokenStream;
this.tokenSeparator = tokenSeparator;
this.preservePositionIncrements = preservePositionIncrements;
this.maxGraphExpansions = maxGraphExpansions;
}
/**
* Calls {@link #ConcatenateGraphFilter(org.apache.lucene.analysis.TokenStream,
* java.lang.Character, boolean, int)}
*
* @param preserveSep Whether {@link #SEP_LABEL} should separate the input tokens in the
* concatenated token
*/
public ConcatenateGraphFilter(
TokenStream inputTokenStream,
boolean preserveSep,
boolean preservePositionIncrements,
int maxGraphExpansions) {
this(
inputTokenStream,
(preserveSep) ? DEFAULT_TOKEN_SEPARATOR : null,
preservePositionIncrements,
maxGraphExpansions);
}
@Override
public void reset() throws IOException {
super.reset();
// we only capture this if we really need it to save the UTF-8 to UTF-16 conversion
charTermAttribute = getAttribute(CharTermAttribute.class); // may return null
wasReset = true;
}
@Override
public boolean incrementToken() throws IOException {
if (finiteStrings == null) {
if (wasReset == false) {
throw new IllegalStateException("reset() missing before incrementToken");
}
// lazy init/consume
Automaton automaton =
toAutomaton(); // calls reset(), incrementToken() repeatedly, and end() on
// inputTokenStream
finiteStrings = new LimitedFiniteStringsIterator(automaton, maxGraphExpansions);
// note: would be nice to know the startOffset but toAutomaton doesn't capture it. We'll
// assume 0
endOffset = inputTokenStream.getAttribute(OffsetAttribute.class).endOffset();
}
IntsRef string = finiteStrings.next();
if (string == null) {
return false;
}
clearAttributes();
if (finiteStrings.size() > 1) { // if number of iterated strings so far is more than one...
posIncrAtt.setPositionIncrement(0); // stacked
}
offsetAtt.setOffset(0, endOffset);
Util.toBytesRef(string, bytesAtt.builder()); // now we have UTF-8
if (charTermAttribute != null) {
charTermAttribute.setLength(0);
charTermAttribute.append(bytesAtt.toUTF16());
}
return true;
}
@Override
public void end() throws IOException {
super.end();
if (finiteStrings == null) { // thus inputTokenStream hasn't yet received end()
inputTokenStream
.end(); // the input TS may really want to see "end()" called even if incrementToken
// hasn't.
} // else we already eagerly consumed inputTokenStream including end()
if (endOffset != -1) {
offsetAtt.setOffset(0, endOffset);
}
}
@Override
public void close() throws IOException {
super.close();
// delegate lifecycle. Note toAutomaton does not close the stream
inputTokenStream.close();
finiteStrings = null;
wasReset = false; // reset
endOffset = -1; // reset
}
/**
* Converts the tokenStream to an automaton, treating the transition labels as utf-8. Does *not*
* close it.
*/
public Automaton toAutomaton() throws IOException {
return toAutomaton(false);
}
/** Converts the tokenStream to an automaton. Does *not* close it. */
public Automaton toAutomaton(boolean unicodeAware) throws IOException {
// TODO refactor this
// maybe we could hook up a modified automaton from TermAutomatonQuery here?
// Create corresponding automaton: labels are bytes
// from each analyzed token, with byte 0 used as
// separator between tokens:
final TokenStreamToAutomaton tsta;
if (tokenSeparator != null) {
tsta = new EscapingTokenStreamToAutomaton(tokenSeparator);
} else {
// When we're not preserving sep, we don't steal 0xff
// byte, so we don't need to do any escaping:
tsta = new TokenStreamToAutomaton();
}
tsta.setPreservePositionIncrements(preservePositionIncrements);
tsta.setUnicodeArcs(unicodeAware);
Automaton automaton = tsta.toAutomaton(inputTokenStream);
// TODO: we can optimize this somewhat by determinizing
// while we convert
automaton = replaceSep(automaton, tokenSeparator);
// This automaton should not blow up during determinize:
return Operations.determinize(automaton, maxGraphExpansions);
}
/** Just escapes the {@link #SEP_LABEL} byte with an extra. */
private static final class EscapingTokenStreamToAutomaton extends TokenStreamToAutomaton {
final BytesRefBuilder spare = new BytesRefBuilder();
final byte sepLabel;
public EscapingTokenStreamToAutomaton(int sepLabel) {
assert sepLabel <= Byte.MAX_VALUE;
this.sepLabel = (byte) sepLabel;
}
@Override
protected BytesRef changeToken(BytesRef in) {
int upto = 0;
for (int i = 0; i < in.length; i++) {
byte b = in.bytes[in.offset + i];
if (b == sepLabel) {
spare.grow(upto + 2);
spare.setByteAt(upto++, sepLabel);
spare.setByteAt(upto++, b);
} else {
spare.grow(upto + 1);
spare.setByteAt(upto++, b);
}
}
spare.setLength(upto);
return spare.get();
}
}
// Replaces SEP with epsilon or remaps them if
// we were asked to preserve them:
private static Automaton replaceSep(Automaton a, Character tokenSeparator) {
Automaton result = new Automaton();
// Copy all states over
int numStates = a.getNumStates();
for (int s = 0; s < numStates; s++) {
result.createState();
result.setAccept(s, a.isAccept(s));
}
// Go in reverse topo sort so we know we only have to
// make one pass:
Transition t = new Transition();
int[] topoSortStates = Operations.topoSortStates(a);
for (int i = 0; i < topoSortStates.length; i++) {
int state = topoSortStates[topoSortStates.length - 1 - i];
int count = a.initTransition(state, t);
for (int j = 0; j < count; j++) {
a.getNextTransition(t);
if (t.min == TokenStreamToAutomaton.POS_SEP) {
assert t.max == TokenStreamToAutomaton.POS_SEP;
if (tokenSeparator != null) {
// Remap to tokenSeparator:
result.addTransition(state, t.dest, tokenSeparator);
} else {
result.addEpsilon(state, t.dest);
}
} else if (t.min == TokenStreamToAutomaton.HOLE) {
assert t.max == TokenStreamToAutomaton.HOLE;
// Just remove the hole: there will then be two
// SEP tokens next to each other, which will only
// match another hole at search time. Note that
// it will also match an empty-string token ... if
// that's somehow a problem we can always map HOLE
// to a dedicated byte (and escape it in the
// input).
result.addEpsilon(state, t.dest);
} else {
result.addTransition(state, t.dest, t.min, t.max);
}
}
}
result.finishState();
return result;
}
/**
* Attribute providing access to the term builder and UTF-16 conversion
*
* @lucene.internal
*/
public interface BytesRefBuilderTermAttribute extends TermToBytesRefAttribute {
/** Returns the builder from which the term is derived. */
BytesRefBuilder builder();
/** Returns the term represented as UTF-16 */
CharSequence toUTF16();
}
/**
* Implementation of {@link BytesRefBuilderTermAttribute}
*
* @lucene.internal
*/
public static final class BytesRefBuilderTermAttributeImpl extends AttributeImpl
implements BytesRefBuilderTermAttribute, TermToBytesRefAttribute {
private final BytesRefBuilder bytes = new BytesRefBuilder();
private transient CharsRefBuilder charsRef;
/** Sole constructor no-op */
public BytesRefBuilderTermAttributeImpl() {}
@Override
public BytesRefBuilder builder() {
return bytes;
}
@Override
public BytesRef getBytesRef() {
return bytes.get();
}
@Override
public void clear() {
bytes.clear();
}
@Override
public void copyTo(AttributeImpl target) {
BytesRefBuilderTermAttributeImpl other = (BytesRefBuilderTermAttributeImpl) target;
other.bytes.copyBytes(bytes);
}
@Override
public AttributeImpl clone() {
BytesRefBuilderTermAttributeImpl other = new BytesRefBuilderTermAttributeImpl();
copyTo(other);
return other;
}
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(TermToBytesRefAttribute.class, "bytes", getBytesRef());
}
@Override
public CharSequence toUTF16() {
if (charsRef == null) {
charsRef = new CharsRefBuilder();
}
charsRef.copyUTF8Bytes(getBytesRef());
return charsRef.get();
}
}
}