blob: a9f6e91b14749f76450053660fa50fc4558cbab0 [file] [log] [blame]
package org.apache.lucene.analysis.stages;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.stages.attributes.ArcAttribute;
import org.apache.lucene.analysis.stages.attributes.DeletedAttribute;
import org.apache.lucene.analysis.stages.attributes.OffsetAttribute;
import org.apache.lucene.analysis.stages.attributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
// TODO
// - is there an adversary here? that can cause
// indefinite buffering?
// nocommit ToTokenizer instead? TokenFilter?
// nocommit make this more generic? ie not just atts for
// current indexer ... eg use captureState/restoreState to
// pass through any custom atts too
/** This is a compatibility class, to map the new {@link Stage} API to
* the legacy {@link TokenStream} API currently used/required by
* consumers like {@link IndexWriter} and query parsers. It takes
* a {@link Stage} as input and produces a {@link TokenStream} as
* output. This is not general purpose: it currently only sets
* the attributes that the (core, no custom indexing chain) indexer
* requires. */
public class StageToTokenStream extends TokenStream {
// nocommit: cutover to the approach from SausageGraphFilter
private final Stage stage;
private final DeletedAttribute delAtt;
private final TermAttribute termAttIn;
private final ArcAttribute arcAttIn;
private final OffsetAttribute offsetAttIn;
private final org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAttOut;
private final org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute posIncAttOut;
private final org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsetAttOut;
protected boolean resetCalled;
protected boolean closeCalled = true;
// Non-null when we are iterating through previously
// buffered tokens:
private Node[] pendingNodes;
private int nodeUpto;
private int arcUpto;
private int lastPosition;
private int pendingPosInc;
private int finalEndOffset;
// How many nodes in the current clump have no leaving arcs:
private int frontierNodeCount;
/** Holds a buffered node */
private static class Node implements Comparable<Node> {
int position;
final List<Arc> leaving = new ArrayList<Arc>();
public int compareTo(Node other) {
// No tie break ... I think that's OK?
return position - other.position;
}
}
private static class Arc {
final Node to;
final String term;
final int startOffset, endOffset;
final boolean deleted;
public Arc(Node to, String term, int startOffset, int endOffset, boolean deleted) {
this.to = to;
this.term = term;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.deleted = deleted;
}
}
public StageToTokenStream(Stage stage) {
this.stage = stage;
termAttIn = stage.get(TermAttribute.class);
termAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
posIncAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
offsetAttIn = stage.get(OffsetAttribute.class);
offsetAttOut = addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
arcAttIn = stage.get(ArcAttribute.class);
delAtt = stage.get(DeletedAttribute.class);
}
private Node getNode(Map<Integer,Node> nodes, int node) {
Node n = nodes.get(node);
if (n == null) {
n = new Node();
nodes.put(node, n);
frontierNodeCount++;
}
return n;
}
private void saveToken(Map<Integer,Node> nodes) {
Node from = nodes.get(arcAttIn.from());
Node to = getNode(nodes, arcAttIn.to());
to.position = Math.max(to.position, 1+from.position);
if (from.leaving.isEmpty()) {
frontierNodeCount--;
assert frontierNodeCount >= 0;
}
from.leaving.add(new Arc(to, termAttIn.toString(), offsetAttIn.startOffset(), offsetAttIn.endOffset(), delAtt != null && delAtt.deleted()));
}
private boolean nextSavedToken() {
while(pendingNodes != null) {
// restore state from pending node/arc:
Node node = pendingNodes[nodeUpto];
if (node.leaving.isEmpty()) {
assert nodeUpto == pendingNodes.length-1;
pendingPosInc = node.position - lastPosition - 1;
assert pendingPosInc >= 0;
System.out.println(" break: posInc=" + pendingPosInc + " lastPos=" + lastPosition + " vs node.pos=" + node.position);
break;
}
Arc arc = node.leaving.get(arcUpto);
arcUpto++;
if (arcUpto == node.leaving.size()) {
nodeUpto++;
if (nodeUpto == pendingNodes.length) {
pendingPosInc = node.position - lastPosition;
pendingNodes = null;
} else {
arcUpto = 0;
}
}
if (!arc.deleted) {
termAttOut.setEmpty();
termAttOut.append(arc.term);
offsetAttOut.setOffset(arc.startOffset, arc.endOffset);
posIncAttOut.setPositionIncrement(node.position - lastPosition);
System.out.println(" set posInc=" + (node.position - lastPosition));
// TODO: it'd be trivial to also set PosLengthAtt
// ... but since indexer is immediately after us, and
// indexer ignores pos len, there's no point today
//posLenAttOut.setPositionLength(arc.to.position - node.position);
pendingPosInc = 0;
lastPosition = node.position;
System.out.println(" set lastPos=" + lastPosition);
System.out.println(" return token=" + termAttOut);
return true;
} else {
System.out.println(" skip deleted token");
}
}
return false;
}
// nocommit this can falsely join two clumps into one, eg
// two back-to-back synonyms
@Override
public final boolean incrementToken() throws IOException {
System.out.println("STS.inc");
if (resetCalled == false) {
throw new IllegalStateException("call reset first");
}
// This is pointless (we always set all of the attrs we
// export), but tests disagree:
clearAttributes();
if (pendingNodes != null) {
// Still iterating through buffered tokens from last
// clump:
if (nextSavedToken()) {
System.out.println(" buffered: " + termAttOut);
return true;
}
System.out.println(" buffered fall through");
// We can fall through to here, eg if the last
// buffered token(s) were deleted (holes)
}
if (stage.next()) {
if (stage.nodes.anyNodesCanChange()) {
System.out.println(" now buffer: " + termAttIn);
Map<Integer,Node> nodes = new HashMap<Integer,Node>();
nodes.put(arcAttIn.from(), new Node());
frontierNodeCount = 1;
// Buffer up this "clump" of overlapping tokens
// until it un-clumps itself:
saveToken(nodes);
while (true) {
boolean result = stage.next();
// So long as there are still nodes that can
// change, there must be more tokens (hmm is this
// really true...):
assert result: "Stage.next ended without freezing all nodes";
saveToken(nodes);
System.out.println(" buffer again: " + termAttIn + "; " + stage.anyNodesCanChange() + " " + frontierNodeCount);
if (!stage.anyNodesCanChange() && frontierNodeCount == 1) {
System.out.println(" clump done");
break;
}
}
// Sort all nodes by position:
pendingNodes = nodes.values().toArray(new Node[nodes.size()]);
ArrayUtil.timSort(pendingNodes);
for(Node node : pendingNodes) {
System.out.println(" node pos=" + node.position + " " + node.leaving.size() + " leaving");
for(Arc arc : node.leaving) {
System.out.println(" " + arc.term + " to pos=" + arc.to.position);
}
}
nodeUpto = 0;
arcUpto = 0;
lastPosition = -1;
return nextSavedToken();
} else {
System.out.println(" pass through");
// Fast path (pass through): no buffering necessary:
termAttOut.setEmpty();
termAttOut.append(termAttIn.get());
offsetAttOut.setOffset(offsetAttIn.startOffset(),
offsetAttIn.endOffset());
posIncAttOut.setPositionIncrement(1 + pendingPosInc);
pendingPosInc = 0;
return true;
}
} else {
finalEndOffset = offsetAttIn.endOffset();
}
return false;
}
@Override
public void end() throws IOException {
super.end();
offsetAttOut.setOffset(finalEndOffset, finalEndOffset);
}
@Override
public void reset() throws IOException {
super.reset();
pendingNodes = null;
resetCalled = true;
}
@Override
public void close() throws IOException {
closeCalled = true;
}
}