blob: 474a8d94d9f8945f89e14b7d21c4549bee507824 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.search.TermAutomatonQuery.EnumAndScorer;
import org.apache.lucene.search.TermAutomatonQuery.TermAutomatonWeight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RunAutomaton;
// TODO: add two-phase and needsScores support. maybe use conjunctionDISI internally?
class TermAutomatonScorer extends Scorer {
private final EnumAndScorer[] subs;
private final EnumAndScorer[] subsOnDoc;
private final PriorityQueue<EnumAndScorer> docIDQueue;
private final PriorityQueue<EnumAndScorer> posQueue;
private final RunAutomaton runAutomaton;
private final Map<Integer,BytesRef> idToTerm;
// We reuse this array to check for matches starting from an initial
// position; we increase posShift every time we move to a new possible
// start:
private PosState[] positions;
int posShift;
// This is -1 if wildcard (null) terms were not used, else it's the id
// of the wildcard term:
private final int anyTermID;
private final LeafSimScorer docScorer;
private int numSubsOnDoc;
private final long cost;
private int docID = -1;
private int freq;
public TermAutomatonScorer(TermAutomatonWeight weight, EnumAndScorer[] subs, int anyTermID, Map<Integer,BytesRef> idToTerm, LeafSimScorer docScorer) throws IOException {
super(weight);
//System.out.println(" automaton:\n" + weight.automaton.toDot());
this.runAutomaton = new TermRunAutomaton(weight.automaton, subs.length);
this.docScorer = docScorer;
this.idToTerm = idToTerm;
this.subs = subs;
this.docIDQueue = new DocIDQueue(subs.length);
this.posQueue = new PositionQueue(subs.length);
this.anyTermID = anyTermID;
this.subsOnDoc = new EnumAndScorer[subs.length];
this.positions = new PosState[4];
for(int i=0;i<this.positions.length;i++) {
this.positions[i] = new PosState();
}
long cost = 0;
// Init docIDQueue:
for(EnumAndScorer sub : subs) {
if (sub != null) {
cost += sub.posEnum.cost();
subsOnDoc[numSubsOnDoc++] = sub;
}
}
this.cost = cost;
}
/** Sorts by docID so we can quickly pull out all scorers that are on
* the same (lowest) docID. */
private static class DocIDQueue extends PriorityQueue<EnumAndScorer> {
public DocIDQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
return a.posEnum.docID() < b.posEnum.docID();
}
}
/** Sorts by position so we can visit all scorers on one doc, by
* position. */
private static class PositionQueue extends PriorityQueue<EnumAndScorer> {
public PositionQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(EnumAndScorer a, EnumAndScorer b) {
return a.pos < b.pos;
}
}
/** Pops all enums positioned on the current (minimum) doc */
private void popCurrentDoc() {
assert numSubsOnDoc == 0;
assert docIDQueue.size() > 0;
subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
docID = subsOnDoc[0].posEnum.docID();
while (docIDQueue.size() > 0 && docIDQueue.top().posEnum.docID() == docID) {
subsOnDoc[numSubsOnDoc++] = docIDQueue.pop();
}
}
/** Pushes all previously pop'd enums back into the docIDQueue */
private void pushCurrentDoc() {
for(int i=0;i<numSubsOnDoc;i++) {
docIDQueue.add(subsOnDoc[i]);
}
numSubsOnDoc = 0;
}
@Override
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
@Override
public int docID() {
return docID;
}
@Override
public long cost() {
return cost;
}
@Override
public int nextDoc() throws IOException {
// we only need to advance docs that are positioned since all docs in the
// pq are guaranteed to be beyond the current doc already
for(int i=0;i<numSubsOnDoc;i++) {
EnumAndScorer sub = subsOnDoc[i];
if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
sub.posLeft = sub.posEnum.freq()-1;
sub.pos = sub.posEnum.nextPosition();
}
}
pushCurrentDoc();
return doNext();
}
@Override
public int advance(int target) throws IOException {
// Both positioned docs and docs in the pq might be behind target
// 1. Advance the PQ
if (docIDQueue.size() > 0) {
EnumAndScorer top = docIDQueue.top();
while (top.posEnum.docID() < target) {
if (top.posEnum.advance(target) != NO_MORE_DOCS) {
top.posLeft = top.posEnum.freq()-1;
top.pos = top.posEnum.nextPosition();
}
top = docIDQueue.updateTop();
}
}
// 2. Advance subsOnDoc
for(int i=0;i<numSubsOnDoc;i++) {
EnumAndScorer sub = subsOnDoc[i];
if (sub.posEnum.advance(target) != NO_MORE_DOCS) {
sub.posLeft = sub.posEnum.freq()-1;
sub.pos = sub.posEnum.nextPosition();
}
}
pushCurrentDoc();
return doNext();
}
private int doNext() throws IOException {
assert numSubsOnDoc == 0;
assert docIDQueue.top().posEnum.docID() > docID;
while (true) {
//System.out.println(" doNext: cycle");
popCurrentDoc();
//System.out.println(" docID=" + docID);
if (docID == NO_MORE_DOCS) {
return docID;
}
countMatches();
if (freq > 0) {
return docID;
}
for(int i=0;i<numSubsOnDoc;i++) {
EnumAndScorer sub = subsOnDoc[i];
if (sub.posEnum.nextDoc() != NO_MORE_DOCS) {
sub.posLeft = sub.posEnum.freq()-1;
sub.pos = sub.posEnum.nextPosition();
}
}
pushCurrentDoc();
}
}
};
}
private PosState getPosition(int pos) {
return positions[pos-posShift];
}
private void shift(int pos) {
int limit = pos-posShift;
for(int i=0;i<limit;i++) {
positions[i].count = 0;
}
posShift = pos;
}
private void countMatches() throws IOException {
freq = 0;
for(int i=0;i<numSubsOnDoc;i++) {
posQueue.add(subsOnDoc[i]);
}
// System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID + " id=" + reader.document(docID).get("id"));
// System.out.println("\ncountMatches: " + numSubsOnDoc + " terms in doc=" + docID + " anyTermID=" + anyTermID);
int lastPos = -1;
posShift = -1;
while (posQueue.size() != 0) {
EnumAndScorer sub = posQueue.pop();
// This is a graph intersection, and pos is the state this token
// leaves from. Until index stores posLength (which we could
// stuff into a payload using a simple TokenFilter), this token
// always transitions from state=pos to state=pos+1:
final int pos = sub.pos;
if (posShift == -1) {
posShift = pos;
}
if (pos+1-posShift >= positions.length) {
PosState[] newPositions = new PosState[ArrayUtil.oversize(pos+1-posShift, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(positions, 0, newPositions, 0, positions.length);
for(int i=positions.length;i<newPositions.length;i++) {
newPositions[i] = new PosState();
}
positions = newPositions;
}
// System.out.println(" term=" + idToTerm.get(sub.termID).utf8ToString() + " pos=" + pos + " (count=" + getPosition(pos).count + " lastPos=" + lastPos + ") posQueue.size=" + posQueue.size() + " posShift=" + posShift);
PosState posState;
PosState nextPosState;
// Maybe advance ANY matches:
if (lastPos != -1) {
if (anyTermID != -1) {
int startLastPos = lastPos;
while (lastPos < pos) {
posState = getPosition(lastPos);
if (posState.count == 0 && lastPos > startLastPos) {
// Petered out...
lastPos = pos;
break;
}
// System.out.println(" iter lastPos=" + lastPos + " count=" + posState.count);
nextPosState = getPosition(lastPos+1);
// Advance all states from lastPos -> pos, if they had an any arc:
for(int i=0;i<posState.count;i++) {
int state = runAutomaton.step(posState.states[i], anyTermID);
if (state != -1) {
// System.out.println(" add pos=" + (lastPos+1) + " state=" + state);
nextPosState.add(state);
}
}
lastPos++;
}
}
}
posState = getPosition(pos);
nextPosState = getPosition(pos+1);
// If there are no pending matches at neither this position or the
// next position, then it's safe to shift back to positions[0]:
if (posState.count == 0 && nextPosState.count == 0) {
shift(pos);
posState = getPosition(pos);
nextPosState = getPosition(pos+1);
}
// Match current token:
for(int i=0;i<posState.count;i++) {
// System.out.println(" check cur state=" + posState.states[i]);
int state = runAutomaton.step(posState.states[i], sub.termID);
if (state != -1) {
// System.out.println(" --> " + state);
nextPosState.add(state);
if (runAutomaton.isAccept(state)) {
// System.out.println(" *** (1)");
freq++;
}
}
}
// Also consider starting a new match from this position:
int state = runAutomaton.step(0, sub.termID);
if (state != -1) {
// System.out.println(" add init state=" + state);
nextPosState.add(state);
if (runAutomaton.isAccept(state)) {
// System.out.println(" *** (2)");
freq++;
}
}
if (sub.posLeft > 0) {
// Put this sub back into the posQueue:
sub.pos = sub.posEnum.nextPosition();
sub.posLeft--;
posQueue.add(sub);
}
lastPos = pos;
}
int limit = lastPos+1-posShift;
// reset
for(int i=0;i<=limit;i++) {
positions[i].count = 0;
}
}
@Override
public String toString() {
return "TermAutomatonScorer(" + weight + ")";
}
@Override
public int docID() {
return docID;
}
@Override
public float score() throws IOException {
// TODO: we could probably do better here, e.g. look @ freqs of actual terms involved in this doc and score differently
return docScorer.score(docID, freq);
}
@Override
public float getMaxScore(int upTo) throws IOException {
return docScorer.getSimScorer().score(Float.MAX_VALUE, 1L);
}
static class TermRunAutomaton extends RunAutomaton {
public TermRunAutomaton(Automaton a, int termCount) {
super(a, termCount);
}
}
private static class PosState {
// Which automaton states we are in at this position
int[] states = new int[2];
// How many states
int count;
public void add(int state) {
if (states.length == count) {
states = ArrayUtil.grow(states);
}
states[count++] = state;
}
}
}