blob: 909cfe084879c7f8dbfd75673dfe58869e9aa2c6 [file] [log] [blame]
package org.apache.lucene.search;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.*;
import org.apache.lucene.search.similarities.Similarity;
final class ExactPhraseScorer extends Scorer {
private final int endMinus1;
private final static int CHUNK = 4096;
private int gen;
private final int[] counts = new int[CHUNK];
private final int[] gens = new int[CHUNK];
boolean noDocs;
private final long cost;
private final static class ChunkState {
final DocsAndPositionsEnum posEnum;
final int offset;
final boolean useAdvance;
int posUpto;
int posLimit;
int pos;
int lastPos;
public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
this.posEnum = posEnum;
this.offset = offset;
this.useAdvance = useAdvance;
}
}
private final ChunkState[] chunkStates;
private int docID = -1;
private int freq;
private final Similarity.SimScorer docScorer;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity.SimScorer docScorer) throws IOException {
super(weight);
this.docScorer = docScorer;
chunkStates = new ChunkState[postings.length];
endMinus1 = postings.length-1;
// min(cost)
cost = postings[0].postings.cost();
for(int i=0;i<postings.length;i++) {
// Coarse optimization: advance(target) is fairly
// costly, so, if the relative freq of the 2nd
// rarest term is not that much (> 1/5th) rarer than
// the first term, then we just use .nextDoc() when
// ANDing. This buys ~15% gain for phrases where
// freq of rarest 2 terms is close:
final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
noDocs = true;
return;
}
}
}
@Override
public int nextDoc() throws IOException {
while(true) {
// first (rarest) term
final int doc = chunkStates[0].posEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
docID = doc;
return doc;
}
// not-first terms
int i = 1;
while(i < chunkStates.length) {
final ChunkState cs = chunkStates[i];
int doc2 = cs.posEnum.docID();
if (cs.useAdvance) {
if (doc2 < doc) {
doc2 = cs.posEnum.advance(doc);
}
} else {
int iter = 0;
while(doc2 < doc) {
// safety net -- fallback to .advance if we've
// done too many .nextDocs
if (++iter == 50) {
doc2 = cs.posEnum.advance(doc);
break;
} else {
doc2 = cs.posEnum.nextDoc();
}
}
}
if (doc2 > doc) {
break;
}
i++;
}
if (i == chunkStates.length) {
// this doc has all the terms -- now test whether
// phrase occurs
docID = doc;
freq = phraseFreq();
if (freq != 0) {
return docID;
}
}
}
}
@Override
public int advance(int target) throws IOException {
// first term
int doc = chunkStates[0].posEnum.advance(target);
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
docID = DocIdSetIterator.NO_MORE_DOCS;
return doc;
}
while(true) {
// not-first terms
int i = 1;
while(i < chunkStates.length) {
int doc2 = chunkStates[i].posEnum.docID();
if (doc2 < doc) {
doc2 = chunkStates[i].posEnum.advance(doc);
}
if (doc2 > doc) {
break;
}
i++;
}
if (i == chunkStates.length) {
// this doc has all the terms -- now test whether
// phrase occurs
docID = doc;
freq = phraseFreq();
if (freq != 0) {
return docID;
}
}
doc = chunkStates[0].posEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS) {
docID = doc;
return doc;
}
}
}
@Override
public String toString() {
return "ExactPhraseScorer(" + weight + ")";
}
@Override
public int freq() {
return freq;
}
@Override
public int docID() {
return docID;
}
@Override
public float score() {
return docScorer.score(docID, freq);
}
private int phraseFreq() throws IOException {
freq = 0;
// init chunks
for(int i=0;i<chunkStates.length;i++) {
final ChunkState cs = chunkStates[i];
cs.posLimit = cs.posEnum.freq();
cs.pos = cs.offset + cs.posEnum.nextPosition();
cs.posUpto = 1;
cs.lastPos = -1;
}
int chunkStart = 0;
int chunkEnd = CHUNK;
// process chunk by chunk
boolean end = false;
// TODO: we could fold in chunkStart into offset and
// save one subtract per pos incr
while(!end) {
gen++;
if (gen == 0) {
// wraparound
Arrays.fill(gens, 0);
gen++;
}
// first term
{
final ChunkState cs = chunkStates[0];
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
counts[posIndex] = 1;
assert gens[posIndex] != gen;
gens[posIndex] = gen;
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
// middle terms
boolean any = true;
for(int t=1;t<endMinus1;t++) {
final ChunkState cs = chunkStates[t];
any = false;
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
// viable
counts[posIndex]++;
any = true;
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
if (!any) {
break;
}
}
if (!any) {
// petered out for this chunk
chunkStart += CHUNK;
chunkEnd += CHUNK;
continue;
}
// last term
{
final ChunkState cs = chunkStates[endMinus1];
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
freq++;
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
chunkStart += CHUNK;
chunkEnd += CHUNK;
}
return freq;
}
@Override
public long cost() {
return cost;
}
}