blob: 2de5b09f153d1e86bd755fa06584053e343a0f0a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.PriorityQueue;
final class ExactPhraseMatcher extends PhraseMatcher {
private static class PostingsAndPosition {
private final PostingsEnum postings;
private final int offset;
private int freq, upTo, pos;
public PostingsAndPosition(PostingsEnum postings, int offset) {
this.postings = postings;
this.offset = offset;
}
}
private final PostingsAndPosition[] postings;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) {
super(matchCost);
final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new));
if (scoreMode == ScoreMode.TOP_SCORES) {
this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
} else {
this.approximation = approximation;
this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
}
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
for(PhraseQuery.PostingsAndFreq posting : postings) {
postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position));
}
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
}
@Override
DocIdSetIterator approximation() {
return approximation;
}
@Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
}
@Override
float maxFreq() {
int minFreq = postings[0].freq;
for (int i = 1; i < postings.length; i++) {
minFreq = Math.min(minFreq, postings[i].freq);
}
return minFreq;
}
/** Advance the given pos enum to the first doc on or after {@code target}.
* Return {@code false} if the enum was exhausted before reaching
* {@code target} and {@code true} otherwise. */
private static boolean advancePosition(PostingsAndPosition posting, int target) throws IOException {
while (posting.pos < target) {
if (posting.upTo == posting.freq) {
return false;
} else {
posting.pos = posting.postings.nextPosition();
posting.upTo += 1;
}
}
return true;
}
@Override
public void reset() throws IOException {
for (PostingsAndPosition posting : postings) {
posting.freq = posting.postings.freq();
posting.pos = -1;
posting.upTo = 0;
}
}
@Override
public boolean nextMatch() throws IOException {
final PostingsAndPosition lead = postings[0];
if (lead.upTo < lead.freq) {
lead.pos = lead.postings.nextPosition();
lead.upTo += 1;
}
else {
return false;
}
advanceHead:
while (true) {
final int phrasePos = lead.pos - lead.offset;
for (int j = 1; j < postings.length; ++j) {
final PostingsAndPosition posting = postings[j];
final int expectedPos = phrasePos + posting.offset;
// advance up to the same position as the lead
if (advancePosition(posting, expectedPos) == false) {
break advanceHead;
}
if (posting.pos != expectedPos) { // we advanced too far
if (advancePosition(lead, posting.pos - posting.offset + lead.offset)) {
continue advanceHead;
} else {
break advanceHead;
}
}
}
return true;
}
return false;
}
@Override
float sloppyWeight() {
return 1;
}
@Override
public int startPosition() {
return postings[0].pos;
}
@Override
public int endPosition() {
return postings[postings.length - 1].pos;
}
@Override
public int startOffset() throws IOException {
return postings[0].postings.startOffset();
}
@Override
public int endOffset() throws IOException {
return postings[postings.length - 1].postings.endOffset();
}
/**
* Merge impacts for multiple terms of an exact phrase.
*/
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) {
// Iteration of block boundaries uses the impacts enum with the lower cost.
// This is consistent with BlockMaxConjunctionScorer.
int tmpLeadIndex = -1;
for (int i = 0; i < impactsEnums.length; ++i) {
if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) {
tmpLeadIndex = i;
}
}
final int leadIndex = tmpLeadIndex;
return new ImpactsSource() {
class SubIterator {
final Iterator<Impact> iterator;
Impact current;
SubIterator(List<Impact> impacts) {
this.iterator = impacts.iterator();
this.current = iterator.next();
}
boolean next() {
if (iterator.hasNext() == false) {
current = null;
return false;
} else {
current = iterator.next();
return true;
}
}
}
@Override
public Impacts getImpacts() throws IOException {
final Impacts[] impacts = new Impacts[impactsEnums.length];
for (int i = 0; i < impactsEnums.length; ++i) {
impacts[i] = impactsEnums[i].getImpacts();
}
final Impacts lead = impacts[leadIndex];
return new Impacts() {
@Override
public int numLevels() {
// Delegate to the lead
return lead.numLevels();
}
@Override
public int getDocIdUpTo(int level) {
// Delegate to the lead
return lead.getDocIdUpTo(level);
}
/**
* Return the minimum level whose impacts are valid up to {@code docIdUpTo},
* or {@code -1} if there is no such level.
*/
private int getLevel(Impacts impacts, int docIdUpTo) {
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
return level;
}
}
return -1;
}
@Override
public List<Impact> getImpacts(int level) {
final int docIdUpTo = getDocIdUpTo(level);
PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) {
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
return a.current.freq < b.current.freq;
}
};
boolean hasImpacts = false;
List<Impact> onlyImpactList = null;
for (int i = 0; i < impacts.length; ++i) {
int impactsLevel = getLevel(impacts[i], docIdUpTo);
if (impactsLevel == -1) {
// This instance doesn't have useful impacts, ignore it: this is safe.
continue;
}
List<Impact> impactList = impacts[i].getImpacts(impactsLevel);
Impact firstImpact = impactList.get(0);
if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) {
// Dummy impacts, ignore it too.
continue;
}
SubIterator subIterator = new SubIterator(impactList);
pq.add(subIterator);
if (hasImpacts == false) {
hasImpacts = true;
onlyImpactList = impactList;
} else {
onlyImpactList = null; // there are multiple impacts
}
}
if (hasImpacts == false) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
} else if (onlyImpactList != null) {
return onlyImpactList;
}
// Idea: merge impacts by freq. The tricky thing is that we need to
// consider freq values that are not in the impacts too. For
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
// there might well be a document that has a freq of 2 and a length of 11,
// which was just not added to the list of impacts because {freq=2,norm=10}
// is more competitive.
// We walk impacts in parallel through a PQ ordered by freq. At any time,
// the competitive impact consists of the lowest freq among all entries of
// the PQ (the top) and the highest norm (tracked separately).
List<Impact> mergedImpacts = new ArrayList<>();
SubIterator top = pq.top();
int currentFreq = top.current.freq;
long currentNorm = 0;
for (SubIterator it : pq) {
if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) {
currentNorm = it.current.norm;
}
}
outer: while (true) {
if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) {
mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq;
} else {
mergedImpacts.add(new Impact(currentFreq, currentNorm));
}
do {
if (top.next() == false) {
// At least one clause doesn't have any more documents below the current norm,
// so we can safely ignore further clauses. The only reason why they have more
// impacts is because they cover more documents that we are not interested in.
break outer;
}
if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) {
currentNorm = top.current.norm;
}
top = pq.updateTop();
} while (top.current.freq == currentFreq);
currentFreq = top.current.freq;
}
return mergedImpacts;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {
for (ImpactsEnum impactsEnum : impactsEnums) {
impactsEnum.advanceShallow(target);
}
}
};
}
}