| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.stream.Collectors; |
| |
| import org.apache.lucene.index.Impact; |
| import org.apache.lucene.index.Impacts; |
| import org.apache.lucene.index.ImpactsEnum; |
| import org.apache.lucene.index.ImpactsSource; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| import org.apache.lucene.util.PriorityQueue; |
| |
| final class ExactPhraseMatcher extends PhraseMatcher { |
| |
| private static class PostingsAndPosition { |
| private final PostingsEnum postings; |
| private final int offset; |
| private int freq, upTo, pos; |
| |
| public PostingsAndPosition(PostingsEnum postings, int offset) { |
| this.postings = postings; |
| this.offset = offset; |
| } |
| } |
| |
| private final PostingsAndPosition[] postings; |
| private final DocIdSetIterator approximation; |
| private final ImpactsDISI impactsApproximation; |
| |
| ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) { |
| super(matchCost); |
| |
| final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList())); |
| final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new)); |
| |
| if (scoreMode == ScoreMode.TOP_SCORES) { |
| this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer); |
| } else { |
| this.approximation = approximation; |
| this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer); |
| } |
| |
| List<PostingsAndPosition> postingsAndPositions = new ArrayList<>(); |
| for(PhraseQuery.PostingsAndFreq posting : postings) { |
| postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position)); |
| } |
| this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]); |
| } |
| |
| @Override |
| DocIdSetIterator approximation() { |
| return approximation; |
| } |
| |
| @Override |
| ImpactsDISI impactsApproximation() { |
| return impactsApproximation; |
| } |
| |
| @Override |
| float maxFreq() { |
| int minFreq = postings[0].freq; |
| for (int i = 1; i < postings.length; i++) { |
| minFreq = Math.min(minFreq, postings[i].freq); |
| } |
| return minFreq; |
| } |
| |
| /** Advance the given pos enum to the first doc on or after {@code target}. |
| * Return {@code false} if the enum was exhausted before reaching |
| * {@code target} and {@code true} otherwise. */ |
| private static boolean advancePosition(PostingsAndPosition posting, int target) throws IOException { |
| while (posting.pos < target) { |
| if (posting.upTo == posting.freq) { |
| return false; |
| } else { |
| posting.pos = posting.postings.nextPosition(); |
| posting.upTo += 1; |
| } |
| } |
| return true; |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| for (PostingsAndPosition posting : postings) { |
| posting.freq = posting.postings.freq(); |
| posting.pos = -1; |
| posting.upTo = 0; |
| } |
| } |
| |
| @Override |
| public boolean nextMatch() throws IOException { |
| final PostingsAndPosition lead = postings[0]; |
| if (lead.upTo < lead.freq) { |
| lead.pos = lead.postings.nextPosition(); |
| lead.upTo += 1; |
| } |
| else { |
| return false; |
| } |
| advanceHead: |
| while (true) { |
| final int phrasePos = lead.pos - lead.offset; |
| for (int j = 1; j < postings.length; ++j) { |
| final PostingsAndPosition posting = postings[j]; |
| final int expectedPos = phrasePos + posting.offset; |
| |
| // advance up to the same position as the lead |
| if (advancePosition(posting, expectedPos) == false) { |
| break advanceHead; |
| } |
| |
| if (posting.pos != expectedPos) { // we advanced too far |
| if (advancePosition(lead, posting.pos - posting.offset + lead.offset)) { |
| continue advanceHead; |
| } else { |
| break advanceHead; |
| } |
| } |
| } |
| return true; |
| } |
| return false; |
| } |
| |
| @Override |
| float sloppyWeight() { |
| return 1; |
| } |
| |
| @Override |
| public int startPosition() { |
| return postings[0].pos; |
| } |
| |
| @Override |
| public int endPosition() { |
| return postings[postings.length - 1].pos; |
| } |
| |
| @Override |
| public int startOffset() throws IOException { |
| return postings[0].postings.startOffset(); |
| } |
| |
| @Override |
| public int endOffset() throws IOException { |
| return postings[postings.length - 1].postings.endOffset(); |
| } |
| |
| /** |
| * Merge impacts for multiple terms of an exact phrase. |
| */ |
| static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) { |
| // Iteration of block boundaries uses the impacts enum with the lower cost. |
| // This is consistent with BlockMaxConjunctionScorer. |
| int tmpLeadIndex = -1; |
| for (int i = 0; i < impactsEnums.length; ++i) { |
| if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) { |
| tmpLeadIndex = i; |
| } |
| } |
| final int leadIndex = tmpLeadIndex; |
| |
| return new ImpactsSource() { |
| |
| class SubIterator { |
| final Iterator<Impact> iterator; |
| Impact current; |
| |
| SubIterator(List<Impact> impacts) { |
| this.iterator = impacts.iterator(); |
| this.current = iterator.next(); |
| } |
| |
| boolean next() { |
| if (iterator.hasNext() == false) { |
| current = null; |
| return false; |
| } else { |
| current = iterator.next(); |
| return true; |
| } |
| } |
| } |
| |
| @Override |
| public Impacts getImpacts() throws IOException { |
| final Impacts[] impacts = new Impacts[impactsEnums.length]; |
| for (int i = 0; i < impactsEnums.length; ++i) { |
| impacts[i] = impactsEnums[i].getImpacts(); |
| } |
| final Impacts lead = impacts[leadIndex]; |
| return new Impacts() { |
| |
| @Override |
| public int numLevels() { |
| // Delegate to the lead |
| return lead.numLevels(); |
| } |
| |
| @Override |
| public int getDocIdUpTo(int level) { |
| // Delegate to the lead |
| return lead.getDocIdUpTo(level); |
| } |
| |
| /** |
| * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, |
| * or {@code -1} if there is no such level. |
| */ |
| private int getLevel(Impacts impacts, int docIdUpTo) { |
| for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { |
| if (impacts.getDocIdUpTo(level) >= docIdUpTo) { |
| return level; |
| } |
| } |
| return -1; |
| } |
| |
| @Override |
| public List<Impact> getImpacts(int level) { |
| final int docIdUpTo = getDocIdUpTo(level); |
| |
| PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) { |
| @Override |
| protected boolean lessThan(SubIterator a, SubIterator b) { |
| return a.current.freq < b.current.freq; |
| } |
| }; |
| |
| boolean hasImpacts = false; |
| List<Impact> onlyImpactList = null; |
| for (int i = 0; i < impacts.length; ++i) { |
| int impactsLevel = getLevel(impacts[i], docIdUpTo); |
| if (impactsLevel == -1) { |
| // This instance doesn't have useful impacts, ignore it: this is safe. |
| continue; |
| } |
| |
| List<Impact> impactList = impacts[i].getImpacts(impactsLevel); |
| Impact firstImpact = impactList.get(0); |
| if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) { |
| // Dummy impacts, ignore it too. |
| continue; |
| } |
| |
| SubIterator subIterator = new SubIterator(impactList); |
| pq.add(subIterator); |
| if (hasImpacts == false) { |
| hasImpacts = true; |
| onlyImpactList = impactList; |
| } else { |
| onlyImpactList = null; // there are multiple impacts |
| } |
| } |
| |
| if (hasImpacts == false) { |
| return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); |
| } else if (onlyImpactList != null) { |
| return onlyImpactList; |
| } |
| |
| // Idea: merge impacts by freq. The tricky thing is that we need to |
| // consider freq values that are not in the impacts too. For |
| // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], |
| // there might well be a document that has a freq of 2 and a length of 11, |
| // which was just not added to the list of impacts because {freq=2,norm=10} |
| // is more competitive. |
| // We walk impacts in parallel through a PQ ordered by freq. At any time, |
| // the competitive impact consists of the lowest freq among all entries of |
| // the PQ (the top) and the highest norm (tracked separately). |
| List<Impact> mergedImpacts = new ArrayList<>(); |
| SubIterator top = pq.top(); |
| int currentFreq = top.current.freq; |
| long currentNorm = 0; |
| for (SubIterator it : pq) { |
| if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) { |
| currentNorm = it.current.norm; |
| } |
| } |
| |
| outer: while (true) { |
| if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) { |
| mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq; |
| } else { |
| mergedImpacts.add(new Impact(currentFreq, currentNorm)); |
| } |
| |
| do { |
| if (top.next() == false) { |
| // At least one clause doesn't have any more documents below the current norm, |
| // so we can safely ignore further clauses. The only reason why they have more |
| // impacts is because they cover more documents that we are not interested in. |
| break outer; |
| } |
| if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) { |
| currentNorm = top.current.norm; |
| } |
| top = pq.updateTop(); |
| } while (top.current.freq == currentFreq); |
| |
| currentFreq = top.current.freq; |
| } |
| |
| return mergedImpacts; |
| } |
| }; |
| } |
| |
| @Override |
| public void advanceShallow(int target) throws IOException { |
| for (ImpactsEnum impactsEnum : impactsEnums) { |
| impactsEnum.advanceShallow(target); |
| } |
| } |
| }; |
| } |
| |
| } |