| package org.apache.lucene.search; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.*; |
| |
| import org.apache.lucene.index.AtomicReaderContext; |
| import org.apache.lucene.index.DocsAndPositionsEnum; |
| import org.apache.lucene.index.AtomicReader; |
| import org.apache.lucene.index.DocsEnum; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexReaderContext; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermContext; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.similarities.Similarity.SimScorer; |
| import org.apache.lucene.search.similarities.Similarity; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.ToStringUtils; |
| |
| /** |
| * MultiPhraseQuery is a generalized version of PhraseQuery, with an added |
| * method {@link #add(Term[])}. |
| * To use this class, to search for the phrase "Microsoft app*" first use |
| * add(Term) on the term "Microsoft", then find all terms that have "app" as |
| * prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[] |
| * terms) to add them to the query. |
| * |
| */ |
| public class MultiPhraseQuery extends Query { |
| private String field; |
| private ArrayList<Term[]> termArrays = new ArrayList<>(); |
| private ArrayList<Integer> positions = new ArrayList<>(); |
| |
| private int slop = 0; |
| |
| /** Sets the phrase slop for this query. |
| * @see PhraseQuery#setSlop(int) |
| */ |
| public void setSlop(int s) { |
| if (s < 0) { |
| throw new IllegalArgumentException("slop value cannot be negative"); |
| } |
| slop = s; |
| } |
| |
| /** Sets the phrase slop for this query. |
| * @see PhraseQuery#getSlop() |
| */ |
| public int getSlop() { return slop; } |
| |
| /** Add a single term at the next position in the phrase. |
| * @see PhraseQuery#add(Term) |
| */ |
| public void add(Term term) { add(new Term[]{term}); } |
| |
| /** Add multiple terms at the next position in the phrase. Any of the terms |
| * may match. |
| * |
| * @see PhraseQuery#add(Term) |
| */ |
| public void add(Term[] terms) { |
| int position = 0; |
| if (positions.size() > 0) |
| position = positions.get(positions.size()-1).intValue() + 1; |
| |
| add(terms, position); |
| } |
| |
| /** |
| * Allows to specify the relative position of terms within the phrase. |
| * |
| * @see PhraseQuery#add(Term, int) |
| */ |
| public void add(Term[] terms, int position) { |
| if (termArrays.size() == 0) |
| field = terms[0].field(); |
| |
| for (int i = 0; i < terms.length; i++) { |
| if (!terms[i].field().equals(field)) { |
| throw new IllegalArgumentException( |
| "All phrase terms must be in the same field (" + field + "): " |
| + terms[i]); |
| } |
| } |
| |
| termArrays.add(terms); |
| positions.add(Integer.valueOf(position)); |
| } |
| |
| /** |
| * Returns a List of the terms in the multiphrase. |
| * Do not modify the List or its contents. |
| */ |
| public List<Term[]> getTermArrays() { |
| return Collections.unmodifiableList(termArrays); |
| } |
| |
| /** |
| * Returns the relative positions of terms in this phrase. |
| */ |
| public int[] getPositions() { |
| int[] result = new int[positions.size()]; |
| for (int i = 0; i < positions.size(); i++) |
| result[i] = positions.get(i).intValue(); |
| return result; |
| } |
| |
| // inherit javadoc |
| @Override |
| public void extractTerms(Set<Term> terms) { |
| for (final Term[] arr : termArrays) { |
| for (final Term term: arr) { |
| terms.add(term); |
| } |
| } |
| } |
| |
| |
| private class MultiPhraseWeight extends Weight { |
| private final Similarity similarity; |
| private final Similarity.SimWeight stats; |
| private final Map<Term,TermContext> termContexts = new HashMap<>(); |
| |
| public MultiPhraseWeight(IndexSearcher searcher) |
| throws IOException { |
| this.similarity = searcher.getSimilarity(); |
| final IndexReaderContext context = searcher.getTopReaderContext(); |
| |
| // compute idf |
| ArrayList<TermStatistics> allTermStats = new ArrayList<>(); |
| for(final Term[] terms: termArrays) { |
| for (Term term: terms) { |
| TermContext termContext = termContexts.get(term); |
| if (termContext == null) { |
| termContext = TermContext.build(context, term); |
| termContexts.put(term, termContext); |
| } |
| allTermStats.add(searcher.termStatistics(term, termContext)); |
| } |
| } |
| stats = similarity.computeWeight(getBoost(), |
| searcher.collectionStatistics(field), |
| allTermStats.toArray(new TermStatistics[allTermStats.size()])); |
| } |
| |
| @Override |
| public Query getQuery() { return MultiPhraseQuery.this; } |
| |
| @Override |
| public float getValueForNormalization() { |
| return stats.getValueForNormalization(); |
| } |
| |
| @Override |
| public void normalize(float queryNorm, float topLevelBoost) { |
| stats.normalize(queryNorm, topLevelBoost); |
| } |
| |
| @Override |
| public Scorer scorer(AtomicReaderContext context, Bits acceptDocs) throws IOException { |
| assert !termArrays.isEmpty(); |
| final AtomicReader reader = context.reader(); |
| final Bits liveDocs = acceptDocs; |
| |
| PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[termArrays.size()]; |
| |
| final Terms fieldTerms = reader.terms(field); |
| if (fieldTerms == null) { |
| return null; |
| } |
| |
| // Reuse single TermsEnum below: |
| final TermsEnum termsEnum = fieldTerms.iterator(null); |
| |
| for (int pos=0; pos<postingsFreqs.length; pos++) { |
| Term[] terms = termArrays.get(pos); |
| |
| final DocsAndPositionsEnum postingsEnum; |
| int docFreq; |
| |
| if (terms.length > 1) { |
| postingsEnum = new UnionDocsAndPositionsEnum(liveDocs, context, terms, termContexts, termsEnum); |
| |
| // coarse -- this overcounts since a given doc can |
| // have more than one term: |
| docFreq = 0; |
| for(int termIdx=0;termIdx<terms.length;termIdx++) { |
| final Term term = terms[termIdx]; |
| TermState termState = termContexts.get(term).get(context.ord); |
| if (termState == null) { |
| // Term not in reader |
| continue; |
| } |
| termsEnum.seekExact(term.bytes(), termState); |
| docFreq += termsEnum.docFreq(); |
| } |
| |
| if (docFreq == 0) { |
| // None of the terms are in this reader |
| return null; |
| } |
| } else { |
| final Term term = terms[0]; |
| TermState termState = termContexts.get(term).get(context.ord); |
| if (termState == null) { |
| // Term not in reader |
| return null; |
| } |
| termsEnum.seekExact(term.bytes(), termState); |
| postingsEnum = termsEnum.docsAndPositions(liveDocs, null, DocsEnum.FLAG_NONE); |
| |
| if (postingsEnum == null) { |
| // term does exist, but has no positions |
| assert termsEnum.docs(liveDocs, null, DocsEnum.FLAG_NONE) != null: "termstate found but no term exists in reader"; |
| throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); |
| } |
| |
| docFreq = termsEnum.docFreq(); |
| } |
| |
| postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, docFreq, positions.get(pos).intValue(), terms); |
| } |
| |
| // sort by increasing docFreq order |
| if (slop == 0) { |
| ArrayUtil.timSort(postingsFreqs); |
| } |
| |
| if (slop == 0) { |
| ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context)); |
| if (s.noDocs) { |
| return null; |
| } else { |
| return s; |
| } |
| } else { |
| return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context)); |
| } |
| } |
| |
| @Override |
| public Explanation explain(AtomicReaderContext context, int doc) throws IOException { |
| Scorer scorer = scorer(context, context.reader().getLiveDocs()); |
| if (scorer != null) { |
| int newDoc = scorer.advance(doc); |
| if (newDoc == doc) { |
| float freq = slop == 0 ? scorer.freq() : ((SloppyPhraseScorer)scorer).sloppyFreq(); |
| SimScorer docScorer = similarity.simScorer(stats, context); |
| ComplexExplanation result = new ComplexExplanation(); |
| result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:"); |
| Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq)); |
| result.addDetail(scoreExplanation); |
| result.setValue(scoreExplanation.getValue()); |
| result.setMatch(true); |
| return result; |
| } |
| } |
| |
| return new ComplexExplanation(false, 0.0f, "no matching term"); |
| } |
| } |
| |
| @Override |
| public Query rewrite(IndexReader reader) { |
| if (termArrays.isEmpty()) { |
| BooleanQuery bq = new BooleanQuery(); |
| bq.setBoost(getBoost()); |
| return bq; |
| } else if (termArrays.size() == 1) { // optimize one-term case |
| Term[] terms = termArrays.get(0); |
| BooleanQuery boq = new BooleanQuery(true); |
| for (int i=0; i<terms.length; i++) { |
| boq.add(new TermQuery(terms[i]), BooleanClause.Occur.SHOULD); |
| } |
| boq.setBoost(getBoost()); |
| return boq; |
| } else { |
| return this; |
| } |
| } |
| |
| @Override |
| public Weight createWeight(IndexSearcher searcher) throws IOException { |
| return new MultiPhraseWeight(searcher); |
| } |
| |
| /** Prints a user-readable version of this query. */ |
| @Override |
| public final String toString(String f) { |
| StringBuilder buffer = new StringBuilder(); |
| if (field == null || !field.equals(f)) { |
| buffer.append(field); |
| buffer.append(":"); |
| } |
| |
| buffer.append("\""); |
| int k = 0; |
| Iterator<Term[]> i = termArrays.iterator(); |
| int lastPos = -1; |
| boolean first = true; |
| while (i.hasNext()) { |
| Term[] terms = i.next(); |
| int position = positions.get(k); |
| if (first) { |
| first = false; |
| } else { |
| buffer.append(" "); |
| for (int j=1; j<(position-lastPos); j++) { |
| buffer.append("? "); |
| } |
| } |
| if (terms.length > 1) { |
| buffer.append("("); |
| for (int j = 0; j < terms.length; j++) { |
| buffer.append(terms[j].text()); |
| if (j < terms.length-1) |
| buffer.append(" "); |
| } |
| buffer.append(")"); |
| } else { |
| buffer.append(terms[0].text()); |
| } |
| lastPos = position; |
| ++k; |
| } |
| buffer.append("\""); |
| |
| if (slop != 0) { |
| buffer.append("~"); |
| buffer.append(slop); |
| } |
| |
| buffer.append(ToStringUtils.boost(getBoost())); |
| |
| return buffer.toString(); |
| } |
| |
| |
| /** Returns true if <code>o</code> is equal to this. */ |
| @Override |
| public boolean equals(Object o) { |
| if (!(o instanceof MultiPhraseQuery)) return false; |
| MultiPhraseQuery other = (MultiPhraseQuery)o; |
| return this.getBoost() == other.getBoost() |
| && this.slop == other.slop |
| && termArraysEquals(this.termArrays, other.termArrays) |
| && this.positions.equals(other.positions); |
| } |
| |
| /** Returns a hash code value for this object.*/ |
| @Override |
| public int hashCode() { |
| return Float.floatToIntBits(getBoost()) |
| ^ slop |
| ^ termArraysHashCode() |
| ^ positions.hashCode() |
| ^ 0x4AC65113; |
| } |
| |
| // Breakout calculation of the termArrays hashcode |
| private int termArraysHashCode() { |
| int hashCode = 1; |
| for (final Term[] termArray: termArrays) { |
| hashCode = 31 * hashCode |
| + (termArray == null ? 0 : Arrays.hashCode(termArray)); |
| } |
| return hashCode; |
| } |
| |
| // Breakout calculation of the termArrays equals |
| private boolean termArraysEquals(List<Term[]> termArrays1, List<Term[]> termArrays2) { |
| if (termArrays1.size() != termArrays2.size()) { |
| return false; |
| } |
| ListIterator<Term[]> iterator1 = termArrays1.listIterator(); |
| ListIterator<Term[]> iterator2 = termArrays2.listIterator(); |
| while (iterator1.hasNext()) { |
| Term[] termArray1 = iterator1.next(); |
| Term[] termArray2 = iterator2.next(); |
| if (!(termArray1 == null ? termArray2 == null : Arrays.equals(termArray1, |
| termArray2))) { |
| return false; |
| } |
| } |
| return true; |
| } |
| } |
| |
| /** |
| * Takes the logical union of multiple DocsEnum iterators. |
| */ |
| |
| // TODO: if ever we allow subclassing of the *PhraseScorer |
| class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { |
| |
| private static final class DocsQueue extends PriorityQueue<DocsAndPositionsEnum> { |
| DocsQueue(List<DocsAndPositionsEnum> docsEnums) throws IOException { |
| super(docsEnums.size()); |
| |
| Iterator<DocsAndPositionsEnum> i = docsEnums.iterator(); |
| while (i.hasNext()) { |
| DocsAndPositionsEnum postings = i.next(); |
| if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { |
| add(postings); |
| } |
| } |
| } |
| |
| @Override |
| public final boolean lessThan(DocsAndPositionsEnum a, DocsAndPositionsEnum b) { |
| return a.docID() < b.docID(); |
| } |
| } |
| |
| private static final class IntQueue { |
| private int _arraySize = 16; |
| private int _index = 0; |
| private int _lastIndex = 0; |
| private int[] _array = new int[_arraySize]; |
| |
| final void add(int i) { |
| if (_lastIndex == _arraySize) |
| growArray(); |
| |
| _array[_lastIndex++] = i; |
| } |
| |
| final int next() { |
| return _array[_index++]; |
| } |
| |
| final void sort() { |
| Arrays.sort(_array, _index, _lastIndex); |
| } |
| |
| final void clear() { |
| _index = 0; |
| _lastIndex = 0; |
| } |
| |
| final int size() { |
| return (_lastIndex - _index); |
| } |
| |
| private void growArray() { |
| int[] newArray = new int[_arraySize * 2]; |
| System.arraycopy(_array, 0, newArray, 0, _arraySize); |
| _array = newArray; |
| _arraySize *= 2; |
| } |
| } |
| |
| private int _doc; |
| private int _freq; |
| private DocsQueue _queue; |
| private IntQueue _posList; |
| private long cost; |
| |
| public UnionDocsAndPositionsEnum(Bits liveDocs, AtomicReaderContext context, Term[] terms, Map<Term,TermContext> termContexts, TermsEnum termsEnum) throws IOException { |
| List<DocsAndPositionsEnum> docsEnums = new LinkedList<>(); |
| for (int i = 0; i < terms.length; i++) { |
| final Term term = terms[i]; |
| TermState termState = termContexts.get(term).get(context.ord); |
| if (termState == null) { |
| // Term doesn't exist in reader |
| continue; |
| } |
| termsEnum.seekExact(term.bytes(), termState); |
| DocsAndPositionsEnum postings = termsEnum.docsAndPositions(liveDocs, null, DocsEnum.FLAG_NONE); |
| if (postings == null) { |
| // term does exist, but has no positions |
| throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run PhraseQuery (term=" + term.text() + ")"); |
| } |
| cost += postings.cost(); |
| docsEnums.add(postings); |
| } |
| |
| _queue = new DocsQueue(docsEnums); |
| _posList = new IntQueue(); |
| } |
| |
| @Override |
| public final int nextDoc() throws IOException { |
| if (_queue.size() == 0) { |
| return NO_MORE_DOCS; |
| } |
| |
| // TODO: move this init into positions(): if the search |
| // doesn't need the positions for this doc then don't |
| // waste CPU merging them: |
| _posList.clear(); |
| _doc = _queue.top().docID(); |
| |
| // merge sort all positions together |
| DocsAndPositionsEnum postings; |
| do { |
| postings = _queue.top(); |
| |
| final int freq = postings.freq(); |
| for (int i = 0; i < freq; i++) { |
| _posList.add(postings.nextPosition()); |
| } |
| |
| if (postings.nextDoc() != NO_MORE_DOCS) { |
| _queue.updateTop(); |
| } else { |
| _queue.pop(); |
| } |
| } while (_queue.size() > 0 && _queue.top().docID() == _doc); |
| |
| _posList.sort(); |
| _freq = _posList.size(); |
| |
| return _doc; |
| } |
| |
| @Override |
| public int nextPosition() { |
| return _posList.next(); |
| } |
| |
| @Override |
| public int startOffset() { |
| return -1; |
| } |
| |
| @Override |
| public int endOffset() { |
| return -1; |
| } |
| |
| @Override |
| public BytesRef getPayload() { |
| return null; |
| } |
| |
| @Override |
| public final int advance(int target) throws IOException { |
| while (_queue.top() != null && target > _queue.top().docID()) { |
| DocsAndPositionsEnum postings = _queue.pop(); |
| if (postings.advance(target) != NO_MORE_DOCS) { |
| _queue.add(postings); |
| } |
| } |
| return nextDoc(); |
| } |
| |
| @Override |
| public final int freq() { |
| return _freq; |
| } |
| |
| @Override |
| public final int docID() { |
| return _doc; |
| } |
| |
| @Override |
| public long cost() { |
| return cost; |
| } |
| } |