blob: b5da7b39195056e73e21b13f5995d14835a5a1db [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/**
* A generalized version of {@link PhraseQuery}, with the possibility of
* adding more than one term at the same position that are treated as a disjunction (OR).
* To use this class to search for the phrase "Microsoft app*" first create a Builder and use
* {@link Builder#add(Term)} on the term "microsoft" (assuming lowercase analysis), then
* find all terms that have "app" as prefix using {@link LeafReader#terms(String)},
* seeking to "app" then iterating and collecting terms until there is no longer
* that prefix, and finally use {@link Builder#add(Term[])} to add them.
* {@link Builder#build()} returns the fully constructed (and immutable) MultiPhraseQuery.
*/
public class MultiPhraseQuery extends Query {
/** A builder for multi-phrase queries */
public static class Builder {
private String field; // becomes non-null on first add() then is unmodified
private final ArrayList<Term[]> termArrays;
private final ArrayList<Integer> positions;
private int slop;
/** Default constructor. */
public Builder() {
this.field = null;
this.termArrays = new ArrayList<>();
this.positions = new ArrayList<>();
this.slop = 0;
}
/** Copy constructor: this will create a builder that has the same
* configuration as the provided builder. */
public Builder(MultiPhraseQuery multiPhraseQuery) {
this.field = multiPhraseQuery.field;
int length = multiPhraseQuery.termArrays.length;
this.termArrays = new ArrayList<>(length);
this.positions = new ArrayList<>(length);
for (int i = 0 ; i < length ; ++i) {
this.termArrays.add(multiPhraseQuery.termArrays[i]);
this.positions.add(multiPhraseQuery.positions[i]);
}
this.slop = multiPhraseQuery.slop;
}
/** Sets the phrase slop for this query.
* @see PhraseQuery#getSlop()
*/
public Builder setSlop(int s) {
if (s < 0) {
throw new IllegalArgumentException("slop value cannot be negative");
}
slop = s;
return this;
}
/** Add a single term at the next position in the phrase.
*/
public Builder add(Term term) { return add(new Term[]{term}); }
/** Add multiple terms at the next position in the phrase. Any of the terms
* may match (a disjunction).
* The array is not copied or mutated, the caller should consider it
* immutable subsequent to calling this method.
*/
public Builder add(Term[] terms) {
int position = 0;
if (positions.size() > 0)
position = positions.get(positions.size() - 1) + 1;
return add(terms, position);
}
/**
* Allows to specify the relative position of terms within the phrase.
* The array is not copied or mutated, the caller should consider it
* immutable subsequent to calling this method.
*/
public Builder add(Term[] terms, int position) {
Objects.requireNonNull(terms, "Term array must not be null");
if (termArrays.size() == 0)
field = terms[0].field();
for (Term term : terms) {
if (!term.field().equals(field)) {
throw new IllegalArgumentException(
"All phrase terms must be in the same field (" + field + "): " + term);
}
}
termArrays.add(terms);
positions.add(position);
return this;
}
/** Builds a {@link MultiPhraseQuery}. */
public MultiPhraseQuery build() {
int[] positionsArray = new int[this.positions.size()];
for (int i = 0; i < this.positions.size(); ++i) {
positionsArray[i] = this.positions.get(i);
}
Term[][] termArraysArray = termArrays.toArray(new Term[termArrays.size()][]);
return new MultiPhraseQuery(field, termArraysArray, positionsArray, slop);
}
}
private final String field;
private final Term[][] termArrays;
private final int[] positions;
private final int slop;
private MultiPhraseQuery(String field, Term[][] termArrays, int[] positions, int slop) {
// No argument checks here since they are provided by the MultiPhraseQuery.Builder
this.field = field;
this.termArrays = termArrays;
this.positions = positions;
this.slop = slop;
}
/** Sets the phrase slop for this query.
* @see PhraseQuery#getSlop()
*/
public int getSlop() { return slop; }
/**
* Returns the arrays of arrays of terms in the multi-phrase.
* Do not modify!
*/
public Term[][] getTermArrays() {
return termArrays;
}
/**
* Returns the relative positions of terms in this phrase.
* Do not modify!
*/
public int[] getPositions() {
return positions;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
if (termArrays.length == 0) {
return new MatchNoDocsQuery("empty MultiPhraseQuery");
} else if (termArrays.length == 1) { // optimize one-term case
Term[] terms = termArrays[0];
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (Term term : terms) {
builder.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
}
return builder.build();
} else {
return super.rewrite(reader);
}
}
@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(field) == false) {
return;
}
QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this);
for (Term[] terms : termArrays) {
QueryVisitor sv = v.getSubVisitor(BooleanClause.Occur.SHOULD, this);
sv.consumeTerms(this, terms);
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
final Map<Term,TermStates> termStates = new HashMap<>();
return new PhraseWeight(this, field, searcher, scoreMode) {
@Override
protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
final IndexReaderContext context = searcher.getTopReaderContext();
// compute idf
ArrayList<TermStatistics> allTermStats = new ArrayList<>();
for(final Term[] terms: termArrays) {
for (Term term: terms) {
TermStates ts = termStates.get(term);
if (ts == null) {
ts = TermStates.build(context, term, scoreMode.needsScores());
termStates.put(term, ts);
}
if (scoreMode.needsScores() && ts.docFreq() > 0) {
allTermStats.add(searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
}
}
}
if (allTermStats.isEmpty()) {
return null; // none of the terms were found, we won't use sim at all
} else {
return similarity.scorer(
boost,
searcher.collectionStatistics(field),
allTermStats.toArray(new TermStatistics[allTermStats.size()]));
}
}
@Override
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
assert termArrays.length != 0;
final LeafReader reader = context.reader();
PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[termArrays.length];
final Terms fieldTerms = reader.terms(field);
if (fieldTerms == null) {
return null;
}
// TODO: move this check to createWeight to happen earlier to the user?
if (fieldTerms.hasPositions() == false) {
throw new IllegalStateException("field \"" + field + "\" was indexed without position data;" +
" cannot run MultiPhraseQuery (phrase=" + getQuery() + ")");
}
// Reuse single TermsEnum below:
final TermsEnum termsEnum = fieldTerms.iterator();
float totalMatchCost = 0;
for (int pos=0; pos<postingsFreqs.length; pos++) {
Term[] terms = termArrays[pos];
List<PostingsEnum> postings = new ArrayList<>();
for (Term term : terms) {
TermState termState = termStates.get(term).get(context);
if (termState != null) {
termsEnum.seekExact(term.bytes(), termState);
postings.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS));
totalMatchCost += PhraseQuery.termPositionsCost(termsEnum);
}
}
if (postings.isEmpty()) {
return null;
}
final PostingsEnum postingsEnum;
if (postings.size() == 1) {
postingsEnum = postings.get(0);
} else {
postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings);
}
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, new SlowImpactsEnum(postingsEnum), positions[pos], terms);
}
// sort by increasing docFreq order
if (slop == 0) {
ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
}
}
@Override
public void extractTerms(Set<Term> terms) {
for (final Term[] arr : termArrays) {
Collections.addAll(terms, arr);
}
}
};
}
/** Prints a user-readable version of this query. */
@Override
public final String toString(String f) {
StringBuilder buffer = new StringBuilder();
if (field == null || !field.equals(f)) {
buffer.append(field);
buffer.append(":");
}
buffer.append("\"");
int lastPos = -1;
for (int i = 0 ; i < termArrays.length ; ++i) {
Term[] terms = termArrays[i];
int position = positions[i];
if (i != 0) {
buffer.append(" ");
for (int j=1; j<(position-lastPos); j++) {
buffer.append("? ");
}
}
if (terms.length > 1) {
buffer.append("(");
for (int j = 0; j < terms.length; j++) {
buffer.append(terms[j].text());
if (j < terms.length-1)
buffer.append(" ");
}
buffer.append(")");
} else {
buffer.append(terms[0].text());
}
lastPos = position;
}
buffer.append("\"");
if (slop != 0) {
buffer.append("~");
buffer.append(slop);
}
return buffer.toString();
}
/** Returns true if <code>o</code> is equal to this. */
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
equalsTo(getClass().cast(other));
}
private boolean equalsTo(MultiPhraseQuery other) {
return this.slop == other.slop &&
termArraysEquals(this.termArrays, other.termArrays) && /* terms equal implies field equal */
Arrays.equals(this.positions, other.positions);
}
/** Returns a hash code value for this object.*/
@Override
public int hashCode() {
return classHash()
^ slop
^ termArraysHashCode() // terms equal implies field equal
^ Arrays.hashCode(positions);
}
// Breakout calculation of the termArrays hashcode
private int termArraysHashCode() {
int hashCode = 1;
for (final Term[] termArray: termArrays) {
hashCode = 31 * hashCode
+ (termArray == null ? 0 : Arrays.hashCode(termArray));
}
return hashCode;
}
// Breakout calculation of the termArrays equals
private boolean termArraysEquals(Term[][] termArrays1, Term[][] termArrays2) {
if (termArrays1.length != termArrays2.length) {
return false;
}
for (int i = 0 ; i < termArrays1.length ; ++i) {
Term[] termArray1 = termArrays1[i];
Term[] termArray2 = termArrays2[i];
if (!(termArray1 == null ? termArray2 == null : Arrays.equals(termArray1,
termArray2))) {
return false;
}
}
return true;
}
/**
* Takes the logical union of multiple PostingsEnum iterators.
* <p>
* Note: positions are merged during freq()
*/
static class UnionPostingsEnum extends PostingsEnum {
/** queue ordered by docid */
final DocsQueue docsQueue;
/** cost of this enum: sum of its subs */
final long cost;
/** queue ordered by position for current doc */
final PositionsQueue posQueue = new PositionsQueue();
/** current doc posQueue is working */
int posQueueDoc = -2;
/** list of subs (unordered) */
final PostingsEnum[] subs;
UnionPostingsEnum(Collection<PostingsEnum> subs) {
docsQueue = new DocsQueue(subs.size());
long cost = 0;
for (PostingsEnum sub : subs) {
docsQueue.add(sub);
cost += sub.cost();
}
this.cost = cost;
this.subs = subs.toArray(new PostingsEnum[subs.size()]);
}
@Override
public int freq() throws IOException {
int doc = docID();
if (doc != posQueueDoc) {
posQueue.clear();
for (PostingsEnum sub : subs) {
if (sub.docID() == doc) {
int freq = sub.freq();
for (int i = 0; i < freq; i++) {
posQueue.add(sub.nextPosition());
}
}
}
posQueue.sort();
posQueueDoc = doc;
}
return posQueue.size();
}
@Override
public int nextPosition() throws IOException {
return posQueue.next();
}
@Override
public int docID() {
return docsQueue.top().docID();
}
@Override
public int nextDoc() throws IOException {
PostingsEnum top = docsQueue.top();
int doc = top.docID();
do {
top.nextDoc();
top = docsQueue.updateTop();
} while (top.docID() == doc);
return top.docID();
}
@Override
public int advance(int target) throws IOException {
PostingsEnum top = docsQueue.top();
do {
top.advance(target);
top = docsQueue.updateTop();
} while (top.docID() < target);
return top.docID();
}
@Override
public long cost() {
return cost;
}
@Override
public int startOffset() throws IOException {
return -1; // offsets are unsupported
}
@Override
public int endOffset() throws IOException {
return -1; // offsets are unsupported
}
@Override
public BytesRef getPayload() throws IOException {
return null; // payloads are unsupported
}
/**
* disjunction of postings ordered by docid.
*/
static class DocsQueue extends PriorityQueue<PostingsEnum> {
DocsQueue(int size) {
super(size);
}
@Override
public final boolean lessThan(PostingsEnum a, PostingsEnum b) {
return a.docID() < b.docID();
}
}
/**
* queue of terms for a single document. its a sorted array of
* all the positions from all the postings
*/
static class PositionsQueue {
private int arraySize = 16;
private int index = 0;
private int size = 0;
private int[] array = new int[arraySize];
void add(int i) {
if (size == arraySize)
growArray();
array[size++] = i;
}
int next() {
return array[index++];
}
void sort() {
Arrays.sort(array, index, size);
}
void clear() {
index = 0;
size = 0;
}
int size() {
return size;
}
private void growArray() {
int[] newArray = new int[arraySize * 2];
System.arraycopy(array, 0, newArray, 0, arraySize);
array = newArray;
arraySize *= 2;
}
}
}
static class PostingsAndPosition {
final PostingsEnum pe;
int pos;
int upto;
PostingsAndPosition(PostingsEnum pe) {
this.pe = pe;
}
}
// Slower version of UnionPostingsEnum that delegates offsets and positions, for
// use by MatchesIterator
static class UnionFullPostingsEnum extends UnionPostingsEnum {
int freq = -1;
boolean started = false;
final PriorityQueue<PostingsAndPosition> posQueue;
final Collection<PostingsAndPosition> subs;
UnionFullPostingsEnum(List<PostingsEnum> subs) {
super(subs);
this.posQueue = new PriorityQueue<PostingsAndPosition>(subs.size()) {
@Override
protected boolean lessThan(PostingsAndPosition a, PostingsAndPosition b) {
return a.pos < b.pos;
}
};
this.subs = new ArrayList<>();
for (PostingsEnum pe : subs) {
this.subs.add(new PostingsAndPosition(pe));
}
}
@Override
public int freq() throws IOException {
int doc = docID();
if (doc == posQueueDoc) {
return freq;
}
freq = 0;
started = false;
posQueue.clear();
for (PostingsAndPosition pp : subs) {
if (pp.pe.docID() == doc) {
pp.pos = pp.pe.nextPosition();
pp.upto = pp.pe.freq();
posQueue.add(pp);
freq += pp.upto;
}
}
return freq;
}
@Override
public int nextPosition() throws IOException {
if (started == false) {
started = true;
return posQueue.top().pos;
}
if (posQueue.top().upto == 1) {
posQueue.pop();
return posQueue.top().pos;
}
posQueue.top().pos = posQueue.top().pe.nextPosition();
posQueue.top().upto--;
posQueue.updateTop();
return posQueue.top().pos;
}
@Override
public int startOffset() throws IOException {
return posQueue.top().pe.startOffset();
}
@Override
public int endOffset() throws IOException {
return posQueue.top().pe.endOffset();
}
@Override
public BytesRef getPayload() throws IOException {
return posQueue.top().pe.getPayload();
}
}
}