blob: aedf14aa7c2188436fb3ea1cd7cf56c9c3d106b7 [file] [log] [blame]
package org.apache.lucene.util;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
/**
* Creates queries from the {@link Analyzer} chain.
* <p>
* Example usage:
* <pre class="prettyprint">
* QueryBuilder builder = new QueryBuilder(analyzer);
* Query a = builder.createBooleanQuery("body", "just a test");
* Query b = builder.createPhraseQuery("body", "another test");
* Query c = builder.createMinShouldMatchQuery("body", "another test", 0.5f);
* </pre>
* <p>
* This can also be used as a subclass for query parsers to make it easier
* to interact with the analysis chain. Factory methods such as {@code newTermQuery}
* are provided so that the generated queries can be customized.
*/
public class QueryBuilder {
private Analyzer analyzer;
private boolean enablePositionIncrements = true;
/** Creates a new QueryBuilder using the given analyzer. */
public QueryBuilder(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Creates a boolean query from the query text.
* <p>
* This is equivalent to {@code createBooleanQuery(field, queryText, Occur.SHOULD)}
* @param field field name
* @param queryText text to be passed to the analyzer
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis
* of {@code queryText}
*/
public Query createBooleanQuery(String field, String queryText) {
return createBooleanQuery(field, queryText, BooleanClause.Occur.SHOULD);
}
/**
* Creates a boolean query from the query text.
* <p>
* @param field field name
* @param queryText text to be passed to the analyzer
* @param operator operator used for clauses between analyzer tokens.
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis
* of {@code queryText}
*/
public Query createBooleanQuery(String field, String queryText, BooleanClause.Occur operator) {
if (operator != BooleanClause.Occur.SHOULD && operator != BooleanClause.Occur.MUST) {
throw new IllegalArgumentException("invalid operator: only SHOULD or MUST are allowed");
}
return createFieldQuery(analyzer, operator, field, queryText, false, 0);
}
/**
* Creates a phrase query from the query text.
* <p>
* This is equivalent to {@code createPhraseQuery(field, queryText, 0)}
* @param field field name
* @param queryText text to be passed to the analyzer
* @return {@code TermQuery}, {@code BooleanQuery}, {@code PhraseQuery}, or
* {@code MultiPhraseQuery}, based on the analysis of {@code queryText}
*/
public Query createPhraseQuery(String field, String queryText) {
return createPhraseQuery(field, queryText, 0);
}
/**
* Creates a phrase query from the query text.
* <p>
* @param field field name
* @param queryText text to be passed to the analyzer
* @param phraseSlop number of other words permitted between words in query phrase
* @return {@code TermQuery}, {@code BooleanQuery}, {@code PhraseQuery}, or
* {@code MultiPhraseQuery}, based on the analysis of {@code queryText}
*/
public Query createPhraseQuery(String field, String queryText, int phraseSlop) {
return createFieldQuery(analyzer, BooleanClause.Occur.MUST, field, queryText, true, phraseSlop);
}
/**
* Creates a minimum-should-match query from the query text.
* <p>
* @param field field name
* @param queryText text to be passed to the analyzer
* @param fraction of query terms {@code [0..1]} that should match
* @return {@code TermQuery} or {@code BooleanQuery}, based on the analysis
* of {@code queryText}
*/
public Query createMinShouldMatchQuery(String field, String queryText, float fraction) {
if (Float.isNaN(fraction) || fraction < 0 || fraction > 1) {
throw new IllegalArgumentException("fraction should be >= 0 and <= 1");
}
// TODO: wierd that BQ equals/rewrite/scorer doesn't handle this?
if (fraction == 1) {
return createBooleanQuery(field, queryText, BooleanClause.Occur.MUST);
}
Query query = createFieldQuery(analyzer, BooleanClause.Occur.SHOULD, field, queryText, false, 0);
if (query instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) query;
bq.setMinimumNumberShouldMatch((int) (fraction * bq.clauses().size()));
}
return query;
}
/**
* Returns the analyzer.
* @see #setAnalyzer(Analyzer)
*/
public Analyzer getAnalyzer() {
return analyzer;
}
/**
* Sets the analyzer used to tokenize text.
*/
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
/**
* Returns true if position increments are enabled.
* @see #setEnablePositionIncrements(boolean)
*/
public boolean getEnablePositionIncrements() {
return enablePositionIncrements;
}
/**
* Set to <code>true</code> to enable position increments in result query.
* <p>
* When set, result phrase and multi-phrase queries will
* be aware of position increments.
* Useful when e.g. a StopFilter increases the position increment of
* the token that follows an omitted token.
* <p>
* Default: true.
*/
public void setEnablePositionIncrements(boolean enable) {
this.enablePositionIncrements = enable;
}
/**
* Creates a query from the analysis chain.
* <p>
* Expert: this is more useful for subclasses such as queryparsers.
* If using this class directly, just use {@link #createBooleanQuery(String, String)}
* and {@link #createPhraseQuery(String, String)}
* @param analyzer analyzer used for this query
* @param operator default boolean operator used for this query
* @param field field to create queries against
* @param queryText text to be passed to the analysis chain
* @param quoted true if phrases should be generated when terms occur at more than one position
* @param phraseSlop slop factor for phrase/multiphrase queries
*/
protected final Query createFieldQuery(Analyzer analyzer, BooleanClause.Occur operator, String field, String queryText, boolean quoted, int phraseSlop) {
assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
// Use the analyzer to get all the tokens, and then build a TermQuery,
// PhraseQuery, or nothing based on the term count
CachingTokenFilter buffer = null;
TermToBytesRefAttribute termAtt = null;
PositionIncrementAttribute posIncrAtt = null;
int numTokens = 0;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
boolean hasMoreTokens = false;
try (TokenStream source = analyzer.tokenStream(field, queryText)) {
source.reset();
buffer = new CachingTokenFilter(source);
buffer.reset();
if (buffer.hasAttribute(TermToBytesRefAttribute.class)) {
termAtt = buffer.getAttribute(TermToBytesRefAttribute.class);
}
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
}
if (termAtt != null) {
try {
hasMoreTokens = buffer.incrementToken();
while (hasMoreTokens) {
numTokens++;
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
if (positionIncrement != 0) {
positionCount += positionIncrement;
} else {
severalTokensAtSamePosition = true;
}
hasMoreTokens = buffer.incrementToken();
}
} catch (IOException e) {
// ignore
}
}
} catch (IOException e) {
throw new RuntimeException("Error analyzing query text", e);
}
// rewind the buffer stream
buffer.reset();
BytesRef bytes = termAtt == null ? null : termAtt.getBytesRef();
if (numTokens == 0)
return null;
else if (numTokens == 1) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
return newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
} else {
if (severalTokensAtSamePosition || (!quoted)) {
if (positionCount == 1 || (!quoted)) {
// no phrase query:
if (positionCount == 1) {
// simple case: only one position, with synonyms
BooleanQuery q = newBooleanQuery(true);
for (int i = 0; i < numTokens; i++) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
Query currentQuery = newTermQuery(
new Term(field, BytesRef.deepCopyOf(bytes)));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
} else {
// multiple positions
BooleanQuery q = newBooleanQuery(false);
Query currentQuery = null;
for (int i = 0; i < numTokens; i++) {
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
if (!(currentQuery instanceof BooleanQuery)) {
Query t = currentQuery;
currentQuery = newBooleanQuery(true);
((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
}
((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))), BooleanClause.Occur.SHOULD);
} else {
if (currentQuery != null) {
q.add(currentQuery, operator);
}
currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
}
}
q.add(currentQuery, operator);
return q;
}
} else {
// phrase query:
MultiPhraseQuery mpq = newMultiPhraseQuery();
mpq.setSlop(phraseSlop);
List<Term> multiTerms = new ArrayList<Term>();
int position = -1;
for (int i = 0; i < numTokens; i++) {
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpq.add(multiTerms.toArray(new Term[0]),position);
} else {
mpq.add(multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new Term(field, BytesRef.deepCopyOf(bytes)));
}
if (enablePositionIncrements) {
mpq.add(multiTerms.toArray(new Term[0]),position);
} else {
mpq.add(multiTerms.toArray(new Term[0]));
}
return mpq;
}
} else {
PhraseQuery pq = newPhraseQuery();
pq.setSlop(phraseSlop);
int position = -1;
for (int i = 0; i < numTokens; i++) {
int positionIncrement = 1;
try {
boolean hasNext = buffer.incrementToken();
assert hasNext == true;
termAtt.fillBytesRef();
if (posIncrAtt != null) {
positionIncrement = posIncrAtt.getPositionIncrement();
}
} catch (IOException e) {
// safe to ignore, because we know the number of tokens
}
if (enablePositionIncrements) {
position += positionIncrement;
pq.add(new Term(field, BytesRef.deepCopyOf(bytes)),position);
} else {
pq.add(new Term(field, BytesRef.deepCopyOf(bytes)));
}
}
return pq;
}
}
}
/**
* Builds a new BooleanQuery instance.
* <p>
* This is intended for subclasses that wish to customize the generated queries.
* @param disableCoord disable coord
* @return new BooleanQuery instance
*/
protected BooleanQuery newBooleanQuery(boolean disableCoord) {
return new BooleanQuery(disableCoord);
}
/**
* Builds a new TermQuery instance.
* <p>
* This is intended for subclasses that wish to customize the generated queries.
* @param term term
* @return new TermQuery instance
*/
protected Query newTermQuery(Term term) {
return new TermQuery(term);
}
/**
* Builds a new PhraseQuery instance.
* <p>
* This is intended for subclasses that wish to customize the generated queries.
* @return new PhraseQuery instance
*/
protected PhraseQuery newPhraseQuery() {
return new PhraseQuery();
}
/**
* Builds a new MultiPhraseQuery instance.
* <p>
* This is intended for subclasses that wish to customize the generated queries.
* @return new MultiPhraseQuery instance
*/
protected MultiPhraseQuery newMultiPhraseQuery() {
return new MultiPhraseQuery();
}
}