| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.queryparser.complexPhrase; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Objects; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.queryparser.classic.ParseException; |
| import org.apache.lucene.queryparser.classic.QueryParser; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanClause.Occur; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.BoostQuery; |
| import org.apache.lucene.search.IndexSearcher; |
| import org.apache.lucene.search.MatchNoDocsQuery; |
| import org.apache.lucene.search.MultiTermQuery; |
| import org.apache.lucene.search.MultiTermQuery.RewriteMethod; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.QueryVisitor; |
| import org.apache.lucene.search.SynonymQuery; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.spans.SpanBoostQuery; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.search.spans.SpanNotQuery; |
| import org.apache.lucene.search.spans.SpanOrQuery; |
| import org.apache.lucene.search.spans.SpanQuery; |
| import org.apache.lucene.search.spans.SpanTermQuery; |
| |
| /** |
| * QueryParser which permits complex phrase query syntax eg "(john jon |
| * jonathan~) peters*". |
| * <p> |
| * Performs potentially multiple passes over Query text to parse any nested |
| * logic in PhraseQueries. - First pass takes any PhraseQuery content between |
| * quotes and stores for subsequent pass. All other query content is parsed as |
| * normal - Second pass parses any stored PhraseQuery content, checking all |
| * embedded clauses are referring to the same field and therefore can be |
| * rewritten as Span queries. All PhraseQuery clauses are expressed as |
| * ComplexPhraseQuery objects |
| * </p> |
| * <p> |
| * This could arguably be done in one pass using a new QueryParser but here I am |
| * working within the constraints of the existing parser as a base class. This |
| * currently simply feeds all phrase content through an analyzer to select |
| * phrase terms - any "special" syntax such as * ~ * etc are not given special |
| * status |
| * </p> |
| * |
| */ |
| public class ComplexPhraseQueryParser extends QueryParser { |
| private ArrayList<ComplexPhraseQuery> complexPhrases = null; |
| |
| private boolean isPass2ResolvingPhrases; |
| |
| private boolean inOrder = true; |
| |
| /** |
| * When <code>inOrder</code> is true, the search terms must |
| * exists in the documents as the same order as in query. |
| * |
| * @param inOrder parameter to choose between ordered or un-ordered proximity search |
| */ |
| public void setInOrder(final boolean inOrder) { |
| this.inOrder = inOrder; |
| } |
| |
| private ComplexPhraseQuery currentPhraseQuery = null; |
| |
| public ComplexPhraseQueryParser(String f, Analyzer a) { |
| super(f, a); |
| } |
| |
| @Override |
| protected Query getFieldQuery(String field, String queryText, int slop) { |
| ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop, inOrder); |
| complexPhrases.add(cpq); // add to list of phrases to be parsed once |
| // we |
| // are through with this pass |
| return cpq; |
| } |
| |
| @Override |
| public Query parse(String query) throws ParseException { |
| if (isPass2ResolvingPhrases) { |
| MultiTermQuery.RewriteMethod oldMethod = getMultiTermRewriteMethod(); |
| try { |
| // Temporarily force BooleanQuery rewrite so that Parser will |
| // generate visible |
| // collection of terms which we can convert into SpanQueries. |
| // ConstantScoreRewrite mode produces an |
| // opaque ConstantScoreQuery object which cannot be interrogated for |
| // terms in the same way a BooleanQuery can. |
| // QueryParser is not guaranteed threadsafe anyway so this temporary |
| // state change should not |
| // present an issue |
| setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE); |
| return super.parse(query); |
| } finally { |
| setMultiTermRewriteMethod(oldMethod); |
| } |
| } |
| |
| // First pass - parse the top-level query recording any PhraseQuerys |
| // which will need to be resolved |
| complexPhrases = new ArrayList<>(); |
| Query q = super.parse(query); |
| |
| // Perform second pass, using this QueryParser to parse any nested |
| // PhraseQueries with different |
| // set of syntax restrictions (i.e. all fields must be same) |
| isPass2ResolvingPhrases = true; |
| try { |
| for (Iterator<ComplexPhraseQuery> iterator = complexPhrases.iterator(); iterator.hasNext();) { |
| currentPhraseQuery = iterator.next(); |
| // in each phrase, now parse the contents between quotes as a |
| // separate parse operation |
| currentPhraseQuery.parsePhraseElements(this); |
| } |
| } finally { |
| isPass2ResolvingPhrases = false; |
| } |
| return q; |
| } |
| |
| // There is No "getTermQuery throws ParseException" method to override so |
| // unfortunately need |
| // to throw a runtime exception here if a term for another field is embedded |
| // in phrase query |
| @Override |
| protected Query newTermQuery(Term term, float boost) { |
| if (isPass2ResolvingPhrases) { |
| try { |
| checkPhraseClauseIsForSameField(term.field()); |
| } catch (ParseException pe) { |
| throw new RuntimeException("Error parsing complex phrase", pe); |
| } |
| } |
| return super.newTermQuery(term, boost); |
| } |
| |
| // Helper method used to report on any clauses that appear in query syntax |
| private void checkPhraseClauseIsForSameField(String field) |
| throws ParseException { |
| if (!field.equals(currentPhraseQuery.field)) { |
| throw new ParseException("Cannot have clause for field \"" + field |
| + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field |
| + "\""); |
| } |
| } |
| |
| @Override |
| protected Query getWildcardQuery(String field, String termStr) |
| throws ParseException { |
| if (isPass2ResolvingPhrases) { |
| checkPhraseClauseIsForSameField(field); |
| } |
| return super.getWildcardQuery(field, termStr); |
| } |
| |
| @Override |
| protected Query getRangeQuery(String field, String part1, String part2, |
| boolean startInclusive, boolean endInclusive) throws ParseException { |
| if (isPass2ResolvingPhrases) { |
| checkPhraseClauseIsForSameField(field); |
| } |
| return super.getRangeQuery(field, part1, part2, startInclusive, endInclusive); |
| } |
| |
| @Override |
| protected Query newRangeQuery(String field, String part1, String part2, |
| boolean startInclusive, boolean endInclusive) { |
| RewriteMethod originalRewriteMethod = getMultiTermRewriteMethod(); |
| try { |
| if (isPass2ResolvingPhrases) { |
| setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_REWRITE); |
| } |
| return super.newRangeQuery(field, part1, part2, startInclusive, endInclusive); |
| } finally { |
| setMultiTermRewriteMethod(originalRewriteMethod); |
| } |
| } |
| |
| @Override |
| protected Query getFuzzyQuery(String field, String termStr, |
| float minSimilarity) throws ParseException { |
| if (isPass2ResolvingPhrases) { |
| checkPhraseClauseIsForSameField(field); |
| } |
| return super.getFuzzyQuery(field, termStr, minSimilarity); |
| } |
| |
| /* |
| * Used to handle the query content in between quotes and produced Span-based |
| * interpretations of the clauses. |
| */ |
| static class ComplexPhraseQuery extends Query { |
| |
| final String field; |
| |
| final String phrasedQueryStringContents; |
| |
| final int slopFactor; |
| |
| private final boolean inOrder; |
| |
| private final Query[] contents = new Query[1]; |
| |
| public ComplexPhraseQuery(String field, String phrasedQueryStringContents, |
| int slopFactor, boolean inOrder) { |
| this.field = Objects.requireNonNull(field); |
| this.phrasedQueryStringContents = Objects.requireNonNull(phrasedQueryStringContents); |
| this.slopFactor = slopFactor; |
| this.inOrder = inOrder; |
| } |
| |
| // Called by ComplexPhraseQueryParser for each phrase after the main |
| // parse |
| // thread is through |
| protected void parsePhraseElements(ComplexPhraseQueryParser qp) throws ParseException { |
| // TODO ensure that field-sensitivity is preserved ie the query |
| // string below is parsed as |
| // field+":("+phrasedQueryStringContents+")" |
| // but this will need code in rewrite to unwrap the first layer of |
| // boolean query |
| |
| String oldDefaultParserField = qp.field; |
| try { |
| //temporarily set the QueryParser to be parsing the default field for this phrase e.g author:"fred* smith" |
| qp.field = this.field; |
| contents[0] = qp.parse(phrasedQueryStringContents); |
| } |
| finally { |
| qp.field = oldDefaultParserField; |
| } |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| visitor.visitLeaf(this); |
| } |
| |
| @Override |
| public Query rewrite(IndexReader reader) throws IOException { |
| final Query contents = this.contents[0]; |
| // ArrayList spanClauses = new ArrayList(); |
| if (contents instanceof TermQuery |
| || contents instanceof MultiTermQuery |
| || contents instanceof SynonymQuery |
| ) { |
| return contents; |
| } |
| // Build a sequence of Span clauses arranged in a SpanNear - child |
| // clauses can be complex |
| // Booleans e.g. nots and ors etc |
| int numNegatives = 0; |
| if (!(contents instanceof BooleanQuery)) { |
| throw new IllegalArgumentException("Unknown query type \"" |
| + contents.getClass().getName() |
| + "\" found in phrase query string \"" + phrasedQueryStringContents |
| + "\""); |
| } |
| BooleanQuery bq = (BooleanQuery) contents; |
| SpanQuery[] allSpanClauses = new SpanQuery[bq.clauses().size()]; |
| // For all clauses e.g. one* two~ |
| int i = 0; |
| for (BooleanClause clause : bq) { |
| // HashSet bclauseterms=new HashSet(); |
| Query qc = clause.getQuery(); |
| // Rewrite this clause e.g one* becomes (one OR onerous) |
| qc = new IndexSearcher(reader).rewrite(qc); |
| if (clause.getOccur().equals(BooleanClause.Occur.MUST_NOT)) { |
| numNegatives++; |
| } |
| |
| while (qc instanceof BoostQuery) { |
| qc = ((BoostQuery) qc).getQuery(); |
| } |
| |
| if (qc instanceof BooleanQuery || qc instanceof SynonymQuery) { |
| ArrayList<SpanQuery> sc = new ArrayList<>(); |
| BooleanQuery booleanCaluse = qc instanceof BooleanQuery ? |
| (BooleanQuery) qc : convert((SynonymQuery) qc); |
| addComplexPhraseClause(sc, booleanCaluse); |
| if (sc.size() > 0) { |
| allSpanClauses[i] = sc.get(0); |
| } else { |
| // Insert fake term e.g. phrase query was for "Fred Smithe*" and |
| // there were no "Smithe*" terms - need to |
| // prevent match on just "Fred". |
| allSpanClauses[i] = new SpanTermQuery(new Term(field, |
| "Dummy clause because no terms found - must match nothing")); |
| } |
| } else if (qc instanceof MatchNoDocsQuery) { |
| // Insert fake term e.g. phrase query was for "Fred Smithe*" and |
| // there were no "Smithe*" terms - need to |
| // prevent match on just "Fred". |
| allSpanClauses[i] = new SpanTermQuery(new Term(field, |
| "Dummy clause because no terms found - must match nothing")); |
| } else { |
| if (qc instanceof TermQuery) { |
| TermQuery tq = (TermQuery) qc; |
| allSpanClauses[i] = new SpanTermQuery(tq.getTerm()); |
| } else { |
| throw new IllegalArgumentException("Unknown query type \"" |
| + qc.getClass().getName() |
| + "\" found in phrase query string \"" |
| + phrasedQueryStringContents + "\""); |
| } |
| } |
| |
| i += 1; |
| } |
| if (numNegatives == 0) { |
| // The simple case - no negative elements in phrase |
| return new SpanNearQuery(allSpanClauses, slopFactor, inOrder); |
| } |
| // Complex case - we have mixed positives and negatives in the |
| // sequence. |
| // Need to return a SpanNotQuery |
| ArrayList<SpanQuery> positiveClauses = new ArrayList<>(); |
| i = 0; |
| for (BooleanClause clause : bq) { |
| if (!clause.getOccur().equals(BooleanClause.Occur.MUST_NOT)) { |
| positiveClauses.add(allSpanClauses[i]); |
| } |
| i += 1; |
| } |
| |
| SpanQuery[] includeClauses = positiveClauses |
| .toArray(new SpanQuery[positiveClauses.size()]); |
| |
| SpanQuery include = null; |
| if (includeClauses.length == 1) { |
| include = includeClauses[0]; // only one positive clause |
| } else { |
| // need to increase slop factor based on gaps introduced by |
| // negatives |
| include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, |
| inOrder); |
| } |
| // Use sequence of positive and negative values as the exclude. |
| SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, |
| inOrder); |
| SpanNotQuery snot = new SpanNotQuery(include, exclude); |
| return snot; |
| } |
| |
| private BooleanQuery convert(SynonymQuery qc) { |
| BooleanQuery.Builder bqb = new BooleanQuery.Builder(); |
| for (Term t : qc.getTerms()){ |
| bqb.add(new BooleanClause(new TermQuery(t), Occur.SHOULD)); |
| } |
| return bqb.build(); |
| } |
| |
| private void addComplexPhraseClause(List<SpanQuery> spanClauses, BooleanQuery qc) { |
| ArrayList<SpanQuery> ors = new ArrayList<>(); |
| ArrayList<SpanQuery> nots = new ArrayList<>(); |
| |
| // For all clauses e.g. one* two~ |
| for (BooleanClause clause : qc) { |
| Query childQuery = clause.getQuery(); |
| |
| float boost = 1f; |
| while (childQuery instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) childQuery; |
| boost *= bq.getBoost(); |
| childQuery = bq.getQuery(); |
| } |
| |
| // select the list to which we will add these options |
| ArrayList<SpanQuery> chosenList = ors; |
| if (clause.getOccur() == BooleanClause.Occur.MUST_NOT) { |
| chosenList = nots; |
| } |
| |
| if (childQuery instanceof TermQuery) { |
| TermQuery tq = (TermQuery) childQuery; |
| SpanQuery stq = new SpanTermQuery(tq.getTerm()); |
| if (boost != 1f) { |
| stq = new SpanBoostQuery(stq, boost); |
| } |
| chosenList.add(stq); |
| } else if (childQuery instanceof BooleanQuery) { |
| BooleanQuery cbq = (BooleanQuery) childQuery; |
| addComplexPhraseClause(chosenList, cbq); |
| } else if (childQuery instanceof MatchNoDocsQuery) { |
| // Insert fake term e.g. phrase query was for "Fred Smithe*" and |
| // there were no "Smithe*" terms - need to |
| // prevent match on just "Fred". |
| SpanQuery stq = new SpanTermQuery(new Term(field, |
| "Dummy clause because no terms found - must match nothing")); |
| chosenList.add(stq); |
| } else { |
| // TODO alternatively could call extract terms here? |
| throw new IllegalArgumentException("Unknown query type:" |
| + childQuery.getClass().getName()); |
| } |
| } |
| if (ors.size() == 0) { |
| return; |
| } |
| SpanOrQuery soq = new SpanOrQuery(ors |
| .toArray(new SpanQuery[ors.size()])); |
| if (nots.size() == 0) { |
| spanClauses.add(soq); |
| } else { |
| SpanOrQuery snqs = new SpanOrQuery(nots |
| .toArray(new SpanQuery[nots.size()])); |
| SpanNotQuery snq = new SpanNotQuery(soq, snqs); |
| spanClauses.add(snq); |
| } |
| } |
| |
| @Override |
| public String toString(String field) { |
| StringBuilder sb = new StringBuilder(); |
| if (!this.field.equals(field)) { |
| sb.append(this.field).append(":"); |
| } |
| sb.append("\"").append(phrasedQueryStringContents).append("\""); |
| if (slopFactor != 0) { |
| sb.append("~").append(slopFactor); |
| } |
| return sb.toString(); |
| } |
| |
| @Override |
| public int hashCode() { |
| final int prime = 31; |
| int result = classHash(); |
| result = prime * result + field.hashCode(); |
| result = prime * result + phrasedQueryStringContents.hashCode(); |
| result = prime * result + slopFactor; |
| result = prime * result + (inOrder ? 1 : 0); |
| return result; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return sameClassAs(other) && |
| equalsTo(getClass().cast(other)); |
| } |
| |
| private boolean equalsTo(ComplexPhraseQuery other) { |
| return field.equals(other.field) && |
| phrasedQueryStringContents.equals(other.phrasedQueryStringContents) && |
| slopFactor == other.slopFactor && |
| inOrder == other.inOrder; |
| } |
| } |
| } |