| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.vectorhighlight; |
| import java.io.IOException; |
| import java.util.Collection; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.queries.function.FunctionScoreQuery; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.BoostQuery; |
| import org.apache.lucene.search.ConstantScoreQuery; |
| import org.apache.lucene.search.DisjunctionMaxQuery; |
| import org.apache.lucene.search.MultiTermQuery; |
| import org.apache.lucene.search.PhraseQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.SynonymQuery; |
| import org.apache.lucene.search.TermQuery; |
| import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; |
| |
| /** |
| * FieldQuery breaks down query object into terms/phrases and keeps |
| * them in a QueryPhraseMap structure. |
| */ |
| public class FieldQuery { |
| |
| final boolean fieldMatch; |
| |
| // fieldMatch==true, Map<fieldName,QueryPhraseMap> |
| // fieldMatch==false, Map<null,QueryPhraseMap> |
| Map<String, QueryPhraseMap> rootMaps = new HashMap<>(); |
| |
| // fieldMatch==true, Map<fieldName,setOfTermsInQueries> |
| // fieldMatch==false, Map<null,setOfTermsInQueries> |
| Map<String, Set<String>> termSetMap = new HashMap<>(); |
| |
| int termOrPhraseNumber; // used for colored tag support |
| |
| // The maximum number of different matching terms accumulated from any one MultiTermQuery |
| private static final int MAX_MTQ_TERMS = 1024; |
| |
| public FieldQuery(Query query, IndexReader reader, boolean phraseHighlight, boolean fieldMatch) throws IOException { |
| this.fieldMatch = fieldMatch; |
| Set<Query> flatQueries = new LinkedHashSet<>(); |
| flatten( query, reader, flatQueries, 1f ); |
| saveTerms( flatQueries, reader ); |
| Collection<Query> expandQueries = expand( flatQueries ); |
| |
| for( Query flatQuery : expandQueries ){ |
| QueryPhraseMap rootMap = getRootMap( flatQuery ); |
| rootMap.add( flatQuery, reader ); |
| float boost = 1f; |
| while (flatQuery instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) flatQuery; |
| flatQuery = bq.getQuery(); |
| boost *= bq.getBoost(); |
| } |
| if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ |
| PhraseQuery pq = (PhraseQuery)flatQuery; |
| if( pq.getTerms().length > 1 ){ |
| for( Term term : pq.getTerms() ) |
| rootMap.addTerm( term, boost ); |
| } |
| } |
| } |
| } |
| |
| /** For backwards compatibility you can initialize FieldQuery without |
| * an IndexReader, which is only required to support MultiTermQuery |
| */ |
| FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ) throws IOException { |
| this (query, null, phraseHighlight, fieldMatch); |
| } |
| |
| protected void flatten( Query sourceQuery, IndexReader reader, Collection<Query> flatQueries, float boost ) throws IOException { |
| while (sourceQuery instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) sourceQuery; |
| sourceQuery = bq.getQuery(); |
| boost *= bq.getBoost(); |
| } |
| if( sourceQuery instanceof BooleanQuery ){ |
| BooleanQuery bq = (BooleanQuery)sourceQuery; |
| for( BooleanClause clause : bq ) { |
| if( !clause.isProhibited() ) { |
| flatten( clause.getQuery(), reader, flatQueries, boost ); |
| } |
| } |
| } else if( sourceQuery instanceof DisjunctionMaxQuery ){ |
| DisjunctionMaxQuery dmq = (DisjunctionMaxQuery)sourceQuery; |
| for( Query query : dmq ){ |
| flatten( query, reader, flatQueries, boost ); |
| } |
| } |
| else if( sourceQuery instanceof TermQuery ){ |
| if (boost != 1f) { |
| sourceQuery = new BoostQuery(sourceQuery, boost); |
| } |
| if( !flatQueries.contains( sourceQuery ) ) |
| flatQueries.add( sourceQuery ); |
| } |
| else if ( sourceQuery instanceof SynonymQuery ){ |
| SynonymQuery synQuery = (SynonymQuery) sourceQuery; |
| for( Term term : synQuery.getTerms()) { |
| flatten( new TermQuery(term), reader, flatQueries, boost); |
| } |
| } |
| else if( sourceQuery instanceof PhraseQuery ){ |
| PhraseQuery pq = (PhraseQuery)sourceQuery; |
| if( pq.getTerms().length == 1 ) |
| sourceQuery = new TermQuery( pq.getTerms()[0] ); |
| if (boost != 1f) { |
| sourceQuery = new BoostQuery(sourceQuery, boost); |
| } |
| flatQueries.add(sourceQuery); |
| } else if (sourceQuery instanceof ConstantScoreQuery) { |
| final Query q = ((ConstantScoreQuery) sourceQuery).getQuery(); |
| if (q != null) { |
| flatten( q, reader, flatQueries, boost); |
| } |
| } else if (sourceQuery instanceof FunctionScoreQuery) { |
| final Query q = ((FunctionScoreQuery) sourceQuery).getWrappedQuery(); |
| if (q != null) { |
| flatten(q, reader, flatQueries, boost); |
| } |
| } else if (reader != null) { |
| Query query = sourceQuery; |
| Query rewritten; |
| if (sourceQuery instanceof MultiTermQuery) { |
| rewritten = new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(MAX_MTQ_TERMS).rewrite(reader, (MultiTermQuery) query); |
| } else { |
| rewritten = query.rewrite(reader); |
| } |
| if (rewritten != query) { |
| // only rewrite once and then flatten again - the rewritten query could have a speacial treatment |
| // if this method is overwritten in a subclass. |
| flatten(rewritten, reader, flatQueries, boost); |
| |
| } |
| // if the query is already rewritten we discard it |
| } |
| // else discard queries |
| } |
| |
| /* |
| * Create expandQueries from flatQueries. |
| * |
| * expandQueries := flatQueries + overlapped phrase queries |
| * |
| * ex1) flatQueries={a,b,c} |
| * => expandQueries={a,b,c} |
| * ex2) flatQueries={a,"b c","c d"} |
| * => expandQueries={a,"b c","c d","b c d"} |
| */ |
| Collection<Query> expand( Collection<Query> flatQueries ){ |
| Set<Query> expandQueries = new LinkedHashSet<>(); |
| for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){ |
| Query query = i.next(); |
| i.remove(); |
| expandQueries.add( query ); |
| float queryBoost = 1f; |
| while (query instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) query; |
| queryBoost *= bq.getBoost(); |
| query = bq.getQuery(); |
| } |
| if( !( query instanceof PhraseQuery ) ) continue; |
| for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){ |
| Query qj = j.next(); |
| float qjBoost = 1f; |
| while (qj instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) qj; |
| qjBoost *= bq.getBoost(); |
| qj = bq.getQuery(); |
| } |
| if( !( qj instanceof PhraseQuery ) ) continue; |
| checkOverlap( expandQueries, (PhraseQuery)query, queryBoost, (PhraseQuery)qj, qjBoost ); |
| } |
| } |
| return expandQueries; |
| } |
| |
| /* |
| * Check if PhraseQuery A and B have overlapped part. |
| * |
| * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"} |
| * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"} |
| * ex3) A="a b", B="c d" => no overlap; expandQueries={} |
| */ |
| private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, float aBoost, PhraseQuery b, float bBoost ){ |
| if( a.getSlop() != b.getSlop() ) return; |
| Term[] ats = a.getTerms(); |
| Term[] bts = b.getTerms(); |
| if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return; |
| checkOverlap( expandQueries, ats, bts, a.getSlop(), aBoost); |
| checkOverlap( expandQueries, bts, ats, b.getSlop(), bBoost ); |
| } |
| |
| /* |
| * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. |
| * |
| * ex1) src="a b", dest="c d" => no overlap |
| * ex2) src="a b", dest="a b c" => no overlap |
| * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} |
| * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} |
| * ex5) src="a b c", dest="b c" => no overlap |
| * ex6) src="a b c", dest="b" => no overlap |
| * ex7) src="a a a a", dest="a a a" => overlap; |
| * expandQueries={"a a a a a","a a a a a a"} |
| * ex8) src="a b c d", dest="b c" => no overlap |
| */ |
| private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){ |
| // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms |
| // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() |
| // converts PhraseQuery to TermQuery) |
| for( int i = 1; i < src.length; i++ ){ |
| boolean overlap = true; |
| for( int j = i; j < src.length; j++ ){ |
| if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ |
| overlap = false; |
| break; |
| } |
| } |
| if( overlap && src.length - i < dest.length ){ |
| PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder(); |
| for( Term srcTerm : src ) |
| pqBuilder.add( srcTerm ); |
| for( int k = src.length - i; k < dest.length; k++ ){ |
| pqBuilder.add( new Term( src[0].field(), dest[k].text() ) ); |
| } |
| pqBuilder.setSlop( slop ); |
| Query pq = pqBuilder.build(); |
| if (boost != 1f) { |
| pq = new BoostQuery(pq, 1f); |
| } |
| if(!expandQueries.contains( pq ) ) |
| expandQueries.add( pq ); |
| } |
| } |
| } |
| |
| QueryPhraseMap getRootMap( Query query ){ |
| String key = getKey( query ); |
| QueryPhraseMap map = rootMaps.get( key ); |
| if( map == null ){ |
| map = new QueryPhraseMap( this ); |
| rootMaps.put( key, map ); |
| } |
| return map; |
| } |
| |
| /* |
| * Return 'key' string. 'key' is the field name of the Query. |
| * If not fieldMatch, 'key' will be null. |
| */ |
| private String getKey( Query query ){ |
| if( !fieldMatch ) return null; |
| while (query instanceof BoostQuery) { |
| query = ((BoostQuery) query).getQuery(); |
| } |
| if( query instanceof TermQuery ) |
| return ((TermQuery)query).getTerm().field(); |
| else if ( query instanceof PhraseQuery ){ |
| PhraseQuery pq = (PhraseQuery)query; |
| Term[] terms = pq.getTerms(); |
| return terms[0].field(); |
| } |
| else if (query instanceof MultiTermQuery) { |
| return ((MultiTermQuery)query).getField(); |
| } |
| else |
| throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); |
| } |
| |
| /* |
| * Save the set of terms in the queries to termSetMap. |
| * |
| * ex1) q=name:john |
| * - fieldMatch==true |
| * termSetMap=Map<"name",Set<"john">> |
| * - fieldMatch==false |
| * termSetMap=Map<null,Set<"john">> |
| * |
| * ex2) q=name:john title:manager |
| * - fieldMatch==true |
| * termSetMap=Map<"name",Set<"john">, |
| * "title",Set<"manager">> |
| * - fieldMatch==false |
| * termSetMap=Map<null,Set<"john","manager">> |
| * |
| * ex3) q=name:"john lennon" |
| * - fieldMatch==true |
| * termSetMap=Map<"name",Set<"john","lennon">> |
| * - fieldMatch==false |
| * termSetMap=Map<null,Set<"john","lennon">> |
| */ |
| void saveTerms( Collection<Query> flatQueries, IndexReader reader ) throws IOException{ |
| for( Query query : flatQueries ){ |
| while (query instanceof BoostQuery) { |
| query = ((BoostQuery) query).getQuery(); |
| } |
| Set<String> termSet = getTermSet( query ); |
| if( query instanceof TermQuery ) |
| termSet.add( ((TermQuery)query).getTerm().text() ); |
| else if( query instanceof PhraseQuery ){ |
| for( Term term : ((PhraseQuery)query).getTerms() ) |
| termSet.add( term.text() ); |
| } |
| else if (query instanceof MultiTermQuery && reader != null) { |
| BooleanQuery mtqTerms = (BooleanQuery) query.rewrite(reader); |
| for (BooleanClause clause : mtqTerms) { |
| termSet.add (((TermQuery) clause.getQuery()).getTerm().text()); |
| } |
| } |
| else |
| throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); |
| } |
| } |
| |
| private Set<String> getTermSet( Query query ){ |
| String key = getKey( query ); |
| Set<String> set = termSetMap.get( key ); |
| if( set == null ){ |
| set = new HashSet<>(); |
| termSetMap.put( key, set ); |
| } |
| return set; |
| } |
| |
| Set<String> getTermSet( String field ){ |
| return termSetMap.get( fieldMatch ? field : null ); |
| } |
| |
| /** |
| * |
| * @return QueryPhraseMap |
| */ |
| public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ |
| QueryPhraseMap rootMap = getRootMap( fieldName ); |
| return rootMap == null ? null : rootMap.subMap.get( term ); |
| } |
| |
| /** |
| * |
| * @return QueryPhraseMap |
| */ |
| public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){ |
| QueryPhraseMap root = getRootMap( fieldName ); |
| if( root == null ) return null; |
| return root.searchPhrase( phraseCandidate ); |
| } |
| |
| private QueryPhraseMap getRootMap( String fieldName ){ |
| return rootMaps.get( fieldMatch ? fieldName : null ); |
| } |
| |
| int nextTermOrPhraseNumber(){ |
| return termOrPhraseNumber++; |
| } |
| |
| /** |
| * Internal structure of a query for highlighting: represents |
| * a nested query structure |
| */ |
| public static class QueryPhraseMap { |
| |
| boolean terminal; |
| int slop; // valid if terminal == true and phraseHighlight == true |
| float boost; // valid if terminal == true |
| int termOrPhraseNumber; // valid if terminal == true |
| FieldQuery fieldQuery; |
| Map<String, QueryPhraseMap> subMap = new HashMap<>(); |
| |
| public QueryPhraseMap( FieldQuery fieldQuery ){ |
| this.fieldQuery = fieldQuery; |
| } |
| |
| void addTerm( Term term, float boost ){ |
| QueryPhraseMap map = getOrNewMap( subMap, term.text() ); |
| map.markTerminal( boost ); |
| } |
| |
| private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){ |
| QueryPhraseMap map = subMap.get( term ); |
| if( map == null ){ |
| map = new QueryPhraseMap( fieldQuery ); |
| subMap.put( term, map ); |
| } |
| return map; |
| } |
| |
| void add( Query query, IndexReader reader ) { |
| float boost = 1f; |
| while (query instanceof BoostQuery) { |
| BoostQuery bq = (BoostQuery) query; |
| query = bq.getQuery(); |
| boost = bq.getBoost(); |
| } |
| if( query instanceof TermQuery ){ |
| addTerm( ((TermQuery)query).getTerm(), boost ); |
| } |
| else if( query instanceof PhraseQuery ){ |
| PhraseQuery pq = (PhraseQuery)query; |
| Term[] terms = pq.getTerms(); |
| Map<String, QueryPhraseMap> map = subMap; |
| QueryPhraseMap qpm = null; |
| for( Term term : terms ){ |
| qpm = getOrNewMap( map, term.text() ); |
| map = qpm.subMap; |
| } |
| qpm.markTerminal( pq.getSlop(), boost ); |
| } |
| else |
| throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); |
| } |
| |
| public QueryPhraseMap getTermMap( String term ){ |
| return subMap.get( term ); |
| } |
| |
| private void markTerminal( float boost ){ |
| markTerminal( 0, boost ); |
| } |
| |
| private void markTerminal( int slop, float boost ){ |
| this.terminal = true; |
| this.slop = slop; |
| this.boost = boost; |
| this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); |
| } |
| |
| public boolean isTerminal(){ |
| return terminal; |
| } |
| |
| public int getSlop(){ |
| return slop; |
| } |
| |
| public float getBoost(){ |
| return boost; |
| } |
| |
| public int getTermOrPhraseNumber(){ |
| return termOrPhraseNumber; |
| } |
| |
| public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){ |
| QueryPhraseMap currMap = this; |
| for( TermInfo ti : phraseCandidate ){ |
| currMap = currMap.subMap.get( ti.getText() ); |
| if( currMap == null ) return null; |
| } |
| return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; |
| } |
| |
| public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){ |
| // check terminal |
| if( !terminal ) return false; |
| |
| // if the candidate is a term, it is valid |
| if( phraseCandidate.size() == 1 ) return true; |
| |
| // else check whether the candidate is valid phrase |
| // compare position-gaps between terms to slop |
| int pos = phraseCandidate.get( 0 ).getPosition(); |
| for( int i = 1; i < phraseCandidate.size(); i++ ){ |
| int nextPos = phraseCandidate.get( i ).getPosition(); |
| if( Math.abs( nextPos - pos - 1 ) > slop ) return false; |
| pos = nextPos; |
| } |
| return true; |
| } |
| } |
| } |