lucene/monitor/src/java/org/apache/lucene/monitor/MultipassTermFilteredPresearcher.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.monitor;

 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;

 /**
  * A TermFilteredPresearcher that indexes queries multiple times, with terms collected
  * from different routes through a querytree.  Each route will produce a set of terms
  * that are *sufficient* to select the query, and are indexed into a separate, suffixed field.
  * <p>
  * Incoming documents are then converted to a set of Disjunction queries over each
  * suffixed field, and these queries are combined into a conjunction query, such that the
  * document's set of terms must match a term from each route.
  * <p>
  * This allows filtering out of documents that contain one half of a two-term phrase query, for
  * example.  The query {@code "hello world"} will be indexed twice, once under 'hello' and once
  * under 'world'.  A document containing the terms "hello there" would match the first field,
  * but not the second, and so would not be selected for matching.
  * <p>
  * The number of passes the presearcher makes is configurable.  More passes will improve the
  * selected/matched ratio, but will take longer to index and will use more RAM.
  * <p>
  * A minimum weight can we set for terms to be chosen for the second and subsequent passes.  This
  * allows users to avoid indexing stopwords, for example.
  */
 public class MultipassTermFilteredPresearcher extends TermFilteredPresearcher {

   private final int passes;
   private final float minWeight;

   /**
    * Construct a new MultipassTermFilteredPresearcher
    *
    * @param passes        the number of times a query should be indexed
    * @param minWeight     the minimum weight a querytree should be advanced over
    * @param weightor      the TreeWeightor to use
    * @param queryHandlers a list of custom query handlers
    * @param filterFields  a set of fields to use as filters
    */
   public MultipassTermFilteredPresearcher(int passes, float minWeight, TermWeightor weightor,
                                           List<CustomQueryHandler> queryHandlers, Set<String> filterFields) {
     super(weightor, queryHandlers, filterFields);
     this.passes = passes;
     this.minWeight = minWeight;
   }

   /**
    * Construct a new MultipassTermFilteredPresearcher using {@link TermFilteredPresearcher#DEFAULT_WEIGHTOR}
    * <p>
    * Note that this will be constructed with a minimum advance weight of zero
    *
    * @param passes     the number of times a query should be indexed
    */
   public MultipassTermFilteredPresearcher(int passes) {
     this(passes, 0, DEFAULT_WEIGHTOR, Collections.emptyList(), Collections.emptySet());
   }

   @Override
   protected DocumentQueryBuilder getQueryBuilder() {
     return new MultipassDocumentQueryBuilder();
   }

   private static String field(String field, int pass) {
     return field + "_" + pass;
   }

   private class MultipassDocumentQueryBuilder implements DocumentQueryBuilder {

     BooleanQuery.Builder[] queries = new BooleanQuery.Builder[passes];
     Map<String, BytesRefHash> terms = new HashMap<>();

     MultipassDocumentQueryBuilder() {
       for (int i = 0; i < queries.length; i++) {
         queries[i] = new BooleanQuery.Builder();
       }
     }

     @Override
     public void addTerm(String field, BytesRef term) {
       BytesRefHash t = terms.computeIfAbsent(field, f -> new BytesRefHash());
       t.add(term);
     }

     @Override
     public Query build() {
       Map<String, BytesRef[]> collectedTerms = new HashMap<>();
       for (Map.Entry<String, BytesRefHash> entry : terms.entrySet()) {
         collectedTerms.put(entry.getKey(), convertHash(entry.getValue()));
       }
       BooleanQuery.Builder parent = new BooleanQuery.Builder();
       for (int i = 0; i < passes; i++) {
         BooleanQuery.Builder child = new BooleanQuery.Builder();
         for (String field : terms.keySet()) {
           child.add(new TermInSetQuery(field(field, i), collectedTerms.get(field)), BooleanClause.Occur.SHOULD);
         }
         parent.add(child.build(), BooleanClause.Occur.MUST);
       }
       return parent.build();
     }
   }

   @Override
   public Document buildQueryDocument(QueryTree querytree) {

     Document doc = new Document();

     for (int i = 0; i < passes; i++) {
       Map<String, BytesRefHash> fieldTerms = collectTerms(querytree);
       for (Map.Entry<String, BytesRefHash> entry : fieldTerms.entrySet()) {
         // we add the index terms once under a suffixed field for the multipass query, and
         // once under the plan field name for the TermsEnumTokenFilter
         doc.add(new Field(field(entry.getKey(), i),
             new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
         doc.add(new Field(entry.getKey(),
             new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
       }
       querytree.advancePhase(minWeight);
     }

     return doc;
   }

   private static BytesRef[] convertHash(BytesRefHash hash) {
     BytesRef[] terms = new BytesRef[hash.size()];
     for (int i = 0; i < terms.length; i++) {
       BytesRef t = new BytesRef();
       terms[i] = hash.get(i, t);
     }
     return terms;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.monitor;

	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.search.BooleanClause;
	import org.apache.lucene.search.BooleanQuery;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.TermInSetQuery;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefHash;

	/**
	* A TermFilteredPresearcher that indexes queries multiple times, with terms collected
	* from different routes through a querytree. Each route will produce a set of terms
	* that are sufficient to select the query, and are indexed into a separate, suffixed field.
	* <p>
	* Incoming documents are then converted to a set of Disjunction queries over each
	* suffixed field, and these queries are combined into a conjunction query, such that the
	* document's set of terms must match a term from each route.
	* <p>
	* This allows filtering out of documents that contain one half of a two-term phrase query, for
	* example. The query {@code "hello world"} will be indexed twice, once under 'hello' and once
	* under 'world'. A document containing the terms "hello there" would match the first field,
	* but not the second, and so would not be selected for matching.
	* <p>
	* The number of passes the presearcher makes is configurable. More passes will improve the
	* selected/matched ratio, but will take longer to index and will use more RAM.
	* <p>
	* A minimum weight can we set for terms to be chosen for the second and subsequent passes. This
	* allows users to avoid indexing stopwords, for example.
	*/
	public class MultipassTermFilteredPresearcher extends TermFilteredPresearcher {

	private final int passes;
	private final float minWeight;

	/**
	* Construct a new MultipassTermFilteredPresearcher
	*
	* @param passes the number of times a query should be indexed
	* @param minWeight the minimum weight a querytree should be advanced over
	* @param weightor the TreeWeightor to use
	* @param queryHandlers a list of custom query handlers
	* @param filterFields a set of fields to use as filters
	*/
	public MultipassTermFilteredPresearcher(int passes, float minWeight, TermWeightor weightor,
	List<CustomQueryHandler> queryHandlers, Set<String> filterFields) {
	super(weightor, queryHandlers, filterFields);
	this.passes = passes;
	this.minWeight = minWeight;
	}

	/**
	* Construct a new MultipassTermFilteredPresearcher using {@link TermFilteredPresearcher#DEFAULT_WEIGHTOR}
	* <p>
	* Note that this will be constructed with a minimum advance weight of zero
	*
	* @param passes the number of times a query should be indexed
	*/
	public MultipassTermFilteredPresearcher(int passes) {
	this(passes, 0, DEFAULT_WEIGHTOR, Collections.emptyList(), Collections.emptySet());
	}

	@Override
	protected DocumentQueryBuilder getQueryBuilder() {
	return new MultipassDocumentQueryBuilder();
	}

	private static String field(String field, int pass) {
	return field + "_" + pass;
	}

	private class MultipassDocumentQueryBuilder implements DocumentQueryBuilder {

	BooleanQuery.Builder[] queries = new BooleanQuery.Builder[passes];
	Map<String, BytesRefHash> terms = new HashMap<>();

	MultipassDocumentQueryBuilder() {
	for (int i = 0; i < queries.length; i++) {
	queries[i] = new BooleanQuery.Builder();
	}
	}

	@Override
	public void addTerm(String field, BytesRef term) {
	BytesRefHash t = terms.computeIfAbsent(field, f -> new BytesRefHash());
	t.add(term);
	}

	@Override
	public Query build() {
	Map<String, BytesRef[]> collectedTerms = new HashMap<>();
	for (Map.Entry<String, BytesRefHash> entry : terms.entrySet()) {
	collectedTerms.put(entry.getKey(), convertHash(entry.getValue()));
	}
	BooleanQuery.Builder parent = new BooleanQuery.Builder();
	for (int i = 0; i < passes; i++) {
	BooleanQuery.Builder child = new BooleanQuery.Builder();
	for (String field : terms.keySet()) {
	child.add(new TermInSetQuery(field(field, i), collectedTerms.get(field)), BooleanClause.Occur.SHOULD);
	}
	parent.add(child.build(), BooleanClause.Occur.MUST);
	}
	return parent.build();
	}
	}

	@Override
	public Document buildQueryDocument(QueryTree querytree) {

	Document doc = new Document();

	for (int i = 0; i < passes; i++) {
	Map<String, BytesRefHash> fieldTerms = collectTerms(querytree);
	for (Map.Entry<String, BytesRefHash> entry : fieldTerms.entrySet()) {
	// we add the index terms once under a suffixed field for the multipass query, and
	// once under the plan field name for the TermsEnumTokenFilter
	doc.add(new Field(field(entry.getKey(), i),
	new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
	doc.add(new Field(entry.getKey(),
	new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
	}
	querytree.advancePhase(minWeight);
	}

	return doc;
	}

	private static BytesRef[] convertHash(BytesRefHash hash) {
	BytesRef[] terms = new BytesRef[hash.size()];
	for (int i = 0; i < terms.length; i++) {
	BytesRef t = new BytesRef();
	terms[i] = hash.get(i, t);
	}
	return terms;
	}

	}