| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.lucene.monitor; |
| |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.search.BooleanClause; |
| import org.apache.lucene.search.BooleanQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.TermInSetQuery; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefHash; |
| |
| /** |
| * A TermFilteredPresearcher that indexes queries multiple times, with terms collected |
| * from different routes through a querytree. Each route will produce a set of terms |
| * that are *sufficient* to select the query, and are indexed into a separate, suffixed field. |
| * <p> |
| * Incoming documents are then converted to a set of Disjunction queries over each |
| * suffixed field, and these queries are combined into a conjunction query, such that the |
| * document's set of terms must match a term from each route. |
| * <p> |
| * This allows filtering out of documents that contain one half of a two-term phrase query, for |
| * example. The query {@code "hello world"} will be indexed twice, once under 'hello' and once |
| * under 'world'. A document containing the terms "hello there" would match the first field, |
| * but not the second, and so would not be selected for matching. |
| * <p> |
| * The number of passes the presearcher makes is configurable. More passes will improve the |
| * selected/matched ratio, but will take longer to index and will use more RAM. |
| * <p> |
| * A minimum weight can we set for terms to be chosen for the second and subsequent passes. This |
| * allows users to avoid indexing stopwords, for example. |
| */ |
| public class MultipassTermFilteredPresearcher extends TermFilteredPresearcher { |
| |
| private final int passes; |
| private final float minWeight; |
| |
| /** |
| * Construct a new MultipassTermFilteredPresearcher |
| * |
| * @param passes the number of times a query should be indexed |
| * @param minWeight the minimum weight a querytree should be advanced over |
| * @param weightor the TreeWeightor to use |
| * @param queryHandlers a list of custom query handlers |
| * @param filterFields a set of fields to use as filters |
| */ |
| public MultipassTermFilteredPresearcher(int passes, float minWeight, TermWeightor weightor, |
| List<CustomQueryHandler> queryHandlers, Set<String> filterFields) { |
| super(weightor, queryHandlers, filterFields); |
| this.passes = passes; |
| this.minWeight = minWeight; |
| } |
| |
| /** |
| * Construct a new MultipassTermFilteredPresearcher using {@link TermFilteredPresearcher#DEFAULT_WEIGHTOR} |
| * <p> |
| * Note that this will be constructed with a minimum advance weight of zero |
| * |
| * @param passes the number of times a query should be indexed |
| */ |
| public MultipassTermFilteredPresearcher(int passes) { |
| this(passes, 0, DEFAULT_WEIGHTOR, Collections.emptyList(), Collections.emptySet()); |
| } |
| |
| @Override |
| protected DocumentQueryBuilder getQueryBuilder() { |
| return new MultipassDocumentQueryBuilder(); |
| } |
| |
| private static String field(String field, int pass) { |
| return field + "_" + pass; |
| } |
| |
| private class MultipassDocumentQueryBuilder implements DocumentQueryBuilder { |
| |
| BooleanQuery.Builder[] queries = new BooleanQuery.Builder[passes]; |
| Map<String, BytesRefHash> terms = new HashMap<>(); |
| |
| MultipassDocumentQueryBuilder() { |
| for (int i = 0; i < queries.length; i++) { |
| queries[i] = new BooleanQuery.Builder(); |
| } |
| } |
| |
| @Override |
| public void addTerm(String field, BytesRef term) { |
| BytesRefHash t = terms.computeIfAbsent(field, f -> new BytesRefHash()); |
| t.add(term); |
| } |
| |
| @Override |
| public Query build() { |
| Map<String, BytesRef[]> collectedTerms = new HashMap<>(); |
| for (Map.Entry<String, BytesRefHash> entry : terms.entrySet()) { |
| collectedTerms.put(entry.getKey(), convertHash(entry.getValue())); |
| } |
| BooleanQuery.Builder parent = new BooleanQuery.Builder(); |
| for (int i = 0; i < passes; i++) { |
| BooleanQuery.Builder child = new BooleanQuery.Builder(); |
| for (String field : terms.keySet()) { |
| child.add(new TermInSetQuery(field(field, i), collectedTerms.get(field)), BooleanClause.Occur.SHOULD); |
| } |
| parent.add(child.build(), BooleanClause.Occur.MUST); |
| } |
| return parent.build(); |
| } |
| } |
| |
| @Override |
| public Document buildQueryDocument(QueryTree querytree) { |
| |
| Document doc = new Document(); |
| |
| for (int i = 0; i < passes; i++) { |
| Map<String, BytesRefHash> fieldTerms = collectTerms(querytree); |
| for (Map.Entry<String, BytesRefHash> entry : fieldTerms.entrySet()) { |
| // we add the index terms once under a suffixed field for the multipass query, and |
| // once under the plan field name for the TermsEnumTokenFilter |
| doc.add(new Field(field(entry.getKey(), i), |
| new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE)); |
| doc.add(new Field(entry.getKey(), |
| new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE)); |
| } |
| querytree.advancePhase(minWeight); |
| } |
| |
| return doc; |
| } |
| |
| private static BytesRef[] convertHash(BytesRefHash hash) { |
| BytesRef[] terms = new BytesRef[hash.size()]; |
| for (int i = 0; i < terms.length; i++) { |
| BytesRef t = new BytesRef(); |
| terms[i] = hash.get(i, t); |
| } |
| return terms; |
| } |
| |
| } |