blob: 8f25da917176c6d939be4f2686a922db7cdc6b2f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.monitor;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
/**
* A TermFilteredPresearcher that indexes queries multiple times, with terms collected
* from different routes through a querytree. Each route will produce a set of terms
* that are *sufficient* to select the query, and are indexed into a separate, suffixed field.
* <p>
* Incoming documents are then converted to a set of Disjunction queries over each
* suffixed field, and these queries are combined into a conjunction query, such that the
* document's set of terms must match a term from each route.
* <p>
* This allows filtering out of documents that contain one half of a two-term phrase query, for
* example. The query {@code "hello world"} will be indexed twice, once under 'hello' and once
* under 'world'. A document containing the terms "hello there" would match the first field,
* but not the second, and so would not be selected for matching.
* <p>
* The number of passes the presearcher makes is configurable. More passes will improve the
* selected/matched ratio, but will take longer to index and will use more RAM.
* <p>
* A minimum weight can we set for terms to be chosen for the second and subsequent passes. This
* allows users to avoid indexing stopwords, for example.
*/
public class MultipassTermFilteredPresearcher extends TermFilteredPresearcher {
private final int passes;
private final float minWeight;
/**
* Construct a new MultipassTermFilteredPresearcher
*
* @param passes the number of times a query should be indexed
* @param minWeight the minimum weight a querytree should be advanced over
* @param weightor the TreeWeightor to use
* @param queryHandlers a list of custom query handlers
* @param filterFields a set of fields to use as filters
*/
public MultipassTermFilteredPresearcher(int passes, float minWeight, TermWeightor weightor,
List<CustomQueryHandler> queryHandlers, Set<String> filterFields) {
super(weightor, queryHandlers, filterFields);
this.passes = passes;
this.minWeight = minWeight;
}
/**
* Construct a new MultipassTermFilteredPresearcher using {@link TermFilteredPresearcher#DEFAULT_WEIGHTOR}
* <p>
* Note that this will be constructed with a minimum advance weight of zero
*
* @param passes the number of times a query should be indexed
*/
public MultipassTermFilteredPresearcher(int passes) {
this(passes, 0, DEFAULT_WEIGHTOR, Collections.emptyList(), Collections.emptySet());
}
@Override
protected DocumentQueryBuilder getQueryBuilder() {
return new MultipassDocumentQueryBuilder();
}
private static String field(String field, int pass) {
return field + "_" + pass;
}
private class MultipassDocumentQueryBuilder implements DocumentQueryBuilder {
BooleanQuery.Builder[] queries = new BooleanQuery.Builder[passes];
Map<String, BytesRefHash> terms = new HashMap<>();
MultipassDocumentQueryBuilder() {
for (int i = 0; i < queries.length; i++) {
queries[i] = new BooleanQuery.Builder();
}
}
@Override
public void addTerm(String field, BytesRef term) {
BytesRefHash t = terms.computeIfAbsent(field, f -> new BytesRefHash());
t.add(term);
}
@Override
public Query build() {
Map<String, BytesRef[]> collectedTerms = new HashMap<>();
for (Map.Entry<String, BytesRefHash> entry : terms.entrySet()) {
collectedTerms.put(entry.getKey(), convertHash(entry.getValue()));
}
BooleanQuery.Builder parent = new BooleanQuery.Builder();
for (int i = 0; i < passes; i++) {
BooleanQuery.Builder child = new BooleanQuery.Builder();
for (String field : terms.keySet()) {
child.add(new TermInSetQuery(field(field, i), collectedTerms.get(field)), BooleanClause.Occur.SHOULD);
}
parent.add(child.build(), BooleanClause.Occur.MUST);
}
return parent.build();
}
}
@Override
public Document buildQueryDocument(QueryTree querytree) {
Document doc = new Document();
for (int i = 0; i < passes; i++) {
Map<String, BytesRefHash> fieldTerms = collectTerms(querytree);
for (Map.Entry<String, BytesRefHash> entry : fieldTerms.entrySet()) {
// we add the index terms once under a suffixed field for the multipass query, and
// once under the plan field name for the TermsEnumTokenFilter
doc.add(new Field(field(entry.getKey(), i),
new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
doc.add(new Field(entry.getKey(),
new TermsEnumTokenStream(new BytesRefHashIterator(entry.getValue())), QUERYFIELDTYPE));
}
querytree.advancePhase(minWeight);
}
return doc;
}
private static BytesRef[] convertHash(BytesRefHash hash) {
BytesRef[] terms = new BytesRef[hash.size()];
for (int i = 0; i < terms.length; i++) {
BytesRef t = new BytesRef();
terms[i] = hash.get(i, t);
}
return terms;
}
}