blob: 4a27ec343cc708ac009442e0c79641d1a00ab092 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.util.FrequencyTrackingRingBuffer;
/**
* A {@link QueryCachingPolicy} that tracks usage statistics of recently-used
* filters in order to decide on which filters are worth caching.
*
* @lucene.experimental
*/
public class UsageTrackingQueryCachingPolicy implements QueryCachingPolicy {
// the hash code that we use as a sentinel in the ring buffer.
private static final int SENTINEL = Integer.MIN_VALUE;
private static boolean isPointQuery(Query query) {
// we need to check for super classes because we occasionally use anonymous
// sub classes of eg. PointRangeQuery
for (Class<?> clazz = query.getClass(); clazz != Query.class; clazz = clazz.getSuperclass()) {
final String simpleName = clazz.getSimpleName();
if (simpleName.startsWith("Point") && simpleName.endsWith("Query")) {
return true;
}
}
return false;
}
static boolean isCostly(Query query) {
// This does not measure the cost of iterating over the filter (for this we
// already have the DocIdSetIterator#cost API) but the cost to build the
// DocIdSet in the first place
return query instanceof MultiTermQuery ||
query instanceof MultiTermQueryConstantScoreWrapper ||
query instanceof TermInSetQuery ||
isPointQuery(query);
}
private static boolean shouldNeverCache(Query query) {
if (query instanceof TermQuery) {
// We do not bother caching term queries since they are already plenty fast.
return true;
}
if (query instanceof DocValuesFieldExistsQuery) {
// We do not bother caching DocValuesFieldExistsQuery queries since they are already plenty fast.
return true;
}
if (query instanceof MatchAllDocsQuery) {
// MatchAllDocsQuery has an iterator that is faster than what a bit set could do.
return true;
}
// For the below queries, it's cheap to notice they cannot match any docs so
// we do not bother caching them.
if (query instanceof MatchNoDocsQuery) {
return true;
}
if (query instanceof BooleanQuery) {
BooleanQuery bq = (BooleanQuery) query;
if (bq.clauses().isEmpty()) {
return true;
}
}
if (query instanceof DisjunctionMaxQuery) {
DisjunctionMaxQuery dmq = (DisjunctionMaxQuery) query;
if (dmq.getDisjuncts().isEmpty()) {
return true;
}
}
return false;
}
private final FrequencyTrackingRingBuffer recentlyUsedFilters;
/**
* Expert: Create a new instance with a configurable history size. Beware of
* passing too large values for the size of the history, either
* {@link #minFrequencyToCache} returns low values and this means some filters
* that are rarely used will be cached, which would hurt performance. Or
* {@link #minFrequencyToCache} returns high values that are function of the
* size of the history but then filters will be slow to make it to the cache.
*
* @param historySize the number of recently used filters to track
*/
public UsageTrackingQueryCachingPolicy(int historySize) {
this.recentlyUsedFilters = new FrequencyTrackingRingBuffer(historySize, SENTINEL);
}
/** Create a new instance with an history size of 256. This should be a good
* default for most cases. */
public UsageTrackingQueryCachingPolicy() {
this(256);
}
/**
* For a given filter, return how many times it should appear in the history
* before being cached. The default implementation returns 2 for filters that
* need to evaluate against the entire index to build a {@link DocIdSetIterator},
* like {@link MultiTermQuery}, point-based queries or {@link TermInSetQuery},
* and 5 for other filters.
*/
protected int minFrequencyToCache(Query query) {
if (isCostly(query)) {
return 2;
} else {
// default: cache after the filter has been seen 5 times
int minFrequency = 5;
if (query instanceof BooleanQuery
|| query instanceof DisjunctionMaxQuery) {
// Say you keep reusing a boolean query that looks like "A OR B" and
// never use the A and B queries out of that context. 5 times after it
// has been used, we would cache both A, B and A OR B, which is
// wasteful. So instead we cache compound queries a bit earlier so that
// we would only cache "A OR B" in that case.
minFrequency--;
}
return minFrequency;
}
}
@Override
public void onUse(Query query) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
if (shouldNeverCache(query)) {
return;
}
// call hashCode outside of sync block
// in case it's somewhat expensive:
int hashCode = query.hashCode();
// we only track hash codes to avoid holding references to possible
// large queries; this may cause rare false positives, but at worse
// this just means we cache a query that was not in fact used enough:
synchronized (this) {
recentlyUsedFilters.add(hashCode);
}
}
int frequency(Query query) {
assert query instanceof BoostQuery == false;
assert query instanceof ConstantScoreQuery == false;
// call hashCode outside of sync block
// in case it's somewhat expensive:
int hashCode = query.hashCode();
synchronized (this) {
return recentlyUsedFilters.frequency(hashCode);
}
}
@Override
public boolean shouldCache(Query query) throws IOException {
if (shouldNeverCache(query)) {
return false;
}
final int frequency = frequency(query);
final int minFrequency = minFrequencyToCache(query);
return frequency >= minFrequency;
}
}