blob: 07e578eacb1f691afb763aa3f9cb1bacd17ab860 [file] [log] [blame]
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.index.TermDocs; // for javadocs
/**
* A {@link Filter} that only accepts documents whose single
* term value in the specified field is contained in the
* provided set of allowed terms.
*
* <p/>
*
* This is the same functionality as TermsFilter (from
* contrib/queries), except this filter requires that the
* field contains only a single term for all documents.
* Because of drastically different implementations, they
* also have different performance characteristics, as
* described below.
*
* <p/>
*
* The first invocation of this filter on a given field will
* be slower, since a {@link FieldCache.StringIndex} must be
* created. Subsequent invocations using the same field
* will re-use this cache. However, as with all
* functionality based on {@link FieldCache}, persistent RAM
* is consumed to hold the cache, and is not freed until the
* {@link IndexReader} is closed. In contrast, TermsFilter
* has no persistent RAM consumption.
*
*
* <p/>
*
* With each search, this filter translates the specified
* set of Terms into a private {@link OpenBitSet} keyed by
* term number per unique {@link IndexReader} (normally one
* reader per segment). Then, during matching, the term
* number for each docID is retrieved from the cache and
* then checked for inclusion using the {@link OpenBitSet}.
* Since all testing is done using RAM resident data
* structures, performance should be very fast, most likely
* fast enough to not require further caching of the
* DocIdSet for each possible combination of terms.
* However, because docIDs are simply scanned linearly, an
* index with a great many small documents may find this
* linear scan too costly.
*
* <p/>
*
* In contrast, TermsFilter builds up an {@link OpenBitSet},
* keyed by docID, every time it's created, by enumerating
* through all matching docs using {@link TermDocs} to seek
* and scan through each term's docID list. While there is
* no linear scan of all docIDs, besides the allocation of
* the underlying array in the {@link OpenBitSet}, this
* approach requires a number of "disk seeks" in proportion
* to the number of terms, which can be exceptionally costly
* when there are cache misses in the OS's IO cache.
*
* <p/>
*
* Generally, this filter will be slower on the first
* invocation for a given field, but subsequent invocations,
* even if you change the allowed set of Terms, should be
* faster than TermsFilter, especially as the number of
* Terms being matched increases. If you are matching only
* a very small number of terms, and those terms in turn
* match a very small number of documents, TermsFilter may
* perform faster.
*
* <p/>
*
* Which filter is best is very application dependent.
*/
public class FieldCacheTermsFilter extends Filter {
private String field;
private String[] terms;
public FieldCacheTermsFilter(String field, String... terms) {
this.field = field;
this.terms = terms;
}
public FieldCache getFieldCache() {
return FieldCache.DEFAULT;
}
@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
}
protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
private FieldCache.StringIndex fcsi;
private OpenBitSet openBitSet;
public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
this.fcsi = fcsi;
openBitSet = new OpenBitSet(this.fcsi.lookup.length);
for (int i=0;i<terms.length;i++) {
int termNumber = this.fcsi.binarySearchLookup(terms[i]);
if (termNumber > 0) {
openBitSet.fastSet(termNumber);
}
}
}
@Override
public DocIdSetIterator iterator() {
return new FieldCacheTermsFilterDocIdSetIterator();
}
/** This DocIdSet implementation is cacheable. */
@Override
public boolean isCacheable() {
return true;
}
protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
private int doc = -1;
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() {
try {
while (!openBitSet.fastGet(fcsi.order[++doc])) {}
} catch (ArrayIndexOutOfBoundsException e) {
doc = NO_MORE_DOCS;
}
return doc;
}
@Override
public int advance(int target) {
try {
doc = target;
while (!openBitSet.fastGet(fcsi.order[doc])) {
doc++;
}
} catch (ArrayIndexOutOfBoundsException e) {
doc = NO_MORE_DOCS;
}
return doc;
}
}
}
}