lucene/backwards/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java - lucene-solr - Git at Google

 package org.apache.lucene.search;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;

 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.util.OpenBitSet;
 import org.apache.lucene.index.TermDocs;  // for javadocs

 /**
  * A {@link Filter} that only accepts documents whose single
  * term value in the specified field is contained in the
  * provided set of allowed terms.
  *
  * <p/>
  *
  * This is the same functionality as TermsFilter (from
  * contrib/queries), except this filter requires that the
  * field contains only a single term for all documents.
  * Because of drastically different implementations, they
  * also have different performance characteristics, as
  * described below.
  *
  * <p/>
  *
  * The first invocation of this filter on a given field will
  * be slower, since a {@link FieldCache.StringIndex} must be
  * created.  Subsequent invocations using the same field
  * will re-use this cache.  However, as with all
  * functionality based on {@link FieldCache}, persistent RAM
  * is consumed to hold the cache, and is not freed until the
  * {@link IndexReader} is closed.  In contrast, TermsFilter
  * has no persistent RAM consumption.
  *
  *
  * <p/>
  *
  * With each search, this filter translates the specified
  * set of Terms into a private {@link OpenBitSet} keyed by
  * term number per unique {@link IndexReader} (normally one
  * reader per segment).  Then, during matching, the term
  * number for each docID is retrieved from the cache and
  * then checked for inclusion using the {@link OpenBitSet}.
  * Since all testing is done using RAM resident data
  * structures, performance should be very fast, most likely
  * fast enough to not require further caching of the
  * DocIdSet for each possible combination of terms.
  * However, because docIDs are simply scanned linearly, an
  * index with a great many small documents may find this
  * linear scan too costly.
  *
  * <p/>
  *
  * In contrast, TermsFilter builds up an {@link OpenBitSet},
  * keyed by docID, every time it's created, by enumerating
  * through all matching docs using {@link TermDocs} to seek
  * and scan through each term's docID list.  While there is
  * no linear scan of all docIDs, besides the allocation of
  * the underlying array in the {@link OpenBitSet}, this
  * approach requires a number of "disk seeks" in proportion
  * to the number of terms, which can be exceptionally costly
  * when there are cache misses in the OS's IO cache.
  *
  * <p/>
  *
  * Generally, this filter will be slower on the first
  * invocation for a given field, but subsequent invocations,
  * even if you change the allowed set of Terms, should be
  * faster than TermsFilter, especially as the number of
  * Terms being matched increases.  If you are matching only
  * a very small number of terms, and those terms in turn
  * match a very small number of documents, TermsFilter may
  * perform faster.
  *
  * <p/>
  *
  * Which filter is best is very application dependent.
  */

 public class FieldCacheTermsFilter extends Filter {
   private String field;
   private String[] terms;

   public FieldCacheTermsFilter(String field, String... terms) {
     this.field = field;
     this.terms = terms;
   }

   public FieldCache getFieldCache() {
     return FieldCache.DEFAULT;
   }

   @Override
   public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
     return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
   }

   protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
     private FieldCache.StringIndex fcsi;

     private OpenBitSet openBitSet;

     public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
       this.fcsi = fcsi;
       openBitSet = new OpenBitSet(this.fcsi.lookup.length);
       for (int i=0;i<terms.length;i++) {
         int termNumber = this.fcsi.binarySearchLookup(terms[i]);
         if (termNumber > 0) {
           openBitSet.fastSet(termNumber);
         }
       }
     }

     @Override
     public DocIdSetIterator iterator() {
       return new FieldCacheTermsFilterDocIdSetIterator();
     }

     /** This DocIdSet implementation is cacheable. */
     @Override
     public boolean isCacheable() {
       return true;
     }

     protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
       private int doc = -1;

       @Override
       public int docID() {
         return doc;
       }

       @Override
       public int nextDoc() {
         try {
           while (!openBitSet.fastGet(fcsi.order[++doc])) {}
         } catch (ArrayIndexOutOfBoundsException e) {
           doc = NO_MORE_DOCS;
         }
         return doc;
       }

       @Override
       public int advance(int target) {
         try {
           doc = target;
           while (!openBitSet.fastGet(fcsi.order[doc])) {
             doc++;
           }
         } catch (ArrayIndexOutOfBoundsException e) {
           doc = NO_MORE_DOCS;
         }
         return doc;
       }
     }
   }
 }
	package org.apache.lucene.search;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;

	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.util.OpenBitSet;
	import org.apache.lucene.index.TermDocs; // for javadocs

	/**
	* A {@link Filter} that only accepts documents whose single
	* term value in the specified field is contained in the
	* provided set of allowed terms.
	*
	* <p/>
	*
	* This is the same functionality as TermsFilter (from
	* contrib/queries), except this filter requires that the
	* field contains only a single term for all documents.
	* Because of drastically different implementations, they
	* also have different performance characteristics, as
	* described below.
	*
	* <p/>
	*
	* The first invocation of this filter on a given field will
	* be slower, since a {@link FieldCache.StringIndex} must be
	* created. Subsequent invocations using the same field
	* will re-use this cache. However, as with all
	* functionality based on {@link FieldCache}, persistent RAM
	* is consumed to hold the cache, and is not freed until the
	* {@link IndexReader} is closed. In contrast, TermsFilter
	* has no persistent RAM consumption.
	*
	*
	* <p/>
	*
	* With each search, this filter translates the specified
	* set of Terms into a private {@link OpenBitSet} keyed by
	* term number per unique {@link IndexReader} (normally one
	* reader per segment). Then, during matching, the term
	* number for each docID is retrieved from the cache and
	* then checked for inclusion using the {@link OpenBitSet}.
	* Since all testing is done using RAM resident data
	* structures, performance should be very fast, most likely
	* fast enough to not require further caching of the
	* DocIdSet for each possible combination of terms.
	* However, because docIDs are simply scanned linearly, an
	* index with a great many small documents may find this
	* linear scan too costly.
	*
	* <p/>
	*
	* In contrast, TermsFilter builds up an {@link OpenBitSet},
	* keyed by docID, every time it's created, by enumerating
	* through all matching docs using {@link TermDocs} to seek
	* and scan through each term's docID list. While there is
	* no linear scan of all docIDs, besides the allocation of
	* the underlying array in the {@link OpenBitSet}, this
	* approach requires a number of "disk seeks" in proportion
	* to the number of terms, which can be exceptionally costly
	* when there are cache misses in the OS's IO cache.
	*
	* <p/>
	*
	* Generally, this filter will be slower on the first
	* invocation for a given field, but subsequent invocations,
	* even if you change the allowed set of Terms, should be
	* faster than TermsFilter, especially as the number of
	* Terms being matched increases. If you are matching only
	* a very small number of terms, and those terms in turn
	* match a very small number of documents, TermsFilter may
	* perform faster.
	*
	* <p/>
	*
	* Which filter is best is very application dependent.
	*/

	public class FieldCacheTermsFilter extends Filter {
	private String field;
	private String[] terms;

	public FieldCacheTermsFilter(String field, String... terms) {
	this.field = field;
	this.terms = terms;
	}

	public FieldCache getFieldCache() {
	return FieldCache.DEFAULT;
	}

	@Override
	public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
	return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field));
	}

	protected class FieldCacheTermsFilterDocIdSet extends DocIdSet {
	private FieldCache.StringIndex fcsi;

	private OpenBitSet openBitSet;

	public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) {
	this.fcsi = fcsi;
	openBitSet = new OpenBitSet(this.fcsi.lookup.length);
	for (int i=0;i<terms.length;i++) {
	int termNumber = this.fcsi.binarySearchLookup(terms[i]);
	if (termNumber > 0) {
	openBitSet.fastSet(termNumber);
	}
	}
	}

	@Override
	public DocIdSetIterator iterator() {
	return new FieldCacheTermsFilterDocIdSetIterator();
	}

	/** This DocIdSet implementation is cacheable. */
	@Override
	public boolean isCacheable() {
	return true;
	}

	protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator {
	private int doc = -1;

	@Override
	public int docID() {
	return doc;
	}

	@Override
	public int nextDoc() {
	try {
	while (!openBitSet.fastGet(fcsi.order[++doc])) {}
	} catch (ArrayIndexOutOfBoundsException e) {
	doc = NO_MORE_DOCS;
	}
	return doc;
	}

	@Override
	public int advance(int target) {
	try {
	doc = target;
	while (!openBitSet.fastGet(fcsi.order[doc])) {
	doc++;
	}
	} catch (ArrayIndexOutOfBoundsException e) {
	doc = NO_MORE_DOCS;
	}
	return doc;
	}
	}
	}
	}