src/Lucene.Net/Search/FieldCacheTermsFilter.cs - lucenenet - Git at Google

 namespace Lucene.Net.Search
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
     using IBits = Lucene.Net.Util.IBits;
     using BytesRef = Lucene.Net.Util.BytesRef;
     using FixedBitSet = Lucene.Net.Util.FixedBitSet;
     using SortedDocValues = Lucene.Net.Index.SortedDocValues;

     /// <summary>
     /// A <see cref="Filter"/> that only accepts documents whose single
     /// term value in the specified field is contained in the
     /// provided set of allowed terms.
     ///
     /// <para/>
     ///
     /// This is the same functionality as TermsFilter (from
     /// queries/), except this filter requires that the
     /// field contains only a single term for all documents.
     /// Because of drastically different implementations, they
     /// also have different performance characteristics, as
     /// described below.
     ///
     /// <para/>
     ///
     /// The first invocation of this filter on a given field will
     /// be slower, since a <see cref="SortedDocValues"/> must be
     /// created.  Subsequent invocations using the same field
     /// will re-use this cache.  However, as with all
     /// functionality based on <see cref="IFieldCache"/>, persistent RAM
     /// is consumed to hold the cache, and is not freed until the
     /// <see cref="Index.IndexReader"/> is disposed.  In contrast, TermsFilter
     /// has no persistent RAM consumption.
     ///
     ///
     /// <para/>
     ///
     /// With each search, this filter translates the specified
     /// set of <see cref="Index.Terms"/> into a private <see cref="FixedBitSet"/> keyed by
     /// term number per unique <see cref="Index.IndexReader"/> (normally one
     /// reader per segment).  Then, during matching, the term
     /// number for each docID is retrieved from the cache and
     /// then checked for inclusion using the <see cref="FixedBitSet"/>.
     /// Since all testing is done using RAM resident data
     /// structures, performance should be very fast, most likely
     /// fast enough to not require further caching of the
     /// <see cref="DocIdSet"/> for each possible combination of terms.
     /// However, because docIDs are simply scanned linearly, an
     /// index with a great many small documents may find this
     /// linear scan too costly.
     ///
     /// <para/>
     ///
     /// In contrast, TermsFilter builds up a <see cref="FixedBitSet"/>,
     /// keyed by docID, every time it's created, by enumerating
     /// through all matching docs using <see cref="Index.DocsEnum"/> to seek
     /// and scan through each term's docID list.  While there is
     /// no linear scan of all docIDs, besides the allocation of
     /// the underlying array in the <see cref="FixedBitSet"/>, this
     /// approach requires a number of "disk seeks" in proportion
     /// to the number of terms, which can be exceptionally costly
     /// when there are cache misses in the OS's IO cache.
     ///
     /// <para/>
     ///
     /// Generally, this filter will be slower on the first
     /// invocation for a given field, but subsequent invocations,
     /// even if you change the allowed set of <see cref="Index.Terms"/>, should be
     /// faster than TermsFilter, especially as the number of
     /// <see cref="Index.Terms"/> being matched increases.  If you are matching only
     /// a very small number of terms, and those terms in turn
     /// match a very small number of documents, TermsFilter may
     /// perform faster.
     ///
     /// <para/>
     ///
     /// Which filter is best is very application dependent.
     /// </summary>
     public class FieldCacheTermsFilter : Filter
     {
         private readonly string field;
         private readonly BytesRef[] terms;

         public FieldCacheTermsFilter(string field, params BytesRef[] terms)
         {
             this.field = field;
             this.terms = terms;
         }

         public FieldCacheTermsFilter(string field, params string[] terms)
         {
             this.field = field;
             this.terms = new BytesRef[terms.Length];
             for (int i = 0; i < terms.Length; i++)
             {
                 this.terms[i] = new BytesRef(terms[i]);
             }
         }

         public virtual IFieldCache FieldCache => Search.FieldCache.DEFAULT;

         public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs)
         {
             SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field);
             FixedBitSet bits = new FixedBitSet(fcsi.ValueCount);
             for (int i = 0; i < terms.Length; i++)
             {
                 int ord = fcsi.LookupTerm(terms[i]);
                 if (ord >= 0)
                 {
                     bits.Set(ord);
                 }
             }
             return new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) =>
             {
                 int ord = fcsi.GetOrd(doc);
                 if (ord == -1)
                 {
                     // missing
                     return false;
                 }
                 else
                 {
                     return bits.Get(ord);
                 }
             });
         }
     }
 }
	namespace Lucene.Net.Search
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
	using IBits = Lucene.Net.Util.IBits;
	using BytesRef = Lucene.Net.Util.BytesRef;
	using FixedBitSet = Lucene.Net.Util.FixedBitSet;
	using SortedDocValues = Lucene.Net.Index.SortedDocValues;

	/// <summary>
	/// A <see cref="Filter"/> that only accepts documents whose single
	/// term value in the specified field is contained in the
	/// provided set of allowed terms.
	///
	/// <para/>
	///
	/// This is the same functionality as TermsFilter (from
	/// queries/), except this filter requires that the
	/// field contains only a single term for all documents.
	/// Because of drastically different implementations, they
	/// also have different performance characteristics, as
	/// described below.
	///
	/// <para/>
	///
	/// The first invocation of this filter on a given field will
	/// be slower, since a <see cref="SortedDocValues"/> must be
	/// created. Subsequent invocations using the same field
	/// will re-use this cache. However, as with all
	/// functionality based on <see cref="IFieldCache"/>, persistent RAM
	/// is consumed to hold the cache, and is not freed until the
	/// <see cref="Index.IndexReader"/> is disposed. In contrast, TermsFilter
	/// has no persistent RAM consumption.
	///
	///
	/// <para/>
	///
	/// With each search, this filter translates the specified
	/// set of <see cref="Index.Terms"/> into a private <see cref="FixedBitSet"/> keyed by
	/// term number per unique <see cref="Index.IndexReader"/> (normally one
	/// reader per segment). Then, during matching, the term
	/// number for each docID is retrieved from the cache and
	/// then checked for inclusion using the <see cref="FixedBitSet"/>.
	/// Since all testing is done using RAM resident data
	/// structures, performance should be very fast, most likely
	/// fast enough to not require further caching of the
	/// <see cref="DocIdSet"/> for each possible combination of terms.
	/// However, because docIDs are simply scanned linearly, an
	/// index with a great many small documents may find this
	/// linear scan too costly.
	///
	/// <para/>
	///
	/// In contrast, TermsFilter builds up a <see cref="FixedBitSet"/>,
	/// keyed by docID, every time it's created, by enumerating
	/// through all matching docs using <see cref="Index.DocsEnum"/> to seek
	/// and scan through each term's docID list. While there is
	/// no linear scan of all docIDs, besides the allocation of
	/// the underlying array in the <see cref="FixedBitSet"/>, this
	/// approach requires a number of "disk seeks" in proportion
	/// to the number of terms, which can be exceptionally costly
	/// when there are cache misses in the OS's IO cache.
	///
	/// <para/>
	///
	/// Generally, this filter will be slower on the first
	/// invocation for a given field, but subsequent invocations,
	/// even if you change the allowed set of <see cref="Index.Terms"/>, should be
	/// faster than TermsFilter, especially as the number of
	/// <see cref="Index.Terms"/> being matched increases. If you are matching only
	/// a very small number of terms, and those terms in turn
	/// match a very small number of documents, TermsFilter may
	/// perform faster.
	///
	/// <para/>
	///
	/// Which filter is best is very application dependent.
	/// </summary>
	public class FieldCacheTermsFilter : Filter
	{
	private readonly string field;
	private readonly BytesRef[] terms;

	public FieldCacheTermsFilter(string field, params BytesRef[] terms)
	{
	this.field = field;
	this.terms = terms;
	}

	public FieldCacheTermsFilter(string field, params string[] terms)
	{
	this.field = field;
	this.terms = new BytesRef[terms.Length];
	for (int i = 0; i < terms.Length; i++)
	{
	this.terms[i] = new BytesRef(terms[i]);
	}
	}

	public virtual IFieldCache FieldCache => Search.FieldCache.DEFAULT;

	public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs)
	{
	SortedDocValues fcsi = FieldCache.GetTermsIndex((context.AtomicReader), field);
	FixedBitSet bits = new FixedBitSet(fcsi.ValueCount);
	for (int i = 0; i < terms.Length; i++)
	{
	int ord = fcsi.LookupTerm(terms[i]);
	if (ord >= 0)
	{
	bits.Set(ord);
	}
	}
	return new FieldCacheDocIdSet(context.Reader.MaxDoc, acceptDocs, (doc) =>
	{
	int ord = fcsi.GetOrd(doc);
	if (ord == -1)
	{
	// missing
	return false;
	}
	else
	{
	return bits.Get(ord);
	}
	});
	}
	}
	}