| using Lucene.Net.Index; |
| using Lucene.Net.Search; |
| using Lucene.Net.Util; |
| using System; |
| |
| namespace Lucene.Net.Sandbox.Queries |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Filter to remove duplicate values from search results. |
| /// <para/> |
| /// WARNING: for this to work correctly, you may have to wrap |
| /// your reader as it cannot current deduplicate across different |
| /// index segments. |
| /// </summary> |
| /// <seealso cref="SlowCompositeReaderWrapper"/> |
| public class DuplicateFilter : Filter |
| { |
| // TODO: make duplicate filter aware of ReaderContext such that we can |
| // filter duplicates across segments |
| |
| // LUCENENET NOTE: KeepMode enum moved outside of this class to avoid naming collisions |
| |
| private KeepMode keepMode; |
| |
| // LUCENENET NOTE: ProcessingMode enum moved outside of this class to avoid naming collisions |
| |
| private ProcessingMode processingMode; |
| |
| private string fieldName; |
| |
| public DuplicateFilter(string fieldName) |
| : this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION) |
| { |
| } |
| |
| public DuplicateFilter(string fieldName, KeepMode keepMode, ProcessingMode processingMode) |
| { |
| this.fieldName = fieldName; |
| this.keepMode = keepMode; |
| this.processingMode = processingMode; |
| } |
| |
| |
| public override DocIdSet GetDocIdSet(AtomicReaderContext context, IBits acceptDocs) |
| { |
| if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) |
| { |
| return FastBits(context.AtomicReader, acceptDocs); |
| } |
| else |
| { |
| return CorrectBits(context.AtomicReader, acceptDocs); |
| } |
| } |
| |
| private FixedBitSet CorrectBits(AtomicReader reader, IBits acceptDocs) |
| { |
| FixedBitSet bits = new FixedBitSet(reader.MaxDoc); //assume all are INvalid |
| Terms terms = reader.Fields.GetTerms(fieldName); |
| |
| if (terms == null) |
| { |
| return bits; |
| } |
| |
| TermsEnum termsEnum = terms.GetEnumerator(); |
| DocsEnum docs = null; |
| while (termsEnum.MoveNext()) |
| { |
| docs = termsEnum.Docs(acceptDocs, docs, DocsFlags.NONE); |
| int doc = docs.NextDoc(); |
| if (doc != DocIdSetIterator.NO_MORE_DOCS) |
| { |
| if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) |
| { |
| bits.Set(doc); |
| } |
| else |
| { |
| int lastDoc/* = doc*/; // LUCENENET: Removed unnecessary assignment |
| while (true) |
| { |
| lastDoc = doc; |
| doc = docs.NextDoc(); |
| if (doc == DocIdSetIterator.NO_MORE_DOCS) |
| { |
| break; |
| } |
| } |
| bits.Set(lastDoc); |
| } |
| } |
| } |
| return bits; |
| } |
| |
| private FixedBitSet FastBits(AtomicReader reader, IBits acceptDocs) |
| { |
| FixedBitSet bits = new FixedBitSet(reader.MaxDoc); |
| bits.Set(0, reader.MaxDoc); //assume all are valid |
| Terms terms = reader.Fields.GetTerms(fieldName); |
| |
| if (terms == null) |
| { |
| return bits; |
| } |
| |
| TermsEnum termsEnum = terms.GetEnumerator(); |
| DocsEnum docs = null; |
| while (termsEnum.MoveNext()) |
| { |
| if (termsEnum.DocFreq > 1) |
| { |
| // unset potential duplicates |
| docs = termsEnum.Docs(acceptDocs, docs, DocsFlags.NONE); |
| int doc = docs.NextDoc(); |
| if (doc != DocIdSetIterator.NO_MORE_DOCS) |
| { |
| if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) |
| { |
| doc = docs.NextDoc(); |
| } |
| } |
| |
| int lastDoc/* = -1*/; // LUCENENET: Unnecessary assignment |
| while (true) |
| { |
| lastDoc = doc; |
| bits.Clear(lastDoc); |
| doc = docs.NextDoc(); |
| if (doc == DocIdSetIterator.NO_MORE_DOCS) |
| { |
| break; |
| } |
| } |
| |
| if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) |
| { |
| // restore the last bit |
| bits.Set(lastDoc); |
| } |
| } |
| } |
| |
| return bits; |
| } |
| |
| public virtual string FieldName |
| { |
| get => fieldName; |
| set => this.fieldName = value; |
| } |
| |
| public KeepMode KeepMode |
| { |
| get => keepMode; |
| set => keepMode = value; |
| } |
| |
| |
| public override bool Equals(object obj) |
| { |
| if (this == obj) |
| { |
| return true; |
| } |
| if ((obj == null) || (obj.GetType() != this.GetType())) |
| { |
| return false; |
| } |
| |
| DuplicateFilter other = (DuplicateFilter)obj; |
| return keepMode == other.keepMode && |
| processingMode == other.processingMode && |
| fieldName != null && fieldName.Equals(other.fieldName, StringComparison.Ordinal); |
| } |
| |
| |
| public override int GetHashCode() |
| { |
| int hash = 217; |
| hash = 31 * hash + keepMode.GetHashCode(); |
| hash = 31 * hash + processingMode.GetHashCode(); |
| hash = 31 * hash + fieldName.GetHashCode(); |
| return hash; |
| } |
| |
| public ProcessingMode ProcessingMode |
| { |
| get => processingMode; |
| set => processingMode = value; |
| } |
| } |
| |
| /// <summary> |
| /// KeepMode determines which document id to consider as the master, all others being |
| /// identified as duplicates. Selecting the "first occurrence" can potentially save on IO. |
| /// </summary> |
| public enum KeepMode |
| { |
| KM_USE_FIRST_OCCURRENCE, |
| |
| KM_USE_LAST_OCCURRENCE |
| } |
| |
| /// <summary> |
| /// "Full" processing mode starts by setting all bits to false and only setting bits |
| /// for documents that contain the given field and are identified as none-duplicates. |
| /// <para/> |
| /// "Fast" processing sets all bits to true then unsets all duplicate docs found for the |
| /// given field. This approach avoids the need to read DocsEnum for terms that are seen |
| /// to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially |
| /// faster approach , the downside is that bitsets produced will include bits set for |
| /// documents that do not actually contain the field given. |
| /// </summary> |
| public enum ProcessingMode |
| { |
| /// <summary> |
| /// "Full" processing mode starts by setting all bits to false and only setting bits |
| /// for documents that contain the given field and are identified as none-duplicates. |
| /// </summary> |
| PM_FULL_VALIDATION, |
| |
| /// <summary> |
| /// "Fast" processing sets all bits to true then unsets all duplicate docs found for the |
| /// given field. This approach avoids the need to read DocsEnum for terms that are seen |
| /// to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially |
| /// faster approach , the downside is that bitsets produced will include bits set for |
| /// documents that do not actually contain the field given. |
| /// </summary> |
| PM_FAST_INVALIDATION |
| } |
| } |