blob: c4610d971327b34c2c9d73c4b90e2938a846bc40 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Search;
using Lucene.Net.Index;
using Lucene.Net.Util;
namespace Lucene.Net.Search
{
public class DuplicateFilter : Filter
{
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode = KM_USE_FIRST_OCCURRENCE;
public static int KM_USE_FIRST_OCCURRENCE = 1;
public static int KM_USE_LAST_OCCURRENCE = 2;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*
*/
int processingMode = PM_FULL_VALIDATION;
public static int PM_FULL_VALIDATION = 1;
public static int PM_FAST_INVALIDATION = 2;
public DuplicateFilter(String fieldName) : this(fieldName, KM_USE_LAST_OCCURRENCE, PM_FULL_VALIDATION)
{
}
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
{
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
public override DocIdSet GetDocIdSet(IndexReader reader)
{
if (processingMode == PM_FAST_INVALIDATION)
{
return FastBits(reader);
}
else
{
return CorrectBits(reader);
}
}
private OpenBitSet CorrectBits(IndexReader reader)
{
OpenBitSet bits = new OpenBitSet(reader.MaxDoc); //assume all are INvalid
Term startTerm = new Term(fieldName);
TermEnum te = reader.Terms(startTerm);
if (te != null)
{
Term currTerm = te.Term;
while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
{
int lastDoc = -1;
//set non duplicates
TermDocs td = reader.TermDocs(currTerm);
if (td.Next())
{
if (keepMode == KM_USE_FIRST_OCCURRENCE)
{
bits.Set(td.Doc);
}
else
{
do
{
lastDoc = td.Doc;
} while (td.Next());
bits.Set(lastDoc);
}
}
if (!te.Next())
{
break;
}
currTerm = te.Term;
}
}
return bits;
}
private OpenBitSet FastBits(IndexReader reader)
{
OpenBitSet bits = new OpenBitSet(reader.MaxDoc);
bits.Set(0, reader.MaxDoc); //assume all are valid
Term startTerm = new Term(fieldName);
TermEnum te = reader.Terms(startTerm);
if (te != null)
{
Term currTerm = te.Term;
while ((currTerm != null) && (currTerm.Field == startTerm.Field)) //term fieldnames are interned
{
if (te.DocFreq() > 1)
{
int lastDoc = -1;
//unset potential duplicates
TermDocs td = reader.TermDocs(currTerm);
td.Next();
if (keepMode == KM_USE_FIRST_OCCURRENCE)
{
td.Next();
}
do
{
lastDoc = td.Doc;
bits.Clear(lastDoc);
} while (td.Next());
if (keepMode == KM_USE_LAST_OCCURRENCE)
{
//restore the last bit
bits.Set(lastDoc);
}
}
if (!te.Next())
{
break;
}
currTerm = te.Term;
}
}
return bits;
}
public string FieldName
{
get { return fieldName; }
set { this.fieldName = value; }
}
public int KeepMode
{
get { return keepMode; }
set { this.keepMode = value; }
}
public override bool Equals(Object obj)
{
if (this == obj)
return true;
if ((obj == null) || (obj.GetType()!= this.GetType()))
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName)));
}
public override int GetHashCode()
{
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + processingMode;
hash = 31 * hash + fieldName.GetHashCode();
return hash;
}
public int ProcessingMode
{
get { return processingMode; }
set { this.processingMode = value; }
}
}
}