blob: f44765c96e9d871160d430f1e6231a4aa04770c8 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Documents;
using Lucene.Net.Store;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Index;
using NUnit.Framework;
namespace Lucene.Net.Search
public class DuplicateFilterTest : TestCase
private static String KEY_FIELD = "url";
private RAMDirectory directory;
private IndexReader reader;
TermQuery tq = new TermQuery(new Term("text", "lucene"));
private IndexSearcher searcher;
public void SetUp()
directory = new RAMDirectory();
IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED);
//Add series of docs with filterable fields : url, text and dates flags
AddDoc(writer, "", "lucene 1.4.3 available", "20040101");
AddDoc(writer, "", "New release pending", "20040102");
AddDoc(writer, "", "Lucene 1.9 out now", "20050101");
AddDoc(writer, "", "Local man bites dog", "20040101");
AddDoc(writer, "", "Dog bites local man", "20040102");
AddDoc(writer, "", "Dog uses Lucene", "20050101");
AddDoc(writer, "", "Lucene 2.0 out", "20050101");
AddDoc(writer, "", "Oops. Lucene 2.1 out", "20050102");
reader = IndexReader.Open(directory,true);
searcher = new IndexSearcher(reader);
public void TearDown()
private void AddDoc(IndexWriter writer, String url, String text, String date)
Document doc = new Document();
doc.Add(new Field(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("date", date, Field.Store.YES, Field.Index.ANALYZED));
public void TestDefaultFilter()
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
Hashtable results = new Hashtable();
var h = searcher.Search(tq, df, 1000).ScoreDocs;
for (int i = 0; i < h.Length; i++)
Document d = searcher.Doc(h[i].Doc);
String url = d.Get(KEY_FIELD);
Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned");
public void TestNoFilter()
Hashtable results = new Hashtable();
ScoreDoc[] h = searcher.Search(tq, null, 1000).ScoreDocs;
Assert.IsTrue(h.Length > 0, "Default searching should have found some matches");
bool dupsFound = false;
for (int i = 0; i < h.Length; i++)
Document d = searcher.Doc(h[i].Doc);
String url = d.Get(KEY_FIELD);
if (!dupsFound)
dupsFound = results.Contains(url);
Assert.IsTrue(dupsFound, "Default searching should have found duplicate urls");
public void TestFastFilter()
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.ProcessingMode = DuplicateFilter.PM_FAST_INVALIDATION;
Hashtable results = new Hashtable();
ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs;
Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches");
for (int i = 0; i < h.Length; i++)
Document d = searcher.Doc(h[i].Doc);
String url = d.Get(KEY_FIELD);
Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned");
Assert.AreEqual(2, results.Count, "Two urls found");
public void TestKeepsLastFilter()
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.KeepMode = DuplicateFilter.KM_USE_LAST_OCCURRENCE;
ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs;
Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches");
for (int i = 0; i < h.Length; i++)
Document d = searcher.Doc(h[i].Doc);
String url = d.Get(KEY_FIELD);
TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url));
int lastDoc = 0;
while (td.Next())
lastDoc = td.Doc;
Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return last doc");
public void TestKeepsFirstFilter()
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.KeepMode = DuplicateFilter.KM_USE_FIRST_OCCURRENCE;
ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs;
Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches");
for (int i = 0; i < h.Length; i++)
Document d = searcher.Doc(h[i].Doc);
String url = d.Get(KEY_FIELD);
TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url));
int lastDoc = 0;
lastDoc = td.Doc;
Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return first doc");