| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using System; |
| using System.Collections; |
| using System.Collections.Generic; |
| using System.Linq; |
| using System.Text; |
| |
| using Lucene.Net.Documents; |
| using Lucene.Net.Store; |
| using Lucene.Net.Analysis; |
| using Lucene.Net.Analysis.Standard; |
| using Lucene.Net.Index; |
| |
| using NUnit.Framework; |
| |
| namespace Lucene.Net.Search |
| { |
| [TestFixture] |
| public class DuplicateFilterTest : TestCase |
| { |
| private static String KEY_FIELD = "url"; |
| private RAMDirectory directory; |
| private IndexReader reader; |
| TermQuery tq = new TermQuery(new Term("text", "lucene")); |
| private IndexSearcher searcher; |
| |
| [SetUp] |
| public void SetUp() |
| { |
| directory = new RAMDirectory(); |
| IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Util.Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.UNLIMITED); |
| |
| //Add series of docs with filterable fields : url, text and dates flags |
| AddDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); |
| AddDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); |
| AddDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); |
| AddDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); |
| AddDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); |
| AddDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); |
| AddDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); |
| AddDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); |
| |
| writer.Close(); |
| reader = IndexReader.Open(directory,true); |
| searcher = new IndexSearcher(reader); |
| |
| } |
| |
| [TearDown] |
| public void TearDown() |
| { |
| reader.Close(); |
| searcher.Close(); |
| directory.Close(); |
| } |
| |
| private void AddDoc(IndexWriter writer, String url, String text, String date) |
| { |
| Document doc = new Document(); |
| doc.Add(new Field(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED)); |
| doc.Add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED)); |
| doc.Add(new Field("date", date, Field.Store.YES, Field.Index.ANALYZED)); |
| writer.AddDocument(doc); |
| } |
| |
| [Test] |
| public void TestDefaultFilter() |
| { |
| DuplicateFilter df = new DuplicateFilter(KEY_FIELD); |
| Hashtable results = new Hashtable(); |
| var h = searcher.Search(tq, df, 1000).ScoreDocs; |
| for (int i = 0; i < h.Length; i++) |
| { |
| Document d = searcher.Doc(h[i].Doc); |
| String url = d.Get(KEY_FIELD); |
| Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned"); |
| results.Add(url,url); |
| } |
| } |
| |
| [Test] |
| public void TestNoFilter() |
| { |
| Hashtable results = new Hashtable(); |
| ScoreDoc[] h = searcher.Search(tq, null, 1000).ScoreDocs; |
| Assert.IsTrue(h.Length > 0, "Default searching should have found some matches"); |
| bool dupsFound = false; |
| for (int i = 0; i < h.Length; i++) |
| { |
| Document d = searcher.Doc(h[i].Doc); |
| String url = d.Get(KEY_FIELD); |
| if (!dupsFound) |
| dupsFound = results.Contains(url); |
| results[url]=url; |
| } |
| Assert.IsTrue(dupsFound, "Default searching should have found duplicate urls"); |
| } |
| |
| [Test] |
| public void TestFastFilter() |
| { |
| DuplicateFilter df = new DuplicateFilter(KEY_FIELD); |
| df.ProcessingMode = DuplicateFilter.PM_FAST_INVALIDATION; |
| Hashtable results = new Hashtable(); |
| ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; |
| Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); |
| for (int i = 0; i < h.Length; i++) |
| { |
| Document d = searcher.Doc(h[i].Doc); |
| String url = d.Get(KEY_FIELD); |
| Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned"); |
| results.Add(url,url); |
| } |
| Assert.AreEqual(2, results.Count, "Two urls found"); |
| } |
| |
| [Test] |
| public void TestKeepsLastFilter() |
| { |
| DuplicateFilter df = new DuplicateFilter(KEY_FIELD); |
| df.KeepMode = DuplicateFilter.KM_USE_LAST_OCCURRENCE; |
| ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; |
| Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); |
| for (int i = 0; i < h.Length; i++) |
| { |
| Document d = searcher.Doc(h[i].Doc); |
| String url = d.Get(KEY_FIELD); |
| TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url)); |
| int lastDoc = 0; |
| while (td.Next()) |
| { |
| lastDoc = td.Doc; |
| } |
| Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return last doc"); |
| } |
| } |
| |
| [Test] |
| public void TestKeepsFirstFilter() |
| { |
| DuplicateFilter df = new DuplicateFilter(KEY_FIELD); |
| df.KeepMode = DuplicateFilter.KM_USE_FIRST_OCCURRENCE; |
| ScoreDoc[] h = searcher.Search(tq, df, 1000).ScoreDocs; |
| Assert.IsTrue(h.Length > 0, "Filtered searching should have found some matches"); |
| for (int i = 0; i < h.Length; i++) |
| { |
| Document d = searcher.Doc(h[i].Doc); |
| String url = d.Get(KEY_FIELD); |
| TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url)); |
| int lastDoc = 0; |
| td.Next(); |
| lastDoc = td.Doc; |
| Assert.AreEqual(lastDoc, h[i].Doc, "Duplicate urls should return first doc"); |
| } |
| } |
| } |
| } |