blob: 65ab7d4e14470e0d5c96c8b1a79664a52a15d574 [file] [log] [blame]
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using Lucene.Net.Analysis;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Index.Extensions;
using Lucene.Net.Store;
using Lucene.Net.Util;
using NUnit.Framework;
using System;
namespace Lucene.Net.Misc
{
[SuppressCodecs("Lucene3x")]
public class TestHighFreqTerms : LuceneTestCase
{
private static IndexWriter writer = null;
private static Directory dir = null;
private static IndexReader reader = null;
[OneTimeSetUp]
public override void BeforeClass() // LUCENENET specific - renamed from SetUpClass() to ensure calling order vs base class
{
base.BeforeClass();
dir = NewDirectory();
writer = new IndexWriter(dir, NewIndexWriterConfig(Random,
TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.WHITESPACE, false))
.SetMaxBufferedDocs(2));
IndexDocs(writer);
reader = DirectoryReader.Open(dir);
TestUtil.CheckIndex(dir);
}
[OneTimeTearDown]
public override void AfterClass() // LUCENENET specific - renamed from TearDownClass() to ensure calling order vs base class
{
reader.Dispose();
dir.Dispose();
dir = null;
reader = null;
writer = null;
base.AfterClass();
}
/******************** Tests for getHighFreqTerms **********************************/
// test without specifying field (i.e. if we pass in field=null it should examine all fields)
// the term "diff" in the field "different_field" occurs 20 times and is the highest df term
[Test]
public void TestFirstTermHighestDocFreqAllFields()
{
int numTerms = 12;
string field = null;
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.DocFreqComparer.Default);
assertEquals("Term with highest docfreq is first", 20, terms[0].DocFreq);
}
[Test]
public void TestFirstTermHighestDocFreq()
{
int numTerms = 12;
string field = "FIELD_1";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.DocFreqComparer.Default);
assertEquals("Term with highest docfreq is first", 10, terms[0].DocFreq);
}
[Test]
public void TestOrderedByDocFreqDescending()
{
int numTerms = 12;
string field = "FIELD_1";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.DocFreqComparer.Default);
for (int i = 0; i < terms.Length; i++)
{
if (i > 0)
{
assertTrue("out of order " + terms[i - 1].DocFreq + "should be >= " + terms[i].DocFreq, terms[i - 1].DocFreq >= terms[i].DocFreq);
}
}
}
[Test]
public void TestNumTerms()
{
int numTerms = 12;
string field = null;
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.DocFreqComparer.Default);
assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.Length);
}
[Test]
public void TestGetHighFreqTerms()
{
int numTerms = 12;
string field = "FIELD_1";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.DocFreqComparer.Default);
for (int i = 0; i < terms.Length; i++)
{
string termtext = terms[i].TermText.Utf8ToString();
// hardcoded highTF or highTFmedDF
if (termtext.Contains("highTF"))
{
if (termtext.Contains("medDF"))
{
assertEquals("doc freq is not as expected", 5, terms[i].DocFreq);
}
else
{
assertEquals("doc freq is not as expected", 1, terms[i].DocFreq);
}
}
else
{
int n = Convert.ToInt32(termtext);
assertEquals("doc freq is not as expected", GetExpecteddocFreq(n),
terms[i].DocFreq);
}
}
}
/********************Test sortByTotalTermFreq**********************************/
[Test]
public void TestFirstTermHighestTotalTermFreq()
{
int numTerms = 20;
string field = null;
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.TotalTermFreqComparer.Default);
assertEquals("Term with highest totalTermFreq is first", 200, terms[0].TotalTermFreq);
}
[Test]
public void TestFirstTermHighestTotalTermFreqDifferentField()
{
int numTerms = 20;
string field = "different_field";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.TotalTermFreqComparer.Default);
assertEquals("Term with highest totalTermFreq is first" + terms[0].GetTermText(), 150, terms[0].TotalTermFreq);
}
[Test]
public void TestOrderedByTermFreqDescending()
{
int numTerms = 12;
string field = "FIELD_1";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.TotalTermFreqComparer.Default);
for (int i = 0; i < terms.Length; i++)
{
// check that they are sorted by descending termfreq
// order
if (i > 0)
{
assertTrue("out of order" + terms[i - 1] + " > " + terms[i], terms[i - 1].TotalTermFreq >= terms[i].TotalTermFreq);
}
}
}
[Test]
public void TestGetTermFreqOrdered()
{
int numTerms = 12;
string field = "FIELD_1";
TermStats[]
terms = HighFreqTerms.GetHighFreqTerms(reader, numTerms, field, HighFreqTerms.TotalTermFreqComparer.Default);
for (int i = 0; i < terms.Length; i++)
{
string text = terms[i].TermText.Utf8ToString();
if (text.Contains("highTF"))
{
if (text.Contains("medDF"))
{
assertEquals("total term freq is expected", 125,
terms[i].TotalTermFreq);
}
else
{
assertEquals("total term freq is expected", 200,
terms[i].TotalTermFreq);
}
}
else
{
int n = Convert.ToInt32(text);
assertEquals("doc freq is expected", GetExpecteddocFreq(n),
terms[i].DocFreq);
assertEquals("total term freq is expected", GetExpectedtotalTermFreq(n),
terms[i].TotalTermFreq);
}
}
}
/********************Testing Utils**********************************/
/// <summary>
/// LUCENENET NOTE: Made non-static because it depends on NewIndexField that is also non-static
/// </summary>
private void IndexDocs(IndexWriter writer)
{
Random rnd = Random;
/**
* Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
*/
for (int i = 1; i <= 10; i++)
{
Document doc = new Document();
string content = GetContent(i);
doc.Add(NewTextField(rnd, "FIELD_1", content, Field.Store.YES));
//add a different field
doc.Add(NewTextField(rnd, "different_field", "diff", Field.Store.YES));
writer.AddDocument(doc);
}
//add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
//highest freq terms for a specific field.
for (int i = 1; i <= 10; i++)
{
Document doc = new Document();
doc.Add(NewTextField(rnd, "different_field", "diff", Field.Store.YES));
writer.AddDocument(doc);
}
// add some docs where tf < df so we can see if sorting works
// highTF low df
int highTF = 200;
Document doc2 = new Document();
string content2 = "";
for (int i = 0; i < highTF; i++)
{
content2 += "highTF ";
}
doc2.Add(NewTextField(rnd, "FIELD_1", content2, Field.Store.YES));
writer.AddDocument(doc2);
// highTF medium df =5
int medium_df = 5;
for (int i = 0; i < medium_df; i++)
{
int tf = 25;
Document newdoc = new Document();
string newcontent = "";
for (int j = 0; j < tf; j++)
{
newcontent += "highTFmedDF ";
}
newdoc.Add(NewTextField(rnd, "FIELD_1", newcontent, Field.Store.YES));
writer.AddDocument(newdoc);
}
// add a doc with high tf in field different_field
int targetTF = 150;
doc2 = new Document();
content2 = "";
for (int i = 0; i < targetTF; i++)
{
content2 += "TF150 ";
}
doc2.Add(NewTextField(rnd, "different_field", content2, Field.Store.YES));
writer.AddDocument(doc2);
writer.Dispose();
}
/**
* getContent
* return string containing numbers 1 to i with each number n occurring n times.
* i.e. for input of 3 return string "3 3 3 2 2 1"
*/
private static string GetContent(int i)
{
string s = "";
for (int j = 10; j >= i; j--)
{
for (int k = 0; k < j; k++)
{
// if j is 3 we return "3 3 3"
s += j.ToString() + " ";
}
}
return s;
}
private static int GetExpectedtotalTermFreq(int i)
{
return GetExpecteddocFreq(i) * i;
}
private static int GetExpecteddocFreq(int i)
{
return i;
}
}
}