blob: fba52709c49ce7da6567a19773682504b1c06d07 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.misc;
import java.util.Random;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestHighFreqTerms extends LuceneTestCase {
private static IndexWriter writer =null;
private static Directory dir = null;
private static IndexReader reader =null;
@BeforeClass
public static void setUpClass() throws Exception {
dir = newDirectory();
writer = new IndexWriter(dir, newIndexWriterConfig(random(),
new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))
.setMaxBufferedDocs(2));
indexDocs(writer);
reader = DirectoryReader.open(dir);
TestUtil.checkIndex(dir);
}
@AfterClass
public static void tearDownClass() throws Exception{
reader.close();
dir.close();
dir = null;
reader = null;
writer = null;
}
/******************** Tests for getHighFreqTerms **********************************/
// test without specifying field (i.e. if we pass in field=null it should examine all fields)
// the term "diff" in the field "different_field" occurs 20 times and is the highest df term
public void testFirstTermHighestDocFreqAllFields () throws Exception{
int numTerms = 12;
String field =null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
}
public void testFirstTermHighestDocFreq () throws Exception{
int numTerms = 12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
}
public void testOrderedByDocFreqDescending () throws Exception{
int numTerms = 12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (int i = 0; i < terms.length; i++) {
if (i > 0) {
assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
}
}
}
public void testNumTerms () throws Exception{
int numTerms = 12;
String field = null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
}
public void testGetHighFreqTerms () throws Exception{
int numTerms=12;
String field="FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
for (int i = 0; i < terms.length; i++) {
String termtext = terms[i].termtext.utf8ToString();
// hardcoded highTF or highTFmedDF
if (termtext.contains("highTF")) {
if (termtext.contains("medDF")) {
assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
} else {
assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
}
} else {
int n = Integer.parseInt(termtext);
assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
terms[i].docFreq);
}
}
}
/********************Test sortByTotalTermFreq**********************************/
public void testFirstTermHighestTotalTermFreq () throws Exception{
int numTerms = 20;
String field = null;
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq);
}
public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
int numTerms = 20;
String field = "different_field";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq);
}
public void testOrderedByTermFreqDescending () throws Exception{
int numTerms = 12;
String field = "FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
for (int i = 0; i < terms.length; i++) {
// check that they are sorted by descending termfreq
// order
if (i > 0) {
assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq);
}
}
}
public void testGetTermFreqOrdered () throws Exception{
int numTerms = 12;
String field = "FIELD_1";
TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
for (int i = 0; i < terms.length; i++) {
String text = terms[i].termtext.utf8ToString();
if (text.contains("highTF")) {
if (text.contains("medDF")) {
assertEquals("total term freq is expected", 125,
terms[i].totalTermFreq);
} else {
assertEquals("total term freq is expected", 200,
terms[i].totalTermFreq);
}
} else {
int n = Integer.parseInt(text);
assertEquals("doc freq is expected", getExpecteddocFreq(n),
terms[i].docFreq);
assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
terms[i].totalTermFreq);
}
}
}
/********************Testing Utils**********************************/
private static void indexDocs(IndexWriter writer) throws Exception {
Random rnd = random();
/**
* Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
*/
for (int i = 1; i <= 10; i++) {
Document doc = new Document();
String content = getContent(i);
doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
//add a different field
doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
writer.addDocument(doc);
}
//add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
//highest freq terms for a specific field.
for (int i = 1; i <= 10; i++) {
Document doc = new Document();
doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
writer.addDocument(doc);
}
// add some docs where tf < df so we can see if sorting works
// highTF low df
int highTF = 200;
Document doc = new Document();
String content = "";
for (int i = 0; i < highTF; i++) {
content += "highTF ";
}
doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
writer.addDocument(doc);
// highTF medium df =5
int medium_df = 5;
for (int i = 0; i < medium_df; i++) {
int tf = 25;
Document newdoc = new Document();
String newcontent = "";
for (int j = 0; j < tf; j++) {
newcontent += "highTFmedDF ";
}
newdoc.add(newTextField(rnd, "FIELD_1", newcontent, Field.Store.YES));
writer.addDocument(newdoc);
}
// add a doc with high tf in field different_field
int targetTF =150;
doc = new Document();
content = "";
for (int i = 0; i < targetTF; i++) {
content += "TF150 ";
}
doc.add(newTextField(rnd, "different_field", content, Field.Store.YES));
writer.addDocument(doc);
writer.close();
}
/**
* getContent
* return string containing numbers 1 to i with each number n occurring n times.
* i.e. for input of 3 return string "3 3 3 2 2 1"
*/
private static String getContent(int i) {
String s = "";
for (int j = 10; j >= i; j--) {
for (int k = 0; k < j; k++) {
// if j is 3 we return "3 3 3"
s += String.valueOf(j) + " ";
}
}
return s;
}
private static int getExpectedtotalTermFreq(int i) {
return getExpecteddocFreq(i) * i;
}
private static int getExpecteddocFreq(int i) {
return i;
}
}