| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.misc; |
| |
| import java.util.Random; |
| |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.analysis.MockTokenizer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.TestUtil; |
| import org.junit.AfterClass; |
| import org.junit.BeforeClass; |
| |
| public class TestHighFreqTerms extends LuceneTestCase { |
| |
| private static IndexWriter writer =null; |
| private static Directory dir = null; |
| private static IndexReader reader =null; |
| |
| @BeforeClass |
| public static void setUpClass() throws Exception { |
| dir = newDirectory(); |
| writer = new IndexWriter(dir, newIndexWriterConfig(random(), |
| new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)) |
| .setMaxBufferedDocs(2)); |
| indexDocs(writer); |
| reader = DirectoryReader.open(dir); |
| TestUtil.checkIndex(dir); |
| } |
| |
| @AfterClass |
| public static void tearDownClass() throws Exception{ |
| reader.close(); |
| dir.close(); |
| dir = null; |
| reader = null; |
| writer = null; |
| } |
| /******************** Tests for getHighFreqTerms **********************************/ |
| |
| // test without specifying field (i.e. if we pass in field=null it should examine all fields) |
| // the term "diff" in the field "different_field" occurs 20 times and is the highest df term |
| public void testFirstTermHighestDocFreqAllFields () throws Exception{ |
| int numTerms = 12; |
| String field =null; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); |
| assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq ); |
| } |
| |
| public void testFirstTermHighestDocFreq () throws Exception{ |
| int numTerms = 12; |
| String field="FIELD_1"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); |
| assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq ); |
| } |
| |
| public void testOrderedByDocFreqDescending () throws Exception{ |
| int numTerms = 12; |
| String field="FIELD_1"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); |
| for (int i = 0; i < terms.length; i++) { |
| if (i > 0) { |
| assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq); |
| } |
| } |
| } |
| |
| public void testNumTerms () throws Exception{ |
| int numTerms = 12; |
| String field = null; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); |
| assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length); |
| } |
| |
| public void testGetHighFreqTerms () throws Exception{ |
| int numTerms=12; |
| String field="FIELD_1"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator()); |
| |
| for (int i = 0; i < terms.length; i++) { |
| String termtext = terms[i].termtext.utf8ToString(); |
| // hardcoded highTF or highTFmedDF |
| if (termtext.contains("highTF")) { |
| if (termtext.contains("medDF")) { |
| assertEquals("doc freq is not as expected", 5, terms[i].docFreq); |
| } else { |
| assertEquals("doc freq is not as expected", 1, terms[i].docFreq); |
| } |
| } else { |
| int n = Integer.parseInt(termtext); |
| assertEquals("doc freq is not as expected", getExpecteddocFreq(n), |
| terms[i].docFreq); |
| } |
| } |
| } |
| |
| /********************Test sortByTotalTermFreq**********************************/ |
| |
| public void testFirstTermHighestTotalTermFreq () throws Exception{ |
| int numTerms = 20; |
| String field = null; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); |
| assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq); |
| } |
| |
| public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{ |
| int numTerms = 20; |
| String field = "different_field"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); |
| assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq); |
| } |
| |
| public void testOrderedByTermFreqDescending () throws Exception{ |
| int numTerms = 12; |
| String field = "FIELD_1"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); |
| |
| for (int i = 0; i < terms.length; i++) { |
| // check that they are sorted by descending termfreq |
| // order |
| if (i > 0) { |
| assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq); |
| } |
| } |
| } |
| |
| public void testGetTermFreqOrdered () throws Exception{ |
| int numTerms = 12; |
| String field = "FIELD_1"; |
| TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator()); |
| |
| for (int i = 0; i < terms.length; i++) { |
| String text = terms[i].termtext.utf8ToString(); |
| if (text.contains("highTF")) { |
| if (text.contains("medDF")) { |
| assertEquals("total term freq is expected", 125, |
| terms[i].totalTermFreq); |
| } else { |
| assertEquals("total term freq is expected", 200, |
| terms[i].totalTermFreq); |
| } |
| |
| } else { |
| int n = Integer.parseInt(text); |
| assertEquals("doc freq is expected", getExpecteddocFreq(n), |
| terms[i].docFreq); |
| assertEquals("total term freq is expected", getExpectedtotalTermFreq(n), |
| terms[i].totalTermFreq); |
| } |
| } |
| } |
| |
| /********************Testing Utils**********************************/ |
| |
| private static void indexDocs(IndexWriter writer) throws Exception { |
| Random rnd = random(); |
| |
| /** |
| * Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared). |
| */ |
| for (int i = 1; i <= 10; i++) { |
| Document doc = new Document(); |
| String content = getContent(i); |
| |
| doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES)); |
| //add a different field |
| doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES)); |
| writer.addDocument(doc); |
| } |
| |
| //add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the |
| //highest freq terms for a specific field. |
| for (int i = 1; i <= 10; i++) { |
| Document doc = new Document(); |
| doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES)); |
| writer.addDocument(doc); |
| } |
| // add some docs where tf < df so we can see if sorting works |
| // highTF low df |
| int highTF = 200; |
| Document doc = new Document(); |
| String content = ""; |
| for (int i = 0; i < highTF; i++) { |
| content += "highTF "; |
| } |
| doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES)); |
| writer.addDocument(doc); |
| // highTF medium df =5 |
| int medium_df = 5; |
| for (int i = 0; i < medium_df; i++) { |
| int tf = 25; |
| Document newdoc = new Document(); |
| String newcontent = ""; |
| for (int j = 0; j < tf; j++) { |
| newcontent += "highTFmedDF "; |
| } |
| newdoc.add(newTextField(rnd, "FIELD_1", newcontent, Field.Store.YES)); |
| writer.addDocument(newdoc); |
| } |
| // add a doc with high tf in field different_field |
| int targetTF =150; |
| doc = new Document(); |
| content = ""; |
| for (int i = 0; i < targetTF; i++) { |
| content += "TF150 "; |
| } |
| doc.add(newTextField(rnd, "different_field", content, Field.Store.YES)); |
| writer.addDocument(doc); |
| writer.close(); |
| |
| } |
| |
| /** |
| * getContent |
| * return string containing numbers 1 to i with each number n occurring n times. |
| * i.e. for input of 3 return string "3 3 3 2 2 1" |
| */ |
| |
| private static String getContent(int i) { |
| String s = ""; |
| for (int j = 10; j >= i; j--) { |
| for (int k = 0; k < j; k++) { |
| // if j is 3 we return "3 3 3" |
| s += String.valueOf(j) + " "; |
| } |
| } |
| return s; |
| } |
| |
| private static int getExpectedtotalTermFreq(int i) { |
| return getExpecteddocFreq(i) * i; |
| } |
| |
| private static int getExpecteddocFreq(int i) { |
| return i; |
| } |
| } |