lucene/misc/src/test/org/apache/lucene/misc/TestHighFreqTerms.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.misc;

 import java.util.Random;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TestUtil;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;

 public class TestHighFreqTerms extends LuceneTestCase {

   private static IndexWriter writer =null;
   private static Directory dir = null;
   private static IndexReader reader =null;

   @BeforeClass
   public static void setUpClass() throws Exception {
     dir = newDirectory();
     writer = new IndexWriter(dir, newIndexWriterConfig(random(),
        new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))
        .setMaxBufferedDocs(2));
     indexDocs(writer);
     reader = DirectoryReader.open(dir);
     TestUtil.checkIndex(dir);
   }

   @AfterClass
   public static void tearDownClass() throws Exception{
     reader.close();
     dir.close();
     dir = null;
     reader = null;
     writer = null;
   }
 /******************** Tests for getHighFreqTerms **********************************/

   // test without specifying field (i.e. if we pass in field=null it should examine all fields)
   // the term "diff" in the field "different_field" occurs 20 times and is the highest df term
   public void testFirstTermHighestDocFreqAllFields () throws Exception{
     int numTerms = 12;
     String field =null;
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
     assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
   }

   public void testFirstTermHighestDocFreq () throws Exception{
     int numTerms = 12;
     String field="FIELD_1";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
     assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
   }

   public void testOrderedByDocFreqDescending () throws Exception{
     int numTerms = 12;
     String field="FIELD_1";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
     for (int i = 0; i < terms.length; i++) {
       if (i > 0) {
         assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
       }
     }
   }

   public void testNumTerms () throws Exception{
     int numTerms = 12;
     String field = null;
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
     assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
   }

   public void testGetHighFreqTerms () throws Exception{
     int numTerms=12;
     String field="FIELD_1";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());

     for (int i = 0; i < terms.length; i++) {
       String termtext = terms[i].termtext.utf8ToString();
       // hardcoded highTF or highTFmedDF
       if (termtext.contains("highTF")) {
         if (termtext.contains("medDF")) {
           assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
         } else {
           assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
         }
       } else {
         int n = Integer.parseInt(termtext);
         assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
             terms[i].docFreq);
       }
     }
   }

   /********************Test sortByTotalTermFreq**********************************/

   public void testFirstTermHighestTotalTermFreq () throws Exception{
     int numTerms = 20;
     String field = null;
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
     assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq);
   }

   public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
     int numTerms = 20;
     String field = "different_field";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
     assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq);
   }

   public void testOrderedByTermFreqDescending () throws Exception{
     int numTerms = 12;
     String field = "FIELD_1";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());

     for (int i = 0; i < terms.length; i++) {
       // check that they are sorted by descending termfreq
       // order
       if (i > 0) {
         assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq);
       }
     }
   }

   public void testGetTermFreqOrdered () throws Exception{
     int numTerms = 12;
     String field = "FIELD_1";
     TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());

     for (int i = 0; i < terms.length; i++) {
       String text = terms[i].termtext.utf8ToString();
       if (text.contains("highTF")) {
         if (text.contains("medDF")) {
           assertEquals("total term freq is expected", 125,
                        terms[i].totalTermFreq);
         } else {
           assertEquals("total term freq is expected", 200,
                        terms[i].totalTermFreq);
         }

       } else {
         int n = Integer.parseInt(text);
         assertEquals("doc freq is expected", getExpecteddocFreq(n),
                      terms[i].docFreq);
         assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
                      terms[i].totalTermFreq);
       }
     }
   }

   /********************Testing Utils**********************************/

   private static void indexDocs(IndexWriter writer) throws Exception {
     Random rnd = random();

     /**
      * Generate 10 documents where term n  has a docFreq of n and a totalTermFreq of n*2 (squared).
      */
     for (int i = 1; i <= 10; i++) {
       Document doc = new Document();
       String content = getContent(i);

       doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
       //add a different field
       doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
       writer.addDocument(doc);
     }

     //add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
     //highest freq terms for a specific field.
     for (int i = 1; i <= 10; i++) {
       Document doc = new Document();
       doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
       writer.addDocument(doc);
     }
     // add some docs where tf < df so we can see if sorting works
     // highTF low df
     int highTF = 200;
     Document doc = new Document();
     String content = "";
     for (int i = 0; i < highTF; i++) {
       content += "highTF ";
     }
     doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
     writer.addDocument(doc);
     // highTF medium df =5
     int medium_df = 5;
     for (int i = 0; i < medium_df; i++) {
       int tf = 25;
       Document newdoc = new Document();
       String newcontent = "";
       for (int j = 0; j < tf; j++) {
         newcontent += "highTFmedDF ";
       }
       newdoc.add(newTextField(rnd, "FIELD_1", newcontent, Field.Store.YES));
       writer.addDocument(newdoc);
     }
     // add a doc with high tf in field different_field
     int targetTF =150;
     doc = new Document();
     content = "";
     for (int i = 0; i < targetTF; i++) {
       content += "TF150 ";
     }
     doc.add(newTextField(rnd, "different_field", content, Field.Store.YES));
     writer.addDocument(doc);
     writer.close();

   }

   /**
    *  getContent
    *  return string containing numbers 1 to i with each number n occurring n times.
    *  i.e. for input of 3 return string "3 3 3 2 2 1"
    */

   private static String getContent(int i) {
     String s = "";
     for (int j = 10; j >= i; j--) {
       for (int k = 0; k < j; k++) {
         // if j is 3 we return "3 3 3"
         s += String.valueOf(j) + " ";
       }
     }
     return s;
   }

   private static int getExpectedtotalTermFreq(int i) {
     return getExpecteddocFreq(i) * i;
   }

   private static int getExpecteddocFreq(int i) {
     return i;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.misc;

	import java.util.Random;

	import org.apache.lucene.analysis.MockAnalyzer;
	import org.apache.lucene.analysis.MockTokenizer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.util.LuceneTestCase;
	import org.apache.lucene.util.TestUtil;
	import org.junit.AfterClass;
	import org.junit.BeforeClass;

	public class TestHighFreqTerms extends LuceneTestCase {

	private static IndexWriter writer =null;
	private static Directory dir = null;
	private static IndexReader reader =null;

	@BeforeClass
	public static void setUpClass() throws Exception {
	dir = newDirectory();
	writer = new IndexWriter(dir, newIndexWriterConfig(random(),
	new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false))
	.setMaxBufferedDocs(2));
	indexDocs(writer);
	reader = DirectoryReader.open(dir);
	TestUtil.checkIndex(dir);
	}

	@AfterClass
	public static void tearDownClass() throws Exception{
	reader.close();
	dir.close();
	dir = null;
	reader = null;
	writer = null;
	}
	/****************** Tests for getHighFreqTerms ********************************/

	// test without specifying field (i.e. if we pass in field=null it should examine all fields)
	// the term "diff" in the field "different_field" occurs 20 times and is the highest df term
	public void testFirstTermHighestDocFreqAllFields () throws Exception{
	int numTerms = 12;
	String field =null;
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
	assertEquals("Term with highest docfreq is first", 20,terms[0].docFreq );
	}

	public void testFirstTermHighestDocFreq () throws Exception{
	int numTerms = 12;
	String field="FIELD_1";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
	assertEquals("Term with highest docfreq is first", 10,terms[0].docFreq );
	}

	public void testOrderedByDocFreqDescending () throws Exception{
	int numTerms = 12;
	String field="FIELD_1";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
	for (int i = 0; i < terms.length; i++) {
	if (i > 0) {
	assertTrue ("out of order " + terms[i-1].docFreq + "should be >= " + terms[i].docFreq,terms[i-1].docFreq >= terms[i].docFreq);
	}
	}
	}

	public void testNumTerms () throws Exception{
	int numTerms = 12;
	String field = null;
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());
	assertEquals("length of terms array equals numTerms :" + numTerms, numTerms, terms.length);
	}

	public void testGetHighFreqTerms () throws Exception{
	int numTerms=12;
	String field="FIELD_1";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.DocFreqComparator());

	for (int i = 0; i < terms.length; i++) {
	String termtext = terms[i].termtext.utf8ToString();
	// hardcoded highTF or highTFmedDF
	if (termtext.contains("highTF")) {
	if (termtext.contains("medDF")) {
	assertEquals("doc freq is not as expected", 5, terms[i].docFreq);
	} else {
	assertEquals("doc freq is not as expected", 1, terms[i].docFreq);
	}
	} else {
	int n = Integer.parseInt(termtext);
	assertEquals("doc freq is not as expected", getExpecteddocFreq(n),
	terms[i].docFreq);
	}
	}
	}

	/******************Test sortByTotalTermFreq********************************/

	public void testFirstTermHighestTotalTermFreq () throws Exception{
	int numTerms = 20;
	String field = null;
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
	assertEquals("Term with highest totalTermFreq is first",200, terms[0].totalTermFreq);
	}

	public void testFirstTermHighestTotalTermFreqDifferentField () throws Exception{
	int numTerms = 20;
	String field = "different_field";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());
	assertEquals("Term with highest totalTermFreq is first"+ terms[0].getTermText(),150, terms[0].totalTermFreq);
	}

	public void testOrderedByTermFreqDescending () throws Exception{
	int numTerms = 12;
	String field = "FIELD_1";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());

	for (int i = 0; i < terms.length; i++) {
	// check that they are sorted by descending termfreq
	// order
	if (i > 0) {
	assertTrue ("out of order" +terms[i-1]+ " > " +terms[i],terms[i-1].totalTermFreq >= terms[i].totalTermFreq);
	}
	}
	}

	public void testGetTermFreqOrdered () throws Exception{
	int numTerms = 12;
	String field = "FIELD_1";
	TermStats[] terms = HighFreqTerms.getHighFreqTerms(reader, numTerms, field, new HighFreqTerms.TotalTermFreqComparator());

	for (int i = 0; i < terms.length; i++) {
	String text = terms[i].termtext.utf8ToString();
	if (text.contains("highTF")) {
	if (text.contains("medDF")) {
	assertEquals("total term freq is expected", 125,
	terms[i].totalTermFreq);
	} else {
	assertEquals("total term freq is expected", 200,
	terms[i].totalTermFreq);
	}

	} else {
	int n = Integer.parseInt(text);
	assertEquals("doc freq is expected", getExpecteddocFreq(n),
	terms[i].docFreq);
	assertEquals("total term freq is expected", getExpectedtotalTermFreq(n),
	terms[i].totalTermFreq);
	}
	}
	}

	/******************Testing Utils********************************/

	private static void indexDocs(IndexWriter writer) throws Exception {
	Random rnd = random();

	/**
	* Generate 10 documents where term n has a docFreq of n and a totalTermFreq of n*2 (squared).
	*/
	for (int i = 1; i <= 10; i++) {
	Document doc = new Document();
	String content = getContent(i);

	doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
	//add a different field
	doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
	writer.addDocument(doc);
	}

	//add 10 more docs with the term "diff" this will make it have the highest docFreq if we don't ask for the
	//highest freq terms for a specific field.
	for (int i = 1; i <= 10; i++) {
	Document doc = new Document();
	doc.add(newTextField(rnd, "different_field", "diff", Field.Store.YES));
	writer.addDocument(doc);
	}
	// add some docs where tf < df so we can see if sorting works
	// highTF low df
	int highTF = 200;
	Document doc = new Document();
	String content = "";
	for (int i = 0; i < highTF; i++) {
	content += "highTF ";
	}
	doc.add(newTextField(rnd, "FIELD_1", content, Field.Store.YES));
	writer.addDocument(doc);
	// highTF medium df =5
	int medium_df = 5;
	for (int i = 0; i < medium_df; i++) {
	int tf = 25;
	Document newdoc = new Document();
	String newcontent = "";
	for (int j = 0; j < tf; j++) {
	newcontent += "highTFmedDF ";
	}
	newdoc.add(newTextField(rnd, "FIELD_1", newcontent, Field.Store.YES));
	writer.addDocument(newdoc);
	}
	// add a doc with high tf in field different_field
	int targetTF =150;
	doc = new Document();
	content = "";
	for (int i = 0; i < targetTF; i++) {
	content += "TF150 ";
	}
	doc.add(newTextField(rnd, "different_field", content, Field.Store.YES));
	writer.addDocument(doc);
	writer.close();

	}

	/**
	* getContent
	* return string containing numbers 1 to i with each number n occurring n times.
	* i.e. for input of 3 return string "3 3 3 2 2 1"
	*/

	private static String getContent(int i) {
	String s = "";
	for (int j = 10; j >= i; j--) {
	for (int k = 0; k < j; k++) {
	// if j is 3 we return "3 3 3"
	s += String.valueOf(j) + " ";
	}
	}
	return s;
	}

	private static int getExpectedtotalTermFreq(int i) {
	return getExpecteddocFreq(i) * i;
	}

	private static int getExpecteddocFreq(int i) {
	return i;
	}
	}