lucene/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.benchmark.quality.utils;

 import java.io.IOException;
 import java.nio.file.Paths;

 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.PriorityQueue;

 /**
  * Suggest Quality queries based on an index contents.
  * Utility class, used for making quality test benchmarks.
  */
 public class QualityQueriesFinder {

   private static final String newline = System.getProperty("line.separator");
   private Directory dir;

   /**
    * Constructor over a directory containing the index.
    * @param dir directory containing the index we search for the quality test.
    */
   private QualityQueriesFinder(Directory dir) {
     this.dir = dir;
   }

   /**
    * @param args {index-dir}
    * @throws IOException  if cannot access the index.
    */
   public static void main(String[] args) throws IOException {
     if (args.length<1) {
       System.err.println("Usage: java QualityQueriesFinder <index-dir>");
       System.exit(1);
     }
     QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.open(Paths.get(args[0])));
     String q[] = qqf.bestQueries("body",20);
     for (int i=0; i<q.length; i++) {
       System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null));
     }
   }

   private String [] bestQueries(String field,int numQueries) throws IOException {
     String words[] = bestTerms("body",4*numQueries);
     int n = words.length;
     int m = n/4;
     String res[] = new String[m];
     for (int i=0; i<res.length; i++) {
       res[i] = words[i] + " " + words[m+i]+ "  " + words[n-1-m-i]  + " " + words[n-1-i];
       //System.out.println("query["+i+"]:  "+res[i]);
     }
     return res;
   }

   private static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) {
     return
       "<top>" + newline +
       "<num> Number: " + qnum             + newline + newline +
       "<title> " + (title==null?"":title) + newline + newline +
       "<desc> Description:"               + newline +
       (description==null?"":description)  + newline + newline +
       "<narr> Narrative:"                 + newline +
       (narrative==null?"":narrative)      + newline + newline +
       "</top>";
   }

   private String [] bestTerms(String field,int numTerms) throws IOException {
     PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
     IndexReader ir = DirectoryReader.open(dir);
     try {
       int threshold = ir.maxDoc() / 10; // ignore words too common.
       Terms terms = MultiTerms.getTerms(ir, field);
       if (terms != null) {
         TermsEnum termsEnum = terms.iterator();
         while (termsEnum.next() != null) {
           int df = termsEnum.docFreq();
           if (df<threshold) {
             String ttxt = termsEnum.term().utf8ToString();
             pq.insertWithOverflow(new TermDf(ttxt,df));
           }
         }
       }
     } finally {
       ir.close();
     }
     String res[] = new String[pq.size()];
     int i = 0;
     while (pq.size()>0) {
       TermDf tdf = pq.pop();
       res[i++] = tdf.word;
       System.out.println(i+".   word:  "+tdf.df+"   "+tdf.word);
     }
     return res;
   }

   private static class TermDf {
     String word;
     int df;
     TermDf (String word, int freq) {
       this.word = word;
       this.df = freq;
     }
   }

   private static class TermsDfQueue extends PriorityQueue<TermDf> {
     TermsDfQueue (int maxSize) {
       super(maxSize);
     }
     @Override
     protected boolean lessThan(TermDf tf1, TermDf tf2) {
       return tf1.df < tf2.df;
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.benchmark.quality.utils;

	import java.io.IOException;
	import java.nio.file.Paths;

	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.MultiTerms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.PriorityQueue;

	/**
	* Suggest Quality queries based on an index contents.
	* Utility class, used for making quality test benchmarks.
	*/
	public class QualityQueriesFinder {

	private static final String newline = System.getProperty("line.separator");
	private Directory dir;

	/**
	* Constructor over a directory containing the index.
	* @param dir directory containing the index we search for the quality test.
	*/
	private QualityQueriesFinder(Directory dir) {
	this.dir = dir;
	}

	/**
	* @param args {index-dir}
	* @throws IOException if cannot access the index.
	*/
	public static void main(String[] args) throws IOException {
	if (args.length<1) {
	System.err.println("Usage: java QualityQueriesFinder <index-dir>");
	System.exit(1);
	}
	QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.open(Paths.get(args[0])));
	String q[] = qqf.bestQueries("body",20);
	for (int i=0; i<q.length; i++) {
	System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null));
	}
	}

	private String [] bestQueries(String field,int numQueries) throws IOException {
	String words[] = bestTerms("body",4*numQueries);
	int n = words.length;
	int m = n/4;
	String res[] = new String[m];
	for (int i=0; i<res.length; i++) {
	res[i] = words[i] + " " + words[m+i]+ " " + words[n-1-m-i] + " " + words[n-1-i];
	//System.out.println("query["+i+"]: "+res[i]);
	}
	return res;
	}

	private static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) {
	return
	"<top>" + newline +
	"<num> Number: " + qnum + newline + newline +
	"<title> " + (title==null?"":title) + newline + newline +
	"<desc> Description:" + newline +
	(description==null?"":description) + newline + newline +
	"<narr> Narrative:" + newline +
	(narrative==null?"":narrative) + newline + newline +
	"</top>";
	}

	private String [] bestTerms(String field,int numTerms) throws IOException {
	PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
	IndexReader ir = DirectoryReader.open(dir);
	try {
	int threshold = ir.maxDoc() / 10; // ignore words too common.
	Terms terms = MultiTerms.getTerms(ir, field);
	if (terms != null) {
	TermsEnum termsEnum = terms.iterator();
	while (termsEnum.next() != null) {
	int df = termsEnum.docFreq();
	if (df<threshold) {
	String ttxt = termsEnum.term().utf8ToString();
	pq.insertWithOverflow(new TermDf(ttxt,df));
	}
	}
	}
	} finally {
	ir.close();
	}
	String res[] = new String[pq.size()];
	int i = 0;
	while (pq.size()>0) {
	TermDf tdf = pq.pop();
	res[i++] = tdf.word;
	System.out.println(i+". word: "+tdf.df+" "+tdf.word);
	}
	return res;
	}

	private static class TermDf {
	String word;
	int df;
	TermDf (String word, int freq) {
	this.word = word;
	this.df = freq;
	}
	}

	private static class TermsDfQueue extends PriorityQueue<TermDf> {
	TermsDfQueue (int maxSize) {
	super(maxSize);
	}
	@Override
	protected boolean lessThan(TermDf tf1, TermDf tf2) {
	return tf1.df < tf2.df;
	}
	}

	}