| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.misc; |
| |
| import java.io.IOException; |
| import java.nio.file.Paths; |
| import java.util.Collection; |
| import java.util.Comparator; |
| import java.util.Locale; |
| |
| import org.apache.lucene.index.DirectoryReader; |
| import org.apache.lucene.index.FieldInfos; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.MultiTerms; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.PriorityQueue; |
| import org.apache.lucene.util.SuppressForbidden; |
| |
| /** |
| * <code>HighFreqTerms</code> class extracts the top n most frequent terms |
| * (by document frequency) from an existing Lucene index and reports their |
| * document frequency. |
| * <p> |
| * If the -t flag is given, both document frequency and total tf (total |
| * number of occurrences) are reported, ordered by descending total tf. |
| * |
| */ |
| public class HighFreqTerms { |
| |
| // The top numTerms will be displayed |
| public static final int DEFAULT_NUMTERMS = 100; |
| |
| @SuppressForbidden(reason = "System.out required: command line tool") |
| public static void main(String[] args) throws Exception { |
| String field = null; |
| int numTerms = DEFAULT_NUMTERMS; |
| |
| if (args.length == 0 || args.length > 4) { |
| usage(); |
| System.exit(1); |
| } |
| |
| Directory dir = FSDirectory.open(Paths.get(args[0])); |
| |
| Comparator<TermStats> comparator = new DocFreqComparator(); |
| |
| for (int i = 1; i < args.length; i++) { |
| if (args[i].equals("-t")) { |
| comparator = new TotalTermFreqComparator(); |
| } |
| else{ |
| try { |
| numTerms = Integer.parseInt(args[i]); |
| } catch (NumberFormatException e) { |
| field=args[i]; |
| } |
| } |
| } |
| |
| IndexReader reader = DirectoryReader.open(dir); |
| TermStats[] terms = getHighFreqTerms(reader, numTerms, field, comparator); |
| |
| for (int i = 0; i < terms.length; i++) { |
| System.out.printf(Locale.ROOT, "%s:%s \t totalTF = %,d \t docFreq = %,d \n", |
| terms[i].field, terms[i].termtext.utf8ToString(), terms[i].totalTermFreq, terms[i].docFreq); |
| } |
| reader.close(); |
| } |
| |
| @SuppressForbidden(reason = "System.out required: command line tool") |
| private static void usage() { |
| System.out |
| .println("\n\n" |
| + "java org.apache.lucene.misc.HighFreqTerms <index dir> [-t] [number_terms] [field]\n\t -t: order by totalTermFreq\n\n"); |
| } |
| |
| /** |
| * Returns TermStats[] ordered by the specified comparator |
| */ |
| public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field, Comparator<TermStats> comparator) throws Exception { |
| TermStatsQueue tiq = null; |
| |
| if (field != null) { |
| Terms terms = MultiTerms.getTerms(reader, field); |
| if (terms == null) { |
| throw new RuntimeException("field " + field + " not found"); |
| } |
| |
| TermsEnum termsEnum = terms.iterator(); |
| tiq = new TermStatsQueue(numTerms, comparator); |
| tiq.fill(field, termsEnum); |
| } else { |
| Collection<String> fields = FieldInfos.getIndexedFields(reader); |
| if (fields.size() == 0) { |
| throw new RuntimeException("no fields found for this index"); |
| } |
| tiq = new TermStatsQueue(numTerms, comparator); |
| for (String fieldName : fields) { |
| Terms terms = MultiTerms.getTerms(reader, fieldName); |
| if (terms != null) { |
| tiq.fill(fieldName, terms.iterator()); |
| } |
| } |
| } |
| |
| TermStats[] result = new TermStats[tiq.size()]; |
| // we want highest first so we read the queue and populate the array |
| // starting at the end and work backwards |
| int count = tiq.size() - 1; |
| while (tiq.size() != 0) { |
| result[count] = tiq.pop(); |
| count--; |
| } |
| return result; |
| } |
| |
| /** |
| * Compares terms by docTermFreq |
| */ |
| public static final class DocFreqComparator implements Comparator<TermStats> { |
| |
| @Override |
| public int compare(TermStats a, TermStats b) { |
| int res = Long.compare(a.docFreq, b.docFreq); |
| if (res == 0) { |
| res = a.field.compareTo(b.field); |
| if (res == 0) { |
| res = a.termtext.compareTo(b.termtext); |
| } |
| } |
| return res; |
| } |
| } |
| |
| /** |
| * Compares terms by totalTermFreq |
| */ |
| public static final class TotalTermFreqComparator implements Comparator<TermStats> { |
| |
| @Override |
| public int compare(TermStats a, TermStats b) { |
| int res = Long.compare(a.totalTermFreq, b.totalTermFreq); |
| if (res == 0) { |
| res = a.field.compareTo(b.field); |
| if (res == 0) { |
| res = a.termtext.compareTo(b.termtext); |
| } |
| } |
| return res; |
| } |
| } |
| |
| /** |
| * Priority queue for TermStats objects |
| **/ |
| static final class TermStatsQueue extends PriorityQueue<TermStats> { |
| final Comparator<TermStats> comparator; |
| |
| TermStatsQueue(int size, Comparator<TermStats> comparator) { |
| super(size); |
| this.comparator = comparator; |
| } |
| |
| @Override |
| protected boolean lessThan(TermStats termInfoA, TermStats termInfoB) { |
| return comparator.compare(termInfoA, termInfoB) < 0; |
| } |
| |
| protected void fill(String field, TermsEnum termsEnum) throws IOException { |
| BytesRef term = null; |
| while ((term = termsEnum.next()) != null) { |
| insertWithOverflow(new TermStats(field, term, termsEnum.docFreq(), termsEnum.totalTermFreq())); |
| } |
| } |
| } |
| } |