tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.eval.app.tools;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Objects;
 import java.util.Set;

 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.PriorityQueue;

 import org.apache.tika.eval.core.tokens.AnalyzerManager;
 import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
 import org.apache.tika.utils.ProcessUtils;

 /**
  * Utility class that reads in a UTF-8 input file with one document per row
  * and outputs the 20000 tokens with the highest document frequencies.
  * <p>
  * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
  * but includes bigrams for cjk.
  * <p>
  * It also has a include list for __email__ and __url__ and a skip list
  * for common html markup terms.
  */
 public class TopCommonTokenCounter {

     private static final String FIELD = "f";
     //these should exist in every list
     static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
             new String[]{URLEmailNormalizingFilterFactory.URL,
                     URLEmailNormalizingFilterFactory.EMAIL}));
     //words to ignore
     //these are common 4 letter html markup words that we do
     //not want to count in case of failed markup processing.
     //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
     static Set<String> SKIP_LIST = new HashSet<>(
             Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
                     "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
                     "colspan", "rowspan"));
     private static String LICENSE =
             "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
                     "# contributor license agreements.  See the NOTICE file distributed with\n" +
                     "# this work for additional information regarding copyright ownership.\n" +
                     "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
                     "# (the \"License\"); you may not use this file except in compliance with\n" +
                     "# the License.  You may obtain a copy of the License at\n" + "#\n" +
                     "#     http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
                     "# Unless required by applicable law or agreed to in writing, software\n" +
                     "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
                     "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
                     "# See the License for the specific language governing permissions and\n" +
                     "# limitations under the License.\n" + "#\n";
     private static int TOP_N = 30000;
     private static int MIN_DOC_FREQ = 10;

     public static void main(String[] args) throws Exception {
         Path commonTokensFile = Paths.get(args[0]);
         List<Path> inputFiles = new ArrayList<>();
         for (int i = 1; i < args.length; i++) {
             inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
         }
         TopCommonTokenCounter counter = new TopCommonTokenCounter();
         if (Files.exists(commonTokensFile)) {
             System.err.println(
                     commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
             return;
         }
         counter.execute(commonTokensFile, inputFiles);
     }

     private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
                                   long sumTotalTermFreqs, long uniqueTerms,
                                   AbstractTokenTFDFPriorityQueue queue) throws IOException {
         if (Files.isRegularFile(path)) {
             System.err.println("File " + path.getFileName() + " already exists. Skipping.");
             return;
         }
         Files.createDirectories(path.getParent());
         try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
             StringBuilder sb = new StringBuilder();
             writer.write(LICENSE);
             writer.write("#DOC_COUNT\t" + totalDocs + "\n");
             writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
             writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
             writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
             writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
             //add these tokens no matter what
             for (String t : INCLUDE_LIST) {
                 writer.write(t);
                 writer.newLine();
             }
             for (TokenDFTF tp : queue.getArray()) {
                 writer.write(getRow(sb, tp) + "\n");

             }
             writer.flush();
         }
     }

     private static String getRow(StringBuilder sb, TokenDFTF tp) {
         sb.setLength(0);
         sb.append(clean(tp.token));
         sb.append("\t").append(tp.df);
         sb.append("\t").append(tp.tf);
         return sb.toString();
     }

     private static String clean(String s) {
         if (s == null) {
             return "";
         }
         return s.replaceAll("\\s+", " ").trim();
     }

     private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
         Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
         AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
         long totalDocs = -1;
         long sumDocFreqs = -1;
         long sumTotalTermFreqs = -1;
         long uniqueTerms = -1;
         try (Directory directory = FSDirectory.open(luceneDir)) {

             AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

             Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
             IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
             int maxLen = 1000000;
             int len = 0;
             try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
                 List<Document> docs = new ArrayList<>();
                 for (Path inputFile : inputFiles) {
                     //total hack
                     boolean isLeipzig = false;
                     if (inputFile.getFileName().toString().contains("-sentences.txt")) {
                         isLeipzig = true;
                     }
                     int lines = 0;
                     try (BufferedReader reader = getReader(inputFile)) {
                         String line = reader.readLine();
                         while (line != null) {
                             if (isLeipzig) {
                                 int tab = line.indexOf("\t");
                                 if (tab > -1) {
                                     line = line.substring(tab + 1);
                                 }
                             }
                             len += line.length();
                             Document document = new Document();
                             document.add(new TextField(FIELD, line, Field.Store.NO));
                             docs.add(document);
                             if (len > maxLen) {
                                 writer.addDocuments(docs);
                                 docs.clear();
                                 len = 0;
                             }
                             line = reader.readLine();
                             if (++lines % 100000 == 0) {
                                 System.out.println(
                                         "processed " + lines + " for " + inputFile.getFileName() +
                                                 " :: " + commonTokensFile.toAbsolutePath());
                             }
                         }
                     }
                 }
                 if (docs.size() > 0) {
                     writer.addDocuments(docs);
                 }
                 writer.commit();
                 writer.flush();
             }

             try (IndexReader reader = DirectoryReader.open(directory)) {
                 LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
                 totalDocs = wrappedReader.getDocCount(FIELD);
                 sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
                 sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);

                 Terms terms = wrappedReader.terms(FIELD);
                 TermsEnum termsEnum = terms.iterator();
                 BytesRef bytesRef = termsEnum.next();
                 int docsWThisField = wrappedReader.getDocCount(FIELD);
                 while (bytesRef != null) {
                     uniqueTerms++;
                     int df = termsEnum.docFreq();
                     long tf = termsEnum.totalTermFreq();
                     if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                         bytesRef = termsEnum.next();
                         continue;
                     }

                     if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
                         String t = bytesRef.utf8ToString();
                         if (!SKIP_LIST.contains(t)) {
                             queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                         }

                     }
                     bytesRef = termsEnum.next();
                 }
             }
         } finally {
             FileUtils.deleteDirectory(luceneDir.toFile());
         }

         writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);


     }

     private BufferedReader getReader(Path inputFile) throws IOException {
         InputStream is = Files.newInputStream(inputFile);
         if (inputFile.toString().endsWith(".gz")) {
             is = new GzipCompressorInputStream(is);
         }
         return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
     }

     private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {

         AbstractTokenTFDFPriorityQueue(int maxSize) {
             super(maxSize);
         }

         public TokenDFTF[] getArray() {
             TokenDFTF[] topN = new TokenDFTF[size()];
             //now we reverse the queue
             TokenDFTF term = pop();
             int i = topN.length - 1;
             while (term != null && i > -1) {
                 topN[i--] = term;
                 term = pop();
             }
             return topN;
         }
     }

     private static class TokenDFTF {

         final String token;
         final int df;
         final long tf;

         public TokenDFTF(String token, int df, long tf) {
             this.token = token;
             this.df = df;
             this.tf = tf;
         }


         public long getTF() {
             return tf;
         }

         public int getDF() {
             return df;
         }

         public String getToken() {
             return token;
         }

         @Override
         public boolean equals(Object o) {
             if (this == o) {
                 return true;
             }
             if (o == null || getClass() != o.getClass()) {
                 return false;
             }

             TokenDFTF tokenDFTF = (TokenDFTF) o;

             if (df != tokenDFTF.df) {
                 return false;
             }
             if (tf != tokenDFTF.tf) {
                 return false;
             }
             return Objects.equals(token, tokenDFTF.token);
         }

         @Override
         public int hashCode() {
             int result = token != null ? token.hashCode() : 0;
             result = 31 * result + df;
             result = 31 * result + (int) (tf ^ (tf >>> 32));
             return result;
         }

         @Override
         public String toString() {
             return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
         }
     }

     private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {

         TokenDFPriorityQueue(int maxSize) {
             super(maxSize);
         }

         @Override
         protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
             if (arg0.df < arg1.df) {
                 return true;
             } else if (arg0.df > arg1.df) {
                 return false;
             }
             return arg1.token.compareTo(arg0.token) < 0;
         }

         public TokenDFTF[] getArray() {
             TokenDFTF[] topN = new TokenDFTF[size()];
             //now we reverse the queue
             TokenDFTF term = pop();
             int i = topN.length - 1;
             while (term != null && i > -1) {
                 topN[i--] = term;
                 term = pop();
             }
             return topN;
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.eval.app.tools;

	import java.io.BufferedReader;
	import java.io.BufferedWriter;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Objects;
	import java.util.Set;

	import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
	import org.apache.commons.io.FileUtils;
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.index.LeafReader;
	import org.apache.lucene.index.Terms;
	import org.apache.lucene.index.TermsEnum;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.PriorityQueue;

	import org.apache.tika.eval.core.tokens.AnalyzerManager;
	import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
	import org.apache.tika.utils.ProcessUtils;

	/**
	* Utility class that reads in a UTF-8 input file with one document per row
	* and outputs the 20000 tokens with the highest document frequencies.
	* <p>
	* The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
	* but includes bigrams for cjk.
	* <p>
	* It also has a include list for __email__ and __url__ and a skip list
	* for common html markup terms.
	*/
	public class TopCommonTokenCounter {

	private static final String FIELD = "f";
	//these should exist in every list
	static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
	new String[]{URLEmailNormalizingFilterFactory.URL,
	URLEmailNormalizingFilterFactory.EMAIL}));
	//words to ignore
	//these are common 4 letter html markup words that we do
	//not want to count in case of failed markup processing.
	//see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
	static Set<String> SKIP_LIST = new HashSet<>(
	Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
	"lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
	"colspan", "rowspan"));
	private static String LICENSE =
	"# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
	"# contributor license agreements. See the NOTICE file distributed with\n" +
	"# this work for additional information regarding copyright ownership.\n" +
	"# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
	"# (the \"License\"); you may not use this file except in compliance with\n" +
	"# the License. You may obtain a copy of the License at\n" + "#\n" +
	"# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
	"# Unless required by applicable law or agreed to in writing, software\n" +
	"# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
	"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
	"# See the License for the specific language governing permissions and\n" +
	"# limitations under the License.\n" + "#\n";
	private static int TOP_N = 30000;
	private static int MIN_DOC_FREQ = 10;

	public static void main(String[] args) throws Exception {
	Path commonTokensFile = Paths.get(args[0]);
	List<Path> inputFiles = new ArrayList<>();
	for (int i = 1; i < args.length; i++) {
	inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
	}
	TopCommonTokenCounter counter = new TopCommonTokenCounter();
	if (Files.exists(commonTokensFile)) {
	System.err.println(
	commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
	return;
	}
	counter.execute(commonTokensFile, inputFiles);
	}

	private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
	long sumTotalTermFreqs, long uniqueTerms,
	AbstractTokenTFDFPriorityQueue queue) throws IOException {
	if (Files.isRegularFile(path)) {
	System.err.println("File " + path.getFileName() + " already exists. Skipping.");
	return;
	}
	Files.createDirectories(path.getParent());
	try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
	StringBuilder sb = new StringBuilder();
	writer.write(LICENSE);
	writer.write("#DOC_COUNT\t" + totalDocs + "\n");
	writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
	writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
	writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
	writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
	//add these tokens no matter what
	for (String t : INCLUDE_LIST) {
	writer.write(t);
	writer.newLine();
	}
	for (TokenDFTF tp : queue.getArray()) {
	writer.write(getRow(sb, tp) + "\n");

	}
	writer.flush();
	}
	}

	private static String getRow(StringBuilder sb, TokenDFTF tp) {
	sb.setLength(0);
	sb.append(clean(tp.token));
	sb.append("\t").append(tp.df);
	sb.append("\t").append(tp.tf);
	return sb.toString();
	}

	private static String clean(String s) {
	if (s == null) {
	return "";
	}
	return s.replaceAll("\\s+", " ").trim();
	}

	private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
	Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
	AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
	long totalDocs = -1;
	long sumDocFreqs = -1;
	long sumTotalTermFreqs = -1;
	long uniqueTerms = -1;
	try (Directory directory = FSDirectory.open(luceneDir)) {

	AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);

	Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
	IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
	int maxLen = 1000000;
	int len = 0;
	try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
	List<Document> docs = new ArrayList<>();
	for (Path inputFile : inputFiles) {
	//total hack
	boolean isLeipzig = false;
	if (inputFile.getFileName().toString().contains("-sentences.txt")) {
	isLeipzig = true;
	}
	int lines = 0;
	try (BufferedReader reader = getReader(inputFile)) {
	String line = reader.readLine();
	while (line != null) {
	if (isLeipzig) {
	int tab = line.indexOf("\t");
	if (tab > -1) {
	line = line.substring(tab + 1);
	}
	}
	len += line.length();
	Document document = new Document();
	document.add(new TextField(FIELD, line, Field.Store.NO));
	docs.add(document);
	if (len > maxLen) {
	writer.addDocuments(docs);
	docs.clear();
	len = 0;
	}
	line = reader.readLine();
	if (++lines % 100000 == 0) {
	System.out.println(
	"processed " + lines + " for " + inputFile.getFileName() +
	" :: " + commonTokensFile.toAbsolutePath());
	}
	}
	}
	}
	if (docs.size() > 0) {
	writer.addDocuments(docs);
	}
	writer.commit();
	writer.flush();
	}

	try (IndexReader reader = DirectoryReader.open(directory)) {
	LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
	totalDocs = wrappedReader.getDocCount(FIELD);
	sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
	sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);

	Terms terms = wrappedReader.terms(FIELD);
	TermsEnum termsEnum = terms.iterator();
	BytesRef bytesRef = termsEnum.next();
	int docsWThisField = wrappedReader.getDocCount(FIELD);
	while (bytesRef != null) {
	uniqueTerms++;
	int df = termsEnum.docFreq();
	long tf = termsEnum.totalTermFreq();
	if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
	bytesRef = termsEnum.next();
	continue;
	}

	if (queue.top() == null \|\| queue.size() < TOP_N \|\| df >= queue.top().df) {
	String t = bytesRef.utf8ToString();
	if (!SKIP_LIST.contains(t)) {
	queue.insertWithOverflow(new TokenDFTF(t, df, tf));
	}

	}
	bytesRef = termsEnum.next();
	}
	}
	} finally {
	FileUtils.deleteDirectory(luceneDir.toFile());
	}

	writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);


	}

	private BufferedReader getReader(Path inputFile) throws IOException {
	InputStream is = Files.newInputStream(inputFile);
	if (inputFile.toString().endsWith(".gz")) {
	is = new GzipCompressorInputStream(is);
	}
	return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
	}

	private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {

	AbstractTokenTFDFPriorityQueue(int maxSize) {
	super(maxSize);
	}

	public TokenDFTF[] getArray() {
	TokenDFTF[] topN = new TokenDFTF[size()];
	//now we reverse the queue
	TokenDFTF term = pop();
	int i = topN.length - 1;
	while (term != null && i > -1) {
	topN[i--] = term;
	term = pop();
	}
	return topN;
	}
	}

	private static class TokenDFTF {

	final String token;
	final int df;
	final long tf;

	public TokenDFTF(String token, int df, long tf) {
	this.token = token;
	this.df = df;
	this.tf = tf;
	}


	public long getTF() {
	return tf;
	}

	public int getDF() {
	return df;
	}

	public String getToken() {
	return token;
	}

	@Override
	public boolean equals(Object o) {
	if (this == o) {
	return true;
	}
	if (o == null \|\| getClass() != o.getClass()) {
	return false;
	}

	TokenDFTF tokenDFTF = (TokenDFTF) o;

	if (df != tokenDFTF.df) {
	return false;
	}
	if (tf != tokenDFTF.tf) {
	return false;
	}
	return Objects.equals(token, tokenDFTF.token);
	}

	@Override
	public int hashCode() {
	int result = token != null ? token.hashCode() : 0;
	result = 31 * result + df;
	result = 31 * result + (int) (tf ^ (tf >>> 32));
	return result;
	}

	@Override
	public String toString() {
	return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
	}
	}

	private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {

	TokenDFPriorityQueue(int maxSize) {
	super(maxSize);
	}

	@Override
	protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
	if (arg0.df < arg1.df) {
	return true;
	} else if (arg0.df > arg1.df) {
	return false;
	}
	return arg1.token.compareTo(arg0.token) < 0;
	}

	public TokenDFTF[] getArray() {
	TokenDFTF[] topN = new TokenDFTF[size()];
	//now we reverse the queue
	TokenDFTF term = pop();
	int i = topN.length - 1;
	while (term != null && i > -1) {
	topN[i--] = term;
	term = pop();
	}
	return topN;
	}
	}
	}