blob: 484729b644f997f18a34c16f16f2f63afe566e8a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval.app.tools;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.tika.eval.core.tokens.AnalyzerManager;
import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
import org.apache.tika.utils.ProcessUtils;
/**
* Utility class that reads in a UTF-8 input file with one document per row
* and outputs the 20000 tokens with the highest document frequencies.
* <p>
* The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
* but includes bigrams for cjk.
* <p>
* It also has a include list for __email__ and __url__ and a skip list
* for common html markup terms.
*/
public class TopCommonTokenCounter {
private static final String FIELD = "f";
//these should exist in every list
static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
new String[]{URLEmailNormalizingFilterFactory.URL,
URLEmailNormalizingFilterFactory.EMAIL}));
//words to ignore
//these are common 4 letter html markup words that we do
//not want to count in case of failed markup processing.
//see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
static Set<String> SKIP_LIST = new HashSet<>(
Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname",
"lang", "style", "script", "strong", "blockquote", "form", "iframe", "section",
"colspan", "rowspan"));
private static String LICENSE =
"# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
"# contributor license agreements. See the NOTICE file distributed with\n" +
"# this work for additional information regarding copyright ownership.\n" +
"# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
"# (the \"License\"); you may not use this file except in compliance with\n" +
"# the License. You may obtain a copy of the License at\n" + "#\n" +
"# http://www.apache.org/licenses/LICENSE-2.0\n" + "#\n" +
"# Unless required by applicable law or agreed to in writing, software\n" +
"# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
"# See the License for the specific language governing permissions and\n" +
"# limitations under the License.\n" + "#\n";
private static int TOP_N = 30000;
private static int MIN_DOC_FREQ = 10;
public static void main(String[] args) throws Exception {
Path commonTokensFile = Paths.get(args[0]);
List<Path> inputFiles = new ArrayList<>();
for (int i = 1; i < args.length; i++) {
inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i])));
}
TopCommonTokenCounter counter = new TopCommonTokenCounter();
if (Files.exists(commonTokensFile)) {
System.err.println(
commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
return;
}
counter.execute(commonTokensFile, inputFiles);
}
private static void writeTopN(Path path, long totalDocs, long sumDocFreqs,
long sumTotalTermFreqs, long uniqueTerms,
AbstractTokenTFDFPriorityQueue queue) throws IOException {
if (Files.isRegularFile(path)) {
System.err.println("File " + path.getFileName() + " already exists. Skipping.");
return;
}
Files.createDirectories(path.getParent());
try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8)) {
StringBuilder sb = new StringBuilder();
writer.write(LICENSE);
writer.write("#DOC_COUNT\t" + totalDocs + "\n");
writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
//add these tokens no matter what
for (String t : INCLUDE_LIST) {
writer.write(t);
writer.newLine();
}
for (TokenDFTF tp : queue.getArray()) {
writer.write(getRow(sb, tp) + "\n");
}
writer.flush();
}
}
private static String getRow(StringBuilder sb, TokenDFTF tp) {
sb.setLength(0);
sb.append(clean(tp.token));
sb.append("\t").append(tp.df);
sb.append("\t").append(tp.tf);
return sb.toString();
}
private static String clean(String s) {
if (s == null) {
return "";
}
return s.replaceAll("\\s+", " ").trim();
}
private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
long totalDocs = -1;
long sumDocFreqs = -1;
long sumTotalTermFreqs = -1;
long uniqueTerms = -1;
try (Directory directory = FSDirectory.open(luceneDir)) {
AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
int maxLen = 1000000;
int len = 0;
try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
List<Document> docs = new ArrayList<>();
for (Path inputFile : inputFiles) {
//total hack
boolean isLeipzig = false;
if (inputFile.getFileName().toString().contains("-sentences.txt")) {
isLeipzig = true;
}
int lines = 0;
try (BufferedReader reader = getReader(inputFile)) {
String line = reader.readLine();
while (line != null) {
if (isLeipzig) {
int tab = line.indexOf("\t");
if (tab > -1) {
line = line.substring(tab + 1);
}
}
len += line.length();
Document document = new Document();
document.add(new TextField(FIELD, line, Field.Store.NO));
docs.add(document);
if (len > maxLen) {
writer.addDocuments(docs);
docs.clear();
len = 0;
}
line = reader.readLine();
if (++lines % 100000 == 0) {
System.out.println(
"processed " + lines + " for " + inputFile.getFileName() +
" :: " + commonTokensFile.toAbsolutePath());
}
}
}
}
if (docs.size() > 0) {
writer.addDocuments(docs);
}
writer.commit();
writer.flush();
}
try (IndexReader reader = DirectoryReader.open(directory)) {
LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
totalDocs = wrappedReader.getDocCount(FIELD);
sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
Terms terms = wrappedReader.terms(FIELD);
TermsEnum termsEnum = terms.iterator();
BytesRef bytesRef = termsEnum.next();
int docsWThisField = wrappedReader.getDocCount(FIELD);
while (bytesRef != null) {
uniqueTerms++;
int df = termsEnum.docFreq();
long tf = termsEnum.totalTermFreq();
if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
bytesRef = termsEnum.next();
continue;
}
if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) {
String t = bytesRef.utf8ToString();
if (!SKIP_LIST.contains(t)) {
queue.insertWithOverflow(new TokenDFTF(t, df, tf));
}
}
bytesRef = termsEnum.next();
}
}
} finally {
FileUtils.deleteDirectory(luceneDir.toFile());
}
writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
}
private BufferedReader getReader(Path inputFile) throws IOException {
InputStream is = Files.newInputStream(inputFile);
if (inputFile.toString().endsWith(".gz")) {
is = new GzipCompressorInputStream(is);
}
return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
}
private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
AbstractTokenTFDFPriorityQueue(int maxSize) {
super(maxSize);
}
public TokenDFTF[] getArray() {
TokenDFTF[] topN = new TokenDFTF[size()];
//now we reverse the queue
TokenDFTF term = pop();
int i = topN.length - 1;
while (term != null && i > -1) {
topN[i--] = term;
term = pop();
}
return topN;
}
}
private static class TokenDFTF {
final String token;
final int df;
final long tf;
public TokenDFTF(String token, int df, long tf) {
this.token = token;
this.df = df;
this.tf = tf;
}
public long getTF() {
return tf;
}
public int getDF() {
return df;
}
public String getToken() {
return token;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
TokenDFTF tokenDFTF = (TokenDFTF) o;
if (df != tokenDFTF.df) {
return false;
}
if (tf != tokenDFTF.tf) {
return false;
}
return Objects.equals(token, tokenDFTF.token);
}
@Override
public int hashCode() {
int result = token != null ? token.hashCode() : 0;
result = 31 * result + df;
result = 31 * result + (int) (tf ^ (tf >>> 32));
return result;
}
@Override
public String toString() {
return "TokenDFTF{" + "token='" + token + '\'' + ", df=" + df + ", tf=" + tf + '}';
}
}
private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
TokenDFPriorityQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
if (arg0.df < arg1.df) {
return true;
} else if (arg0.df > arg1.df) {
return false;
}
return arg1.token.compareTo(arg0.token) < 0;
}
public TokenDFTF[] getArray() {
TokenDFTF[] topN = new TokenDFTF[size()];
//now we reverse the queue
TokenDFTF term = pop();
int i = topN.length - 1;
while (term != null && i > -1) {
topN[i--] = term;
term = pop();
}
return topN;
}
}
}