blob: 89e2649a6d8014527011fb683771cc7f6f5d62dc [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.jackrabbit.oak.plugins.tika;
import com.google.common.base.Stopwatch;
import com.google.common.io.Closer;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import static com.google.common.base.Charsets.UTF_8;
import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.FULLTEXT;
import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH;
import static org.apache.jackrabbit.oak.plugins.tika.CSVFileBinaryResourceProvider.FORMAT;
class TextPopulator {
private static final Logger log = LoggerFactory.getLogger(TextPopulator.class);
static final String BLOB_ID = "blobId";
static final String ERROR_TEXT = "TextExtractionError";
private final TextWriter textWriter;
private PopulatorStats stats;
TextPopulator(TextWriter textWriter) {
this.textWriter = textWriter;
this.stats = new PopulatorStats();
}
// exposed for test purposes only
void setStats(PopulatorStats stats) {
this.stats = stats;
}
void populate(File dataFile, File indexDir) throws IOException {
try (Closer closer = Closer.create()) {
Iterable<CSVRecord> csvRecords = closer.register(CSVParser.parse(dataFile, UTF_8, FORMAT));
final FSDirectory dir = closer.register(FSDirectory.open(indexDir));
final DirectoryReader reader = closer.register(DirectoryReader.open(dir));
final IndexSearcher searcher = new IndexSearcher(reader);
for (CSVRecord record : csvRecords) {
String blobId = record.get(BLOB_ID);
String jcrPath = record.get(JCR_PATH);
if (!textWriter.isProcessed(blobId)) {
String text = getText(reader, searcher, jcrPath);
stats.processed++;
if (text == null) {
// Ignore errors as we might be processing partial OR incorrect index
// writer.markError(blobId);
stats.errored++;
} else if (ERROR_TEXT.equals(text)) {
textWriter.markError(blobId);
stats.errored++;
} else if (text.length() == 0) {
textWriter.markEmpty(blobId);
stats.empty++;
} else {
textWriter.write(blobId, text);
stats.parsed++;
}
} else {
stats.ignored++;
}
stats.readAndDumpStatsIfRequired(jcrPath);
}
log.info(stats.toString());
}
}
private static String getText(DirectoryReader reader, IndexSearcher searcher, String path) {
TopDocs topDocs;
try {
topDocs = searcher.search(new TermQuery(new Term(PATH, path)), 1);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (scoreDocs.length != 1) {
return null;
}
Document doc = reader.document(scoreDocs[0].doc);
String[] ftVals = doc.getValues(FULLTEXT);
if (ftVals.length != 1) {
// being conservative... expecting only one stored fulltext field
return null;
}
return ftVals[0].trim();
} catch (IOException e) {
// ignore
}
return null;
}
static class PopulatorStats {
int read = 0;
int ignored = 0;
int processed = 0;
int parsed = 0;
int errored = 0;
int empty = 0;
Stopwatch w = Stopwatch.createStarted();
void readAndDumpStatsIfRequired(String path) {
read++;
if (read%10000 == 0) {
log.info("{} - currently at {}", this.toString(), path);
}
}
@Override
public String toString () {
return String.format("Text populator stats - " +
"Read: %s; Ignored: %s; Processed: %s; Parsed: %s; Errored: %s; Empty: %s (in %s)",
read, ignored, processed, parsed, errored, empty, w);
}
}
}