trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java - jackrabbit-oak - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.jackrabbit.oak.plugins.tika;

 import com.google.common.base.Stopwatch;
 import com.google.common.io.Closer;
 import org.apache.commons.csv.CSVParser;
 import org.apache.commons.csv.CSVRecord;
 import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.FSDirectory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.File;
 import java.io.IOException;

 import static com.google.common.base.Charsets.UTF_8;
 import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
 import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.FULLTEXT;
 import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH;
 import static org.apache.jackrabbit.oak.plugins.tika.CSVFileBinaryResourceProvider.FORMAT;

 class TextPopulator {
     private static final Logger log = LoggerFactory.getLogger(TextPopulator.class);

     static final String BLOB_ID = "blobId";
     static final String ERROR_TEXT = "TextExtractionError";

     private final TextWriter textWriter;

     private PopulatorStats stats;

     TextPopulator(TextWriter textWriter) {
         this.textWriter = textWriter;
         this.stats = new PopulatorStats();
     }

     // exposed for test purposes only
     void setStats(PopulatorStats stats) {
         this.stats = stats;
     }

     void populate(File dataFile, File indexDir) throws IOException {
         try (Closer closer = Closer.create()) {
             Iterable<CSVRecord> csvRecords = closer.register(CSVParser.parse(dataFile, UTF_8, FORMAT));

             final FSDirectory dir = closer.register(FSDirectory.open(indexDir));
             final DirectoryReader reader = closer.register(DirectoryReader.open(dir));
             final IndexSearcher searcher = new IndexSearcher(reader);

             for (CSVRecord record : csvRecords) {
                 String blobId = record.get(BLOB_ID);
                 String jcrPath = record.get(JCR_PATH);

                 if (!textWriter.isProcessed(blobId)) {
                     String text = getText(reader, searcher, jcrPath);

                     stats.processed++;

                     if (text == null) {
                         // Ignore errors as we might be processing partial OR incorrect index
                         // writer.markError(blobId);
                         stats.errored++;
                     } else if (ERROR_TEXT.equals(text)) {
                         textWriter.markError(blobId);
                         stats.errored++;
                     } else if (text.length() == 0) {
                         textWriter.markEmpty(blobId);
                         stats.empty++;
                     } else {
                         textWriter.write(blobId, text);
                         stats.parsed++;
                     }
                 } else {
                     stats.ignored++;
                 }

                 stats.readAndDumpStatsIfRequired(jcrPath);
             }
             log.info(stats.toString());
         }
     }

     private static String getText(DirectoryReader reader, IndexSearcher searcher, String path) {
         TopDocs topDocs;
         try {
             topDocs = searcher.search(new TermQuery(new Term(PATH, path)), 1);

             ScoreDoc[] scoreDocs = topDocs.scoreDocs;
             if (scoreDocs.length != 1) {
                 return null;
             }

             Document doc = reader.document(scoreDocs[0].doc);

             String[] ftVals = doc.getValues(FULLTEXT);
             if (ftVals.length != 1) {
                 // being conservative... expecting only one stored fulltext field
                 return null;
             }

             return ftVals[0].trim();
         } catch (IOException e) {
             // ignore
         }

         return null;
     }

     static class PopulatorStats {
         int read = 0;
         int ignored = 0;
         int processed = 0;
         int parsed = 0;
         int errored = 0;
         int empty = 0;

         Stopwatch w = Stopwatch.createStarted();

         void readAndDumpStatsIfRequired(String path) {
             read++;

             if (read%10000 == 0) {
                 log.info("{} - currently at {}", this.toString(), path);
             }
         }

         @Override
         public String toString () {
             return String.format("Text populator stats - " +
                             "Read: %s; Ignored: %s; Processed: %s; Parsed: %s; Errored: %s; Empty: %s (in %s)",
                     read, ignored, processed, parsed, errored, empty, w);
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.jackrabbit.oak.plugins.tika;

	import com.google.common.base.Stopwatch;
	import com.google.common.io.Closer;
	import org.apache.commons.csv.CSVParser;
	import org.apache.commons.csv.CSVRecord;
	import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.FSDirectory;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import java.io.File;
	import java.io.IOException;

	import static com.google.common.base.Charsets.UTF_8;
	import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
	import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.FULLTEXT;
	import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH;
	import static org.apache.jackrabbit.oak.plugins.tika.CSVFileBinaryResourceProvider.FORMAT;

	class TextPopulator {
	private static final Logger log = LoggerFactory.getLogger(TextPopulator.class);

	static final String BLOB_ID = "blobId";
	static final String ERROR_TEXT = "TextExtractionError";

	private final TextWriter textWriter;

	private PopulatorStats stats;

	TextPopulator(TextWriter textWriter) {
	this.textWriter = textWriter;
	this.stats = new PopulatorStats();
	}

	// exposed for test purposes only
	void setStats(PopulatorStats stats) {
	this.stats = stats;
	}

	void populate(File dataFile, File indexDir) throws IOException {
	try (Closer closer = Closer.create()) {
	Iterable<CSVRecord> csvRecords = closer.register(CSVParser.parse(dataFile, UTF_8, FORMAT));

	final FSDirectory dir = closer.register(FSDirectory.open(indexDir));
	final DirectoryReader reader = closer.register(DirectoryReader.open(dir));
	final IndexSearcher searcher = new IndexSearcher(reader);

	for (CSVRecord record : csvRecords) {
	String blobId = record.get(BLOB_ID);
	String jcrPath = record.get(JCR_PATH);

	if (!textWriter.isProcessed(blobId)) {
	String text = getText(reader, searcher, jcrPath);

	stats.processed++;

	if (text == null) {
	// Ignore errors as we might be processing partial OR incorrect index
	// writer.markError(blobId);
	stats.errored++;
	} else if (ERROR_TEXT.equals(text)) {
	textWriter.markError(blobId);
	stats.errored++;
	} else if (text.length() == 0) {
	textWriter.markEmpty(blobId);
	stats.empty++;
	} else {
	textWriter.write(blobId, text);
	stats.parsed++;
	}
	} else {
	stats.ignored++;
	}

	stats.readAndDumpStatsIfRequired(jcrPath);
	}
	log.info(stats.toString());
	}
	}

	private static String getText(DirectoryReader reader, IndexSearcher searcher, String path) {
	TopDocs topDocs;
	try {
	topDocs = searcher.search(new TermQuery(new Term(PATH, path)), 1);

	ScoreDoc[] scoreDocs = topDocs.scoreDocs;
	if (scoreDocs.length != 1) {
	return null;
	}

	Document doc = reader.document(scoreDocs[0].doc);

	String[] ftVals = doc.getValues(FULLTEXT);
	if (ftVals.length != 1) {
	// being conservative... expecting only one stored fulltext field
	return null;
	}

	return ftVals[0].trim();
	} catch (IOException e) {
	// ignore
	}

	return null;
	}

	static class PopulatorStats {
	int read = 0;
	int ignored = 0;
	int processed = 0;
	int parsed = 0;
	int errored = 0;
	int empty = 0;

	Stopwatch w = Stopwatch.createStarted();

	void readAndDumpStatsIfRequired(String path) {
	read++;

	if (read%10000 == 0) {
	log.info("{} - currently at {}", this.toString(), path);
	}
	}

	@Override
	public String toString () {
	return String.format("Text populator stats - " +
	"Read: %s; Ignored: %s; Processed: %s; Parsed: %s; Errored: %s; Empty: %s (in %s)",
	read, ignored, processed, parsed, errored, empty, w);
	}
	}
	}