solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.handler.extraction;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.lang.invoke.MethodHandles;
 import java.util.Locale;

 import org.apache.commons.io.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.UpdateParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.ContentStreamBase;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.handler.loader.ContentStreamLoader;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
 import org.apache.solr.update.AddUpdateCommand;
 import org.apache.solr.update.processor.UpdateRequestProcessor;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.HttpHeaders;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.html.HtmlMapper;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
 import org.apache.tika.sax.xpath.XPathParser;
 import org.apache.xml.serialize.BaseMarkupSerializer;
 import org.apache.xml.serialize.OutputFormat;
 import org.apache.xml.serialize.TextSerializer;
 import org.apache.xml.serialize.XMLSerializer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;


 /**
  * The class responsible for loading extracted content into Solr.
  *
  **/
 public class ExtractingDocumentLoader extends ContentStreamLoader {

   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

   /**
    * Extract Only supported format
    */
   public static final String TEXT_FORMAT = "text";
   /**
    * Extract Only supported format.  Default
    */
   public static final String XML_FORMAT = "xml";
   /**
    * XHTML XPath parser.
    */
   private static final XPathParser PARSER =
           new XPathParser("xhtml", XHTMLContentHandler.XHTML);

   final SolrCore core;
   final SolrParams params;
   final UpdateRequestProcessor processor;
   final boolean ignoreTikaException;
   protected AutoDetectParser autoDetectParser;

   private final AddUpdateCommand templateAdd;

   protected TikaConfig config;
   protected ParseContextConfig parseContextConfig;
   protected SolrContentHandlerFactory factory;

   public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                            TikaConfig config, ParseContextConfig parseContextConfig,
                                   SolrContentHandlerFactory factory) {
     this.params = req.getParams();
     this.core = req.getCore();
     this.config = config;
     this.parseContextConfig = parseContextConfig;
     this.processor = processor;

     templateAdd = new AddUpdateCommand(req);
     templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
     templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

     //this is lightweight
     autoDetectParser = new AutoDetectParser(config);
     this.factory = factory;

     ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
   }


   /**
    * this must be MT safe... may be called concurrently from multiple threads.
    *
    */
   void doAdd(SolrContentHandler handler, AddUpdateCommand template)
           throws IOException {
     template.solrDoc = handler.newDocument();
     processor.processAdd(template);
   }

   void addDoc(SolrContentHandler handler) throws IOException {
     templateAdd.clear();
     doAdd(handler, templateAdd);
   }

   @Override
   @SuppressWarnings({"unchecked"})
   public void load(SolrQueryRequest req, SolrQueryResponse rsp,
       ContentStream stream, UpdateRequestProcessor processor) throws Exception {
     Parser parser = null;
     String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
     if (streamType != null) {
       //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
       MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
       parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
     } else {
       parser = autoDetectParser;
     }
     if (parser != null) {
       Metadata metadata = new Metadata();

       // If you specify the resource name (the filename, roughly) with this parameter,
       // then Tika can make use of it in guessing the appropriate MIME type:
       String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
       if (resourceName != null) {
         metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
       }
       // Provide stream's content type as hint for auto detection
       if(stream.getContentType() != null) {
         metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
       }

       InputStream inputStream = null;
       try {
         inputStream = stream.getStream();
         metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
         metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
         metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
         metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
         // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
         String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
         if(charset != null){
           metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
         }

         String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
         boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
         SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
         ContentHandler parsingHandler = handler;

         StringWriter writer = null;
         BaseMarkupSerializer serializer = null;
         if (extractOnly == true) {
           String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
           writer = new StringWriter();
           if (extractFormat.equals(TEXT_FORMAT)) {
             serializer = new TextSerializer();
             serializer.setOutputCharStream(writer);
             serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
           } else {
             serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
           }
           if (xpathExpr != null) {
             Matcher matcher =
                     PARSER.parse(xpathExpr);
             serializer.startDocument();//The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
             parsingHandler = new MatchingContentHandler(serializer, matcher);
           } else {
             parsingHandler = serializer;
           }
         } else if (xpathExpr != null) {
           Matcher matcher =
                   PARSER.parse(xpathExpr);
           parsingHandler = new MatchingContentHandler(handler, matcher);
         } //else leave it as is

         try{
           //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
           ParseContext context = parseContextConfig.create();


           context.set(Parser.class, parser);
           context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

           // Password handling
           RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
           String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
           if(pwMapFile != null && pwMapFile.length() > 0) {
             InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
             if(is != null) {
               log.debug("Password file supplied: {}", pwMapFile);
               epp.parse(is);
             }
           }
           context.set(PasswordProvider.class, epp);
           String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
           if(resourcePassword != null) {
             epp.setExplicitPassword(resourcePassword);
             log.debug("Literal password supplied for file {}", resourceName);
           }
           parser.parse(inputStream, parsingHandler, metadata, context);
         } catch (TikaException e) {
           if(ignoreTikaException)
             log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
                 .append(". metadata=").append(metadata.toString()).toString()); // nowarn
           else
             throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
         }
         if (extractOnly == false) {
           addDoc(handler);
         } else {
           //serializer is not null, so we need to call endDoc on it if using xpath
           if (xpathExpr != null){
             serializer.endDocument();
           }
           rsp.add(stream.getName(), writer.toString());
           writer.close();
           String[] names = metadata.names();
           @SuppressWarnings({"rawtypes"})
           NamedList metadataNL = new NamedList();
           for (int i = 0; i < names.length; i++) {
             String[] vals = metadata.getValues(names[i]);
             metadataNL.add(names[i], vals);
           }
           rsp.add(stream.getName() + "_metadata", metadataNL);
         }
       } catch (SAXException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       } finally {
         IOUtils.closeQuietly(inputStream);
       }
     } else {
       throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
     }
   }

   public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
     public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();

     /**
      * Keep all elements and their content.
      *
      * Apparently &lt;SCRIPT&gt; and &lt;STYLE&gt; elements are blocked elsewhere
      */
     @Override
     public boolean isDiscardElement(String name) {
       return false;
     }

     /** Lowercases the attribute name */
     @Override
     public String mapSafeAttribute(String elementName, String attributeName) {
       return attributeName.toLowerCase(Locale.ENGLISH);
     }

     /**
      * Lowercases the element name, but returns null for &lt;BR&gt;,
      * which suppresses the start-element event for lt;BR&gt; tags.
      * This also suppresses the &lt;BODY&gt; tags because those
      * are handled internally by Tika's XHTMLContentHandler.
      */
     @Override
     public String mapSafeElement(String name) {
       String lowerName = name.toLowerCase(Locale.ROOT);
       return (lowerName.equals("br") || lowerName.equals("body")) ? null : lowerName;
     }
    }
  }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.handler.extraction;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.StringWriter;
	import java.lang.invoke.MethodHandles;
	import java.util.Locale;

	import org.apache.commons.io.IOUtils;
	import org.apache.solr.common.SolrException;
	import org.apache.solr.common.params.SolrParams;
	import org.apache.solr.common.params.UpdateParams;
	import org.apache.solr.common.util.ContentStream;
	import org.apache.solr.common.util.ContentStreamBase;
	import org.apache.solr.common.util.NamedList;
	import org.apache.solr.core.SolrCore;
	import org.apache.solr.handler.loader.ContentStreamLoader;
	import org.apache.solr.request.SolrQueryRequest;
	import org.apache.solr.response.SolrQueryResponse;
	import org.apache.solr.update.AddUpdateCommand;
	import org.apache.solr.update.processor.UpdateRequestProcessor;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.HttpHeaders;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaMetadataKeys;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.DefaultParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.parser.PasswordProvider;
	import org.apache.tika.parser.html.HtmlMapper;
	import org.apache.tika.sax.XHTMLContentHandler;
	import org.apache.tika.sax.xpath.Matcher;
	import org.apache.tika.sax.xpath.MatchingContentHandler;
	import org.apache.tika.sax.xpath.XPathParser;
	import org.apache.xml.serialize.BaseMarkupSerializer;
	import org.apache.xml.serialize.OutputFormat;
	import org.apache.xml.serialize.TextSerializer;
	import org.apache.xml.serialize.XMLSerializer;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;


	/**
	* The class responsible for loading extracted content into Solr.
	*
	**/
	public class ExtractingDocumentLoader extends ContentStreamLoader {

	private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

	/**
	* Extract Only supported format
	*/
	public static final String TEXT_FORMAT = "text";
	/**
	* Extract Only supported format. Default
	*/
	public static final String XML_FORMAT = "xml";
	/**
	* XHTML XPath parser.
	*/
	private static final XPathParser PARSER =
	new XPathParser("xhtml", XHTMLContentHandler.XHTML);

	final SolrCore core;
	final SolrParams params;
	final UpdateRequestProcessor processor;
	final boolean ignoreTikaException;
	protected AutoDetectParser autoDetectParser;

	private final AddUpdateCommand templateAdd;

	protected TikaConfig config;
	protected ParseContextConfig parseContextConfig;
	protected SolrContentHandlerFactory factory;

	public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
	TikaConfig config, ParseContextConfig parseContextConfig,
	SolrContentHandlerFactory factory) {
	this.params = req.getParams();
	this.core = req.getCore();
	this.config = config;
	this.parseContextConfig = parseContextConfig;
	this.processor = processor;

	templateAdd = new AddUpdateCommand(req);
	templateAdd.overwrite = params.getBool(UpdateParams.OVERWRITE, true);
	templateAdd.commitWithin = params.getInt(UpdateParams.COMMIT_WITHIN, -1);

	//this is lightweight
	autoDetectParser = new AutoDetectParser(config);
	this.factory = factory;

	ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
	}


	/**
	* this must be MT safe... may be called concurrently from multiple threads.
	*
	*/
	void doAdd(SolrContentHandler handler, AddUpdateCommand template)
	throws IOException {
	template.solrDoc = handler.newDocument();
	processor.processAdd(template);
	}

	void addDoc(SolrContentHandler handler) throws IOException {
	templateAdd.clear();
	doAdd(handler, templateAdd);
	}

	@Override
	@SuppressWarnings({"unchecked"})
	public void load(SolrQueryRequest req, SolrQueryResponse rsp,
	ContentStream stream, UpdateRequestProcessor processor) throws Exception {
	Parser parser = null;
	String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
	if (streamType != null) {
	//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
	MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
	parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
	} else {
	parser = autoDetectParser;
	}
	if (parser != null) {
	Metadata metadata = new Metadata();

	// If you specify the resource name (the filename, roughly) with this parameter,
	// then Tika can make use of it in guessing the appropriate MIME type:
	String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
	if (resourceName != null) {
	metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
	}
	// Provide stream's content type as hint for auto detection
	if(stream.getContentType() != null) {
	metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
	}

	InputStream inputStream = null;
	try {
	inputStream = stream.getStream();
	metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
	metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
	metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
	metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
	// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
	String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
	if(charset != null){
	metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
	}

	String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
	boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
	SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
	ContentHandler parsingHandler = handler;

	StringWriter writer = null;
	BaseMarkupSerializer serializer = null;
	if (extractOnly == true) {
	String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
	writer = new StringWriter();
	if (extractFormat.equals(TEXT_FORMAT)) {
	serializer = new TextSerializer();
	serializer.setOutputCharStream(writer);
	serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
	} else {
	serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
	}
	if (xpathExpr != null) {
	Matcher matcher =
	PARSER.parse(xpathExpr);
	serializer.startDocument();//The MatchingContentHandler does not invoke startDocument. See http://tika.markmail.org/message/kknu3hw7argwiqin
	parsingHandler = new MatchingContentHandler(serializer, matcher);
	} else {
	parsingHandler = serializer;
	}
	} else if (xpathExpr != null) {
	Matcher matcher =
	PARSER.parse(xpathExpr);
	parsingHandler = new MatchingContentHandler(handler, matcher);
	} //else leave it as is

	try{
	//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
	ParseContext context = parseContextConfig.create();


	context.set(Parser.class, parser);
	context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

	// Password handling
	RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
	String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
	if(pwMapFile != null && pwMapFile.length() > 0) {
	InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
	if(is != null) {
	log.debug("Password file supplied: {}", pwMapFile);
	epp.parse(is);
	}
	}
	context.set(PasswordProvider.class, epp);
	String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
	if(resourcePassword != null) {
	epp.setExplicitPassword(resourcePassword);
	log.debug("Literal password supplied for file {}", resourceName);
	}
	parser.parse(inputStream, parsingHandler, metadata, context);
	} catch (TikaException e) {
	if(ignoreTikaException)
	log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
	.append(". metadata=").append(metadata.toString()).toString()); // nowarn
	else
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	if (extractOnly == false) {
	addDoc(handler);
	} else {
	//serializer is not null, so we need to call endDoc on it if using xpath
	if (xpathExpr != null){
	serializer.endDocument();
	}
	rsp.add(stream.getName(), writer.toString());
	writer.close();
	String[] names = metadata.names();
	@SuppressWarnings({"rawtypes"})
	NamedList metadataNL = new NamedList();
	for (int i = 0; i < names.length; i++) {
	String[] vals = metadata.getValues(names[i]);
	metadataNL.add(names[i], vals);
	}
	rsp.add(stream.getName() + "_metadata", metadataNL);
	}
	} catch (SAXException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	} finally {
	IOUtils.closeQuietly(inputStream);
	}
	} else {
	throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
	}
	}

	public static class MostlyPassthroughHtmlMapper implements HtmlMapper {
	public static final HtmlMapper INSTANCE = new MostlyPassthroughHtmlMapper();

	/**
	* Keep all elements and their content.
	*
	* Apparently <SCRIPT> and <STYLE> elements are blocked elsewhere
	*/
	@Override
	public boolean isDiscardElement(String name) {
	return false;
	}

	/** Lowercases the attribute name */
	@Override
	public String mapSafeAttribute(String elementName, String attributeName) {
	return attributeName.toLowerCase(Locale.ENGLISH);
	}

	/**
	* Lowercases the element name, but returns null for <BR>,
	* which suppresses the start-element event for lt;BR> tags.
	* This also suppresses the <BODY> tags because those
	* are handled internally by Tika's XHTMLContentHandler.
	*/
	@Override
	public String mapSafeElement(String name) {
	String lowerName = name.toLowerCase(Locale.ROOT);
	return (lowerName.equals("br") \|\| lowerName.equals("body")) ? null : lowerName;
	}
	}
	}