blob: b02c59195a7fea437e9a3bbee9c64ea40489ea27 [file] [log] [blame]
package org.apache.manifoldcf.agents.transformation.htmlextractor;
/* $Id$ */
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.apache.manifoldcf.agents.transformation.htmlextractor.exception.RegexException;
import org.apache.manifoldcf.agents.interfaces.*;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
public class HtmlExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
public static final String _rcsid = "@(#)$Id$";
protected static final String ACTIVITY_PROCESS = "process";
protected static final String[] activitiesList = new String[]{ACTIVITY_PROCESS};
* Forward to the javascript to check the specification parameters for the job
private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
private static final String EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML = "editSpecification_HTML_Extractor.html";
protected static final int HTML_STRIP_NONE = 0;
protected static final int HTML_STRIP_ALL = 1;
protected static int html_strip_usage = HTML_STRIP_ALL;
public static final String NODE_KEEPMETADATA = "striphtml";
public static final String NODE_FILTEREMPTY = "filterEmpty";
public static final String ATTRIBUTE_SOURCE = "source";
public static final String ATTRIBUTE_TARGET = "target";
public static final String ATTRIBUTE_VALUE = "value";
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
/** Return a list of activities that this connector generates.
* The connector does NOT need to be connected before this method is called.
*@return the set of activities.
public String[] getActivitiesList()
return activitiesList;
/** Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
* necessary.
* The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the
* output description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode
* an output description string in order to determine what should be done.
*@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process
* and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
*@param pipelineDescription is the description string that was constructed for this document by the getOutputDescription() method.
*@param document is the document data to be processed (handed to the output data store).
*@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null.
*@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity,
* or sending a modified document to the next stage in the pipeline.
*@return the document status (accepted or permanently rejected).
*@throws IOException only if there's a stream error reading the document data.
public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
long startTime = System.currentTimeMillis();
String resultCode = "OK";
String description = null;
Long length = null;
final SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
Logging.connectors.debug("Processing by HTML Extractor");
if (!(document.getMimeType().startsWith("text/html")) || (document.getMimeType().startsWith("application/xhtml+xml"))){
Logging.connectors.debug("no processing, mime type not html");
resultCode = "NO HTML";
else {
Logging.connectors.debug("Document recognized as HTML - processing");
long binaryLength = document.getBinaryLength();
length = new Long(binaryLength);
DestinationStorage ds;
if (document.getBinaryLength() <= inMemoryMaximumFile)
ds = new MemoryDestinationStorage((int)document.getBinaryLength());
ds = new FileDestinationStorage();
OutputStream os = ds.getOutputStream();
/* Add an option to keep HTML markup of the extracted text or not -
* in case for example of processing by Tika after this transformation connector
Hashtable<String,String> metadataExtracted = new Hashtable<String,String>();
metadataExtracted = JsoupProcessing.extractTextAndMetadataHtmlDocument(document.getBinaryStream(),sp.includeFilters.get(0), sp.excludeFilters, sp.striphtml);
InputStream newStream = new ByteArrayInputStream(metadataExtracted.get("extractedDoc").getBytes(StandardCharsets.UTF_8));
int lenghtNewStream = newStream.available();
document.setBinary(newStream, lenghtNewStream);
Iterator<Entry<String, String>> it;
Map.Entry<String,String> entry;
it = metadataExtracted.entrySet().iterator();
while (it.hasNext()) {
entry =;
if (entry.getKey()!="extractedDoc")
document.addField("jsoup_"+entry.getKey(), entry.getValue());
return activities.sendDocument(documentURI,document);
catch (ServiceInterruption e)
description = e.getMessage();
throw e;
catch (ManifoldCFException e)
resultCode = "EXCEPTION";
description = e.getMessage();
throw e;
catch (IOException e)
resultCode = "IOEXCEPTION";
description = e.getMessage();
throw e;
catch (Exception e)
resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
description = e.getMessage();
activities.recordActivity(new Long(startTime), ACTIVITY_PROCESS, length, documentURI,
resultCode, description);
return activities.sendDocument(documentURI,document);
protected static interface DestinationStorage
/** Get the output stream to write to. Caller should explicitly close this stream when done writing.
public OutputStream getOutputStream()
throws ManifoldCFException;
/** Get new binary length.
public long getBinaryLength()
throws ManifoldCFException;
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
public InputStream getInputStream()
throws ManifoldCFException;
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
public void close()
throws ManifoldCFException;
protected static class FileDestinationStorage implements DestinationStorage
protected final File outputFile;
protected final OutputStream outputStream;
public FileDestinationStorage()
throws ManifoldCFException
File outputFile;
OutputStream outputStream;
outputFile = File.createTempFile("mcftika","tmp");
outputStream = new FileOutputStream(outputFile);
catch (IOException e)
outputFile = null;
outputStream = null;
this.outputFile = outputFile;
this.outputStream = outputStream;
public OutputStream getOutputStream()
throws ManifoldCFException
return outputStream;
/** Get new binary length.
public long getBinaryLength()
throws ManifoldCFException
return outputFile.length();
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
public InputStream getInputStream()
throws ManifoldCFException
return new FileInputStream(outputFile);
catch (IOException e)
return null;
private void handleIOException(IOException e) {
// TODO Auto-generated method stub
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
public void close()
throws ManifoldCFException
protected static class MemoryDestinationStorage implements DestinationStorage
protected final ByteArrayOutputStream outputStream;
public MemoryDestinationStorage(int sizeHint)
outputStream = new ByteArrayOutputStream(sizeHint);
public OutputStream getOutputStream()
throws ManifoldCFException
return outputStream;
/** Get new binary length.
public long getBinaryLength()
throws ManifoldCFException
return outputStream.size();
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
public InputStream getInputStream()
throws ManifoldCFException
return new ByteArrayInputStream(outputStream.toByteArray());
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
public void close()
throws ManifoldCFException
protected static int handleIOException(IOException e)
throws ManifoldCFException
// IOException reading from our local storage...
if (e instanceof InterruptedIOException)
throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
throw new ManifoldCFException(e.getMessage(),e);
* Test if there is at least one regular expression that match with the
* provided sting
* @param regexList
* the list of regular expressions
* @param str
* the string to test
* @return the first matching regex found or null if no matching regex
private String matchingRegex(final List<String> regexList, final String str) throws RegexException {
for (final String regex : regexList) {
try {
final Pattern pattern = Pattern.compile(regex);
final Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
return regex;
} catch (final PatternSyntaxException e) {
throw new RegexException(regex, "Invalid regular expression");
return null;
* Output the configuration header section. This method is called in the head
* section of the connector's configuration page. Its purpose is to add the
* required tabs to the list, and to output any javascript methods that might
* be needed by the configuration editing HTML.
* @param threadContext
* is the local thread context.
* @param out
* is the output to which any HTML should be sent.
* @param parameters
* are the configuration parameters, as they currently exist, for
* this connection being configured.
* @param tabsArray
* is an array of tab names. Add to this array any tab names that are
* specific to the connector.
public void outputConfigurationHeader(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
final ConfigParams parameters, final List<String> tabsArray) throws ManifoldCFException, IOException {
Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIGURATION_JS, null);
* Output the configuration body section. This method is called in the body
* section of the connector's configuration page. Its purpose is to present
* the required form elements for editing. The coder can presume that the HTML
* that is output from this configuration will be within appropriate &lt;html&gt;,
* &lt;body&gt;, and &lt;form&gt; tags. The name of the form is "editconnection".
* @param threadContext
* is the local thread context.
* @param out
* is the output to which any HTML should be sent.
* @param parameters
* are the configuration parameters, as they currently exist, for
* this connection being configured.
* @param tabName
* is the current tab name.
public void outputConfigurationBody(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
final ConfigParams parameters, final String tabName) throws ManifoldCFException, IOException {
final Map<String, Object> velocityContext = new HashMap<>();
velocityContext.put("TabName", tabName);
* Process a configuration post. This method is called at the start of the
* connector's configuration page, whenever there is a possibility that form
* data for a connection has been posted. Its purpose is to gather form
* information and modify the configuration parameters accordingly. The name
* of the posted form is "editconnection".
* @param threadContext
* is the local thread context.
* @param variableContext
* is the set of variables available from the post, including binary
* file post information.
* @param parameters
* are the configuration parameters, as they currently exist, for
* this connection being configured.
* @return null if all is well, or a string error message if there is an error
* that should prevent saving of the connection (and cause a
* redirection to an error page).
public String processConfigurationPost(final IThreadContext threadContext, final IPostParameters variableContext,
final Locale locale, final ConfigParams parameters) throws ManifoldCFException {
return null;
* View configuration. This method is called in the body section of the
* connector's view configuration page. Its purpose is to present the
* connection information to the user. The coder can presume that the HTML
* that is output from this configuration will be within appropriate &lt;html&gt;
* and &lt;body&gt; tags.
* @param threadContext
* is the local thread context.
* @param out
* is the output to which any HTML should be sent.
* @param parameters
* are the configuration parameters, as they currently exist, for
* this connection being configured.
public void viewConfiguration(final IThreadContext threadContext, final IHTTPOutput out, final Locale locale,
final ConfigParams parameters) throws ManifoldCFException, IOException {
final Map<String, Object> velocityContext = new HashMap<>();
Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIGURATION_HTML, velocityContext);
protected static void fillInHtmlExtractorSpecification(final Map<String, Object> paramMap, final Specification os) {
final List<String> includeFilters = new ArrayList<String>();
final List<String> excludeFilters = new ArrayList<String>();
String striphtmlValue = "true";
// Fill in context
for (int i = 0; i < os.getChildCount(); i++) {
final SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
final String includeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
if (includeFilter != null) {
} else if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
final String excludeFilter = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
if (excludeFilter != null) {
} else if (sn.getType().equals(NODE_KEEPMETADATA))
striphtmlValue = sn.getAttributeValue(ATTRIBUTE_VALUE);
paramMap.put("INCLUDEFILTERS", includeFilters);
paramMap.put("EXCLUDEFILTERS", excludeFilters);
paramMap.put("HTMLTAGUSAGE", html_strip_usage);
* Output the specification header section. This method is called in the head
* section of a job page which has selected a pipeline connection of the
* current type. Its purpose is to add the required tabs to the list, and to
* output any javascript methods that might be needed by the job editing HTML.
* @param out
* is the output to which any HTML should be sent.
* @param locale
* @param os
* is the current pipeline specification for this connection.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param tabsArray
* is an array of tab names. Add to this array any tab names that are
* specific to the connector.
public void outputSpecificationHeader(final IHTTPOutput out, final Locale locale, final Specification os,
final int connectionSequenceNumber, final List<String> tabsArray) throws ManifoldCFException, IOException {
final Map<String, Object> paramMap = new HashMap<>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
tabsArray.add(Messages.getString(locale, "HtmlExtractorTransformationConnector.HtmlExtractorTabName"));
// Fill in the specification header map, using data from all tabs.
fillInHtmlExtractorSpecification(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
* Output the specification body section. This method is called in the body
* section of a job page which has selected a pipeline connection of the
* current type. Its purpose is to present the required form elements for
* editing. The coder can presume that the HTML that is output from this
* configuration will be within appropriate &lt;html&gt;, &lt;body&gt;, and &lt;form&gt; tags.
* The name of the form is "editjob".
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param os
* is the current pipeline specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param actualSequenceNumber
* is the connection within the job that has currently been selected.
* @param tabName
* is the current tab name.
public void outputSpecificationBody(final IHTTPOutput out, final Locale locale, final Specification os,
final int connectionSequenceNumber, final int actualSequenceNumber, final String tabName)
throws ManifoldCFException, IOException {
final Map<String, Object> paramMap = new HashMap<>();
// Set the tab name
paramMap.put("TABNAME", tabName);
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
// Fill in the field mapping tab data
fillInHtmlExtractorSpecification(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_HTML_EXTRACTOR_HTML, paramMap);
* Process a specification post. This method is called at the start of job's
* edit or view page, whenever there is a possibility that form data for a
* connection has been posted. Its purpose is to gather form information and
* modify the transformation specification accordingly. The name of the posted
* form is "editjob".
* @param variableContext
* contains the post data, including binary file-upload information.
* @param locale
* is the preferred local of the output.
* @param os
* is the current pipeline specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return null if all is well, or a string error message if there is an error
* that should prevent saving of the job (and cause a redirection to
* an error page).
public String processSpecificationPost(final IPostParameters variableContext, final Locale locale,
final Specification os, final int connectionSequenceNumber) throws ManifoldCFException {
final String seqPrefix = "s" + connectionSequenceNumber + "_";
String x;
// Include filters
x = variableContext.getParameter(seqPrefix + "includefilter_count");
if (x != null && x.length() > 0) {
// About to gather the includefilter nodes, so get rid of the old ones.
int i = 0;
while (i < os.getChildCount()) {
final SpecificationNode node = os.getChild(i);
if (node.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
} else {
final int count = Integer.parseInt(x);
i = 0;
while (i < count) {
final String prefix = seqPrefix + "includefilter_";
final String suffix = "_" + Integer.toString(i);
final String op = variableContext.getParameter(prefix + "op" + suffix);
if (op == null || !op.equals("Delete")) {
// Gather the includefilters etc.
final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
os.addChild(os.getChildCount(), node);
final String addop = variableContext.getParameter(seqPrefix + "includefilter_op");
if (addop != null && addop.equals("Add")) {
final String regex = variableContext.getParameter(seqPrefix + "includefilter_regex");
final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_INCLUDEFILTER);
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
os.addChild(os.getChildCount(), node);
// Exclude filters
x = variableContext.getParameter(seqPrefix + "excludefilter_count");
if (x != null && x.length() > 0) {
// About to gather the excludefilter nodes, so get rid of the old ones.
int i = 0;
while (i < os.getChildCount()) {
final SpecificationNode node = os.getChild(i);
if (node.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
} else {
final int count = Integer.parseInt(x);
i = 0;
while (i < count) {
final String prefix = seqPrefix + "excludefilter_";
final String suffix = "_" + Integer.toString(i);
final String op = variableContext.getParameter(prefix + "op" + suffix);
if (op == null || !op.equals("Delete")) {
// Gather the excludefilters etc.
final String regex = variableContext.getParameter(prefix + HtmlExtractorConfig.ATTRIBUTE_REGEX + suffix);
final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
os.addChild(os.getChildCount(), node);
final String addop = variableContext.getParameter(seqPrefix + "excludefilter_op");
if (addop != null && addop.equals("Add")) {
final String regex = variableContext.getParameter(seqPrefix + "excludefilter_regex");
final SpecificationNode node = new SpecificationNode(HtmlExtractorConfig.NODE_EXCLUDEFILTER);
node.setAttribute(HtmlExtractorConfig.ATTRIBUTE_REGEX, regex);
os.addChild(os.getChildCount(), node);
x = variableContext.getParameter(seqPrefix+"striphtml_present");
if (x != null && x.length() > 0)
String keepAll = variableContext.getParameter(seqPrefix+"striphtml");
if (keepAll == null)
keepAll = "false";
// About to gather the fieldmapping nodes, so get rid of the old ones.
int i = 0;
while (i < os.getChildCount())
SpecificationNode node = os.getChild(i);
if (node.getType().equals(NODE_KEEPMETADATA))
// Gather the keep all metadata parameter to be the last one
SpecificationNode node = new SpecificationNode(NODE_KEEPMETADATA);
node.setAttribute(ATTRIBUTE_VALUE, keepAll);
// Add the new striphtml config parameter
os.addChild(os.getChildCount(), node);
return null;
* View specification. This method is called in the body section of a job's
* view page. Its purpose is to present the pipeline specification information
* to the user. The coder can presume that the HTML that is output from this
* configuration will be within appropriate &lt;html&gt; and &lt;body&gt;tags.
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param os
* is the current pipeline specification for this job.
public void viewSpecification(final IHTTPOutput out, final Locale locale, final Specification os,
final int connectionSequenceNumber) throws ManifoldCFException, IOException {
final Map<String, Object> paramMap = new HashMap<>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
// Fill in the map with data from all tabs
fillInHtmlExtractorSpecification(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
protected static class SpecPacker {
private final List<String> includeFilters = new ArrayList<>();
private final List<String> excludeFilters = new ArrayList<>();
private final boolean striphtml;
public SpecPacker(final Specification os) {
boolean striphtml = true;
for (int i = 0; i < os.getChildCount(); i++) {
final SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(HtmlExtractorConfig.NODE_INCLUDEFILTER)) {
final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
if (sn.getType().equals(HtmlExtractorConfig.NODE_EXCLUDEFILTER)) {
final String regex = sn.getAttributeValue(HtmlExtractorConfig.ATTRIBUTE_REGEX);
if(sn.getType().equals(NODE_KEEPMETADATA)) {
String value = sn.getAttributeValue(ATTRIBUTE_VALUE);
striphtml = Boolean.parseBoolean(value);
if (includeFilters.isEmpty()) {
this.striphtml = striphtml;
public String toPackedString() {
final StringBuilder sb = new StringBuilder();
packList(sb, includeFilters, '+');
packList(sb, excludeFilters, '+');
if (striphtml)
return sb.toString();