blob: d02fed430f1525810942ae0c06d0c9d2f25f5c1a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.ObjectCache;
/** Creates and caches {@link Parser} plugins. */
public final class ParserFactory {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
/** Wildcard for default plugins. */
public static final String DEFAULT_PLUGIN = "*";
/** Empty extension list for caching purposes. */
private final List<Extension> EMPTY_EXTENSION_LIST = Collections
.<Extension> emptyList();
private Configuration conf;
private ExtensionPoint extensionPoint;
private ParsePluginList parsePluginList;
public ParserFactory(Configuration conf) {
this.conf = conf;
ObjectCache objectCache = ObjectCache.get(conf);
this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
Parser.X_POINT_ID);
this.parsePluginList = (ParsePluginList) objectCache
.getObject(ParsePluginList.class.getName());
if (this.parsePluginList == null) {
this.parsePluginList = new ParsePluginsReader().parse(conf);
objectCache.setObject(ParsePluginList.class.getName(),
this.parsePluginList);
}
if (this.extensionPoint == null) {
throw new RuntimeException("x point " + Parser.X_POINT_ID + " not found.");
}
if (this.parsePluginList == null) {
throw new RuntimeException(
"Parse Plugins preferences could not be loaded.");
}
}
/**
* Function returns an array of {@link Parser}s for a given content type.
*
* The function consults the internal list of parse plugins for the
* ParserFactory to determine the list of pluginIds, then gets the appropriate
* extension points to instantiate as {@link Parser}s.
*
* @param contentType
* The contentType to return the <code>Array</code> of {@link Parser}
* s for.
* @param url
* The url for the content that may allow us to get the type from the
* file suffix.
* @return An <code>Array</code> of {@link Parser}s for the given contentType.
* If there were plugins mapped to a contentType via the
* <code>parse-plugins.xml</code> file, but never enabled via the
* <code>plugin.includes</code> Nutch conf, then those plugins won't
* be part of this array, i.e., they will be skipped. So, if the
* ordered list of parsing plugins for <code>text/plain</code> was
* <code>[parse-text,parse-html,
* parse-rtf]</code>, and only <code>parse-html</code> and
* <code>parse-rtf</code> were enabled via
* <code>plugin.includes</code>, then this ordered Array would consist
* of two {@link Parser} interfaces,
* <code>[parse-html, parse-rtf]</code>.
*/
public Parser[] getParsers(String contentType, String url)
throws ParserNotFound {
List<Parser> parsers = null;
List<Extension> parserExts = null;
ObjectCache objectCache = ObjectCache.get(conf);
// TODO once the MimeTypes is available
// parsers = getExtensions(MimeUtils.map(contentType));
// if (parsers != null) {
// return parsers;
// }
// Last Chance: Guess content-type from file url...
// parsers = getExtensions(MimeUtils.getMimeType(url));
parserExts = getExtensions(contentType);
if (parserExts == null) {
throw new ParserNotFound(url, contentType);
}
parsers = new Vector<>(parserExts.size());
for (Iterator<Extension> i = parserExts.iterator(); i.hasNext();) {
Extension ext = i.next();
Parser p = null;
try {
// check to see if we've cached this parser instance yet
p = (Parser) objectCache.getObject(ext.getId());
if (p == null) {
// go ahead and instantiate it and then cache it
p = (Parser) ext.getExtensionInstance();
objectCache.setObject(ext.getId(), p);
}
parsers.add(p);
} catch (PluginRuntimeException e) {
if (LOG.isWarnEnabled()) {
LOG.warn(
"ParserFactory:PluginRuntimeException when "
+ "initializing parser plugin "
+ ext.getDescriptor().getPluginId()
+ " instance because: " + e.getMessage()
+ " - attempting to continue instantiating parsers",
e);
}
}
}
return parsers.toArray(new Parser[] {});
}
/**
* Function returns a {@link Parser} instance with the specified
* <code>extId</code>, representing its extension ID. If the Parser instance
* isn't found, then the function throws a <code>ParserNotFound</code>
* exception. If the function is able to find the {@link Parser} in the
* internal <code>PARSER_CACHE</code> then it will return the already
* instantiated Parser. Otherwise, if it has to instantiate the Parser itself
* , then this function will cache that Parser in the internal
* <code>PARSER_CACHE</code>.
*
* @param id
* The string extension ID (e.g.,
* "org.apache.nutch.parse.rss.RSSParser",
* "org.apache.nutch.parse.rtf.RTFParseFactory") of the
* {@link Parser} implementation to return.
* @return A {@link Parser} implementation specified by the parameter
* <code>id</code>.
* @throws ParserNotFound
* If the Parser is not found (i.e., registered with the extension
* point), or if the there a {@link PluginRuntimeException}
* instantiating the {@link Parser}.
*/
public Parser getParserById(String id) throws ParserNotFound {
Extension[] extensions = this.extensionPoint.getExtensions();
Extension parserExt = null;
ObjectCache objectCache = ObjectCache.get(conf);
if (id != null) {
parserExt = getExtension(extensions, id);
}
if (parserExt == null) {
parserExt = getExtensionFromAlias(extensions, id);
}
if (parserExt == null) {
throw new ParserNotFound("No Parser Found for id [" + id + "]");
}
// first check the cache
if (objectCache.getObject(parserExt.getId()) != null) {
return (Parser) objectCache.getObject(parserExt.getId());
// if not found in cache, instantiate the Parser
} else {
try {
Parser p = (Parser) parserExt.getExtensionInstance();
objectCache.setObject(parserExt.getId(), p);
return p;
} catch (PluginRuntimeException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Canno initialize parser "
+ parserExt.getDescriptor().getPluginId() + " (cause: "
+ e.toString());
}
throw new ParserNotFound("Cannot init parser for id [" + id + "]");
}
}
}
/**
* Finds the best-suited parse plugin for a given contentType.
*
* @param contentType
* Content-Type for which we seek a parse plugin.
* @return a list of extensions to be used for this contentType. If none,
* returns <code>null</code>.
*/
@SuppressWarnings("unchecked")
protected List<Extension> getExtensions(String contentType) {
ObjectCache objectCache = ObjectCache.get(conf);
// First of all, tries to clean the content-type
String type = null;
type = MimeUtil.cleanMimeType(contentType);
List<Extension> extensions = (List<Extension>) objectCache.getObject(type);
// Just compare the reference:
// if this is the empty list, we know we will find no extension.
if (extensions == EMPTY_EXTENSION_LIST) {
return null;
}
if (extensions == null) {
extensions = findExtensions(type);
if (extensions != null) {
objectCache.setObject(type, extensions);
} else {
// Put the empty extension list into cache
// to remember we don't know any related extension.
objectCache.setObject(type, EMPTY_EXTENSION_LIST);
}
}
return extensions;
}
/**
* searches a list of suitable parse plugins for the given contentType.
* <p>
* It first looks for a preferred plugin defined in the parse-plugin file. If
* none is found, it returns a list of default plugins.
*
* @param contentType
* Content-Type for which we seek a parse plugin.
* @return List - List of extensions to be used for this contentType. If none,
* returns null.
*/
private List<Extension> findExtensions(String contentType) {
Extension[] extensions = this.extensionPoint.getExtensions();
// Look for a preferred plugin.
List<String> parsePluginList = this.parsePluginList
.getPluginList(contentType);
List<Extension> extensionList = matchExtensions(parsePluginList,
extensions, contentType);
if (extensionList != null) {
return extensionList;
}
// If none found, look for a default plugin.
parsePluginList = this.parsePluginList.getPluginList(DEFAULT_PLUGIN);
return matchExtensions(parsePluginList, extensions, DEFAULT_PLUGIN);
}
/**
* Tries to find a suitable parser for the given contentType.
* <ol>
* <li>It checks if a parser which accepts the contentType can be found in the
* <code>plugins</code> list;</li>
* <li>If this list is empty, it tries to find amongst the loaded extensions
* whether some of them might suit and warns the user.</li>
* </ol>
*
* @param plugins
* List of candidate plugins.
* @param extensions
* Array of loaded extensions.
* @param contentType
* Content-Type for which we seek a parse plugin.
* @return List - List of extensions to be used for this contentType. If none,
* returns null.
*/
private List<Extension> matchExtensions(List<String> plugins,
Extension[] extensions, String contentType) {
List<Extension> extList = new ArrayList<>();
if (plugins != null) {
for (String parsePluginId : plugins) {
Extension ext = getExtension(extensions, parsePluginId, contentType);
// the extension returned may be null
// that means that it was not enabled in the plugin.includes
// nutch conf property, but it was mapped in the
// parse-plugins.xml
// file.
// OR it was enabled in plugin.includes, but the plugin's plugin.xml
// file does not claim that the plugin supports the specified mimeType
// in either case, LOG the appropriate error message to WARN level
if (ext == null) {
// try to get it just by its pluginId
ext = getExtension(extensions, parsePluginId);
if (LOG.isWarnEnabled()) {
if (ext != null) {
// plugin was enabled via plugin.includes
// its plugin.xml just doesn't claim to support that
// particular mimeType
LOG.warn("ParserFactory:Plugin: " + parsePluginId
+ " mapped to contentType " + contentType
+ " via parse-plugins.xml, but " + "its plugin.xml "
+ "file does not claim to support contentType: "
+ contentType);
} else {
// plugin wasn't enabled via plugin.includes
LOG.warn("ParserFactory: Plugin: " + parsePluginId
+ " mapped to contentType " + contentType
+ " via parse-plugins.xml, but not enabled via "
+ "plugin.includes in nutch-default.xml");
}
}
}
if (ext != null) {
// add it to the list
extList.add(ext);
}
}
} else {
// okay, there were no list of plugins defined for
// this mimeType, however, there may be plugins registered
// via the plugin.includes nutch conf property that claim
// via their plugin.xml file to support this contentType
// so, iterate through the list of extensions and if you find
// any extensions where this is the case, throw a
// NotMappedParserException
for (int i = 0; i < extensions.length; i++) {
if ("*".equals(extensions[i].getAttribute("contentType"))) {
extList.add(0, extensions[i]);
} else if (extensions[i].getAttribute("contentType") != null
&& contentType.matches(escapeContentType(extensions[i]
.getAttribute("contentType")))) {
extList.add(extensions[i]);
}
}
if (extList.size() > 0) {
if (LOG.isInfoEnabled()) {
StringBuffer extensionsIDs = new StringBuffer("[");
boolean isFirst = true;
for (Extension ext : extList) {
if (!isFirst)
extensionsIDs.append(" - ");
else
isFirst = false;
extensionsIDs.append(ext.getId());
}
extensionsIDs.append("]");
LOG.info("The parsing plugins: " + extensionsIDs.toString()
+ " are enabled via the plugin.includes system "
+ "property, and all claim to support the content type "
+ contentType + ", but they are not mapped to it in the "
+ "parse-plugins.xml file");
}
} else if (LOG.isDebugEnabled()) {
LOG.debug("ParserFactory:No parse plugins mapped or enabled for "
+ "contentType " + contentType);
}
}
return (extList.size() > 0) ? extList : null;
}
private String escapeContentType(String contentType) {
// Escapes contentType in order to use as a regex
// (and keep backwards compatibility).
// This enables to accept multiple types for a single parser.
return contentType.replace("+", "\\+").replace(".", "\\.");
}
private boolean match(Extension extension, String id, String type) {
return ((id.equals(extension.getId())) && (extension.getAttribute(
"contentType").equals("*")
|| type
.matches(escapeContentType(extension.getAttribute("contentType"))) || type
.equals(DEFAULT_PLUGIN)));
}
/** Get an extension from its id and supported content-type. */
private Extension getExtension(Extension[] list, String id, String type) {
for (int i = 0; i < list.length; i++) {
if (match(list[i], id, type)) {
return list[i];
}
}
return null;
}
private Extension getExtension(Extension[] list, String id) {
for (int i = 0; i < list.length; i++) {
if (id.equals(list[i].getId())) {
return list[i];
}
}
return null;
}
private Extension getExtensionFromAlias(Extension[] list, String id) {
return getExtension(list, parsePluginList.getAliases().get(id));
}
}