| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.parse.ext; |
| |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.parse.ParseResult; |
| import org.apache.nutch.parse.ParseStatus; |
| import org.apache.nutch.parse.Parser; |
| import org.apache.nutch.parse.ParseData; |
| import org.apache.nutch.parse.ParseImpl; |
| import org.apache.nutch.parse.Outlink; |
| import org.apache.nutch.parse.OutlinkExtractor; |
| |
| import org.apache.nutch.util.CommandRunner; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.hadoop.conf.Configuration; |
| |
| import org.apache.nutch.plugin.Extension; |
| import org.apache.nutch.plugin.PluginRepository; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.util.Hashtable; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.nio.charset.Charset; |
| |
| /** |
| * A wrapper that invokes external command to do real parsing job. |
| * |
| * @author John Xing |
| */ |
| |
| public class ExtParser implements Parser { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| static final int BUFFER_SIZE = 4096; |
| |
| static final int TIMEOUT_DEFAULT = 30; // in seconds |
| |
| // handy map from String contentType to String[] {command, timeoutString, |
| // encoding} |
| Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>(); |
| |
| private Configuration conf; |
| |
| public ExtParser() { |
| } |
| |
| @Override |
| public ParseResult getParse(Content content) { |
| |
| String contentType = content.getContentType(); |
| |
| String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType); |
| if (params == null) |
| return new ParseStatus(ParseStatus.FAILED, |
| "No external command defined for contentType: " + contentType) |
| .getEmptyParseResult(content.getUrl(), getConf()); |
| |
| String command = params[0]; |
| int timeout = Integer.parseInt(params[1]); |
| String encoding = params[2]; |
| |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Use " + command + " with timeout=" + timeout + "secs"); |
| } |
| |
| String text = null; |
| String title = null; |
| |
| try { |
| |
| byte[] raw = content.getContent(); |
| |
| String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); |
| if (contentLength != null |
| && raw.length != Integer.parseInt(contentLength)) { |
| return new ParseStatus(ParseStatus.FAILED, |
| ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length |
| + " bytes. Parser can't handle incomplete " + contentType |
| + " file.").getEmptyParseResult(content.getUrl(), getConf()); |
| } |
| |
| ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); |
| ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4); |
| |
| CommandRunner cr = new CommandRunner(); |
| |
| cr.setCommand(command + " " + contentType); |
| cr.setInputStream(new ByteArrayInputStream(raw)); |
| cr.setStdOutputStream(os); |
| cr.setStdErrorStream(es); |
| |
| cr.setTimeout(timeout); |
| |
| cr.evaluate(); |
| |
| if (cr.getExitValue() != 0) |
| return new ParseStatus(ParseStatus.FAILED, "External command " |
| + command + " failed with error: " + es.toString()) |
| .getEmptyParseResult(content.getUrl(), getConf()); |
| |
| text = os.toString(encoding); |
| |
| } catch (Exception e) { // run time exception |
| return new ParseStatus(e) |
| .getEmptyParseResult(content.getUrl(), getConf()); |
| } |
| |
| if (text == null) |
| text = ""; |
| |
| if (title == null) |
| title = ""; |
| |
| // collect outlink |
| Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); |
| |
| ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, |
| outlinks, content.getMetadata()); |
| return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, |
| parseData)); |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| Extension[] extensions = PluginRepository.get(conf) |
| .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions(); |
| |
| String contentType, command, timeoutString, encoding; |
| |
| for (int i = 0; i < extensions.length; i++) { |
| Extension extension = extensions[i]; |
| |
| // only look for extensions defined by plugin parse-ext |
| if (!extension.getDescriptor().getPluginId().equals("parse-ext")) |
| continue; |
| |
| contentType = extension.getAttribute("contentType"); |
| if (contentType == null || contentType.equals("")) |
| continue; |
| |
| command = extension.getAttribute("command"); |
| if (command == null || command.equals("")) |
| continue; |
| |
| // null encoding means default |
| encoding = extension.getAttribute("encoding"); |
| if (encoding == null) |
| encoding = Charset.defaultCharset().name(); |
| |
| timeoutString = extension.getAttribute("timeout"); |
| if (timeoutString == null || timeoutString.equals("")) |
| timeoutString = "" + TIMEOUT_DEFAULT; |
| |
| TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, |
| encoding }); |
| } |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return this.conf; |
| } |
| } |