| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.protocol.file; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.URL; |
| import java.util.List; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Text; |
| |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.protocol.Protocol; |
| import org.apache.nutch.protocol.ProtocolOutput; |
| import org.apache.nutch.protocol.ProtocolStatus; |
| import org.apache.nutch.protocol.RobotRulesParser; |
| import org.apache.nutch.util.NutchConfiguration; |
| |
| import crawlercommons.robots.BaseRobotRules; |
| |
| /** |
| * This class is a protocol plugin used for file: scheme. It creates |
| * {@link FileResponse} object and gets the content of the url from it. |
| * Configurable parameters are {@code file.content.limit} and |
| * {@code file.crawl.parent} in nutch-default.xml defined under |
| * "file properties" section. |
| * |
| * @author John Xing |
| */ |
| public class File implements Protocol { |
| |
| protected static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| static final int MAX_REDIRECTS = 5; |
| |
| int maxContentLength; |
| boolean crawlParents; |
| |
| /** |
| * if true return a redirect for symbolic links and do not resolve the links |
| * internally |
| */ |
| boolean symlinksAsRedirects = true; |
| |
| private Configuration conf; |
| |
| public File() { |
| } |
| |
| /** |
| * Set the {@link Configuration} object |
| */ |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024); |
| this.crawlParents = conf.getBoolean("file.crawl.parent", true); |
| this.symlinksAsRedirects = conf.getBoolean( |
| "file.crawl.redirect_noncanonical", true); |
| } |
| |
| /** |
| * Get the {@link Configuration} object |
| */ |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| /** |
| * Set the length after at which content is truncated. |
| */ |
| public void setMaxContentLength(int maxContentLength) { |
| this.maxContentLength = maxContentLength; |
| } |
| |
| /** |
| * Creates a {@link FileResponse} object corresponding to the url and return a |
| * {@link ProtocolOutput} object as per the content received |
| * |
| * @param url |
| * Text containing the url |
| * @param datum |
| * The CrawlDatum object corresponding to the url |
| * |
| * @return {@link ProtocolOutput} object for the content of the file indicated |
| * by url |
| */ |
| public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { |
| String urlString = url.toString(); |
| try { |
| URL u = new URL(urlString); |
| |
| int redirects = 0; |
| |
| while (true) { |
| FileResponse response; |
| response = new FileResponse(u, datum, this, getConf()); // make a |
| // request |
| |
| int code = response.getCode(); |
| |
| if (code == 200) { // got a good response |
| return new ProtocolOutput(response.toContent()); // return it |
| |
| } else if (code == 304) { // got not modified |
| return new ProtocolOutput(response.toContent(), |
| ProtocolStatus.STATUS_NOTMODIFIED); |
| |
| } else if (code == 401) { // access denied / no read permissions |
| return new ProtocolOutput(response.toContent(), new ProtocolStatus( |
| ProtocolStatus.ACCESS_DENIED)); |
| |
| } else if (code == 404) { // no such file |
| return new ProtocolOutput(response.toContent(), |
| ProtocolStatus.STATUS_NOTFOUND); |
| |
| } else if (code >= 300 && code < 400) { // handle redirect |
| u = new URL(response.getHeader("Location")); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("redirect to " + u); |
| } |
| if (symlinksAsRedirects) { |
| return new ProtocolOutput(response.toContent(), new ProtocolStatus( |
| ProtocolStatus.MOVED, u)); |
| } else if (redirects == MAX_REDIRECTS) { |
| LOG.trace("Too many redirects: {}", url); |
| return new ProtocolOutput(response.toContent(), new ProtocolStatus( |
| ProtocolStatus.REDIR_EXCEEDED, u)); |
| } |
| redirects++; |
| |
| } else { // convert to exception |
| throw new FileError(code); |
| } |
| } |
| } catch (Exception e) { |
| e.printStackTrace(); |
| return new ProtocolOutput(null, new ProtocolStatus(e)); |
| } |
| } |
| |
| /** |
| * Quick way for running this class. Useful for debugging. |
| */ |
| public static void main(String[] args) throws Exception { |
| int maxContentLength = Integer.MIN_VALUE; |
| boolean dumpContent = false; |
| String urlString = null; |
| |
| String usage = "Usage: File [-maxContentLength L] [-dumpContent] url"; |
| |
| if (args.length == 0) { |
| System.err.println(usage); |
| System.exit(-1); |
| } |
| |
| for (int i = 0; i < args.length; i++) { |
| if (args[i].equals("-maxContentLength")) { |
| maxContentLength = Integer.parseInt(args[++i]); |
| } else if (args[i].equals("-dumpContent")) { |
| dumpContent = true; |
| } else if (i != args.length - 1) { |
| System.err.println(usage); |
| System.exit(-1); |
| } else |
| urlString = args[i]; |
| } |
| |
| File file = new File(); |
| file.setConf(NutchConfiguration.create()); |
| |
| if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength |
| file.setMaxContentLength(maxContentLength); |
| |
| // set log level |
| // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); |
| |
| ProtocolOutput output = file.getProtocolOutput(new Text(urlString), |
| new CrawlDatum()); |
| Content content = output.getContent(); |
| |
| System.err.println("URL: " + content.getUrl()); |
| System.err.println("Status: " + output.getStatus()); |
| System.err.println("Content-Type: " + content.getContentType()); |
| System.err.println("Content-Length: " |
| + content.getMetadata().get(Response.CONTENT_LENGTH)); |
| System.err.println("Last-Modified: " |
| + content.getMetadata().get(Response.LAST_MODIFIED)); |
| String redirectLocation = content.getMetadata().get("Location"); |
| if (redirectLocation != null) { |
| System.err.println("Location: " + redirectLocation); |
| } |
| |
| if (dumpContent) { |
| System.out.print(new String(content.getContent())); |
| } |
| |
| file = null; |
| } |
| |
| /** |
| * No robots parsing is done for file protocol. So this returns a set of empty |
| * rules which will allow every url. |
| */ |
| @Override |
| public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, |
| List<Content> robotsTxtContent) { |
| return RobotRulesParser.EMPTY_RULES; |
| } |
| |
| } |