| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.parse.html; |
| |
| import java.net.URL; |
| |
| import org.apache.nutch.parse.HTMLMetaTags; |
| import org.w3c.dom.NamedNodeMap; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| |
| /** |
| * Class for parsing META Directives from DOM trees. This class handles |
| * specifically Robots META directives (all, none, nofollow, noindex), finding |
| * BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are |
| * stored in a HTMLMetaTags instance. |
| */ |
| public class HTMLMetaProcessor { |
| |
| /** |
| * Utility class with indicators for the robots directives "noindex" and |
| * "nofollow", and HTTP-EQUIV/no-cache |
| */ |
| |
| /** |
| * Sets the indicators in <code>robotsMeta</code> to appropriate values, based |
| * on any META tags found under the given <code>node</code>. |
| */ |
| public static final void getMetaTags(HTMLMetaTags metaTags, Node node, |
| URL currURL) { |
| |
| metaTags.reset(); |
| getMetaTagsHelper(metaTags, node, currURL); |
| } |
| |
| private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node, |
| URL currURL) { |
| |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| |
| if ("body".equalsIgnoreCase(node.getNodeName())) { |
| // META tags should not be under body |
| return; |
| } |
| |
| if ("meta".equalsIgnoreCase(node.getNodeName())) { |
| NamedNodeMap attrs = node.getAttributes(); |
| Node nameNode = null; |
| Node equivNode = null; |
| Node contentNode = null; |
| // Retrieves name, http-equiv and content attribues |
| for (int i = 0; i < attrs.getLength(); i++) { |
| Node attr = attrs.item(i); |
| String attrName = attr.getNodeName().toLowerCase(); |
| if (attrName.equals("name")) { |
| nameNode = attr; |
| } else if (attrName.equals("http-equiv")) { |
| equivNode = attr; |
| } else if (attrName.equals("content")) { |
| contentNode = attr; |
| } |
| } |
| |
| if (nameNode != null) { |
| if (contentNode != null) { |
| String name = nameNode.getNodeValue().toLowerCase(); |
| metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); |
| if ("robots".equals(name)) { |
| String directives = contentNode.getNodeValue().toLowerCase(); |
| int index = directives.indexOf("none"); |
| |
| if (index >= 0) { |
| metaTags.setNoIndex(); |
| metaTags.setNoFollow(); |
| } |
| |
| index = directives.indexOf("all"); |
| if (index >= 0) { |
| // do nothing... |
| } |
| |
| index = directives.indexOf("noindex"); |
| if (index >= 0) { |
| metaTags.setNoIndex(); |
| } |
| |
| index = directives.indexOf("nofollow"); |
| if (index >= 0) { |
| metaTags.setNoFollow(); |
| } |
| |
| index = directives.indexOf("noarchive"); |
| if (index >= 0) { |
| metaTags.setNoCache(); |
| } |
| |
| } // end if (name == robots) |
| } |
| } |
| |
| if (equivNode != null) { |
| if (contentNode != null) { |
| String name = equivNode.getNodeValue().toLowerCase(); |
| String content = contentNode.getNodeValue(); |
| metaTags.getHttpEquivTags().setProperty(name, content); |
| if ("pragma".equals(name)) { |
| content = content.toLowerCase(); |
| int index = content.indexOf("no-cache"); |
| if (index >= 0) |
| metaTags.setNoCache(); |
| } else if ("refresh".equals(name)) { |
| int idx = content.indexOf(';'); |
| String time = null; |
| if (idx == -1) { // just the refresh time |
| time = content; |
| } else |
| time = content.substring(0, idx); |
| try { |
| metaTags.setRefreshTime(Integer.parseInt(time)); |
| // skip this if we couldn't parse the time |
| metaTags.setRefresh(true); |
| } catch (Exception e) { |
| ; |
| } |
| URL refreshUrl = null; |
| if (metaTags.getRefresh() && idx != -1) { // set the URL |
| idx = content.toLowerCase().indexOf("url="); |
| if (idx == -1) { // assume a mis-formatted entry with just the |
| // url |
| idx = content.indexOf(';') + 1; |
| } else |
| idx += 4; |
| if (idx != -1) { |
| String url = content.substring(idx); |
| try { |
| refreshUrl = new URL(url); |
| } catch (Exception e) { |
| // XXX according to the spec, this has to be an absolute |
| // XXX url. However, many websites use relative URLs and |
| // XXX expect browsers to handle that. |
| // XXX Unfortunately, in some cases this may create a |
| // XXX infinitely recursive paths (a crawler trap)... |
| // if (!url.startsWith("/")) url = "/" + url; |
| try { |
| refreshUrl = new URL(currURL, url); |
| } catch (Exception e1) { |
| refreshUrl = null; |
| } |
| } |
| } |
| } |
| if (metaTags.getRefresh()) { |
| if (refreshUrl == null) { |
| // apparently only refresh time was present. set the URL |
| // to the same URL. |
| refreshUrl = currURL; |
| } |
| metaTags.setRefreshHref(refreshUrl); |
| } |
| } |
| } |
| } |
| |
| } else if ("base".equalsIgnoreCase(node.getNodeName())) { |
| NamedNodeMap attrs = node.getAttributes(); |
| Node hrefNode = attrs.getNamedItem("href"); |
| |
| if (hrefNode != null) { |
| String urlString = hrefNode.getNodeValue(); |
| |
| URL url = null; |
| try { |
| if (currURL == null) |
| url = new URL(urlString); |
| else |
| url = new URL(currURL, urlString); |
| } catch (Exception e) { |
| ; |
| } |
| |
| if (url != null) |
| metaTags.setBaseHref(url); |
| } |
| |
| } |
| |
| } |
| |
| NodeList children = node.getChildNodes(); |
| if (children != null) { |
| int len = children.getLength(); |
| for (int i = 0; i < len; i++) { |
| getMetaTagsHelper(metaTags, children.item(i), currURL); |
| } |
| } |
| } |
| |
| } |