blob: 4e7ef14113bbc740915e1198a2d743e1e8afc597 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.net.URL;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Class for parsing META Directives from DOM trees. This class handles
* specifically Robots META directives (all, none, nofollow, noindex), finding
* BASE HREF tags, and HTTP-EQUIV no-cache instructions. All meta directives are
* stored in a HTMLMetaTags instance.
*/
public class HTMLMetaProcessor {
/**
* Utility class with indicators for the robots directives "noindex" and
* "nofollow", and HTTP-EQUIV/no-cache
*/
/**
* Sets the indicators in <code>robotsMeta</code> to appropriate values, based
* on any META tags found under the given <code>node</code>.
*/
public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
URL currURL) {
metaTags.reset();
getMetaTagsHelper(metaTags, node, currURL);
}
private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
URL currURL) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(node.getNodeName())) {
// META tags should not be under body
return;
}
if ("meta".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node nameNode = null;
Node equivNode = null;
Node contentNode = null;
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName().toLowerCase();
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
equivNode = attr;
} else if (attrName.equals("content")) {
contentNode = attr;
}
}
if (nameNode != null) {
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
if ("robots".equals(name)) {
String directives = contentNode.getNodeValue().toLowerCase();
int index = directives.indexOf("none");
if (index >= 0) {
metaTags.setNoIndex();
metaTags.setNoFollow();
}
index = directives.indexOf("all");
if (index >= 0) {
// do nothing...
}
index = directives.indexOf("noindex");
if (index >= 0) {
metaTags.setNoIndex();
}
index = directives.indexOf("nofollow");
if (index >= 0) {
metaTags.setNoFollow();
}
index = directives.indexOf("noarchive");
if (index >= 0) {
metaTags.setNoCache();
}
} // end if (name == robots)
}
}
if (equivNode != null) {
if (contentNode != null) {
String name = equivNode.getNodeValue().toLowerCase();
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
content = content.toLowerCase();
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
} else if ("refresh".equals(name)) {
int idx = content.indexOf(';');
String time = null;
if (idx == -1) { // just the refresh time
time = content;
} else
time = content.substring(0, idx);
try {
metaTags.setRefreshTime(Integer.parseInt(time));
// skip this if we couldn't parse the time
metaTags.setRefresh(true);
} catch (Exception e) {
;
}
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
idx = content.toLowerCase().indexOf("url=");
if (idx == -1) { // assume a mis-formatted entry with just the
// url
idx = content.indexOf(';') + 1;
} else
idx += 4;
if (idx != -1) {
String url = content.substring(idx);
try {
refreshUrl = new URL(url);
} catch (Exception e) {
// XXX according to the spec, this has to be an absolute
// XXX url. However, many websites use relative URLs and
// XXX expect browsers to handle that.
// XXX Unfortunately, in some cases this may create a
// XXX infinitely recursive paths (a crawler trap)...
// if (!url.startsWith("/")) url = "/" + url;
try {
refreshUrl = new URL(currURL, url);
} catch (Exception e1) {
refreshUrl = null;
}
}
}
}
if (metaTags.getRefresh()) {
if (refreshUrl == null) {
// apparently only refresh time was present. set the URL
// to the same URL.
refreshUrl = currURL;
}
metaTags.setRefreshHref(refreshUrl);
}
}
}
}
} else if ("base".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node hrefNode = attrs.getNamedItem("href");
if (hrefNode != null) {
String urlString = hrefNode.getNodeValue();
URL url = null;
try {
if (currURL == null)
url = new URL(urlString);
else
url = new URL(currURL, urlString);
} catch (Exception e) {
;
}
if (url != null)
metaTags.setBaseHref(url);
}
}
}
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++) {
getMetaTagsHelper(metaTags, children.item(i), currURL);
}
}
}
}