blob: 3d96a7b040c586f16020294a3e5c6880b3f0e245 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.microformats.reltag;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.StringUtil;
import org.apache.hadoop.conf.Configuration;
/**
* Adds microformat rel-tags of document if found.
*
* @see <a href="http://www.microformats.org/wiki/rel-tag">
* http://www.microformats.org/wiki/rel-tag</a>
*/
public class RelTagParser implements HtmlParseFilter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
public final static String REL_TAG = "Rel-Tag";
private Configuration conf = null;
/**
* Scan the HTML document looking at possible rel-tags
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's rel-tags
Parser parser = new Parser(doc);
Set<?> tags = parser.getRelTags();
Iterator<?> iter = tags.iterator();
Metadata metadata = parse.getData().getParseMeta();
while (iter.hasNext())
metadata.add(REL_TAG, (String) iter.next());
return parseResult;
}
private static class Parser {
Set<String> tags = null;
Parser(Node node) {
tags = new TreeSet<String>();
parse(node);
}
Set<String> getRelTags() {
return tags;
}
void parse(Node node) {
if (node.getNodeType() == Node.ELEMENT_NODE) {
// Look for <a> tag
if ("a".equalsIgnoreCase(node.getNodeName())) {
NamedNodeMap attrs = node.getAttributes();
Node hrefNode = attrs.getNamedItem("href");
// Checks that it contains a href attribute
if (hrefNode != null) {
Node relNode = attrs.getNamedItem("rel");
// Checks that it contains a rel attribute too
if (relNode != null) {
// Finaly checks that rel=tag
if ("tag".equalsIgnoreCase(relNode.getNodeValue())) {
String tag = parseTag(hrefNode.getNodeValue());
if (!StringUtil.isEmpty(tag)) {
if (!tags.contains(tag)) {
tags.add(tag);
LOG.debug("Adding tag: " + tag + " to tag set.");
}
}
}
}
}
}
}
// Recurse
NodeList children = node.getChildNodes();
for (int i = 0; children != null && i < children.getLength(); i++)
parse(children.item(i));
}
private final static String parseTag(String url) {
String tag = null;
try {
URL u = new URL(url);
String path = u.getPath();
tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1),
"UTF-8");
} catch (Exception e) {
// Malformed tag...
tag = null;
}
return tag;
}
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return this.conf;
}
}