blob: cecd36630f33c4184b9a5b726f1f3d63eedcf355 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.feed;
import java.lang.invoke.MethodHandles;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.util.Date;
import java.util.List;
import java.util.Map.Entry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.metadata.Feed;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.ParserFactory;
import org.apache.nutch.parse.ParserNotFound;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.xml.sax.InputSource;
import com.rometools.rome.feed.synd.SyndCategory;
import com.rometools.rome.feed.synd.SyndContent;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.feed.synd.SyndPerson;
import com.rometools.rome.io.SyndFeedInput;
/**
*
* @author dogacan
* @author mattmann
* @since NUTCH-444
*
* <p>
* A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
* links and content present in the feed.
* </p>
*
*/
public class FeedParser implements Parser {
public static final String CHARSET_UTF8 = "charset=UTF-8";
public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
+ CHARSET_UTF8;
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf;
private ParserFactory parserFactory;
private URLNormalizers normalizers;
private URLFilters filters;
private String defaultEncoding;
/**
* Parses the given feed and extracts out and parsers all linked items within
* the feed, using the underlying ROME feed parsing library.
*
* @param content
* A {@link Content} object representing the feed that is being
* parsed by this {@link Parser}.
*
* @return A {@link ParseResult} containing all {@link Parse}d feeds that were
* present in the feed file that this {@link Parser} dealt with.
*
*/
@Override
public ParseResult getParse(Content content) {
SyndFeed feed = null;
ParseResult parseResult = new ParseResult(content.getUrl());
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, defaultEncoding);
try {
InputSource input = new InputSource(new ByteArrayInputStream(
content.getContent()));
input.setEncoding(encoding);
SyndFeedInput feedInput = new SyndFeedInput();
feed = feedInput.build(input);
} catch (Exception e) {
// return empty parse
LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
+ StringUtils.stringifyException(e));
return new ParseStatus(e)
.getEmptyParseResult(content.getUrl(), getConf());
}
String feedLink = feed.getLink();
try {
feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
if (feedLink != null)
feedLink = filters.filter(feedLink);
} catch (Exception e) {
feedLink = null;
}
List<?> entries = feed.getEntries();
for (Object entry : entries) {
addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
}
String feedDesc = stripTags(feed.getDescriptionEx());
String feedTitle = stripTags(feed.getTitleEx());
parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
content.getMetadata()));
return parseResult;
}
/**
*
* Sets the {@link Configuration} object for this {@link Parser}. This
* {@link Parser} expects the following configuration properties to be set:
*
* <ul>
* <li>URLNormalizers - properties in the configuration object to set up the
* default url normalizers.</li>
* <li>URLFilters - properties in the configuration object to set up the
* default url filters.</li>
* </ul>
*
* @param conf
* The Hadoop {@link Configuration} object to use to configure this
* {@link Parser}.
*
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.parserFactory = new ParserFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(conf);
this.defaultEncoding = conf.get("parser.character.encoding.default",
"windows-1252");
}
/**
*
* @return The {@link Configuration} object used to configure this
* {@link Parser}.
*/
@Override
public Configuration getConf() {
return this.conf;
}
/**
* Runs a command line version of this {@link Parser}.
*
* @param args
* A single argument (expected at arg[0]) representing a path on the
* local filesystem that points to a feed file.
*
* @throws Exception
* If any error occurs.
*/
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("Usage: FeedParser <feed>");
System.exit(1);
}
String name = args[0];
String url = "file:" + name;
Configuration conf = NutchConfiguration.create();
FeedParser parser = new FeedParser();
parser.setConf(conf);
File file = new File(name);
byte[] bytes = new byte[(int) file.length()];
DataInputStream in = new DataInputStream(new FileInputStream(file));
in.readFully(bytes);
in.close();
ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
"application/rss+xml", new Metadata(), conf));
for (Entry<Text, Parse> entry : parseResult) {
System.out.println("key: " + entry.getKey());
Parse parse = entry.getValue();
System.out.println("data: " + parse.getData());
System.out.println("text: " + parse.getText() + "\n");
}
}
private void addToMap(ParseResult parseResult, SyndFeed feed,
String feedLink, SyndEntry entry, Content content) {
String link = entry.getLink(), text = null, title = null;
Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
Parse parse = null;
SyndContent description = entry.getDescription();
try {
link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
if (link != null)
link = filters.filter(link);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (link == null)
return;
title = stripTags(entry.getTitleEx());
if (feedLink != null)
parseMeta.set("feed", feedLink);
addFields(parseMeta, contentMeta, feed, entry);
// some item descriptions contain markup text in them,
// so we temporarily set their content-type to parse them
// with another plugin
String contentType = contentMeta.get(Response.CONTENT_TYPE);
if (description != null)
text = description.getValue();
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
for (Object syndContent : contents) {
buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
try {
Parser parser = parserFactory.getParsers(contentType, link)[0];
parse = parser.getParse(
new Content(link, link, text.getBytes(), contentType, contentMeta,
conf)).get(link);
} catch (ParserNotFound e) { /* ignore */
}
if (parse != null) {
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
parseResult.put(link, new ParseText(parse.getText()),
new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(
ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
parseMeta));
}
}
private static String stripTags(SyndContent c) {
if (c == null)
return "";
String value = c.getValue();
String[] parts = value.split("<[^>]*>");
StringBuffer buf = new StringBuffer();
for (String part : parts)
buf.append(part);
return buf.toString().trim();
}
private void addFields(Metadata parseMeta, Metadata contentMeta,
SyndFeed feed, SyndEntry entry) {
List<?> authors = entry.getAuthors(), categories = entry.getCategories();
Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
String contentType = null;
if (authors != null) {
for (Object o : authors) {
SyndPerson author = (SyndPerson) o;
String authorName = author.getName();
if (checkString(authorName)) {
parseMeta.add(Feed.FEED_AUTHOR, authorName);
}
}
} else {
// getAuthors may return null if feed is non-atom
// if so, call getAuthor to get Dublin Core module creator.
String authorName = entry.getAuthor();
if (checkString(authorName)) {
parseMeta.set(Feed.FEED_AUTHOR, authorName);
}
}
for (Object i : categories) {
parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
}
if (published != null) {
parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
}
if (updated != null) {
parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
}
SyndContent description = entry.getDescription();
if (description != null) {
contentType = description.getType();
} else {
// TODO: What to do if contents.size() > 1?
List<?> contents = entry.getContents();
if (contents.size() > 0) {
contentType = ((SyndContent) contents.get(0)).getType();
}
}
if (checkString(contentType)) {
// ROME may return content-type as html
if (contentType.equals("html"))
contentType = "text/html";
else if (contentType.equals("xhtml"))
contentType = "text/xhtml";
contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
} else {
contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
}
}
private void mergeMetadata(Metadata first, Metadata second) {
for (String name : second.names()) {
String[] values = second.getValues(name);
for (String value : values) {
first.add(name, value);
}
}
}
private boolean checkString(String s) {
return s != null && !s.equals("");
}
}