src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.feed;

 import java.lang.invoke.MethodHandles;
 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.util.Date;
 import java.util.List;
 import java.util.Map.Entry;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.metadata.Feed;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.ParserFactory;
 import org.apache.nutch.parse.ParserNotFound;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.EncodingDetector;
 import org.apache.nutch.util.NutchConfiguration;
 import org.xml.sax.InputSource;

 import com.rometools.rome.feed.synd.SyndCategory;
 import com.rometools.rome.feed.synd.SyndContent;
 import com.rometools.rome.feed.synd.SyndEntry;
 import com.rometools.rome.feed.synd.SyndFeed;
 import com.rometools.rome.feed.synd.SyndPerson;
 import com.rometools.rome.io.SyndFeedInput;

 /**
  *
  * @author dogacan
  * @author mattmann
  * @since NUTCH-444
  *
  *        <p>
  *        A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
  *        links and content present in the feed.
  *        </p>
  *
  */
 public class FeedParser implements Parser {

   public static final String CHARSET_UTF8 = "charset=UTF-8";

   public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
       + CHARSET_UTF8;

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private Configuration conf;

   private ParserFactory parserFactory;

   private URLNormalizers normalizers;

   private URLFilters filters;

   private String defaultEncoding;

   /**
    * Parses the given feed and extracts out and parsers all linked items within
    * the feed, using the underlying ROME feed parsing library.
    *
    * @param content
    *          A {@link Content} object representing the feed that is being
    *          parsed by this {@link Parser}.
    *
    * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
    *         present in the feed file that this {@link Parser} dealt with.
    *
    */
   @Override
   public ParseResult getParse(Content content) {
     SyndFeed feed = null;
     ParseResult parseResult = new ParseResult(content.getUrl());

     EncodingDetector detector = new EncodingDetector(conf);
     detector.autoDetectClues(content, true);
     String encoding = detector.guessEncoding(content, defaultEncoding);
     try {
       InputSource input = new InputSource(new ByteArrayInputStream(
           content.getContent()));
       input.setEncoding(encoding);
       SyndFeedInput feedInput = new SyndFeedInput();
       feed = feedInput.build(input);
     } catch (Exception e) {
       // return empty parse
       LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
           + StringUtils.stringifyException(e));
       return new ParseStatus(e)
           .getEmptyParseResult(content.getUrl(), getConf());
     }

     String feedLink = feed.getLink();
     try {
       feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
       if (feedLink != null)
         feedLink = filters.filter(feedLink);
     } catch (Exception e) {
       feedLink = null;
     }

     List<?> entries = feed.getEntries();
     for (Object entry : entries) {
       addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
     }

     String feedDesc = stripTags(feed.getDescriptionEx());
     String feedTitle = stripTags(feed.getTitleEx());

     parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
         new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
         content.getMetadata()));

     return parseResult;
   }

   /**
    *
    * Sets the {@link Configuration} object for this {@link Parser}. This
    * {@link Parser} expects the following configuration properties to be set:
    *
    * <ul>
    * <li>URLNormalizers - properties in the configuration object to set up the
    * default url normalizers.</li>
    * <li>URLFilters - properties in the configuration object to set up the
    * default url filters.</li>
    * </ul>
    *
    * @param conf
    *          The Hadoop {@link Configuration} object to use to configure this
    *          {@link Parser}.
    *
    */
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.parserFactory = new ParserFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
     this.filters = new URLFilters(conf);
     this.defaultEncoding = conf.get("parser.character.encoding.default",
         "windows-1252");
   }

   /**
    *
    * @return The {@link Configuration} object used to configure this
    *         {@link Parser}.
    */
   @Override
   public Configuration getConf() {
     return this.conf;
   }

   /**
    * Runs a command line version of this {@link Parser}.
    *
    * @param args
    *          A single argument (expected at arg[0]) representing a path on the
    *          local filesystem that points to a feed file.
    *
    * @throws Exception
    *           If any error occurs.
    */
   public static void main(String[] args) throws Exception {
     if (args.length != 1) {
       System.err.println("Usage: FeedParser <feed>");
       System.exit(1);
     }
     String name = args[0];
     String url = "file:" + name;
     Configuration conf = NutchConfiguration.create();
     FeedParser parser = new FeedParser();
     parser.setConf(conf);
     File file = new File(name);
     byte[] bytes = new byte[(int) file.length()];
     DataInputStream in = new DataInputStream(new FileInputStream(file));
     in.readFully(bytes);
     in.close();
     ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
         "application/rss+xml", new Metadata(), conf));
     for (Entry<Text, Parse> entry : parseResult) {
       System.out.println("key: " + entry.getKey());
       Parse parse = entry.getValue();
       System.out.println("data: " + parse.getData());
       System.out.println("text: " + parse.getText() + "\n");
     }
   }

   private void addToMap(ParseResult parseResult, SyndFeed feed,
       String feedLink, SyndEntry entry, Content content) {
     String link = entry.getLink(), text = null, title = null;
     Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
     Parse parse = null;
     SyndContent description = entry.getDescription();

     try {
       link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);

       if (link != null)
         link = filters.filter(link);
     } catch (Exception e) {
       e.printStackTrace();
       return;
     }

     if (link == null)
       return;

     title = stripTags(entry.getTitleEx());

     if (feedLink != null)
       parseMeta.set("feed", feedLink);

     addFields(parseMeta, contentMeta, feed, entry);

     // some item descriptions contain markup text in them,
     // so we temporarily set their content-type to parse them
     // with another plugin
     String contentType = contentMeta.get(Response.CONTENT_TYPE);

     if (description != null)
       text = description.getValue();

     if (text == null) {
       List<?> contents = entry.getContents();
       StringBuilder buf = new StringBuilder();
       for (Object syndContent : contents) {
         buf.append(((SyndContent) syndContent).getValue());
       }
       text = buf.toString();
     }

     try {
       Parser parser = parserFactory.getParsers(contentType, link)[0];
       parse = parser.getParse(
           new Content(link, link, text.getBytes(), contentType, contentMeta,
               conf)).get(link);
     } catch (ParserNotFound e) { /* ignore */
     }

     if (parse != null) {
       ParseData data = parse.getData();
       data.getContentMeta().remove(Response.CONTENT_TYPE);
       mergeMetadata(data.getParseMeta(), parseMeta);
       parseResult.put(link, new ParseText(parse.getText()),
           new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
               data.getContentMeta(), data.getParseMeta()));
     } else {
       contentMeta.remove(Response.CONTENT_TYPE);
       parseResult.put(link, new ParseText(text), new ParseData(
           ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
           parseMeta));
     }

   }

   private static String stripTags(SyndContent c) {
     if (c == null)
       return "";

     String value = c.getValue();

     String[] parts = value.split("<[^>]*>");
     StringBuffer buf = new StringBuffer();

     for (String part : parts)
       buf.append(part);

     return buf.toString().trim();
   }

   private void addFields(Metadata parseMeta, Metadata contentMeta,
       SyndFeed feed, SyndEntry entry) {
     List<?> authors = entry.getAuthors(), categories = entry.getCategories();
     Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
     String contentType = null;

     if (authors != null) {
       for (Object o : authors) {
         SyndPerson author = (SyndPerson) o;
         String authorName = author.getName();
         if (checkString(authorName)) {
           parseMeta.add(Feed.FEED_AUTHOR, authorName);
         }
       }
     } else {
       // getAuthors may return null if feed is non-atom
       // if so, call getAuthor to get Dublin Core module creator.
       String authorName = entry.getAuthor();
       if (checkString(authorName)) {
         parseMeta.set(Feed.FEED_AUTHOR, authorName);
       }
     }

     for (Object i : categories) {
       parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
     }

     if (published != null) {
       parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
     }
     if (updated != null) {
       parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
     }

     SyndContent description = entry.getDescription();
     if (description != null) {
       contentType = description.getType();
     } else {
       // TODO: What to do if contents.size() > 1?
       List<?> contents = entry.getContents();
       if (contents.size() > 0) {
         contentType = ((SyndContent) contents.get(0)).getType();
       }
     }

     if (checkString(contentType)) {
       // ROME may return content-type as html
       if (contentType.equals("html"))
         contentType = "text/html";
       else if (contentType.equals("xhtml"))
         contentType = "text/xhtml";
       contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
     } else {
       contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
     }

   }

   private void mergeMetadata(Metadata first, Metadata second) {
     for (String name : second.names()) {
       String[] values = second.getValues(name);
       for (String value : values) {
         first.add(name, value);
       }
     }
   }

   private boolean checkString(String s) {
     return s != null && !s.equals("");
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.feed;

	import java.lang.invoke.MethodHandles;
	import java.io.ByteArrayInputStream;
	import java.io.DataInputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.util.Date;
	import java.util.List;
	import java.util.Map.Entry;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.util.StringUtils;
	import org.apache.nutch.metadata.Feed;
	import org.apache.nutch.metadata.Metadata;
	import org.apache.nutch.net.URLFilters;
	import org.apache.nutch.net.URLNormalizers;
	import org.apache.nutch.net.protocols.Response;
	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.parse.ParseResult;
	import org.apache.nutch.parse.ParseStatus;
	import org.apache.nutch.parse.ParseText;
	import org.apache.nutch.parse.Parser;
	import org.apache.nutch.parse.ParserFactory;
	import org.apache.nutch.parse.ParserNotFound;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.EncodingDetector;
	import org.apache.nutch.util.NutchConfiguration;
	import org.xml.sax.InputSource;

	import com.rometools.rome.feed.synd.SyndCategory;
	import com.rometools.rome.feed.synd.SyndContent;
	import com.rometools.rome.feed.synd.SyndEntry;
	import com.rometools.rome.feed.synd.SyndFeed;
	import com.rometools.rome.feed.synd.SyndPerson;
	import com.rometools.rome.io.SyndFeedInput;

	/**
	*
	* @author dogacan
	* @author mattmann
	* @since NUTCH-444
	*
	* <p>
	* A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
	* links and content present in the feed.
	* </p>
	*
	*/
	public class FeedParser implements Parser {

	public static final String CHARSET_UTF8 = "charset=UTF-8";

	public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
	+ CHARSET_UTF8;

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private Configuration conf;

	private ParserFactory parserFactory;

	private URLNormalizers normalizers;

	private URLFilters filters;

	private String defaultEncoding;

	/**
	* Parses the given feed and extracts out and parsers all linked items within
	* the feed, using the underlying ROME feed parsing library.
	*
	* @param content
	* A {@link Content} object representing the feed that is being
	* parsed by this {@link Parser}.
	*
	* @return A {@link ParseResult} containing all {@link Parse}d feeds that were
	* present in the feed file that this {@link Parser} dealt with.
	*
	*/
	@Override
	public ParseResult getParse(Content content) {
	SyndFeed feed = null;
	ParseResult parseResult = new ParseResult(content.getUrl());

	EncodingDetector detector = new EncodingDetector(conf);
	detector.autoDetectClues(content, true);
	String encoding = detector.guessEncoding(content, defaultEncoding);
	try {
	InputSource input = new InputSource(new ByteArrayInputStream(
	content.getContent()));
	input.setEncoding(encoding);
	SyndFeedInput feedInput = new SyndFeedInput();
	feed = feedInput.build(input);
	} catch (Exception e) {
	// return empty parse
	LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
	+ StringUtils.stringifyException(e));
	return new ParseStatus(e)
	.getEmptyParseResult(content.getUrl(), getConf());
	}

	String feedLink = feed.getLink();
	try {
	feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
	if (feedLink != null)
	feedLink = filters.filter(feedLink);
	} catch (Exception e) {
	feedLink = null;
	}

	List<?> entries = feed.getEntries();
	for (Object entry : entries) {
	addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
	}

	String feedDesc = stripTags(feed.getDescriptionEx());
	String feedTitle = stripTags(feed.getTitleEx());

	parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
	new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
	content.getMetadata()));

	return parseResult;
	}

	/**
	*
	* Sets the {@link Configuration} object for this {@link Parser}. This
	* {@link Parser} expects the following configuration properties to be set:
	*
	* <ul>
	* <li>URLNormalizers - properties in the configuration object to set up the
	* default url normalizers.</li>
	* <li>URLFilters - properties in the configuration object to set up the
	* default url filters.</li>
	* </ul>
	*
	* @param conf
	* The Hadoop {@link Configuration} object to use to configure this
	* {@link Parser}.
	*
	*/
	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	this.parserFactory = new ParserFactory(conf);
	this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
	this.filters = new URLFilters(conf);
	this.defaultEncoding = conf.get("parser.character.encoding.default",
	"windows-1252");
	}

	/**
	*
	* @return The {@link Configuration} object used to configure this
	* {@link Parser}.
	*/
	@Override
	public Configuration getConf() {
	return this.conf;
	}

	/**
	* Runs a command line version of this {@link Parser}.
	*
	* @param args
	* A single argument (expected at arg[0]) representing a path on the
	* local filesystem that points to a feed file.
	*
	* @throws Exception
	* If any error occurs.
	*/
	public static void main(String[] args) throws Exception {
	if (args.length != 1) {
	System.err.println("Usage: FeedParser <feed>");
	System.exit(1);
	}
	String name = args[0];
	String url = "file:" + name;
	Configuration conf = NutchConfiguration.create();
	FeedParser parser = new FeedParser();
	parser.setConf(conf);
	File file = new File(name);
	byte[] bytes = new byte[(int) file.length()];
	DataInputStream in = new DataInputStream(new FileInputStream(file));
	in.readFully(bytes);
	in.close();
	ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
	"application/rss+xml", new Metadata(), conf));
	for (Entry<Text, Parse> entry : parseResult) {
	System.out.println("key: " + entry.getKey());
	Parse parse = entry.getValue();
	System.out.println("data: " + parse.getData());
	System.out.println("text: " + parse.getText() + "\n");
	}
	}

	private void addToMap(ParseResult parseResult, SyndFeed feed,
	String feedLink, SyndEntry entry, Content content) {
	String link = entry.getLink(), text = null, title = null;
	Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
	Parse parse = null;
	SyndContent description = entry.getDescription();

	try {
	link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);

	if (link != null)
	link = filters.filter(link);
	} catch (Exception e) {
	e.printStackTrace();
	return;
	}

	if (link == null)
	return;

	title = stripTags(entry.getTitleEx());

	if (feedLink != null)
	parseMeta.set("feed", feedLink);

	addFields(parseMeta, contentMeta, feed, entry);

	// some item descriptions contain markup text in them,
	// so we temporarily set their content-type to parse them
	// with another plugin
	String contentType = contentMeta.get(Response.CONTENT_TYPE);

	if (description != null)
	text = description.getValue();

	if (text == null) {
	List<?> contents = entry.getContents();
	StringBuilder buf = new StringBuilder();
	for (Object syndContent : contents) {
	buf.append(((SyndContent) syndContent).getValue());
	}
	text = buf.toString();
	}

	try {
	Parser parser = parserFactory.getParsers(contentType, link)[0];
	parse = parser.getParse(
	new Content(link, link, text.getBytes(), contentType, contentMeta,
	conf)).get(link);
	} catch (ParserNotFound e) { /* ignore */
	}

	if (parse != null) {
	ParseData data = parse.getData();
	data.getContentMeta().remove(Response.CONTENT_TYPE);
	mergeMetadata(data.getParseMeta(), parseMeta);
	parseResult.put(link, new ParseText(parse.getText()),
	new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
	data.getContentMeta(), data.getParseMeta()));
	} else {
	contentMeta.remove(Response.CONTENT_TYPE);
	parseResult.put(link, new ParseText(text), new ParseData(
	ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
	parseMeta));
	}

	}

	private static String stripTags(SyndContent c) {
	if (c == null)
	return "";

	String value = c.getValue();

	String[] parts = value.split("<[^>]*>");
	StringBuffer buf = new StringBuffer();

	for (String part : parts)
	buf.append(part);

	return buf.toString().trim();
	}

	private void addFields(Metadata parseMeta, Metadata contentMeta,
	SyndFeed feed, SyndEntry entry) {
	List<?> authors = entry.getAuthors(), categories = entry.getCategories();
	Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
	String contentType = null;

	if (authors != null) {
	for (Object o : authors) {
	SyndPerson author = (SyndPerson) o;
	String authorName = author.getName();
	if (checkString(authorName)) {
	parseMeta.add(Feed.FEED_AUTHOR, authorName);
	}
	}
	} else {
	// getAuthors may return null if feed is non-atom
	// if so, call getAuthor to get Dublin Core module creator.
	String authorName = entry.getAuthor();
	if (checkString(authorName)) {
	parseMeta.set(Feed.FEED_AUTHOR, authorName);
	}
	}

	for (Object i : categories) {
	parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
	}

	if (published != null) {
	parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
	}
	if (updated != null) {
	parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
	}

	SyndContent description = entry.getDescription();
	if (description != null) {
	contentType = description.getType();
	} else {
	// TODO: What to do if contents.size() > 1?
	List<?> contents = entry.getContents();
	if (contents.size() > 0) {
	contentType = ((SyndContent) contents.get(0)).getType();
	}
	}

	if (checkString(contentType)) {
	// ROME may return content-type as html
	if (contentType.equals("html"))
	contentType = "text/html";
	else if (contentType.equals("xhtml"))
	contentType = "text/xhtml";
	contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
	} else {
	contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
	}

	}

	private void mergeMetadata(Metadata first, Metadata second) {
	for (String name : second.names()) {
	String[] values = second.getValues(name);
	for (String value : values) {
	first.add(name, value);
	}
	}
	}

	private boolean checkString(String s) {
	return s != null && !s.equals("");
	}

	}