app/src/main/java/org/apache/roller/weblogger/util/LinkbackExtractor.java - roller - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  The ASF licenses this file to You
 * under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.  For additional information regarding
 * copyright in this work, please see the NOTICE file in the top level
 * directory of this distribution.
 */
 package org.apache.roller.weblogger.util;

 import com.rometools.rome.feed.synd.SyndEntry;
 import com.rometools.rome.feed.synd.SyndFeed;
 import com.rometools.rome.io.FeedException;
 import com.rometools.rome.io.SyndFeedInput;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.StringReader;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Arrays;
 import java.util.List;

 import javax.swing.text.MutableAttributeSet;
 import javax.swing.text.html.HTML;
 import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.HTML.Tag;
 import javax.swing.text.html.HTMLEditorKit.Parser;
 import javax.swing.text.html.HTMLEditorKit.ParserCallback;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;


 /**
  * Parses HTML file for referring linkback title and excerpt.
  *
  * @author David M Johnson
  */
 public class LinkbackExtractor
 {
     private static Log mLogger        = LogFactory.getFactory().getInstance(
                                               LinkbackExtractor.class);
     private boolean    mFound         = false;
     private String     mTitle         = "";
     private String     mRssLink       = null;
     private String     mExcerpt       = null;
     private String     mPermalink     = null;
     private int        mStart         = 0;
     private int        mEnd           = 0;
     private String     mRequestURL    = null;
     private String     mRequestURLWWW = null;
     private String     mRefererURL;

     private static final int MAX_EXCERPT_CHARS = 500;
     private static final int DESIRED_TITLE_LENGTH = 50;

     //------------------------------------------------------------------------
     /**
      * Extract referring page title, excerpt, and permalink.
      *
      * @param refererURL
      * @param requestURL
      */
     public LinkbackExtractor(String refererURL, String requestURL) throws IOException {
         try {
             extractByParsingHtml(refererURL, requestURL);
             if (mRssLink != null) {
                 extractByParsingRss(mRssLink, requestURL);
             }
         } catch (Exception e) {
             if (mLogger.isDebugEnabled()) {
                 mLogger.debug("Extracting linkback", e);
             }
         }
     }

     //------------------------------------------------------------------------
     private void extractByParsingHtml(String refererURL, String requestURL) throws IOException {
         URL url = new URL(refererURL);
         InputStream is = url.openStream();

         mRefererURL = refererURL;

         if (requestURL.startsWith("http://www.")) {
             mRequestURLWWW = requestURL;
             mRequestURL = "http://" + mRequestURLWWW.substring(11);
         } else {
             mRequestURL = requestURL;
             mRequestURLWWW = "http://www." + mRequestURL.substring(7);
         }

         // Trick gets Swing's HTML parser by making its protected getParser() method public
         // Ignore inaccurate Sonar complaint about useless overriding method:
         //    http://jira.codehaus.org/browse/SONARJAVA-287
         Parser parser = (new HTMLEditorKit() {
             public Parser getParser() {
                 return super.getParser();
             }
         }).getParser();

         // Read HTML file into string
         StringBuilder sb = new StringBuilder();
         InputStreamReader isr = new InputStreamReader(is);
         BufferedReader br = new BufferedReader(isr);
         try {
             String line;
             while ((line = br.readLine()) != null) {
                 sb.append(line);
             }
         } finally {
             br.close();
         }

         // Parse HTML string to find title and start and end position
         // of the referring excerpt.
         StringReader sr = new StringReader(sb.toString());
         parser.parse(sr, new LinkbackCallback(), true);

         if (mStart != 0 && mEnd != 0 && mEnd > mStart) {
             mExcerpt = sb.toString().substring(mStart, mEnd);
             mExcerpt = Utilities.removeHTML(mExcerpt);

             if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
                 mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
             }
         }

         if (mTitle.startsWith(">") && mTitle.length() > 1) {
             mTitle = mTitle.substring(1);
         }
     }

     //------------------------------------------------------------------------
     private void extractByParsingRss(String rssLink, String requestURL)
             throws FeedException, IOException {
         SyndFeedInput feedInput = new SyndFeedInput();
         SyndFeed feed = feedInput.build(
             new InputStreamReader(new URL(rssLink).openStream()));
         String feedTitle = feed.getTitle();

         int count = 0;

         if (mLogger.isDebugEnabled()) {
             mLogger.debug("Feed parsed, title: " + feedTitle);
         }

         for (Object objItem : feed.getEntries()) {
             count++;
             SyndEntry item = (SyndEntry) objItem;
             if (item.getDescription().getValue().contains(requestURL)) {
                 mFound = true;
                 mPermalink = item.getLink();
                 if (feedTitle != null && feedTitle.trim().length() > 0) {
                     mTitle = feedTitle + ": " + item.getTitle();
                 } else {
                     mTitle = item.getTitle();
                 }
                 mExcerpt = item.getDescription().getValue();
                 mExcerpt = Utilities.removeHTML(mExcerpt);
                 if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
                     mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
                 }
                 break;
             }
         }

         if (mLogger.isDebugEnabled()) {
             mLogger.debug("Parsed " + count + " articles, found linkback=" + mFound);
         }
     }

     //------------------------------------------------------------------------
     /**
      * Returns the excerpt.
      *
      * @return String
      */
     public String getExcerpt() {
         return mExcerpt;
     }

     //------------------------------------------------------------------------
     /**
      * Returns the title.
      *
      * @return String
      */
     public String getTitle() {
         return mTitle;
     }

     //------------------------------------------------------------------------
     /**
      * Returns the permalink.
      *
      * @return String
      */
     public String getPermalink() {
         return mPermalink;
     }

     //------------------------------------------------------------------------
     /**
      * Sets the permalink.
      *
      * @param permalink
      *            The permalink to set
      */
     public void setPermalink(String permalink)
     {
         mPermalink = permalink;
     }

     /////////////////////////////////////////////////////////////////////////

     /**
      * Parser callback that finds title and excerpt. As we walk through the HTML
      * tags, we keep track of the most recently encountered divider tag in the
      * mStart field. Once we find the referring permalink, we set the mFound
      * flag. After that, we look for the next divider tag and save it's position
      * in the mEnd field.
      */
     private final class LinkbackCallback extends ParserCallback
     {
         // Dividers
         private Tag[] mDivTags    = { Tag.TD, Tag.DIV, Tag.SPAN,
                                           Tag.BLOCKQUOTE, Tag.P, Tag.LI,
                                           Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
                                           Tag.H2, Tag.H3, Tag.H4, Tag.H5,
                                           Tag.H6 };

         private List  mList       = Arrays.asList(mDivTags);

         private Tag   mCurrentTag = null;

         /**
          * Look for divider tags and for the permalink.
          *
          * @param tag
          *            HTML tag
          * @param atts
          *            Attributes of that tag
          * @param pos
          *            Tag's position in file
          */
         public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos)
         {
             if (mList.contains(tag) && !mFound)
             {
                 mStart = pos;
             }
             else if (mList.contains(tag) && mFound && mEnd == 0)
             {
                 mEnd = pos;
             }
             else if (tag.equals(Tag.A))
             {
                 String href = (String) atts.getAttribute(HTML.Attribute.HREF);
                 if (href == null) {
                     return;
                 }
                 int hashPos = href.lastIndexOf('#');
                 if (hashPos != -1)
                 {
                     href = href.substring(0, hashPos);
                 }
                 if (href != null
                         && (href.equals(mRequestURL) || href
                                 .equals(mRequestURLWWW)))
                 {
                     mFound = true;
                 }
             }
             mCurrentTag = tag;
         }

         /**
          * Needed to handle SPAN tag.
          */
         public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos)
         {
             if (mList.contains(tag) && mFound && mEnd == 0)
             {
                 mEnd = pos;
             }
             else if (tag.equals(Tag.LINK))
             {
                 // Look out for RSS autodiscovery link
                 String title = (String) atts.getAttribute(HTML.Attribute.TITLE);
                 String type = (String) atts.getAttribute(HTML.Attribute.TYPE);
                 if (title != null && type != null
                         && type.equals("application/rss+xml")
                         && title.equals("RSS"))
                 {
                     mRssLink = (String) atts.getAttribute(HTML.Attribute.HREF);

                     if (mLogger.isDebugEnabled())
                     {
                         mLogger.debug("Found RSS link " + mRssLink);
                     }

                     if (mRssLink.startsWith("/") && mRssLink.length() > 1)
                     {
                         try
                         {
                             URL url = new URL(mRefererURL);
                             mRssLink = url.getProtocol() + "://"
                                     + url.getHost() + ":" + url.getPort()
                                     + mRssLink;
                         }
                         catch (MalformedURLException e)
                         {
                             mRssLink = null;
                             if (mLogger.isDebugEnabled())
                             {
                                 mLogger.debug("Determining RSS URL", e);
                             }
                         }
                     }
                     else if (!mRssLink.startsWith("http"))
                     {
                         int slash = mRefererURL.lastIndexOf('/');
                         if (slash != -1)
                         {
                             mRssLink = mRefererURL.substring(0, slash) + "/"
                                     + mRssLink;
                         }
                     }
                     if (mLogger.isDebugEnabled())
                     {
                         mLogger.debug("Qualified RSS link is " + mRssLink);
                     }
                 }
             }
         }

         /**
          * Stop at the very first divider tag after the permalink.
          *
          * @param tag
          *            End tag
          * @param pos
          *            Position in HTML file
          */
         public void handleEndTag(Tag tag, int pos)
         {
             if (mList.contains(tag) && mFound && mEnd == 0)
             {
                 mEnd = pos;
             }
             else if (mList.contains(tag) && !mFound)
             {
                 mStart = pos;
             }
             else
             {
                 mCurrentTag = null;
             }
         }

         /**
          * Get the page title
          */
         public void handleText(char[] data, int pos)
         {
             if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
             {
                 String newText = new String(data);
                 if (mTitle.length() < DESIRED_TITLE_LENGTH)
                 {
                     mTitle += newText;
                 }
             }
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. The ASF licenses this file to You
	* under the Apache License, Version 2.0 (the "License"); you may not
	* use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License. For additional information regarding
	* copyright in this work, please see the NOTICE file in the top level
	* directory of this distribution.
	*/
	package org.apache.roller.weblogger.util;

	import com.rometools.rome.feed.synd.SyndEntry;
	import com.rometools.rome.feed.synd.SyndFeed;
	import com.rometools.rome.io.FeedException;
	import com.rometools.rome.io.SyndFeedInput;
	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.StringReader;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.util.Arrays;
	import java.util.List;

	import javax.swing.text.MutableAttributeSet;
	import javax.swing.text.html.HTML;
	import javax.swing.text.html.HTMLEditorKit;
	import javax.swing.text.html.HTML.Tag;
	import javax.swing.text.html.HTMLEditorKit.Parser;
	import javax.swing.text.html.HTMLEditorKit.ParserCallback;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;


	/**
	* Parses HTML file for referring linkback title and excerpt.
	*
	* @author David M Johnson
	*/
	public class LinkbackExtractor
	{
	private static Log mLogger = LogFactory.getFactory().getInstance(
	LinkbackExtractor.class);
	private boolean mFound = false;
	private String mTitle = "";
	private String mRssLink = null;
	private String mExcerpt = null;
	private String mPermalink = null;
	private int mStart = 0;
	private int mEnd = 0;
	private String mRequestURL = null;
	private String mRequestURLWWW = null;
	private String mRefererURL;

	private static final int MAX_EXCERPT_CHARS = 500;
	private static final int DESIRED_TITLE_LENGTH = 50;

	//------------------------------------------------------------------------
	/**
	* Extract referring page title, excerpt, and permalink.
	*
	* @param refererURL
	* @param requestURL
	*/
	public LinkbackExtractor(String refererURL, String requestURL) throws IOException {
	try {
	extractByParsingHtml(refererURL, requestURL);
	if (mRssLink != null) {
	extractByParsingRss(mRssLink, requestURL);
	}
	} catch (Exception e) {
	if (mLogger.isDebugEnabled()) {
	mLogger.debug("Extracting linkback", e);
	}
	}
	}

	//------------------------------------------------------------------------
	private void extractByParsingHtml(String refererURL, String requestURL) throws IOException {
	URL url = new URL(refererURL);
	InputStream is = url.openStream();

	mRefererURL = refererURL;

	if (requestURL.startsWith("http://www.")) {
	mRequestURLWWW = requestURL;
	mRequestURL = "http://" + mRequestURLWWW.substring(11);
	} else {
	mRequestURL = requestURL;
	mRequestURLWWW = "http://www." + mRequestURL.substring(7);
	}

	// Trick gets Swing's HTML parser by making its protected getParser() method public
	// Ignore inaccurate Sonar complaint about useless overriding method:
	// http://jira.codehaus.org/browse/SONARJAVA-287
	Parser parser = (new HTMLEditorKit() {
	public Parser getParser() {
	return super.getParser();
	}
	}).getParser();

	// Read HTML file into string
	StringBuilder sb = new StringBuilder();
	InputStreamReader isr = new InputStreamReader(is);
	BufferedReader br = new BufferedReader(isr);
	try {
	String line;
	while ((line = br.readLine()) != null) {
	sb.append(line);
	}
	} finally {
	br.close();
	}

	// Parse HTML string to find title and start and end position
	// of the referring excerpt.
	StringReader sr = new StringReader(sb.toString());
	parser.parse(sr, new LinkbackCallback(), true);

	if (mStart != 0 && mEnd != 0 && mEnd > mStart) {
	mExcerpt = sb.toString().substring(mStart, mEnd);
	mExcerpt = Utilities.removeHTML(mExcerpt);

	if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
	mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
	}
	}

	if (mTitle.startsWith(">") && mTitle.length() > 1) {
	mTitle = mTitle.substring(1);
	}
	}

	//------------------------------------------------------------------------
	private void extractByParsingRss(String rssLink, String requestURL)
	throws FeedException, IOException {
	SyndFeedInput feedInput = new SyndFeedInput();
	SyndFeed feed = feedInput.build(
	new InputStreamReader(new URL(rssLink).openStream()));
	String feedTitle = feed.getTitle();

	int count = 0;

	if (mLogger.isDebugEnabled()) {
	mLogger.debug("Feed parsed, title: " + feedTitle);
	}

	for (Object objItem : feed.getEntries()) {
	count++;
	SyndEntry item = (SyndEntry) objItem;
	if (item.getDescription().getValue().contains(requestURL)) {
	mFound = true;
	mPermalink = item.getLink();
	if (feedTitle != null && feedTitle.trim().length() > 0) {
	mTitle = feedTitle + ": " + item.getTitle();
	} else {
	mTitle = item.getTitle();
	}
	mExcerpt = item.getDescription().getValue();
	mExcerpt = Utilities.removeHTML(mExcerpt);
	if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
	mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
	}
	break;
	}
	}

	if (mLogger.isDebugEnabled()) {
	mLogger.debug("Parsed " + count + " articles, found linkback=" + mFound);
	}
	}

	//------------------------------------------------------------------------
	/**
	* Returns the excerpt.
	*
	* @return String
	*/
	public String getExcerpt() {
	return mExcerpt;
	}

	//------------------------------------------------------------------------
	/**
	* Returns the title.
	*
	* @return String
	*/
	public String getTitle() {
	return mTitle;
	}

	//------------------------------------------------------------------------
	/**
	* Returns the permalink.
	*
	* @return String
	*/
	public String getPermalink() {
	return mPermalink;
	}

	//------------------------------------------------------------------------
	/**
	* Sets the permalink.
	*
	* @param permalink
	* The permalink to set
	*/
	public void setPermalink(String permalink)
	{
	mPermalink = permalink;
	}

	/////////////////////////////////////////////////////////////////////////

	/**
	* Parser callback that finds title and excerpt. As we walk through the HTML
	* tags, we keep track of the most recently encountered divider tag in the
	* mStart field. Once we find the referring permalink, we set the mFound
	* flag. After that, we look for the next divider tag and save it's position
	* in the mEnd field.
	*/
	private final class LinkbackCallback extends ParserCallback
	{
	// Dividers
	private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
	Tag.BLOCKQUOTE, Tag.P, Tag.LI,
	Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
	Tag.H2, Tag.H3, Tag.H4, Tag.H5,
	Tag.H6 };

	private List mList = Arrays.asList(mDivTags);

	private Tag mCurrentTag = null;

	/**
	* Look for divider tags and for the permalink.
	*
	* @param tag
	* HTML tag
	* @param atts
	* Attributes of that tag
	* @param pos
	* Tag's position in file
	*/
	public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos)
	{
	if (mList.contains(tag) && !mFound)
	{
	mStart = pos;
	}
	else if (mList.contains(tag) && mFound && mEnd == 0)
	{
	mEnd = pos;
	}
	else if (tag.equals(Tag.A))
	{
	String href = (String) atts.getAttribute(HTML.Attribute.HREF);
	if (href == null) {
	return;
	}
	int hashPos = href.lastIndexOf('#');
	if (hashPos != -1)
	{
	href = href.substring(0, hashPos);
	}
	if (href != null
	&& (href.equals(mRequestURL) \|\| href
	.equals(mRequestURLWWW)))
	{
	mFound = true;
	}
	}
	mCurrentTag = tag;
	}

	/**
	* Needed to handle SPAN tag.
	*/
	public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos)
	{
	if (mList.contains(tag) && mFound && mEnd == 0)
	{
	mEnd = pos;
	}
	else if (tag.equals(Tag.LINK))
	{
	// Look out for RSS autodiscovery link
	String title = (String) atts.getAttribute(HTML.Attribute.TITLE);
	String type = (String) atts.getAttribute(HTML.Attribute.TYPE);
	if (title != null && type != null
	&& type.equals("application/rss+xml")
	&& title.equals("RSS"))
	{
	mRssLink = (String) atts.getAttribute(HTML.Attribute.HREF);

	if (mLogger.isDebugEnabled())
	{
	mLogger.debug("Found RSS link " + mRssLink);
	}

	if (mRssLink.startsWith("/") && mRssLink.length() > 1)
	{
	try
	{
	URL url = new URL(mRefererURL);
	mRssLink = url.getProtocol() + "://"
	+ url.getHost() + ":" + url.getPort()
	+ mRssLink;
	}
	catch (MalformedURLException e)
	{
	mRssLink = null;
	if (mLogger.isDebugEnabled())
	{
	mLogger.debug("Determining RSS URL", e);
	}
	}
	}
	else if (!mRssLink.startsWith("http"))
	{
	int slash = mRefererURL.lastIndexOf('/');
	if (slash != -1)
	{
	mRssLink = mRefererURL.substring(0, slash) + "/"
	+ mRssLink;
	}
	}
	if (mLogger.isDebugEnabled())
	{
	mLogger.debug("Qualified RSS link is " + mRssLink);
	}
	}
	}
	}

	/**
	* Stop at the very first divider tag after the permalink.
	*
	* @param tag
	* End tag
	* @param pos
	* Position in HTML file
	*/
	public void handleEndTag(Tag tag, int pos)
	{
	if (mList.contains(tag) && mFound && mEnd == 0)
	{
	mEnd = pos;
	}
	else if (mList.contains(tag) && !mFound)
	{
	mStart = pos;
	}
	else
	{
	mCurrentTag = null;
	}
	}

	/**
	* Get the page title
	*/
	public void handleText(char[] data, int pos)
	{
	if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
	{
	String newText = new String(data);
	if (mTitle.length() < DESIRED_TITLE_LENGTH)
	{
	mTitle += newText;
	}
	}
	}
	}
	}