blob: 1b4c16c9ab31f7be503254cbc67004c5ac2d20dd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. The ASF licenses this file to You
* under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. For additional information regarding
* copyright in this work, please see the NOTICE file in the top level
* directory of this distribution.
*/
package org.apache.roller.weblogger.util;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.List;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.Parser;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Parses HTML file for referring linkback title and excerpt.
*
* @author David M Johnson
*/
public class LinkbackExtractor
{
private static Log mLogger = LogFactory.getFactory().getInstance(
LinkbackExtractor.class);
private boolean mFound = false;
private String mTitle = "";
private String mRssLink = null;
private String mExcerpt = null;
private String mPermalink = null;
private int mStart = 0;
private int mEnd = 0;
private String mRequestURL = null;
private String mRequestURLWWW = null;
private String mRefererURL;
private static final int MAX_EXCERPT_CHARS = 500;
private static final int DESIRED_TITLE_LENGTH = 50;
//------------------------------------------------------------------------
/**
* Extract referring page title, excerpt, and permalink.
*
* @param refererURL
* @param requestURL
*/
public LinkbackExtractor(String refererURL, String requestURL) throws IOException {
try {
extractByParsingHtml(refererURL, requestURL);
if (mRssLink != null) {
extractByParsingRss(mRssLink, requestURL);
}
} catch (Exception e) {
if (mLogger.isDebugEnabled()) {
mLogger.debug("Extracting linkback", e);
}
}
}
//------------------------------------------------------------------------
private void extractByParsingHtml(String refererURL, String requestURL) throws IOException {
URL url = new URL(refererURL);
InputStream is = url.openStream();
mRefererURL = refererURL;
if (requestURL.startsWith("http://www.")) {
mRequestURLWWW = requestURL;
mRequestURL = "http://" + mRequestURLWWW.substring(11);
} else {
mRequestURL = requestURL;
mRequestURLWWW = "http://www." + mRequestURL.substring(7);
}
// Trick gets Swing's HTML parser by making its protected getParser() method public
// Ignore inaccurate Sonar complaint about useless overriding method:
// http://jira.codehaus.org/browse/SONARJAVA-287
Parser parser = (new HTMLEditorKit() {
public Parser getParser() {
return super.getParser();
}
}).getParser();
// Read HTML file into string
StringBuilder sb = new StringBuilder();
InputStreamReader isr = new InputStreamReader(is);
BufferedReader br = new BufferedReader(isr);
try {
String line;
while ((line = br.readLine()) != null) {
sb.append(line);
}
} finally {
br.close();
}
// Parse HTML string to find title and start and end position
// of the referring excerpt.
StringReader sr = new StringReader(sb.toString());
parser.parse(sr, new LinkbackCallback(), true);
if (mStart != 0 && mEnd != 0 && mEnd > mStart) {
mExcerpt = sb.toString().substring(mStart, mEnd);
mExcerpt = Utilities.removeHTML(mExcerpt);
if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
}
}
if (mTitle.startsWith(">") && mTitle.length() > 1) {
mTitle = mTitle.substring(1);
}
}
//------------------------------------------------------------------------
private void extractByParsingRss(String rssLink, String requestURL)
throws FeedException, IOException {
SyndFeedInput feedInput = new SyndFeedInput();
SyndFeed feed = feedInput.build(
new InputStreamReader(new URL(rssLink).openStream()));
String feedTitle = feed.getTitle();
int count = 0;
if (mLogger.isDebugEnabled()) {
mLogger.debug("Feed parsed, title: " + feedTitle);
}
for (Object objItem : feed.getEntries()) {
count++;
SyndEntry item = (SyndEntry) objItem;
if (item.getDescription().getValue().contains(requestURL)) {
mFound = true;
mPermalink = item.getLink();
if (feedTitle != null && feedTitle.trim().length() > 0) {
mTitle = feedTitle + ": " + item.getTitle();
} else {
mTitle = item.getTitle();
}
mExcerpt = item.getDescription().getValue();
mExcerpt = Utilities.removeHTML(mExcerpt);
if (mExcerpt.length() > MAX_EXCERPT_CHARS) {
mExcerpt = mExcerpt.substring(0, MAX_EXCERPT_CHARS) + "...";
}
break;
}
}
if (mLogger.isDebugEnabled()) {
mLogger.debug("Parsed " + count + " articles, found linkback=" + mFound);
}
}
//------------------------------------------------------------------------
/**
* Returns the excerpt.
*
* @return String
*/
public String getExcerpt() {
return mExcerpt;
}
//------------------------------------------------------------------------
/**
* Returns the title.
*
* @return String
*/
public String getTitle() {
return mTitle;
}
//------------------------------------------------------------------------
/**
* Returns the permalink.
*
* @return String
*/
public String getPermalink() {
return mPermalink;
}
//------------------------------------------------------------------------
/**
* Sets the permalink.
*
* @param permalink
* The permalink to set
*/
public void setPermalink(String permalink)
{
mPermalink = permalink;
}
/////////////////////////////////////////////////////////////////////////
/**
* Parser callback that finds title and excerpt. As we walk through the HTML
* tags, we keep track of the most recently encountered divider tag in the
* mStart field. Once we find the referring permalink, we set the mFound
* flag. After that, we look for the next divider tag and save it's position
* in the mEnd field.
*/
private final class LinkbackCallback extends ParserCallback
{
// Dividers
private Tag[] mDivTags = { Tag.TD, Tag.DIV, Tag.SPAN,
Tag.BLOCKQUOTE, Tag.P, Tag.LI,
Tag.BR, Tag.HR, Tag.PRE, Tag.H1,
Tag.H2, Tag.H3, Tag.H4, Tag.H5,
Tag.H6 };
private List mList = Arrays.asList(mDivTags);
private Tag mCurrentTag = null;
/**
* Look for divider tags and for the permalink.
*
* @param tag
* HTML tag
* @param atts
* Attributes of that tag
* @param pos
* Tag's position in file
*/
public void handleStartTag(Tag tag, MutableAttributeSet atts, int pos)
{
if (mList.contains(tag) && !mFound)
{
mStart = pos;
}
else if (mList.contains(tag) && mFound && mEnd == 0)
{
mEnd = pos;
}
else if (tag.equals(Tag.A))
{
String href = (String) atts.getAttribute(HTML.Attribute.HREF);
if (href == null) {
return;
}
int hashPos = href.lastIndexOf('#');
if (hashPos != -1)
{
href = href.substring(0, hashPos);
}
if (href != null
&& (href.equals(mRequestURL) || href
.equals(mRequestURLWWW)))
{
mFound = true;
}
}
mCurrentTag = tag;
}
/**
* Needed to handle SPAN tag.
*/
public void handleSimpleTag(Tag tag, MutableAttributeSet atts, int pos)
{
if (mList.contains(tag) && mFound && mEnd == 0)
{
mEnd = pos;
}
else if (tag.equals(Tag.LINK))
{
// Look out for RSS autodiscovery link
String title = (String) atts.getAttribute(HTML.Attribute.TITLE);
String type = (String) atts.getAttribute(HTML.Attribute.TYPE);
if (title != null && type != null
&& type.equals("application/rss+xml")
&& title.equals("RSS"))
{
mRssLink = (String) atts.getAttribute(HTML.Attribute.HREF);
if (mLogger.isDebugEnabled())
{
mLogger.debug("Found RSS link " + mRssLink);
}
if (mRssLink.startsWith("/") && mRssLink.length() > 1)
{
try
{
URL url = new URL(mRefererURL);
mRssLink = url.getProtocol() + "://"
+ url.getHost() + ":" + url.getPort()
+ mRssLink;
}
catch (MalformedURLException e)
{
mRssLink = null;
if (mLogger.isDebugEnabled())
{
mLogger.debug("Determining RSS URL", e);
}
}
}
else if (!mRssLink.startsWith("http"))
{
int slash = mRefererURL.lastIndexOf('/');
if (slash != -1)
{
mRssLink = mRefererURL.substring(0, slash) + "/"
+ mRssLink;
}
}
if (mLogger.isDebugEnabled())
{
mLogger.debug("Qualified RSS link is " + mRssLink);
}
}
}
}
/**
* Stop at the very first divider tag after the permalink.
*
* @param tag
* End tag
* @param pos
* Position in HTML file
*/
public void handleEndTag(Tag tag, int pos)
{
if (mList.contains(tag) && mFound && mEnd == 0)
{
mEnd = pos;
}
else if (mList.contains(tag) && !mFound)
{
mStart = pos;
}
else
{
mCurrentTag = null;
}
}
/**
* Get the page title
*/
public void handleText(char[] data, int pos)
{
if (mCurrentTag != null && mCurrentTag.equals(Tag.TITLE))
{
String newText = new String(data);
if (mTitle.length() < DESIRED_TITLE_LENGTH)
{
mTitle += newText;
}
}
}
}
}