blob: 567dae60fb84b48c11f0bbf45102a461fab367b9 [file] [log] [blame]
// LUCENENET TODO: Use HTML Agility pack instead of SAX ?
using J2N.Collections.Generic.Extensions;
using Sax;
using Sax.Helpers;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Benchmarks.ByTask.Feeds
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Simple HTML Parser extracting title, meta tags, and body text
/// that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>.
/// </summary>
public class DemoHTMLParser : IHTMLParser
{
/// <summary>The actual parser to read HTML documents.</summary>
public sealed class Parser
{
private readonly IDictionary<string, string> metaTags = new Dictionary<string, string>();
private readonly string title, body;
// LUCENENET specific - expose field through property
public IDictionary<string, string> MetaTags
{
get { return metaTags; }
}
// LUCENENET specific - expose field through property
public string Title
{
get { return title; }
}
// LUCENENET specific - expose field through property
public string Body
{
get { return body; }
}
public Parser(TextReader reader)
: this(new InputSource(reader))
{
}
public Parser(InputSource source)
{
TagSoup.Parser parser = new TagSoup.Parser();
parser.SetFeature(TagSoup.Parser.NAMESPACES_FEATURE, true);
StringBuilder title = new StringBuilder(), body = new StringBuilder();
DefaultHandler handler = new DefaultHandlerAnonymousHelper(this, title, body);
parser.ContentHandler = handler;
parser.ErrorHandler = handler;
parser.Parse(source);
// the javacc-based parser trimmed title (which should be done for HTML in all cases):
this.title = title.ToString().Trim();
// assign body text
this.body = body.ToString();
}
private class DefaultHandlerAnonymousHelper : DefaultHandler
{
private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0;
private readonly Parser outerInstance;
private readonly StringBuilder title;
private readonly StringBuilder body;
public DefaultHandlerAnonymousHelper(Parser outerInstance, StringBuilder title, StringBuilder body)
{
this.outerInstance = outerInstance;
this.title = title;
this.body = body;
}
public override void StartElement(string uri, string localName, string qName, IAttributes atts)
{
if (inHEAD > 0)
{
if ("title".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inTITLE++;
}
else
{
if ("meta".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
string name = atts.GetValue("name");
if (name == null)
{
name = atts.GetValue("http-equiv");
}
string val = atts.GetValue("content");
if (name != null && val != null)
{
outerInstance.metaTags[name.ToLowerInvariant()] = val;
}
}
}
}
else if (inBODY > 0)
{
if (SUPPRESS_ELEMENTS.Contains(localName))
{
suppressed++;
}
else if ("img".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
// the original javacc-based parser preserved <IMG alt="..."/>
// attribute as body text in [] parenthesis:
string alt = atts.GetValue("alt");
if (alt != null)
{
body.Append('[').Append(alt).Append(']');
}
}
}
else if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inBODY++;
}
else if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inHEAD++;
}
else if ("frameset".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
throw new SAXException("This parser does not support HTML framesets.");
}
}
public override void EndElement(string uri, string localName, string qName)
{
if (inBODY > 0)
{
if ("body".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inBODY--;
}
else if (ENDLINE_ELEMENTS.Contains(localName))
{
body.Append('\n');
}
else if (SUPPRESS_ELEMENTS.Contains(localName))
{
suppressed--;
}
}
else if (inHEAD > 0)
{
if ("head".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inHEAD--;
}
else if (inTITLE > 0 && "title".Equals(localName, StringComparison.OrdinalIgnoreCase))
{
inTITLE--;
}
}
}
public override void Characters(char[] ch, int start, int length)
{
if (inBODY > 0 && suppressed == 0)
{
body.Append(ch, start, length);
}
else if (inTITLE > 0)
{
title.Append(ch, start, length);
}
}
public override InputSource ResolveEntity(string publicId, string systemId)
{
// disable network access caused by DTDs
return new InputSource(new StringReader(""));
}
}
private static ISet<string> CreateElementNameSet(params string[] names)
{
return new JCG.HashSet<string>(names).AsReadOnly();
}
/// <summary>HTML elements that cause a line break (they are block-elements).</summary>
internal static readonly ISet<string> ENDLINE_ELEMENTS = CreateElementNameSet(
"p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl",
"pre", "hr", "blockquote", "address", "fieldset", "table", "form",
"noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option"
);
/// <summary>HTML elements with contents that are ignored.</summary>
internal static readonly ISet<string> SUPPRESS_ELEMENTS = CreateElementNameSet(
"style", "script"
);
}
public virtual DocData Parse(DocData docData, string name, DateTime? date, TextReader reader, TrecContentSource trecSrc)
{
try
{
return Parse(docData, name, date, new InputSource(reader), trecSrc);
}
catch (SAXException saxe)
{
throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
}
}
public virtual DocData Parse(DocData docData, string name, DateTime? date, InputSource source, TrecContentSource trecSrc)
{
Parser p = new Parser(source);
// properties
IDictionary<string, string> props = p.MetaTags;
string dateStr;
if (props.TryGetValue("date", out dateStr) && dateStr != null)
{
DateTime? newDate = trecSrc.ParseDate(dateStr);
if (newDate != null)
{
date = newDate;
}
}
docData.Clear();
docData.Name = name;
docData.Body = p.Body;
docData.Title = p.Title;
docData.Props = props;
docData.SetDate(date);
return docData;
}
}
}