| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.benchmark.byTask.feeds; |
| |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Date; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.Properties; |
| import java.util.Set; |
| |
| import org.cyberneko.html.parsers.SAXParser; |
| |
| import org.xml.sax.Attributes; |
| import org.xml.sax.InputSource; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.DefaultHandler; |
| |
| /** |
| * Simple HTML Parser extracting title, meta tags, and body text |
| * that is based on <a href="http://nekohtml.sourceforge.net/">NekoHTML</a>. |
| */ |
| public class DemoHTMLParser implements HTMLParser { |
| |
| /** The actual parser to read HTML documents */ |
| public static final class Parser { |
| |
| public final Properties metaTags = new Properties(); |
| public final String title, body; |
| |
| public Parser(Reader reader) throws IOException, SAXException { |
| this(new InputSource(reader)); |
| } |
| |
| public Parser(InputSource source) throws IOException, SAXException { |
| final SAXParser parser = new SAXParser(); |
| parser.setFeature("http://xml.org/sax/features/namespaces", true); |
| parser.setFeature("http://cyberneko.org/html/features/balance-tags", true); |
| parser.setFeature("http://cyberneko.org/html/features/report-errors", false); |
| parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); |
| parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); |
| |
| final StringBuilder title = new StringBuilder(), body = new StringBuilder(); |
| final DefaultHandler handler = new DefaultHandler() { |
| private int inBODY = 0, inHEAD = 0, inTITLE = 0, suppressed = 0; |
| |
| @Override |
| public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { |
| if (inHEAD > 0) { |
| if ("title".equals(localName)) { |
| inTITLE++; |
| } else { |
| if ("meta".equals(localName)) { |
| String name = atts.getValue("name"); |
| if (name == null) { |
| name = atts.getValue("http-equiv"); |
| } |
| final String val = atts.getValue("content"); |
| if (name != null && val != null) { |
| metaTags.setProperty(name.toLowerCase(Locale.ROOT), val); |
| } |
| } |
| } |
| } else if (inBODY > 0) { |
| if (SUPPRESS_ELEMENTS.contains(localName)) { |
| suppressed++; |
| } else if ("img".equals(localName)) { |
| // the original javacc-based parser preserved <IMG alt="..."/> |
| // attribute as body text in [] parenthesis: |
| final String alt = atts.getValue("alt"); |
| if (alt != null) { |
| body.append('[').append(alt).append(']'); |
| } |
| } |
| } else if ("body".equals(localName)) { |
| inBODY++; |
| } else if ("head".equals(localName)) { |
| inHEAD++; |
| } else if ("frameset".equals(localName)) { |
| throw new SAXException("This parser does not support HTML framesets."); |
| } |
| } |
| |
| @Override |
| public void endElement(String namespaceURI, String localName, String qName) throws SAXException { |
| if (inBODY > 0) { |
| if ("body".equals(localName)) { |
| inBODY--; |
| } else if (ENDLINE_ELEMENTS.contains(localName)) { |
| body.append('\n'); |
| } else if (SUPPRESS_ELEMENTS.contains(localName)) { |
| suppressed--; |
| } |
| } else if (inHEAD > 0) { |
| if ("head".equals(localName)) { |
| inHEAD--; |
| } else if (inTITLE > 0 && "title".equals(localName)) { |
| inTITLE--; |
| } |
| } |
| } |
| |
| @Override |
| public void characters(char[] ch, int start, int length) throws SAXException { |
| if (inBODY > 0 && suppressed == 0) { |
| body.append(ch, start, length); |
| } else if (inTITLE > 0) { |
| title.append(ch, start, length); |
| } |
| } |
| |
| @Override |
| public InputSource resolveEntity(String publicId, String systemId) { |
| // disable network access caused by DTDs |
| return new InputSource(new StringReader("")); |
| } |
| }; |
| |
| parser.setContentHandler(handler); |
| parser.setErrorHandler(handler); |
| parser.parse(source); |
| |
| // the javacc-based parser trimmed title (which should be done for HTML in all cases): |
| this.title = title.toString().trim(); |
| |
| // assign body text |
| this.body = body.toString(); |
| } |
| |
| private static final Set<String> createElementNameSet(String... names) { |
| return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(names))); |
| } |
| |
| /** HTML elements that cause a line break (they are block-elements) */ |
| static final Set<String> ENDLINE_ELEMENTS = createElementNameSet( |
| "p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", |
| "pre", "hr", "blockquote", "address", "fieldset", "table", "form", |
| "noscript", "li", "dt", "dd", "noframes", "br", "tr", "select", "option" |
| ); |
| |
| /** HTML elements with contents that are ignored */ |
| static final Set<String> SUPPRESS_ELEMENTS = createElementNameSet( |
| "style", "script" |
| ); |
| } |
| |
| @Override |
| public DocData parse(DocData docData, String name, Date date, Reader reader, TrecContentSource trecSrc) throws IOException { |
| try { |
| return parse(docData, name, date, new InputSource(reader), trecSrc); |
| } catch (SAXException saxe) { |
| throw new IOException("SAX exception occurred while parsing HTML document.", saxe); |
| } |
| } |
| |
| public DocData parse(DocData docData, String name, Date date, InputSource source, TrecContentSource trecSrc) throws IOException, SAXException { |
| final Parser p = new Parser(source); |
| |
| // properties |
| final Properties props = p.metaTags; |
| String dateStr = props.getProperty("date"); |
| if (dateStr != null) { |
| final Date newDate = trecSrc.parseDate(dateStr); |
| if (newDate != null) { |
| date = newDate; |
| } |
| } |
| |
| docData.clear(); |
| docData.setName(name); |
| docData.setBody(p.body); |
| docData.setTitle(p.title); |
| docData.setProps(props); |
| docData.setDate(date); |
| return docData; |
| } |
| |
| } |