| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.parse.tika; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.ByteArrayInputStream; |
| import java.net.MalformedURLException; |
| import java.net.URL; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.html.dom.HTMLDocumentImpl; |
| import org.apache.nutch.metadata.Nutch; |
| import org.apache.nutch.parse.HTMLMetaTags; |
| import org.apache.nutch.parse.HtmlParseFilters; |
| import org.apache.nutch.parse.Outlink; |
| import org.apache.nutch.parse.OutlinkExtractor; |
| import org.apache.nutch.parse.Parse; |
| import org.apache.nutch.parse.ParseData; |
| import org.apache.nutch.parse.ParseImpl; |
| import org.apache.nutch.parse.ParseResult; |
| import org.apache.nutch.parse.ParseStatus; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.tika.config.TikaConfig; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.html.BoilerpipeContentHandler; |
| import org.apache.tika.parser.AutoDetectParser; |
| import org.apache.tika.parser.CompositeParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.parser.html.HtmlMapper; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| import org.apache.tika.sax.Link; |
| import org.apache.tika.sax.LinkContentHandler; |
| import org.apache.tika.sax.TeeContentHandler; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.w3c.dom.DocumentFragment; |
| import org.xml.sax.ContentHandler; |
| |
| /** |
| * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML |
| * representation returned by Tika as SAX events |
| */ |
| public class TikaParser implements org.apache.nutch.parse.Parser { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private Configuration conf; |
| private TikaConfig tikaConfig = null; |
| private DOMContentUtils utils; |
| private HtmlParseFilters htmlParseFilters; |
| private String cachingPolicy; |
| private HtmlMapper HTMLMapper; |
| private boolean parseEmbedded = true; |
| private boolean upperCaseElementNames = true; |
| private boolean useBoilerpipe; |
| private String boilerpipeExtractorName; |
| private Set<String> boilerpipeMimeTypes; |
| |
| @Override |
| public ParseResult getParse(Content content) { |
| HTMLDocumentImpl doc = new HTMLDocumentImpl(); |
| doc.setErrorChecking(false); |
| DocumentFragment root = doc.createDocumentFragment(); |
| |
| return getParse(content, doc, root); |
| } |
| |
| @SuppressWarnings("deprecation") |
| ParseResult getParse(Content content, HTMLDocumentImpl doc, |
| DocumentFragment root) { |
| String mimeType = content.getContentType(); |
| |
| URL base; |
| try { |
| base = new URL(content.getBaseUrl()); |
| } catch (MalformedURLException e) { |
| return new ParseStatus(e).getEmptyParseResult(content.getUrl(), |
| getConf()); |
| } |
| |
| // get the right parser using the mime type as a clue |
| CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser(); |
| Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType)); |
| if (parser == null) { |
| String message = "Can't retrieve Tika parser for mime-type " + mimeType; |
| LOG.error(message); |
| return new ParseStatus(ParseStatus.FAILED, message) |
| .getEmptyParseResult(content.getUrl(), getConf()); |
| } |
| |
| LOG.debug("Using Tika parser {} for mime-type {}.", |
| parser.getClass().getName(), mimeType); |
| |
| byte[] raw = content.getContent(); |
| Metadata tikamd = new Metadata(); |
| |
| ContentHandler domHandler; |
| |
| // Check whether to use Tika's BoilerplateContentHandler |
| if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) { |
| BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler( |
| (ContentHandler) new DOMBuilder(doc, root), |
| BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); |
| bpHandler.setIncludeMarkup(true); |
| domHandler = (ContentHandler) bpHandler; |
| } else { |
| DOMBuilder domBuilder = new DOMBuilder(doc, root); |
| domBuilder.setUpperCaseElementNames(upperCaseElementNames); |
| domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML); |
| domHandler = (ContentHandler) domBuilder; |
| } |
| |
| LinkContentHandler linkContentHandler = new LinkContentHandler(); |
| |
| ParseContext context = new ParseContext(); |
| if (parseEmbedded) { |
| context.set(Parser.class, new AutoDetectParser(tikaConfig)); |
| } |
| |
| TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, |
| linkContentHandler); |
| |
| if (HTMLMapper != null) |
| context.set(HtmlMapper.class, HTMLMapper); |
| tikamd.set(Metadata.CONTENT_TYPE, mimeType); |
| try { |
| parser.parse(new ByteArrayInputStream(raw), |
| (ContentHandler) teeContentHandler, tikamd, context); |
| } catch (Exception e) { |
| LOG.error("Error parsing " + content.getUrl(), e); |
| return new ParseStatus(ParseStatus.FAILED, e.getMessage()) |
| .getEmptyParseResult(content.getUrl(), getConf()); |
| } |
| |
| HTMLMetaTags metaTags = new HTMLMetaTags(); |
| String text = ""; |
| String title = ""; |
| Outlink[] outlinks = new Outlink[0]; |
| org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); |
| |
| // we have converted the sax events generated by Tika into a DOM object |
| // so we can now use the usual HTML resources from Nutch |
| // get meta directives |
| HTMLMetaProcessor.getMetaTags(metaTags, root, base); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); |
| } |
| |
| // check meta directives |
| if (!metaTags.getNoIndex()) { // okay to index |
| StringBuffer sb = new StringBuffer(); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting text..."); |
| } |
| utils.getText(sb, root); // extract text |
| text = sb.toString(); |
| sb.setLength(0); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting title..."); |
| } |
| utils.getTitle(sb, root); // extract title |
| title = sb.toString().trim(); |
| } |
| |
| if (!metaTags.getNoFollow()) { // okay to follow links |
| ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks |
| URL baseTag = base; |
| String baseTagHref = tikamd.get("Content-Location"); |
| if (baseTagHref != null) { |
| try { |
| baseTag = new URL(base, baseTagHref); |
| } catch (MalformedURLException e) { |
| LOG.trace("Invalid <base href=\"{}\">", baseTagHref); |
| } |
| } |
| if (LOG.isTraceEnabled()) { |
| LOG.trace("Getting links (base URL = {}) ...", baseTag); |
| } |
| |
| // pre-1233 outlink extraction |
| // utils.getOutlinks(baseTag != null ? baseTag : base, l, root); |
| // Get outlinks from Tika |
| List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); |
| utils.getOutlinks(baseTag, l, tikaExtractedOutlinks); |
| outlinks = l.toArray(new Outlink[l.size()]); |
| if (LOG.isTraceEnabled()) { |
| LOG.trace( |
| "found " + outlinks.length + " outlinks in " + content.getUrl()); |
| } |
| } |
| |
| // populate Nutch metadata with Tika metadata |
| String[] TikaMDNames = tikamd.names(); |
| for (String tikaMDName : TikaMDNames) { |
| if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) |
| continue; |
| String[] values = tikamd.getValues(tikaMDName); |
| for (String v : values) { |
| nutchMetadata.add(tikaMDName, v); |
| if (tikaMDName.equalsIgnoreCase(Nutch.ROBOTS_METATAG) |
| && nutchMetadata.get(Nutch.ROBOTS_METATAG) == null) { |
| // NUTCH-2720 force lowercase robots directive |
| nutchMetadata.add(Nutch.ROBOTS_METATAG, v); |
| } |
| } |
| } |
| |
| // no outlinks? try OutlinkExtractor e.g works for mime types where no |
| // explicit markup for anchors |
| |
| if (outlinks.length == 0) { |
| outlinks = OutlinkExtractor.getOutlinks(text, getConf()); |
| } |
| |
| ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); |
| if (metaTags.getRefresh()) { |
| status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); |
| status.setArgs(new String[] { metaTags.getRefreshHref().toString(), |
| Integer.toString(metaTags.getRefreshTime()) }); |
| } |
| ParseData parseData = new ParseData(status, title, outlinks, |
| content.getMetadata(), nutchMetadata); |
| ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), |
| new ParseImpl(text, parseData)); |
| |
| // run filters on parse |
| ParseResult filteredParse = this.htmlParseFilters.filter(content, |
| parseResult, metaTags, root); |
| if (metaTags.getNoCache()) { // not okay to cache |
| for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) |
| entry.getValue().getData().getParseMeta() |
| .set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); |
| } |
| return filteredParse; |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| this.tikaConfig = null; |
| |
| // do we want a custom Tika configuration file |
| // deprecated since Tika 0.7 which is based on |
| // a service provider based configuration |
| String customConfFile = conf.get("tika.config.file"); |
| if (customConfFile != null) { |
| try { |
| // see if a Tika config file can be found in the job file |
| URL customTikaConfig = conf.getResource(customConfFile); |
| if (customTikaConfig != null) { |
| tikaConfig = new TikaConfig(customTikaConfig, |
| this.getClass().getClassLoader()); |
| } |
| } catch (Exception e1) { |
| String message = "Problem loading custom Tika configuration from " |
| + customConfFile; |
| LOG.error(message, e1); |
| } |
| } |
| if (tikaConfig == null) { |
| try { |
| tikaConfig = new TikaConfig(this.getClass().getClassLoader()); |
| } catch (Exception e2) { |
| String message = "Problem loading default Tika configuration"; |
| LOG.error(message, e2); |
| } |
| } |
| |
| // use a custom htmlmapper |
| String htmlmapperClassName = conf.get("tika.htmlmapper.classname"); |
| if (StringUtils.isNotBlank(htmlmapperClassName)) { |
| try { |
| Class<?> HTMLMapperClass = Class.forName(htmlmapperClassName); |
| boolean interfaceOK = HtmlMapper.class |
| .isAssignableFrom(HTMLMapperClass); |
| if (!interfaceOK) { |
| throw new RuntimeException("Class " + htmlmapperClassName |
| + " does not implement HtmlMapper"); |
| } |
| HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor() |
| .newInstance(); |
| } catch (Exception e) { |
| String message = "Can't generate instance for class " |
| + htmlmapperClassName; |
| LOG.error(message); |
| throw new RuntimeException(message); |
| } |
| } |
| |
| htmlParseFilters = new HtmlParseFilters(conf); |
| utils = new DOMContentUtils(conf); |
| cachingPolicy = conf.get("parser.caching.forbidden.policy", |
| Nutch.CACHING_FORBIDDEN_CONTENT); |
| upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names", |
| true); |
| useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe"); |
| boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm", |
| "ArticleExtractor"); |
| boilerpipeMimeTypes = new HashSet<>(Arrays |
| .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types", |
| "text/html", "application/xhtml+xml"))); |
| parseEmbedded = conf.getBoolean("tika.parse.embedded", true); |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| } |