| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.analysis.lang; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.util.Enumeration; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.Properties; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.metadata.Metadata; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.parse.HTMLMetaTags; |
| import org.apache.nutch.parse.HtmlParseFilter; |
| import org.apache.nutch.parse.Parse; |
| import org.apache.nutch.parse.ParseResult; |
| import org.apache.nutch.protocol.Content; |
| import org.apache.nutch.util.NodeWalker; |
| import org.apache.tika.language.LanguageIdentifier; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.w3c.dom.DocumentFragment; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.NamedNodeMap; |
| import org.w3c.dom.Node; |
| |
| public class HTMLLanguageParser implements HtmlParseFilter { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private int detect = -1, identify = -1; |
| |
| private int contentMaxlength = -1; |
| |
| private boolean onlyCertain = false; |
| |
| /* A static Map of ISO-639 language codes */ |
| private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>(); |
| static { |
| try { |
| Properties p = new Properties(); |
| p.load(HTMLLanguageParser.class |
| .getResourceAsStream("langmappings.properties")); |
| Enumeration<?> keys = p.keys(); |
| while (keys.hasMoreElements()) { |
| String key = (String) keys.nextElement(); |
| String[] values = p.getProperty(key).split(",", -1); |
| LANGUAGES_MAP.put(key, key); |
| for (int i = 0; i < values.length; i++) { |
| LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key); |
| } |
| } |
| } catch (Exception e) { |
| if (LOG.isErrorEnabled()) { |
| LOG.error(e.toString()); |
| } |
| } |
| } |
| |
| private Configuration conf; |
| |
| /** |
| * Scan the HTML document looking at possible indications of content language<br> |
| * <ul> |
| * <li>1. html lang attribute |
| * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta |
| * dc.language |
| * (http://dublincore.org/documents/2000/07/16/usageguide/qualified |
| * -html.shtml#language) <li>3. meta http-equiv (content-language) |
| * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br></ul> |
| */ |
| public ParseResult filter(Content content, ParseResult parseResult, |
| HTMLMetaTags metaTags, DocumentFragment doc) { |
| String lang = null; |
| |
| Parse parse = parseResult.get(content.getUrl()); |
| |
| if (detect >= 0 && identify < 0) { |
| lang = detectLanguage(parse, doc); |
| } else if (detect < 0 && identify >= 0) { |
| lang = identifyLanguage(parse); |
| } else if (detect < identify) { |
| lang = detectLanguage(parse, doc); |
| if (lang == null) { |
| lang = identifyLanguage(parse); |
| } |
| } else if (identify < detect) { |
| lang = identifyLanguage(parse); |
| if (lang == null) { |
| lang = detectLanguage(parse, doc); |
| } |
| } else { |
| LOG.warn("No configuration for language extraction policy is provided"); |
| return parseResult; |
| } |
| |
| if (lang != null) { |
| parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); |
| return parseResult; |
| } |
| |
| return parseResult; |
| } |
| |
| /** Try to find the document's language from page headers and metadata */ |
| private String detectLanguage(Parse page, DocumentFragment doc) { |
| String lang = getLanguageFromMetadata(page.getData().getParseMeta()); |
| if (lang == null) { |
| LanguageParser parser = new LanguageParser(doc); |
| lang = parser.getLanguage(); |
| } |
| |
| if (lang != null) { |
| return lang; |
| } |
| |
| lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); |
| |
| return lang; |
| } |
| |
| /** Use statistical language identification to extract page language */ |
| private String identifyLanguage(Parse parse) { |
| StringBuilder text = new StringBuilder(); |
| if (parse == null) |
| return null; |
| |
| String title = parse.getData().getTitle(); |
| if (title != null) { |
| text.append(title.toString()); |
| } |
| |
| String content = parse.getText(); |
| if (content != null) { |
| text.append(" ").append(content.toString()); |
| } |
| |
| // trim content? |
| String titleandcontent = text.toString(); |
| |
| if (this.contentMaxlength != -1 |
| && titleandcontent.length() > this.contentMaxlength) |
| titleandcontent = titleandcontent.substring(0, contentMaxlength); |
| |
| LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent); |
| |
| if (onlyCertain) { |
| if (identifier.isReasonablyCertain()) |
| return identifier.getLanguage(); |
| else |
| return null; |
| } |
| return identifier.getLanguage(); |
| } |
| |
| // Check in the metadata whether the language has already been stored there |
| // by Tika |
| private static String getLanguageFromMetadata(Metadata meta) { |
| if (meta == null) |
| return null; |
| // dublin core |
| String lang = meta.get("dc.language"); |
| if (lang != null) |
| return lang; |
| // meta content-language |
| lang = meta.get("content-language"); |
| if (lang != null) |
| return lang; |
| // lang attribute |
| return meta.get("lang"); |
| } |
| |
| static class LanguageParser { |
| |
| private String dublinCore = null; |
| private String htmlAttribute = null; |
| private String httpEquiv = null; |
| private String language = null; |
| |
| LanguageParser(Node node) { |
| parse(node); |
| if (htmlAttribute != null) { |
| language = htmlAttribute; |
| } else if (dublinCore != null) { |
| language = dublinCore; |
| } else { |
| language = httpEquiv; |
| } |
| } |
| |
| String getLanguage() { |
| return language; |
| } |
| |
| void parse(Node node) { |
| |
| NodeWalker walker = new NodeWalker(node); |
| while (walker.hasNext()) { |
| |
| Node currentNode = walker.nextNode(); |
| String nodeName = currentNode.getNodeName(); |
| short nodeType = currentNode.getNodeType(); |
| |
| if (nodeType == Node.ELEMENT_NODE) { |
| |
| // Check for the lang HTML attribute |
| if (htmlAttribute == null) { |
| htmlAttribute = parseLanguage(((Element) currentNode) |
| .getAttribute("lang")); |
| } |
| |
| // Check for Meta |
| if ("meta".equalsIgnoreCase(nodeName)) { |
| NamedNodeMap attrs = currentNode.getAttributes(); |
| |
| // Check for the dc.language Meta |
| if (dublinCore == null) { |
| for (int i = 0; i < attrs.getLength(); i++) { |
| Node attrnode = attrs.item(i); |
| if ("name".equalsIgnoreCase(attrnode.getNodeName())) { |
| if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) { |
| Node valueattr = attrs.getNamedItem("content"); |
| if (valueattr != null) { |
| dublinCore = parseLanguage(valueattr.getNodeValue()); |
| } |
| } |
| } |
| } |
| } |
| |
| // Check for the http-equiv content-language |
| if (httpEquiv == null) { |
| for (int i = 0; i < attrs.getLength(); i++) { |
| Node attrnode = attrs.item(i); |
| if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) { |
| if ("content-language".equals(attrnode.getNodeValue() |
| .toLowerCase())) { |
| Node valueattr = attrs.getNamedItem("content"); |
| if (valueattr != null) { |
| httpEquiv = parseLanguage(valueattr.getNodeValue()); |
| } |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| if ((dublinCore != null) && (htmlAttribute != null) |
| && (httpEquiv != null)) { |
| return; |
| } |
| } |
| } |
| |
| /** |
| * Parse a language string and return an ISO 639 primary code, or |
| * <code>null</code> if something wrong occurs, or if no language is found. |
| */ |
| final static String parseLanguage(String lang) { |
| |
| if (lang == null) { |
| return null; |
| } |
| |
| String code = null; |
| String language = null; |
| |
| // First, split multi-valued values |
| String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1); |
| |
| int i = 0; |
| while ((language == null) && (i < langs.length)) { |
| // Then, get the primary code |
| code = langs[i].split("-")[0]; |
| code = code.split("_")[0]; |
| // Find the ISO 639 code |
| language = (String) LANGUAGES_MAP.get(code.toLowerCase()); |
| i++; |
| } |
| |
| return language; |
| } |
| |
| } |
| |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| contentMaxlength = conf.getInt("lang.analyze.max.length", -1); |
| onlyCertain = conf.getBoolean("lang.identification.only.certain", false); |
| String[] policy = conf.getStrings("lang.extraction.policy"); |
| for (int i = 0; i < policy.length; i++) { |
| if (policy[i].equals("detect")) { |
| detect = i; |
| } else if (policy[i].equals("identify")) { |
| identify = i; |
| } |
| } |
| } |
| |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| } |