| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.encoding; |
| |
| import org.apache.tika.detect.TextStatistics; |
| import org.apache.tika.parser.txt.CharsetDetector; |
| import org.apache.tika.parser.txt.CharsetMatch; |
| import org.jsoup.nodes.Comment; |
| import org.jsoup.nodes.DataNode; |
| import org.jsoup.nodes.Document; |
| import org.jsoup.nodes.DocumentType; |
| import org.jsoup.nodes.Element; |
| import org.jsoup.nodes.Node; |
| import org.jsoup.nodes.PseudoTextElement; |
| import org.jsoup.nodes.TextNode; |
| import org.jsoup.parser.ParseError; |
| import org.jsoup.parser.ParseErrorList; |
| import org.jsoup.parser.Parser; |
| import org.jsoup.select.NodeTraversor; |
| import org.jsoup.select.NodeVisitor; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.charset.Charset; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| import static java.nio.charset.StandardCharsets.ISO_8859_1; |
| |
| /** |
| * An implementation of {@link EncodingDetector} based on |
| * <a href="http://tika.apache.org/">Apache Tika</a>. |
| * |
| * @author Michele Mostarda ( michele.mostarda@gmail.com ) |
| * @author Davide Palmisano ( dpalmisano@gmail.com ) |
| * @author Hans Brende (hansbrende@apache.org) |
| * @version $Id$ |
| */ |
| public class TikaEncodingDetector implements EncodingDetector { |
| |
| @Override |
| public String guessEncoding(InputStream input) throws IOException { |
| return guessEncoding(input, (String)null); |
| } |
| |
| private static final String TAG_CHARS = "< />"; |
| private static final byte[] TAG_BYTES = TAG_CHARS.getBytes(UTF_8); |
| private static final Node[] EMPTY_NODES = new Node[0]; |
| |
| private static Charset guessEncoding(InputStream is, Charset declared) throws IOException { |
| if (!is.markSupported()) { |
| is = new BufferedInputStream(is); |
| } |
| |
| TextStatistics stats = computeAndReset(is, EncodingUtils::stats); |
| |
| //we've overridden the looksLikeUTF8() method to be 100% precise, as in jchardet |
| if (stats.looksLikeUTF8()) { |
| // > 92% of the web is UTF-8. Do not risk false positives from other charsets. |
| // See https://issues.apache.org/jira/browse/TIKA-2771 |
| // and https://issues.apache.org/jira/browse/TIKA-539 |
| return UTF_8; |
| } |
| |
| declared = EncodingUtils.correctVariant(stats, declared); |
| if (declared != null) { |
| return declared; |
| } |
| |
| // ISO-8859-1 is Java's only "standard charset" which maps 1-to-1 onto the first 256 unicode characters; |
| // use ISO-8859-1 for round-tripping of bytes after stripping html/xml tags from input |
| String iso_8859_1 = computeAndReset(is, EncodingUtils::iso_8859_1); |
| |
| Charset xmlCharset = EncodingUtils.xmlCharset(stats, iso_8859_1); |
| if (xmlCharset != null) { |
| return xmlCharset; |
| } |
| |
| ParseErrorList htmlErrors = ParseErrorList.tracking(Integer.MAX_VALUE); |
| Document doc = parseFragment(iso_8859_1, htmlErrors); |
| |
| Charset htmlCharset = EncodingUtils.htmlCharset(stats, doc); |
| |
| if (htmlCharset != null) { |
| return htmlCharset; |
| } |
| |
| if (stats.countEightBit() == 0) { |
| // All characters are plain ASCII, so it doesn't matter what we choose. |
| return UTF_8; |
| } |
| |
| //HTML & XML tag-stripping is vital for accurate n-gram detection, so use Jsoup instead of icu4j's |
| // "quick and dirty, not 100% accurate" tag-stripping implementation for more accurate results. |
| // Cf. https://issues.apache.org/jira/browse/TIKA-2038 |
| long openTags = countTags(doc); |
| long badTags = htmlErrors.stream().map(ParseError::getErrorMessage) |
| .filter(err -> err != null && err.matches(".*'[</>]'.*")).count(); |
| |
| //condition for filtering input adapted from icu4j's CharsetDetector#MungeInput() |
| boolean filterInput = true; |
| if (openTags < 5 || openTags / 5 < badTags) { |
| filterInput = false; |
| } else { |
| String wholeText = wholeText(doc); |
| if (wholeText.length() < 100 && iso_8859_1.length() > 600) { |
| filterInput = false; |
| } else { |
| iso_8859_1 = wholeText; |
| } |
| } |
| byte[] text = iso_8859_1.getBytes(ISO_8859_1); |
| |
| CharsetDetector icu4j = new CharsetDetector(text.length); |
| icu4j.setText(text); |
| |
| for (CharsetMatch match : icu4j.detectAll()) { |
| try { |
| Charset charset = EncodingUtils.forName(match.getName()); |
| |
| // If we successfully filtered input based on 0x3C and 0x3E, then this must be an ascii-compatible charset |
| // See https://issues.apache.org/jira/browse/TIKA-2771 |
| if (filterInput && !TAG_CHARS.equals(new String(TAG_BYTES, charset))) { |
| continue; |
| } |
| |
| charset = EncodingUtils.correctVariant(stats, charset); |
| if (charset != null) { |
| return charset; |
| } |
| } catch (Exception e) { |
| //ignore; if this charset isn't supported by this platform, it's probably not correct anyway. |
| } |
| } |
| |
| // No bytes are invalid in ISO-8859-1, so this one is always possible if there are no options left. |
| // Also, has second-highest popularity on the web behind UTF-8. |
| return EncodingUtils.correctVariant(stats, ISO_8859_1); |
| } |
| |
| @Override |
| public String guessEncoding(InputStream is, String contentType) throws IOException { |
| Charset charset = EncodingUtils.contentTypeCharset(contentType); |
| return guessEncoding(is, charset).name(); |
| } |
| |
| |
| |
| //////////////////// |
| // STATIC HELPERS // |
| //////////////////// |
| |
| @FunctionalInterface |
| private interface InputStreamFunction<E> { |
| E compute(InputStream is) throws IOException; |
| } |
| |
| private static <E> E computeAndReset(InputStream is, InputStreamFunction<E> function) throws IOException { |
| is.mark(Integer.MAX_VALUE); |
| try { |
| return function.compute(is); |
| } finally { |
| is.reset(); |
| } |
| } |
| |
| private static Document parseFragment(String html, ParseErrorList errors) { |
| Document doc = new Document(""); |
| Node[] childNodes = Parser.parseFragment(html, null, "", errors).toArray(EMPTY_NODES); |
| for (Node node : childNodes) { |
| if (node.parentNode() != null) { |
| node.remove(); |
| } |
| doc.appendChild(node); |
| } |
| return doc; |
| } |
| |
| private static long countTags(Node node) { |
| long[] ret = {0}; |
| NodeTraversor.traverse(new NodeVisitor() { |
| @Override |
| public void head(Node node, int depth) { |
| if (node instanceof Document || node instanceof PseudoTextElement) { |
| //subclasses of Element that don't have start/end tags |
| return; |
| } |
| if (node instanceof Element || node instanceof DocumentType || node instanceof Comment) { |
| ret[0] += node.childNodeSize() == 0 ? 1 : 2; |
| } |
| } |
| @Override |
| public void tail(Node node, int depth) { |
| } |
| }, node); |
| return ret[0]; |
| } |
| |
| private static String wholeText(Node node) { |
| StringBuilder sb = new StringBuilder(); |
| NodeTraversor.traverse(new NodeVisitor() { |
| @Override |
| public void head(Node node, int depth) { |
| if (node instanceof TextNode) { |
| sb.append(((TextNode) node).getWholeText()); |
| } else if (node instanceof DataNode) { |
| String data = ((DataNode) node).getWholeData(); |
| do { |
| //make sure json-ld data is included in text stats |
| //otherwise, ignore css & javascript |
| if ("script".equalsIgnoreCase(node.nodeName())) { |
| if (node.attr("type").toLowerCase(java.util.Locale.ROOT).contains("json")) { |
| sb.append(data); |
| } |
| break; |
| } else if ("style".equalsIgnoreCase(node.nodeName())) { |
| break; |
| } |
| node = node.parentNode(); |
| } while (node != null); |
| } else if (node instanceof Comment) { |
| String data = ((Comment) node).getData(); |
| //avoid comments that are actually processing instructions or xml declarations |
| if (!data.contains("<!") && !data.contains("<?")) { |
| sb.append(data); |
| } |
| } else if (node instanceof Element) { |
| //make sure all microdata itemprop "content" values are taken into consideration |
| sb.append(node.attr("content")); |
| } |
| } |
| @Override |
| public void tail(Node node, int depth) { |
| } |
| }, node); |
| return sb.toString(); |
| } |
| |
| } |