| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.util; |
| |
| import java.nio.ByteBuffer; |
| import java.nio.charset.Charset; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| |
| import org.apache.avro.util.Utf8; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.net.protocols.Response; |
| import org.apache.nutch.storage.WebPage; |
| |
| import com.ibm.icu.text.CharsetDetector; |
| import com.ibm.icu.text.CharsetMatch; |
| |
| /** |
| * A simple class for detecting character encodings. |
| * |
| * <p> |
| * Broadly this encompasses two functions, which are distinctly separate: |
| * |
| * <ol> |
| * <li>Auto detecting a set of "clues" from input text.</li> |
| * <li>Taking a set of clues and making a "best guess" as to the |
| * "real" encoding.</li> |
| * </ol> |
| * </p> |
| * |
| * <p> |
| * A caller will often have some extra information about what the |
| * encoding might be (e.g. from the HTTP header or HTML meta-tags, often |
| * wrong but still potentially useful clues). The types of clues may differ |
| * from caller to caller. Thus a typical calling sequence is: |
| * <ul> |
| * <li>Run step (1) to generate a set of auto-detected clues;</li> |
| * <li>Combine these clues with the caller-dependent "extra clues" |
| * available;</li> |
| * <li>Run step (2) to guess what the most probable answer is.</li> |
| * </p> |
| */ |
| public class EncodingDetector { |
| |
| public final static Utf8 CONTENT_TYPE_UTF8 = new Utf8(Response.CONTENT_TYPE); |
| |
| private class EncodingClue { |
| private final String value; |
| private final String source; |
| private final int confidence; |
| |
| // Constructor for clues with no confidence values (ignore thresholds) |
| public EncodingClue(String value, String source) { |
| this(value, source, NO_THRESHOLD); |
| } |
| |
| public EncodingClue(String value, String source, int confidence) { |
| this.value = value.toLowerCase(); |
| this.source = source; |
| this.confidence = confidence; |
| } |
| |
| @SuppressWarnings("unused") |
| public String getSource() { |
| return source; |
| } |
| |
| @SuppressWarnings("unused") |
| public String getValue() { |
| return value; |
| } |
| |
| @Override |
| public String toString() { |
| return value + " (" + source + |
| ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")"; |
| } |
| |
| public boolean isEmpty() { |
| return (value==null || "".equals(value)); |
| } |
| |
| public boolean meetsThreshold() { |
| return (confidence < 0 || |
| (minConfidence >= 0 && confidence >= minConfidence)); |
| } |
| } |
| |
| public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class); |
| |
| public static final int NO_THRESHOLD = -1; |
| |
| public static final String MIN_CONFIDENCE_KEY = |
| "encodingdetector.charset.min.confidence"; |
| |
| private static final HashMap<String, String> ALIASES = |
| new HashMap<String, String>(); |
| |
| private static final HashSet<String> DETECTABLES = new HashSet<String>(); |
| |
| // CharsetDetector will die without a minimum amount of data. |
| private static final int MIN_LENGTH=4; |
| |
| static { |
| DETECTABLES.add("text/html"); |
| DETECTABLES.add("text/plain"); |
| DETECTABLES.add("text/richtext"); |
| DETECTABLES.add("text/rtf"); |
| DETECTABLES.add("text/sgml"); |
| DETECTABLES.add("text/tab-separated-values"); |
| DETECTABLES.add("text/xml"); |
| DETECTABLES.add("application/rss+xml"); |
| DETECTABLES.add("application/xhtml+xml"); |
| /* |
| * the following map is not an alias mapping table, but |
| * maps character encodings which are often used in mislabelled |
| * documents to their correct encodings. For instance, |
| * there are a lot of documents labelled 'ISO-8859-1' which contain |
| * characters not covered by ISO-8859-1 but covered by windows-1252. |
| * Because windows-1252 is a superset of ISO-8859-1 (sharing code points |
| * for the common part), it's better to treat ISO-8859-1 as |
| * synonymous with windows-1252 than to reject, as invalid, documents |
| * labelled as ISO-8859-1 that have characters outside ISO-8859-1. |
| */ |
| ALIASES.put("ISO-8859-1", "windows-1252"); |
| ALIASES.put("EUC-KR", "x-windows-949"); |
| ALIASES.put("x-EUC-CN", "GB18030"); |
| ALIASES.put("GBK", "GB18030"); |
| //ALIASES.put("Big5", "Big5HKSCS"); |
| //ALIASES.put("TIS620", "Cp874"); |
| //ALIASES.put("ISO-8859-11", "Cp874"); |
| |
| } |
| |
| private final int minConfidence; |
| |
| private final CharsetDetector detector; |
| |
| private final List<EncodingClue> clues; |
| |
| public EncodingDetector(Configuration conf) { |
| minConfidence = conf.getInt(MIN_CONFIDENCE_KEY, -1); |
| detector = new CharsetDetector(); |
| clues = new ArrayList<EncodingClue>(); |
| } |
| |
| public void autoDetectClues(WebPage page, boolean filter) { |
| autoDetectClues(page.getContent(), page.getContentType(), |
| parseCharacterEncoding(page.getFromHeaders(CONTENT_TYPE_UTF8)), filter); |
| } |
| |
| private void autoDetectClues(ByteBuffer dataBuffer, Utf8 typeUtf8, |
| String encoding, boolean filter) { |
| byte[] data = dataBuffer.array(); |
| String type = TableUtil.toString(typeUtf8); |
| |
| if (minConfidence >= 0 && DETECTABLES.contains(type) |
| && data.length > MIN_LENGTH) { |
| CharsetMatch[] matches = null; |
| |
| // do all these in a try/catch; setText and detect/detectAll |
| // will sometimes throw exceptions |
| try { |
| detector.enableInputFilter(filter); |
| if (data.length > MIN_LENGTH) { |
| detector.setText(data); |
| matches = detector.detectAll(); |
| } |
| } catch (Exception e) { |
| LOG.debug("Exception from ICU4J (ignoring): ", e); |
| } |
| |
| if (matches != null) { |
| for (CharsetMatch match : matches) { |
| addClue(match.getName(), "detect", match.getConfidence()); |
| } |
| } |
| } |
| |
| // add character encoding coming from HTTP response header |
| addClue(encoding, "header"); |
| } |
| |
| public void addClue(String value, String source, int confidence) { |
| if (value == null || "".equals(value)) { |
| return; |
| } |
| value = resolveEncodingAlias(value); |
| if (value != null) { |
| clues.add(new EncodingClue(value, source, confidence)); |
| } |
| } |
| |
| public void addClue(String value, String source) { |
| addClue(value, source, NO_THRESHOLD); |
| } |
| |
| /** |
| * Guess the encoding with the previously specified list of clues. |
| * |
| * @param row URL's row |
| * @param defaultValue Default encoding to return if no encoding can be |
| * detected with enough confidence. Note that this will <b>not</b> be |
| * normalized with {@link EncodingDetector#resolveEncodingAlias} |
| * |
| * @return Guessed encoding or defaultValue |
| */ |
| public String guessEncoding(WebPage page, String defaultValue) { |
| Utf8 baseUrlUtf8 = page.getBaseUrl(); |
| String baseUrl = TableUtil.toString(baseUrlUtf8); |
| return guessEncoding(baseUrl, defaultValue); |
| } |
| |
| /** |
| * Guess the encoding with the previously specified list of clues. |
| * |
| * @param baseUrl Base URL |
| * @param defaultValue Default encoding to return if no encoding can be |
| * detected with enough confidence. Note that this will <b>not</b> be |
| * normalized with {@link EncodingDetector#resolveEncodingAlias} |
| * |
| * @return Guessed encoding or defaultValue |
| */ |
| private String guessEncoding(String baseUrl, String defaultValue) { |
| /* |
| * This algorithm could be replaced by something more sophisticated; |
| * ideally we would gather a bunch of data on where various clues |
| * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with |
| * the correct answer, and use machine learning/some statistical method |
| * to generate a better heuristic. |
| */ |
| |
| |
| if (LOG.isTraceEnabled()) { |
| findDisagreements(baseUrl, clues); |
| } |
| |
| /* |
| * Go down the list of encoding "clues". Use a clue if: |
| * 1. Has a confidence value which meets our confidence threshold, OR |
| * 2. Doesn't meet the threshold, but is the best try, |
| * since nothing else is available. |
| */ |
| EncodingClue defaultClue = new EncodingClue(defaultValue, "default"); |
| EncodingClue bestClue = defaultClue; |
| |
| for (EncodingClue clue : clues) { |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(baseUrl + ": charset " + clue); |
| } |
| String charset = clue.value; |
| if (minConfidence >= 0 && clue.confidence >= minConfidence) { |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(baseUrl + ": Choosing encoding: " + charset + |
| " with confidence " + clue.confidence); |
| } |
| return resolveEncodingAlias(charset).toLowerCase(); |
| } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) { |
| bestClue = clue; |
| } |
| } |
| |
| if (LOG.isTraceEnabled()) { |
| LOG.trace(baseUrl + ": Choosing encoding: " + bestClue); |
| } |
| return bestClue.value.toLowerCase(); |
| } |
| |
| /** Clears all clues. */ |
| public void clearClues() { |
| clues.clear(); |
| } |
| |
| /* |
| * Strictly for analysis, look for "disagreements." The top guess from |
| * each source is examined; if these meet the threshold and disagree, then |
| * we log the information -- useful for testing or generating training data |
| * for a better heuristic. |
| */ |
| private void findDisagreements(String url, List<EncodingClue> newClues) { |
| HashSet<String> valsSeen = new HashSet<String>(); |
| HashSet<String> sourcesSeen = new HashSet<String>(); |
| boolean disagreement = false; |
| for (int i = 0; i < newClues.size(); i++) { |
| EncodingClue clue = newClues.get(i); |
| if (!clue.isEmpty() && !sourcesSeen.contains(clue.source)) { |
| if (valsSeen.size() > 0 && !valsSeen.contains(clue.value) |
| && clue.meetsThreshold()) { |
| disagreement = true; |
| } |
| if (clue.meetsThreshold()) { |
| valsSeen.add(clue.value); |
| } |
| sourcesSeen.add(clue.source); |
| } |
| } |
| if (disagreement) { |
| // dump all values in case of disagreement |
| StringBuffer sb = new StringBuffer(); |
| sb.append("Disagreement: "+url+"; "); |
| for (int i = 0; i < newClues.size(); i++) { |
| if (i>0) { |
| sb.append(", "); |
| } |
| sb.append(newClues.get(i)); |
| } |
| LOG.trace(sb.toString()); |
| } |
| } |
| |
| public static String resolveEncodingAlias(String encoding) { |
| try { |
| if (encoding == null || !Charset.isSupported(encoding)) |
| return null; |
| String canonicalName = new String(Charset.forName(encoding).name()); |
| return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName) |
| : canonicalName; |
| } catch (Exception e) { |
| LOG.warn("Invalid encoding " + encoding + " detected, using default."); |
| return null; |
| } |
| } |
| |
| /** |
| * Parse the character encoding from the specified content type header. |
| * If the content type is null, or there is no explicit character encoding, |
| * <code>null</code> is returned. |
| * <br /> |
| * This method was copied from org.apache.catalina.util.RequestUtil, |
| * which is licensed under the Apache License, Version 2.0 (the "License"). |
| * |
| * @param contentType a content type header |
| */ |
| public static String parseCharacterEncoding(Utf8 contentTypeUtf8) { |
| if (contentTypeUtf8 == null) |
| return (null); |
| String contentType = contentTypeUtf8.toString(); |
| int start = contentType.indexOf("charset="); |
| if (start < 0) |
| return (null); |
| String encoding = contentType.substring(start + 8); |
| int end = encoding.indexOf(';'); |
| if (end >= 0) |
| encoding = encoding.substring(0, end); |
| encoding = encoding.trim(); |
| if ((encoding.length() > 2) && (encoding.startsWith("\"")) |
| && (encoding.endsWith("\""))) |
| encoding = encoding.substring(1, encoding.length() - 1); |
| return (encoding.trim()); |
| |
| } |
| |
| /*public static void main(String[] args) throws IOException { |
| if (args.length != 1) { |
| System.err.println("Usage: EncodingDetector <file>"); |
| System.exit(1); |
| } |
| |
| Configuration conf = NutchConfiguration.create(); |
| EncodingDetector detector = |
| new EncodingDetector(NutchConfiguration.create()); |
| |
| // do everything as bytes; don't want any conversion |
| BufferedInputStream istr = |
| new BufferedInputStream(new FileInputStream(args[0])); |
| ByteArrayOutputStream ostr = new ByteArrayOutputStream(); |
| byte[] bytes = new byte[1000]; |
| boolean more = true; |
| while (more) { |
| int len = istr.read(bytes); |
| if (len < bytes.length) { |
| more = false; |
| if (len > 0) { |
| ostr.write(bytes, 0, len); |
| } |
| } else { |
| ostr.write(bytes); |
| } |
| } |
| |
| byte[] data = ostr.toByteArray(); |
| MimeUtil mimeTypes = new MimeUtil(conf); |
| |
| // make a fake Content |
| Content content = |
| new Content("", "", data, "text/html", new Metadata(), mimeTypes); |
| |
| detector.autoDetectClues(content, true); |
| String encoding = detector.guessEncoding(content, |
| conf.get("parser.character.encoding.default")); |
| System.out.println("Guessed encoding: " + encoding); |
| }*/ |
| |
| } |