tika-parsers/src/main/java/org/apache/tika/parser/txt/UniversalEncodingListener.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.txt;

 import java.nio.charset.Charset;

 import org.apache.tika.detect.TextStatistics;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.utils.CharsetUtils;
 import org.mozilla.universalchardet.CharsetListener;
 import org.mozilla.universalchardet.Constants;
 import org.mozilla.universalchardet.UniversalDetector;

 /**
  * Helper class used by {@link UniversalEncodingDetector} to access the
  * <code>juniversalchardet</code> detection logic.
  */
 class UniversalEncodingListener implements CharsetListener {

     private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";

     private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";

     private final TextStatistics statistics = new TextStatistics();

     private final UniversalDetector detector = new UniversalDetector(this);

     private String hint = null;

     private Charset charset = null;

     public UniversalEncodingListener(Metadata metadata) {
         MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
         if (type != null) {
             hint = type.getParameters().get("charset");
         }
         if (hint == null) {
             hint = metadata.get(Metadata.CONTENT_ENCODING);
         }
     }

     public void report(String name) {
         if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
             if (hint != null) {
                 // Use the encoding hint when available
                 name = hint;
             } else if (statistics.count('\r') == 0) {
                 // If there are no CR(LF)s, then the encoding is more
                 // likely to be ISO-8859-1(5) than windows-1252
                 if (statistics.count(0xa4) > 0) { // currency/euro sign
                     // The general currency sign is hardly ever used in
                     // ISO-8859-1, so it's more likely that we're dealing
                     // with ISO-8859-15, where the character is used for
                     // the euro symbol, which is more commonly used.
                     name = CHARSET_ISO_8859_15;
                 } else {
                     name = CHARSET_ISO_8859_1;
                 }
             }
         }
         try {
             this.charset = CharsetUtils.forName(name);
         } catch (Exception e) {
             // ignore
         }
     }

     public boolean isDone() {
         return detector.isDone();
     }

     public void handleData(byte[] buf, int offset, int length) {
         statistics.addData(buf, offset, length);
         detector.handleData(buf, offset, length);
     }

     public Charset dataEnd() {
         detector.dataEnd();
         if (charset == null && statistics.isMostlyAscii()) {
             report(Constants.CHARSET_WINDOWS_1252);
         }
         return charset;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.txt;

	import java.nio.charset.Charset;

	import org.apache.tika.detect.TextStatistics;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.utils.CharsetUtils;
	import org.mozilla.universalchardet.CharsetListener;
	import org.mozilla.universalchardet.Constants;
	import org.mozilla.universalchardet.UniversalDetector;

	/**
	* Helper class used by {@link UniversalEncodingDetector} to access the
	* <code>juniversalchardet</code> detection logic.
	*/
	class UniversalEncodingListener implements CharsetListener {

	private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";

	private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";

	private final TextStatistics statistics = new TextStatistics();

	private final UniversalDetector detector = new UniversalDetector(this);

	private String hint = null;

	private Charset charset = null;

	public UniversalEncodingListener(Metadata metadata) {
	MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
	if (type != null) {
	hint = type.getParameters().get("charset");
	}
	if (hint == null) {
	hint = metadata.get(Metadata.CONTENT_ENCODING);
	}
	}

	public void report(String name) {
	if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
	if (hint != null) {
	// Use the encoding hint when available
	name = hint;
	} else if (statistics.count('\r') == 0) {
	// If there are no CR(LF)s, then the encoding is more
	// likely to be ISO-8859-1(5) than windows-1252
	if (statistics.count(0xa4) > 0) { // currency/euro sign
	// The general currency sign is hardly ever used in
	// ISO-8859-1, so it's more likely that we're dealing
	// with ISO-8859-15, where the character is used for
	// the euro symbol, which is more commonly used.
	name = CHARSET_ISO_8859_15;
	} else {
	name = CHARSET_ISO_8859_1;
	}
	}
	}
	try {
	this.charset = CharsetUtils.forName(name);
	} catch (Exception e) {
	// ignore
	}
	}

	public boolean isDone() {
	return detector.isDone();
	}

	public void handleData(byte[] buf, int offset, int length) {
	statistics.addData(buf, offset, length);
	detector.handleData(buf, offset, length);
	}

	public Charset dataEnd() {
	detector.dataEnd();
	if (charset == null && statistics.isMostlyAscii()) {
	report(Constants.CHARSET_WINDOWS_1252);
	}
	return charset;
	}

	}