blob: 5e215a99f7ebd6a8e5e6fab67a3bc5d2f10e13a3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.txt;
import java.nio.charset.Charset;
import org.apache.tika.detect.TextStatistics;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
import org.mozilla.universalchardet.CharsetListener;
import org.mozilla.universalchardet.Constants;
import org.mozilla.universalchardet.UniversalDetector;
/**
* Helper class used by {@link UniversalEncodingDetector} to access the
* <code>juniversalchardet</code> detection logic.
*/
class UniversalEncodingListener implements CharsetListener {
private static final String CHARSET_ISO_8859_1 = "ISO-8859-1";
private static final String CHARSET_ISO_8859_15 = "ISO-8859-15";
private final TextStatistics statistics = new TextStatistics();
private final UniversalDetector detector = new UniversalDetector(this);
private String hint = null;
private Charset charset = null;
public UniversalEncodingListener(Metadata metadata) {
MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
if (type != null) {
hint = type.getParameters().get("charset");
}
if (hint == null) {
hint = metadata.get(Metadata.CONTENT_ENCODING);
}
}
public void report(String name) {
if (Constants.CHARSET_WINDOWS_1252.equals(name)) {
if (hint != null) {
// Use the encoding hint when available
name = hint;
} else if (statistics.count('\r') == 0) {
// If there are no CR(LF)s, then the encoding is more
// likely to be ISO-8859-1(5) than windows-1252
if (statistics.count(0xa4) > 0) { // currency/euro sign
// The general currency sign is hardly ever used in
// ISO-8859-1, so it's more likely that we're dealing
// with ISO-8859-15, where the character is used for
// the euro symbol, which is more commonly used.
name = CHARSET_ISO_8859_15;
} else {
name = CHARSET_ISO_8859_1;
}
}
}
try {
this.charset = CharsetUtils.forName(name);
} catch (Exception e) {
// ignore
}
}
public boolean isDone() {
return detector.isDone();
}
public void handleData(byte[] buf, int offset, int length) {
statistics.addData(buf, offset, length);
detector.handleData(buf, offset, length);
}
public Charset dataEnd() {
detector.dataEnd();
if (charset == null && statistics.isMostlyAscii()) {
report(Constants.CHARSET_WINDOWS_1252);
}
return charset;
}
}