blob: b9d12ff5c09e19644f856113e78097c23a64975d [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.utils;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@SuppressWarnings("serial")
public class CharsetUtils {
private static final Pattern CHARSET_NAME_PATTERN = Pattern.compile("[ \\\"]*([^ >,;\\\"]+).*");
private static final Pattern ISO_NAME_PATTERN = Pattern.compile("(?i).*8859-([\\d]+)");
private static final Pattern CP_NAME_PATTERN = Pattern.compile("(?i)cp-([\\d]+)");
private static final Pattern WIN_NAME_PATTERN = Pattern.compile("(?i)win(|-)([\\d]+)");
// List of common invalid charset names that we can't fix using
// pattern matching + heuristic
private static final Map<String, String> CHARSET_ALIASES = new HashMap<String, String>() {{
put("none", null);
put("no", null);
put("iso-8851-1", "iso-8859-1");
put("windows", "windows-1252");
put("koi8r", "KOI8-R");
}};
/**
* Safely return whether <charsetName> is supported, without throwing exceptions
*
* @param charsetName Name of charset (can be null)
* @return true if the character set is supported
*/
public static boolean isSupported(String charsetName) {
try {
return Charset.isSupported(charsetName);
} catch (IllegalCharsetNameException e) {
return false;
} catch (IllegalArgumentException e) {
// null, for example
return false;
} catch (Exception e) {
// Unexpected exception, what to do?
return false;
}
}
/**
* Handle various common charset name errors, and return something
* that will be considered valid (and is normalized)
*
* @param charsetName name of charset to process
* @return potentially remapped/cleaned up version of charset name
*/
public static String clean(String charsetName) {
if (charsetName == null) {
return null;
}
// Get rid of cruft around names, like <>, trailing commas, etc.
Matcher m = CHARSET_NAME_PATTERN.matcher(charsetName);
if (!m.matches()) {
return null;
}
String result = m.group(1);
if (CHARSET_ALIASES.containsKey(result.toLowerCase())) {
// Handle common erroneous charset names.
result = CHARSET_ALIASES.get(result.toLowerCase());
} else if (ISO_NAME_PATTERN.matcher(result).matches()) {
// Handle "iso 8859-x" error
m = ISO_NAME_PATTERN.matcher(result);
m.matches();
result = "iso-8859-" + m.group(1);
} else if (CP_NAME_PATTERN.matcher(result).matches()) {
// Handle "cp-xxx" error
m = CP_NAME_PATTERN.matcher(result);
m.matches();
result = "cp" + m.group(1);
} else if (WIN_NAME_PATTERN.matcher(result).matches()) {
// Handle "winxxx" and "win-xxx" errors
m = WIN_NAME_PATTERN.matcher(result);
m.matches();
result = "windows-" + m.group(2);
}
try {
Charset cs = Charset.forName(result);
return cs.name();
} catch (Exception e) {
return null;
}
}
}