| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.stanbol.enhancer.engines.metaxa.core.html; |
| |
| import java.io.BufferedInputStream; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import com.ibm.icu.text.CharsetDetector; |
| import com.ibm.icu.text.CharsetMatch; |
| |
| /** |
| * EncodingDetector.java |
| * |
| * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a> |
| * |
| */ |
| public class CharsetRecognizer { |
| |
| /** |
| * This contains the logger. |
| */ |
| private static final Logger LOG = |
| LoggerFactory.getLogger(CharsetRecognizer.class); |
| |
| |
| private static String checkPattern(String str, String pattern, int group) { |
| |
| Pattern pat = Pattern.compile(pattern); |
| Matcher m = pat.matcher(str); |
| if (m.find()) { |
| return m.group(group); |
| } |
| return null; |
| } |
| |
| |
| private static String checkFormat(String format, InputStream in) |
| throws IOException { |
| |
| String result = null; |
| String defaultValue = null; |
| byte[] bytes; |
| String decl; |
| in.mark(4096); |
| int read; |
| if (format.equalsIgnoreCase("xml")) { |
| defaultValue = "UTF-8"; |
| bytes = new byte[80]; |
| read = in.read(bytes); |
| in.reset(); |
| decl = new String(bytes, 0, read, "US-ASCII"); |
| result = checkPattern(decl, "encoding=\"(\\w[-\\w]+)\"", 1); |
| } |
| else if (format.equalsIgnoreCase("html")) { |
| bytes = new byte[2048]; |
| read = in.read(bytes); |
| in.reset(); |
| decl = new String(bytes, 0, read, "US-ASCII"); |
| result = |
| checkPattern(decl, |
| "<meta .*?content=\".*charset=(\\w[-\\w]+).*?/>", 1); |
| } |
| if (result != null) { |
| result = result.toUpperCase(); |
| LOG.debug(format.toUpperCase() + " encoding: " + result); |
| } |
| else { |
| return defaultValue; |
| } |
| return result; |
| } |
| |
| |
| public static String detect(InputStream in) |
| throws IOException { |
| |
| return detect(in, null, null); |
| } |
| |
| |
| public static String detect(InputStream in, String format, String encoding) |
| throws IOException { |
| |
| // the input stream must support marks |
| if (!in.markSupported()) { |
| throw new IOException("Mark not supported by input stream"); |
| } |
| String result = null; |
| if (format != null) { |
| result = checkFormat(format, in); |
| if (result != null) { |
| return result; |
| } |
| } |
| // in case of HTML or XML check whether there is a charset |
| // specification; might be too fragile |
| CharsetDetector detector = new CharsetDetector(); |
| if (encoding != null) { |
| detector.setDeclaredEncoding(encoding); |
| } |
| detector.setText(in); |
| CharsetMatch found = detector.detect(); |
| result = found.getName(); |
| LOG.debug("Encoding: " + result); |
| return result; |
| } |
| |
| public static void main(String[] args) { |
| |
| String format = null; |
| String encoding = null; |
| int argv = 0; |
| while (argv < args.length && args[argv].startsWith("-")) { |
| String option = args[argv].substring(1); |
| if (option.startsWith("f")) { |
| format = args[++argv]; |
| } |
| else if (option.startsWith("e")) { |
| encoding = args[++argv]; |
| } |
| else { |
| System.err.println("illegal option: " + option); |
| System.exit(1); |
| } |
| ++argv; |
| } |
| for (int i = argv; i < args.length; ++i) { |
| try { |
| BufferedInputStream fstream = |
| new BufferedInputStream(new FileInputStream(args[i])); |
| String found = |
| CharsetRecognizer.detect(fstream, format, encoding); |
| System.out.println("Encoding: " + found + ": " + args[i]); |
| /* |
| * check whether the stream is reset correctly byte[] bytes = |
| * new byte[50]; int read = fstream.read(bytes); |
| * System.out.println(new String(bytes)); |
| */ |
| fstream.close(); |
| } catch (IOException e) { |
| LOG.error(e.getMessage()); |
| } |
| } |
| } |
| |
| } |