| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. The ASF licenses this file to You |
| * under the Apache License, Version 2.0 (the "License"); you may not |
| * use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. For additional information regarding |
| * copyright in this work, please see the NOTICE file in the top level |
| * directory of this distribution. |
| */ |
| package org.apache.abdera2.common.io; |
| |
| import java.io.FilterInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| |
| /** |
| * Will attempt to autodetect the character encoding from the stream By default, this will preserve the BOM if it exists |
| */ |
| public class CharsetSniffingInputStream extends FilterInputStream { |
| |
| public static enum Encoding { |
| UTF32be("UTF-32", true, new byte[] {0x00, 0x00, 0xFFFFFFFE, 0xFFFFFFFF}), UTF32le("UTF-32", true, |
| new byte[] {0xFFFFFFFF, 0xFFFFFFFE, 0x00, 0x00}), INVALID(null, true, new byte[] {0xFFFFFFFE, 0xFFFFFFFF, |
| 0x00, 0x00}, |
| new byte[] {0x00, 0x00, 0xFFFFFFFF, 0xFFFFFFFE}), UTF16be("UTF-16", true, new byte[] {0xFFFFFFFE, |
| 0xFFFFFFFF}), UTF16le( |
| "UTF-16", true, new byte[] {0xFFFFFFFF, 0xFFFFFFFE}), UTF8("UTF-8", true, new byte[] {0xFFFFFFEF, |
| 0xFFFFFFBB, |
| 0xFFFFFFBF}), UTF32be2( |
| "UTF-32be", false, new byte[] {0x00, 0x00, 0x00, 0x3C}), UTF32le2("UTF-32le", false, |
| new byte[] {0x3C, 0x00, 0x00, 0x00}), UTF16be2("UTF-16be", false, new byte[] {0x00, 0x3C, 0x00, 0x3F}), UTF16le2( |
| "UTF-16le", false, new byte[] {0x3C, 0x00, 0x3F, 0x00}); |
| |
| private final String enc; |
| private final byte[][] checks; |
| private final boolean bom; |
| |
| Encoding(String name, boolean bom, byte[]... checks) { |
| this.enc = name; |
| this.checks = checks; |
| this.bom = bom; |
| } |
| |
| public String getEncoding() { |
| return enc; |
| } |
| |
| public boolean getBom() { |
| return bom; |
| } |
| |
| public int equals(byte[] bom) { |
| for (byte[] check : checks) { |
| if (CharsetSniffingInputStream.equals(bom, check.length, check)) |
| return check.length; |
| } |
| return 0; |
| } |
| } |
| |
| protected String encoding; |
| protected boolean bomset = false; |
| protected final boolean preserve; |
| |
| public CharsetSniffingInputStream(InputStream in) { |
| this(in, true); |
| } |
| |
| public CharsetSniffingInputStream(InputStream in, boolean preserveBom) { |
| super(!(in instanceof PeekAheadInputStream) ? new PeekAheadInputStream(in, 4) : in); |
| this.preserve = preserveBom; |
| try { |
| encoding = detectEncoding(); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| public boolean isBomSet() { |
| return bomset; |
| } |
| |
| public String getEncoding() { |
| return encoding; |
| } |
| |
| protected PeekAheadInputStream getInternal() { |
| return (PeekAheadInputStream)in; |
| } |
| |
| private static boolean equals(byte[] a1, int len, byte[] a2) { |
| for (int n = 0, i = 0; n < len; n++, i++) { |
| if (a1[n] != a2[i]) |
| return false; |
| } |
| return true; |
| } |
| |
| protected String detectEncoding() throws IOException { |
| PeekAheadInputStream pin = (PeekAheadInputStream)this.in; |
| byte[] bom = new byte[4]; |
| pin.peek(bom); |
| bomset = false; |
| for (Encoding enc : Encoding.values()) { |
| int bomlen = enc.equals(bom); |
| if (bomlen > 0) { |
| bomset = enc.getBom(); |
| if (bomset && !preserve) // consume the bom |
| pin.read(new byte[bomlen]); |
| return enc.getEncoding(); |
| } |
| } |
| return null; |
| } |
| |
| } |