blob: ceff1a9229d7e092590afc339e5ded76fa6e5d30 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.johnzon.core;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PushbackInputStream;
import java.nio.charset.Charset;
import javax.json.JsonException;
final class RFC4627AwareInputStreamReader extends InputStreamReader {
RFC4627AwareInputStreamReader(final InputStream in) {
this(new PushbackInputStream(in,4));
}
private RFC4627AwareInputStreamReader(final PushbackInputStream in) {
super(in, getCharset(in).newDecoder());
}
/*
* RFC 4627
JSON text SHALL be encoded in Unicode. The default encoding is
UTF-8.
Since the first two characters of a JSON text will always be ASCII
characters [RFC0020], it is possible to determine whether an octet
stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
at the pattern of nulls in the first four octets.
00 00 00 xx UTF-32BE
00 xx 00 xx UTF-16BE
xx 00 00 00 UTF-32LE
xx 00 xx 00 UTF-16LE
xx xx xx xx UTF-8
*/
private static Charset getCharset(final PushbackInputStream inputStream) {
Charset charset = Charset.forName("UTF-8");
final byte[] utfBytes = new byte[4];
int bomLength=0;
try {
final int read = inputStream.read(utfBytes);
if (read < 2) {
throw new JsonException("Invalid Json. Valid Json has at least 2 bytes");
} else {
int first = (utfBytes[0] & 0xFF);
int second = (utfBytes[1] & 0xFF);
if (first == 0x00) {
charset = (second == 0x00) ? Charset.forName("UTF-32BE") : Charset.forName("UTF-16BE");
} else if (read > 2 && second == 0x00) {
int third = (utfBytes[2] & 0xFF);
charset = (third == 0x00) ? Charset.forName("UTF-32LE") : Charset.forName("UTF-16LE");
} else {
/*check BOM
Encoding hex byte order mark
UTF-8 EF BB BF
UTF-16 (BE) FE FF
UTF-16 (LE) FF FE
UTF-32 (BE) 00 00 FE FF
UTF-32 (LE) FF FE 00 00
*/
if(first == 0xFE && second == 0xFF) {
charset = Charset.forName("UTF-16BE");
bomLength=2;
} else if(read > 3 && first == 0x00 && second == 0x00 && (utfBytes[2]&0xff) == 0xFE && (utfBytes[3]&0xff) == 0xFF){
charset = Charset.forName("UTF-32BE");
bomLength=4;
} else if(first == 0xFF && second == 0xFE) {
if(read > 3 && (utfBytes[2]&0xff) == 0x00 && (utfBytes[3]&0xff) == 0x00) {
charset = Charset.forName("UTF-32LE");
bomLength=4;
}else {
charset = Charset.forName("UTF-16LE");
bomLength=2;
}
}
//assume UTF8
}
}
if(bomLength < 4) {
inputStream.unread(utfBytes,bomLength==2?2:0,read-bomLength);
}
} catch (final IOException e) {
throw new JsonException("Unable to detect charset due to "+e.getMessage(), e);
}
return charset;
}
}