src/main/java/org/apache/johnzon/core/RFC4627AwareInputStreamReader.java - sling-org-apache-sling-commons-johnzon - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements. See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership. The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied. See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.johnzon.core;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.PushbackInputStream;
 import java.nio.charset.Charset;

 import javax.json.JsonException;

 final class RFC4627AwareInputStreamReader extends InputStreamReader {

     RFC4627AwareInputStreamReader(final InputStream in) {
         this(new PushbackInputStream(in,4));
     }

     private RFC4627AwareInputStreamReader(final PushbackInputStream in) {
         super(in, getCharset(in).newDecoder());

     }

     /**
      * According to the Java API "An attempt is made to read as many as len bytes, but a smaller number may be read".
      * [http://docs.oracle.com/javase/7/docs/api/java/io/InputStream.html#read(byte[],%20int,%20int)]
      * For this reason we need to ensure that we've read all the bytes that we need out of this stream.
      */
     private static byte[] readAllBytes(final PushbackInputStream inputStream) throws IOException {
         final int first = inputStream.read();
         final int second = inputStream.read();
         if(first == -1|| second == -1) {
             throw new JsonException("Invalid Json. Valid Json has at least 2 bytes");
         }
         final int third = inputStream.read();
         final int fourth = inputStream.read();
         if(third == -1) {
             return new byte[] { (byte) first, (byte) second };
         } else if(fourth == -1) {
             return new byte[] { (byte) first, (byte) second, (byte) third };
         } else {
             return new byte[] { (byte) first, (byte) second, (byte) third, (byte) fourth };
         }
     }

     /*
         * RFC 4627

           JSON text SHALL be encoded in Unicode.  The default encoding is
           UTF-8.

           Since the first two characters of a JSON text will always be ASCII
           characters [RFC0020], it is possible to determine whether an octet
           stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
           at the pattern of nulls in the first four octets.

           00 00 00 xx  UTF-32BE
           00 xx 00 xx  UTF-16BE
           xx 00 00 00  UTF-32LE
           xx 00 xx 00  UTF-16LE
           xx xx xx xx  UTF-8

         */

     private static Charset getCharset(final PushbackInputStream inputStream) {
         Charset charset = Charset.forName("UTF-8");
         int bomLength=0;
         try {
             final byte[] utfBytes = readAllBytes(inputStream);
             int first = (utfBytes[0] & 0xFF);
             int second = (utfBytes[1] & 0xFF);
             if (first == 0x00) {
                 charset = (second == 0x00) ? Charset.forName("UTF-32BE") : Charset.forName("UTF-16BE");
             } else if (utfBytes.length > 2 && second == 0x00) {
                 int third = (utfBytes[2] & 0xFF);
                 charset = (third  == 0x00) ? Charset.forName("UTF-32LE") : Charset.forName("UTF-16LE");
             } else {

                     /*check BOM

                     Encoding       hex byte order mark
                     UTF-8          EF BB BF
                     UTF-16 (BE)    FE FF
                     UTF-16 (LE)    FF FE
                     UTF-32 (BE)    00 00 FE FF
                     UTF-32 (LE)    FF FE 00 00
                     */

                 //We do not check for UTF-32BE because that is already covered above and we
                 //do not to unread anything.

                 if(first == 0xFE && second == 0xFF) {
                     charset = Charset.forName("UTF-16BE");
                     bomLength=2;
                 } else if(first == 0xFF && second == 0xFE) {
                     if(utfBytes.length > 3 && (utfBytes[2]&0xff) == 0x00 && (utfBytes[3]&0xff) == 0x00) {
                         charset = Charset.forName("UTF-32LE");
                         bomLength=4;
                     }else {
                         charset = Charset.forName("UTF-16LE");
                         bomLength=2;
                     }
                 } else if (utfBytes.length > 2 && first == 0xEF && second == 0xBB && (utfBytes[2]&0xff) == 0xBF) {
                     //UTF-8 with BOM
                     bomLength=3;
                 }
             }
             //assume UTF8
             if(bomLength > 0 && bomLength < 4) {
                 //do not unread BOM, only bytes after BOM
                 inputStream.unread(utfBytes,bomLength,utfBytes.length - bomLength);
             } else {
                 //no BOM, unread all read bytes
                 inputStream.unread(utfBytes);
             }


         } catch (final IOException e) {
             throw new JsonException("Unable to detect charset due to "+e.getMessage(), e);
         }

         return charset;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.johnzon.core;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.io.PushbackInputStream;
	import java.nio.charset.Charset;

	import javax.json.JsonException;

	final class RFC4627AwareInputStreamReader extends InputStreamReader {

	RFC4627AwareInputStreamReader(final InputStream in) {
	this(new PushbackInputStream(in,4));
	}

	private RFC4627AwareInputStreamReader(final PushbackInputStream in) {
	super(in, getCharset(in).newDecoder());

	}

	/**
	* According to the Java API "An attempt is made to read as many as len bytes, but a smaller number may be read".
	* [http://docs.oracle.com/javase/7/docs/api/java/io/InputStream.html#read(byte[],%20int,%20int)]
	* For this reason we need to ensure that we've read all the bytes that we need out of this stream.
	*/
	private static byte[] readAllBytes(final PushbackInputStream inputStream) throws IOException {
	final int first = inputStream.read();
	final int second = inputStream.read();
	if(first == -1\|\| second == -1) {
	throw new JsonException("Invalid Json. Valid Json has at least 2 bytes");
	}
	final int third = inputStream.read();
	final int fourth = inputStream.read();
	if(third == -1) {
	return new byte[] { (byte) first, (byte) second };
	} else if(fourth == -1) {
	return new byte[] { (byte) first, (byte) second, (byte) third };
	} else {
	return new byte[] { (byte) first, (byte) second, (byte) third, (byte) fourth };
	}
	}

	/*
	* RFC 4627

	JSON text SHALL be encoded in Unicode. The default encoding is
	UTF-8.

	Since the first two characters of a JSON text will always be ASCII
	characters [RFC0020], it is possible to determine whether an octet
	stream is UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE) by looking
	at the pattern of nulls in the first four octets.

	00 00 00 xx UTF-32BE
	00 xx 00 xx UTF-16BE
	xx 00 00 00 UTF-32LE
	xx 00 xx 00 UTF-16LE
	xx xx xx xx UTF-8

	*/

	private static Charset getCharset(final PushbackInputStream inputStream) {
	Charset charset = Charset.forName("UTF-8");
	int bomLength=0;
	try {
	final byte[] utfBytes = readAllBytes(inputStream);
	int first = (utfBytes[0] & 0xFF);
	int second = (utfBytes[1] & 0xFF);
	if (first == 0x00) {
	charset = (second == 0x00) ? Charset.forName("UTF-32BE") : Charset.forName("UTF-16BE");
	} else if (utfBytes.length > 2 && second == 0x00) {
	int third = (utfBytes[2] & 0xFF);
	charset = (third == 0x00) ? Charset.forName("UTF-32LE") : Charset.forName("UTF-16LE");
	} else {

	/*check BOM

	Encoding hex byte order mark
	UTF-8 EF BB BF
	UTF-16 (BE) FE FF
	UTF-16 (LE) FF FE
	UTF-32 (BE) 00 00 FE FF
	UTF-32 (LE) FF FE 00 00
	*/

	//We do not check for UTF-32BE because that is already covered above and we
	//do not to unread anything.

	if(first == 0xFE && second == 0xFF) {
	charset = Charset.forName("UTF-16BE");
	bomLength=2;
	} else if(first == 0xFF && second == 0xFE) {
	if(utfBytes.length > 3 && (utfBytes[2]&0xff) == 0x00 && (utfBytes[3]&0xff) == 0x00) {
	charset = Charset.forName("UTF-32LE");
	bomLength=4;
	}else {
	charset = Charset.forName("UTF-16LE");
	bomLength=2;
	}
	} else if (utfBytes.length > 2 && first == 0xEF && second == 0xBB && (utfBytes[2]&0xff) == 0xBF) {
	//UTF-8 with BOM
	bomLength=3;
	}
	}
	//assume UTF8
	if(bomLength > 0 && bomLength < 4) {
	//do not unread BOM, only bytes after BOM
	inputStream.unread(utfBytes,bomLength,utfBytes.length - bomLength);
	} else {
	//no BOM, unread all read bytes
	inputStream.unread(utfBytes);
	}


	} catch (final IOException e) {
	throw new JsonException("Unable to detect charset due to "+e.getMessage(), e);
	}

	return charset;
	}

	}