groovy-core/src/main/groovy/util/CharsetToolkit.java - groovy - Git at Google

 /*
  * Copyright 2003-2007 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package groovy.util;

 import java.io.*;
 import java.nio.charset.Charset;
 import java.util.*;

 /**
  * <p>Utility class to guess the encoding of a given text file.</p>
  *
  * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
  * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
  * is wide enough, the charset should also be discovered.</p>
  *
  * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
  *
  * <p>Usage:</p>
  * <pre>
  * // guess the encoding
  * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
  *
  * // create a reader with the correct charset
  * CharsetToolkit toolkit = new CharsetToolkit(file);
  * BufferedReader reader = toolkit.getReader();
  *
  * // read the file content
  * String line;
  * while ((line = br.readLine())!= null)
  * {
  *     System.out.println(line);
  * }
  * </pre>
  *
  * @author Guillaume Laforge
  */
 public class CharsetToolkit {
     private byte[] buffer;
     private Charset defaultCharset;
     private Charset charset;
     private boolean enforce8Bit = true;
     private final File file;

     /**
      * Constructor of the <code>CharsetToolkit</code> utility class.
      *
      * @param file of which we want to know the encoding.
      */
     public CharsetToolkit(File file) throws IOException {
         this.file = file;
         this.defaultCharset = getDefaultSystemCharset();
         this.charset = null;
         InputStream input = new FileInputStream(file);
         try {
             byte[] bytes = new byte[4096];
             int bytesRead = input.read(bytes);
             if (bytesRead == -1) {
                 this.buffer = new byte[0];
             }
             else if (bytesRead < 4096) {
                 byte[] bytesToGuess = new byte[bytesRead];
                 System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
                 this.buffer = bytesToGuess;
             }
             else {
                 this.buffer = bytes;
             }
         } finally {
             try {input.close();} catch (IOException e){
                 // IGNORE
             }
         }
     }

     /**
      * Defines the default <code>Charset</code> used in case the buffer represents
      * an 8-bit <code>Charset</code>.
      *
      * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
      * if an 8-bit <code>Charset</code> is encountered.
      */
     public void setDefaultCharset(Charset defaultCharset) {
         if (defaultCharset != null)
             this.defaultCharset = defaultCharset;
         else
             this.defaultCharset = getDefaultSystemCharset();
     }

     public Charset getCharset() {
         if (this.charset == null)
             this.charset = guessEncoding();
         return charset;
     }

     /**
      * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
      * It might be a file without any special character in the range 128-255, but that may be or become
      * a file encoded with the default <code>charset</code> rather than US-ASCII.
      *
      * @param enforce a boolean specifying the use or not of US-ASCII.
      */
     public void setEnforce8Bit(boolean enforce) {
         this.enforce8Bit = enforce;
     }

     /**
      * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
      *
      * @return a boolean representing the flag of use of US-ASCII.
      */
     public boolean getEnforce8Bit() {
         return this.enforce8Bit;
     }

     /**
      * Retrieves the default Charset
      */
     public Charset getDefaultCharset() {
         return defaultCharset;
     }

     /**
      * <p>Guess the encoding of the provided buffer.</p>
      * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
      * return the charset implied by this BOM. Otherwise, the file would not be a human
      * readable text file.</p>
      *
      * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
      * If it is not UTF-8, we assume the encoding is the default system encoding
      * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
      *
      * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
      * <pre>
      * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
      * 0000 0000-0000 007F       0xxxxxxx
      * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
      * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
      * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      * </pre>
      * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
      *
      * @return the Charset recognized.
      */
     private Charset guessEncoding() {
         // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
         // otherwise, the file would not be human readable
         if (hasUTF8Bom())
             return Charset.forName("UTF-8");
         if (hasUTF16LEBom())
             return Charset.forName("UTF-16LE");
         if (hasUTF16BEBom())
             return Charset.forName("UTF-16BE");

         // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
         // otherwise, the file is in US-ASCII
         boolean highOrderBit = false;

         // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
         // if it's not the case, we can assume the encoding is the default encoding of the system
         boolean validU8Char = true;

         // TODO the buffer is not read up to the end, but up to length - 6

         int length = buffer.length;
         int i = 0;
         while (i < length - 6) {
             byte b0 = buffer[i];
             byte b1 = buffer[i + 1];
             byte b2 = buffer[i + 2];
             byte b3 = buffer[i + 3];
             byte b4 = buffer[i + 4];
             byte b5 = buffer[i + 5];
             if (b0 < 0) {
                 // a high order bit was encountered, thus the encoding is not US-ASCII
                 // it may be either an 8-bit encoding or UTF-8
                 highOrderBit = true;
                 // a two-bytes sequence was encoutered
                 if (isTwoBytesSequence(b0)) {
                     // there must be one continuation byte of the form 10xxxxxx,
                     // otherwise the following characteris is not a valid UTF-8 construct
                     if (!isContinuationChar(b1))
                         validU8Char = false;
                     else
                         i++;
                 }
                 // a three-bytes sequence was encoutered
                 else if (isThreeBytesSequence(b0)) {
                     // there must be two continuation bytes of the form 10xxxxxx,
                     // otherwise the following characteris is not a valid UTF-8 construct
                     if (!(isContinuationChar(b1) && isContinuationChar(b2)))
                         validU8Char = false;
                     else
                         i += 2;
                 }
                 // a four-bytes sequence was encoutered
                 else if (isFourBytesSequence(b0)) {
                     // there must be three continuation bytes of the form 10xxxxxx,
                     // otherwise the following characteris is not a valid UTF-8 construct
                     if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
                         validU8Char = false;
                     else
                         i += 3;
                 }
                 // a five-bytes sequence was encoutered
                 else if (isFiveBytesSequence(b0)) {
                     // there must be four continuation bytes of the form 10xxxxxx,
                     // otherwise the following characteris is not a valid UTF-8 construct
                     if (!(isContinuationChar(b1)
                         && isContinuationChar(b2)
                         && isContinuationChar(b3)
                         && isContinuationChar(b4)))
                         validU8Char = false;
                     else
                         i += 4;
                 }
                 // a six-bytes sequence was encoutered
                 else if (isSixBytesSequence(b0)) {
                     // there must be five continuation bytes of the form 10xxxxxx,
                     // otherwise the following characteris is not a valid UTF-8 construct
                     if (!(isContinuationChar(b1)
                         && isContinuationChar(b2)
                         && isContinuationChar(b3)
                         && isContinuationChar(b4)
                         && isContinuationChar(b5)))
                         validU8Char = false;
                     else
                         i += 5;
                 }
                 else
                     validU8Char = false;
             }
             if (!validU8Char)
                 break;
             i++;
         }
         // if no byte with an high order bit set, the encoding is US-ASCII
         // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
         if (!highOrderBit) {
             // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
             if (this.enforce8Bit)
                 return this.defaultCharset;
             else
                 return Charset.forName("US-ASCII");
         }
         // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
         // otherwise the file would not be human readable
         if (validU8Char)
             return Charset.forName("UTF-8");
         // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
         return this.defaultCharset;
     }

     /**
      * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
      *
      * @param b a byte.
      * @return true if it's a continuation char.
      */
     private static boolean isContinuationChar(byte b) {
         return -128 <= b && b <= -65;
     }

     /**
      * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
      *
      * @param b a byte.
      * @return true if it's the first byte of a two-bytes sequence.
      */
     private static boolean isTwoBytesSequence(byte b) {
         return -64 <= b && b <= -33;
     }

     /**
      * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
      *
      * @param b a byte.
      * @return true if it's the first byte of a three-bytes sequence.
      */
     private static boolean isThreeBytesSequence(byte b) {
         return -32 <= b && b <= -17;
     }

     /**
      * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
      *
      * @param b a byte.
      * @return true if it's the first byte of a four-bytes sequence.
      */
     private static boolean isFourBytesSequence(byte b) {
         return -16 <= b && b <= -9;
     }

     /**
      * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
      *
      * @param b a byte.
      * @return true if it's the first byte of a five-bytes sequence.
      */
     private static boolean isFiveBytesSequence(byte b) {
         return -8 <= b && b <= -5;
     }

     /**
      * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
      *
      * @param b a byte.
      * @return true if it's the first byte of a six-bytes sequence.
      */
     private static boolean isSixBytesSequence(byte b) {
         return -4 <= b && b <= -3;
     }

     /**
      * Retrieve the default charset of the system.
      *
      * @return the default <code>Charset</code>.
      */
     public static Charset getDefaultSystemCharset() {
         return Charset.forName(System.getProperty("file.encoding"));
     }

     /**
      * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
      *
      * @return true if the buffer has a BOM for UTF8.
      */
     public boolean hasUTF8Bom() {
         if (buffer.length >= 3)
             return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
         else
             return false;
     }

     /**
      * Has a Byte Order Marker for UTF-16 Low Endian
      * (ucs-2le, ucs-4le, and ucs-16le).
      *
      * @return true if the buffer has a BOM for UTF-16 Low Endian.
      */
     public boolean hasUTF16LEBom() {
         if (buffer.length >= 2)
             return (buffer[0] == -1 && buffer[1] == -2);
         else
             return false;
     }

     /**
      * Has a Byte Order Marker for UTF-16 Big Endian
      * (utf-16 and ucs-2).
      *
      * @return true if the buffer has a BOM for UTF-16 Big Endian.
      */
     public boolean hasUTF16BEBom() {
         if (buffer.length >= 2)
             return (buffer[0] == -2 && buffer[1] == -1);
         else
             return false;
     }

     /**
      * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
      * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
      * method <code>guessEncoding()</code>.
      *
      * @return a <code>BufferedReader</code>
      * @throws FileNotFoundException if the file is not found.
      */
     public BufferedReader getReader() throws FileNotFoundException {
         LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
         if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
             try {
                 reader.read();
             }
             catch (IOException e) {
                 // should never happen, as a file with no content
                 // but with a BOM has at least one char
             }
         }
         return reader;
     }

     /**
      * Retrieves all the available <code>Charset</code>s on the platform,
      * among which the default <code>charset</code>.
      *
      * @return an array of <code>Charset</code>s.
      */
     public static Charset[] getAvailableCharsets() {
         Collection collection = Charset.availableCharsets().values();
         return (Charset[]) collection.toArray(new Charset[collection.size()]);
     }
 }
	/*
	* Copyright 2003-2007 the original author or authors.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package groovy.util;

	import java.io.*;
	import java.nio.charset.Charset;
	import java.util.*;

	/**
	* <p>Utility class to guess the encoding of a given text file.</p>
	*
	* <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
	* with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
	* is wide enough, the charset should also be discovered.</p>
	*
	* <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
	*
	* <p>Usage:</p>
	* <pre>
	* // guess the encoding
	* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
	*
	* // create a reader with the correct charset
	* CharsetToolkit toolkit = new CharsetToolkit(file);
	* BufferedReader reader = toolkit.getReader();
	*
	* // read the file content
	* String line;
	* while ((line = br.readLine())!= null)
	* {
	* System.out.println(line);
	* }
	* </pre>
	*
	* @author Guillaume Laforge
	*/
	public class CharsetToolkit {
	private byte[] buffer;
	private Charset defaultCharset;
	private Charset charset;
	private boolean enforce8Bit = true;
	private final File file;

	/**
	* Constructor of the <code>CharsetToolkit</code> utility class.
	*
	* @param file of which we want to know the encoding.
	*/
	public CharsetToolkit(File file) throws IOException {
	this.file = file;
	this.defaultCharset = getDefaultSystemCharset();
	this.charset = null;
	InputStream input = new FileInputStream(file);
	try {
	byte[] bytes = new byte[4096];
	int bytesRead = input.read(bytes);
	if (bytesRead == -1) {
	this.buffer = new byte[0];
	}
	else if (bytesRead < 4096) {
	byte[] bytesToGuess = new byte[bytesRead];
	System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
	this.buffer = bytesToGuess;
	}
	else {
	this.buffer = bytes;
	}
	} finally {
	try {input.close();} catch (IOException e){
	// IGNORE
	}
	}
	}

	/**
	* Defines the default <code>Charset</code> used in case the buffer represents
	* an 8-bit <code>Charset</code>.
	*
	* @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
	* if an 8-bit <code>Charset</code> is encountered.
	*/
	public void setDefaultCharset(Charset defaultCharset) {
	if (defaultCharset != null)
	this.defaultCharset = defaultCharset;
	else
	this.defaultCharset = getDefaultSystemCharset();
	}

	public Charset getCharset() {
	if (this.charset == null)
	this.charset = guessEncoding();
	return charset;
	}

	/**
	* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
	* It might be a file without any special character in the range 128-255, but that may be or become
	* a file encoded with the default <code>charset</code> rather than US-ASCII.
	*
	* @param enforce a boolean specifying the use or not of US-ASCII.
	*/
	public void setEnforce8Bit(boolean enforce) {
	this.enforce8Bit = enforce;
	}

	/**
	* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
	*
	* @return a boolean representing the flag of use of US-ASCII.
	*/
	public boolean getEnforce8Bit() {
	return this.enforce8Bit;
	}

	/**
	* Retrieves the default Charset
	*/
	public Charset getDefaultCharset() {
	return defaultCharset;
	}

	/**
	* <p>Guess the encoding of the provided buffer.</p>
	* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
	* return the charset implied by this BOM. Otherwise, the file would not be a human
	* readable text file.</p>
	*
	* <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
	* If it is not UTF-8, we assume the encoding is the default system encoding
	* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
	*
	* <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
	* <pre>
	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
	* 0000 0000-0000 007F 0xxxxxxx
	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
	* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	* </pre>
	* <p>With UTF-8, 0xFE and 0xFF never appear.</p>
	*
	* @return the Charset recognized.
	*/
	private Charset guessEncoding() {
	// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
	// otherwise, the file would not be human readable
	if (hasUTF8Bom())
	return Charset.forName("UTF-8");
	if (hasUTF16LEBom())
	return Charset.forName("UTF-16LE");
	if (hasUTF16BEBom())
	return Charset.forName("UTF-16BE");

	// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
	// otherwise, the file is in US-ASCII
	boolean highOrderBit = false;

	// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
	// if it's not the case, we can assume the encoding is the default encoding of the system
	boolean validU8Char = true;

	// TODO the buffer is not read up to the end, but up to length - 6

	int length = buffer.length;
	int i = 0;
	while (i < length - 6) {
	byte b0 = buffer[i];
	byte b1 = buffer[i + 1];
	byte b2 = buffer[i + 2];
	byte b3 = buffer[i + 3];
	byte b4 = buffer[i + 4];
	byte b5 = buffer[i + 5];
	if (b0 < 0) {
	// a high order bit was encountered, thus the encoding is not US-ASCII
	// it may be either an 8-bit encoding or UTF-8
	highOrderBit = true;
	// a two-bytes sequence was encoutered
	if (isTwoBytesSequence(b0)) {
	// there must be one continuation byte of the form 10xxxxxx,
	// otherwise the following characteris is not a valid UTF-8 construct
	if (!isContinuationChar(b1))
	validU8Char = false;
	else
	i++;
	}
	// a three-bytes sequence was encoutered
	else if (isThreeBytesSequence(b0)) {
	// there must be two continuation bytes of the form 10xxxxxx,
	// otherwise the following characteris is not a valid UTF-8 construct
	if (!(isContinuationChar(b1) && isContinuationChar(b2)))
	validU8Char = false;
	else
	i += 2;
	}
	// a four-bytes sequence was encoutered
	else if (isFourBytesSequence(b0)) {
	// there must be three continuation bytes of the form 10xxxxxx,
	// otherwise the following characteris is not a valid UTF-8 construct
	if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
	validU8Char = false;
	else
	i += 3;
	}
	// a five-bytes sequence was encoutered
	else if (isFiveBytesSequence(b0)) {
	// there must be four continuation bytes of the form 10xxxxxx,
	// otherwise the following characteris is not a valid UTF-8 construct
	if (!(isContinuationChar(b1)
	&& isContinuationChar(b2)
	&& isContinuationChar(b3)
	&& isContinuationChar(b4)))
	validU8Char = false;
	else
	i += 4;
	}
	// a six-bytes sequence was encoutered
	else if (isSixBytesSequence(b0)) {
	// there must be five continuation bytes of the form 10xxxxxx,
	// otherwise the following characteris is not a valid UTF-8 construct
	if (!(isContinuationChar(b1)
	&& isContinuationChar(b2)
	&& isContinuationChar(b3)
	&& isContinuationChar(b4)
	&& isContinuationChar(b5)))
	validU8Char = false;
	else
	i += 5;
	}
	else
	validU8Char = false;
	}
	if (!validU8Char)
	break;
	i++;
	}
	// if no byte with an high order bit set, the encoding is US-ASCII
	// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
	if (!highOrderBit) {
	// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
	if (this.enforce8Bit)
	return this.defaultCharset;
	else
	return Charset.forName("US-ASCII");
	}
	// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
	// otherwise the file would not be human readable
	if (validU8Char)
	return Charset.forName("UTF-8");
	// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
	return this.defaultCharset;
	}

	/**
	* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
	*
	* @param b a byte.
	* @return true if it's a continuation char.
	*/
	private static boolean isContinuationChar(byte b) {
	return -128 <= b && b <= -65;
	}

	/**
	* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
	*
	* @param b a byte.
	* @return true if it's the first byte of a two-bytes sequence.
	*/
	private static boolean isTwoBytesSequence(byte b) {
	return -64 <= b && b <= -33;
	}

	/**
	* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
	*
	* @param b a byte.
	* @return true if it's the first byte of a three-bytes sequence.
	*/
	private static boolean isThreeBytesSequence(byte b) {
	return -32 <= b && b <= -17;
	}

	/**
	* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
	*
	* @param b a byte.
	* @return true if it's the first byte of a four-bytes sequence.
	*/
	private static boolean isFourBytesSequence(byte b) {
	return -16 <= b && b <= -9;
	}

	/**
	* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
	*
	* @param b a byte.
	* @return true if it's the first byte of a five-bytes sequence.
	*/
	private static boolean isFiveBytesSequence(byte b) {
	return -8 <= b && b <= -5;
	}

	/**
	* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
	*
	* @param b a byte.
	* @return true if it's the first byte of a six-bytes sequence.
	*/
	private static boolean isSixBytesSequence(byte b) {
	return -4 <= b && b <= -3;
	}

	/**
	* Retrieve the default charset of the system.
	*
	* @return the default <code>Charset</code>.
	*/
	public static Charset getDefaultSystemCharset() {
	return Charset.forName(System.getProperty("file.encoding"));
	}

	/**
	* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
	*
	* @return true if the buffer has a BOM for UTF8.
	*/
	public boolean hasUTF8Bom() {
	if (buffer.length >= 3)
	return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
	else
	return false;
	}

	/**
	* Has a Byte Order Marker for UTF-16 Low Endian
	* (ucs-2le, ucs-4le, and ucs-16le).
	*
	* @return true if the buffer has a BOM for UTF-16 Low Endian.
	*/
	public boolean hasUTF16LEBom() {
	if (buffer.length >= 2)
	return (buffer[0] == -1 && buffer[1] == -2);
	else
	return false;
	}

	/**
	* Has a Byte Order Marker for UTF-16 Big Endian
	* (utf-16 and ucs-2).
	*
	* @return true if the buffer has a BOM for UTF-16 Big Endian.
	*/
	public boolean hasUTF16BEBom() {
	if (buffer.length >= 2)
	return (buffer[0] == -2 && buffer[1] == -1);
	else
	return false;
	}

	/**
	* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
	* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
	* method <code>guessEncoding()</code>.
	*
	* @return a <code>BufferedReader</code>
	* @throws FileNotFoundException if the file is not found.
	*/
	public BufferedReader getReader() throws FileNotFoundException {
	LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
	if (hasUTF8Bom() \|\| hasUTF16LEBom() \|\| hasUTF16BEBom()) {
	try {
	reader.read();
	}
	catch (IOException e) {
	// should never happen, as a file with no content
	// but with a BOM has at least one char
	}
	}
	return reader;
	}

	/**
	* Retrieves all the available <code>Charset</code>s on the platform,
	* among which the default <code>charset</code>.
	*
	* @return an array of <code>Charset</code>s.
	*/
	public static Charset[] getAvailableCharsets() {
	Collection collection = Charset.availableCharsets().values();
	return (Charset[]) collection.toArray(new Charset[collection.size()]);
	}
	}