DocFormats/platform/3rdparty/w3c-tidy-html5/src/utf8.c - incubator-retired-corinthia - Git at Google

 /* utf8.c -- convert characters to/from UTF-8

   (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
   See tidy.h for the copyright notice.

   Uses public interfaces to abstract input source and output
   sink, which may be user supplied or either FILE* or memory
   based Tidy implementations.  Encoding support is uniform
   regardless of I/O mechanism.

   Note, UTF-8 encoding, by itself, does not affect the actual
   "codepoints" of the underlying character encoding.  In the
   cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
   refer to ISO-10646 "codepoints".  For anything else, they
   refer to some other "codepoint" set.

   Put another way, UTF-8 is a variable length method to
   represent any non-negative integer value.  The glyph
   that a integer value represents is unchanged and defined
   externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
   Latin2-9, and so on).

   Put still another way, UTF-8 is more of a _transfer_ encoding
   than a _character_ encoding, per se.
 */

 #include "tidy.h"
 #include "forward.h"
 #include "utf8.h"

 /*
 UTF-8 encoding/decoding functions
 Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence

 Also see below for UTF-16 encoding/decoding functions

 References :

 1) UCS Transformation Format 8 (UTF-8):
 ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>

 Table 4 - Mapping from UCS-4 to UTF-8

 2) Unicode standards:
 <http://www.unicode.org/unicode/standard/standard.html>

 3) Legal UTF-8 byte sequences:
 <http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>

 Code point          1st byte    2nd byte    3rd byte    4th byte
 ----------          --------    --------    --------    --------
 U+0000..U+007F      00..7F
 U+0080..U+07FF      C2..DF      80..BF
 U+0800..U+0FFF      E0          A0..BF      80..BF
 U+1000..U+FFFF      E1..EF      80..BF      80..BF
 U+10000..U+3FFFF    F0          90..BF      80..BF      80..BF
 U+40000..U+FFFFF    F1..F3      80..BF      80..BF      80..BF
 U+100000..U+10FFFF  F4          80..8F      80..BF      80..BF

 The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
 allows for the use of five- and six-byte sequences to encode
 characters that are outside the range of the Unicode character
 set; those five- and six-byte sequences are illegal for the use
 of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
 does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
 (but it does allow other noncharacters).

 4) RFC 2279: UTF-8, a transformation format of ISO 10646:
 <http://www.ietf.org/rfc/rfc2279.txt>

 5) UTF-8 and Unicode FAQ:
 <http://www.cl.cam.ac.uk/~mgk25/unicode.html>

 6) Markus Kuhn's UTF-8 decoder stress test file:
 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>

 7) UTF-8 Demo:
 <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>

 8) UTF-8 Sampler:
 <http://www.columbia.edu/kermit/utf8.html>

 9) Transformation Format for 16 Planes of Group 00 (UTF-16):
 ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
 <http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
 <http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>

 10) RFC 2781: UTF-16, an encoding of ISO 10646:
 <http://www.ietf.org/rfc/rfc2781.txt>

 11) UTF-16 invalid surrogate pairs:
 <http://www.unicode.org/unicode/faq/utf_bom.html#16>

 UTF-16       UTF-8          UCS-4
 D83F DFF*    F0 9F BF B*    0001FFF*
 D87F DFF*    F0 AF BF B*    0002FFF*
 D8BF DFF*    F0 BF BF B*    0003FFF*
 D8FF DFF*    F1 8F BF B*    0004FFF*
 D93F DFF*    F1 9F BF B*    0005FFF*
 D97F DFF*    F1 AF BF B*    0006FFF*
                 ...
 DBBF DFF*    F3 BF BF B*    000FFFF*
 DBFF DFF*    F4 8F BF B*    0010FFF*

 * = E or F

 1010  A
 1011  B
 1100  C
 1101  D
 1110  E
 1111  F

 */

 #define kNumUTF8Sequences        7
 #define kMaxUTF8Bytes            4

 #define kUTF8ByteSwapNotAChar    0xFFFE
 #define kUTF8NotAChar            0xFFFF

 #define kMaxUTF8FromUCS4         0x10FFFF

 #define kUTF16SurrogatesBegin    0x10000
 #define kMaxUTF16FromUCS4        0x10FFFF

 /* UTF-16 surrogate pair areas */
 #define kUTF16LowSurrogateBegin  0xD800
 #define kUTF16LowSurrogateEnd    0xDBFF
 #define kUTF16HighSurrogateBegin 0xDC00
 #define kUTF16HighSurrogateEnd   0xDFFF


 /* offsets into validUTF8 table below */
 static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
 {
     0, /* 1 byte */
     1, /* 2 bytes */
     2, /* 3 bytes */
     4, /* 4 bytes */
     kNumUTF8Sequences /* must be last */
 };

 static const struct validUTF8Sequence
 {
      uint lowChar;
      uint highChar;
      int  numBytes;
      byte validBytes[8];
 } validUTF8[kNumUTF8Sequences] =
 {
 /*   low       high   #bytes  byte 1      byte 2      byte 3      byte 4 */
     {0x0000,   0x007F,   1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
     {0x0080,   0x07FF,   2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
     {0x0800,   0x0FFF,   3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
     {0x1000,   0xFFFF,   3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
     {0x10000,  0x3FFFF,  4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
     {0x40000,  0xFFFFF,  4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
     {0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
 };

 int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
                                 TidyInputSource* inp, int* count )
 {
     byte tempbuf[10];
     byte *buf = &tempbuf[0];
     uint ch = 0, n = 0;
     int i, bytes = 0;
     Bool hasError = no;

     if ( successorBytes )
         buf = (byte*) successorBytes;

     /* special check if we have been passed an EOF char */
     if ( firstByte == EndOfStream )
     {
         /* at present */
         *c = firstByte;
         *count = 1;
         return 0;
     }

     ch = firstByte; /* first byte is passed in separately */

     if (ch <= 0x7F) /* 0XXX XXXX one byte */
     {
         n = ch;
         bytes = 1;
     }
     else if ((ch & 0xE0) == 0xC0)  /* 110X XXXX  two bytes */
     {
         n = ch & 31;
         bytes = 2;
     }
     else if ((ch & 0xF0) == 0xE0)  /* 1110 XXXX  three bytes */
     {
         n = ch & 15;
         bytes = 3;
     }
     else if ((ch & 0xF8) == 0xF0)  /* 1111 0XXX  four bytes */
     {
         n = ch & 7;
         bytes = 4;
     }
     else if ((ch & 0xFC) == 0xF8)  /* 1111 10XX  five bytes */
     {
         n = ch & 3;
         bytes = 5;
         hasError = yes;
     }
     else if ((ch & 0xFE) == 0xFC)  /* 1111 110X  six bytes */
     {
         n = ch & 1;
         bytes = 6;
         hasError = yes;
     }
     else
     {
         /* not a valid first byte of a UTF-8 sequence */
         n = ch;
         bytes = 1;
         hasError = yes;
     }

     /* successor bytes should have the form 10XX XXXX */

     /* If caller supplied buffer, use it.  Else see if caller
     ** supplied an input source, use that.
     */
     if ( successorBytes )
     {
         for ( i=0; i < bytes-1; ++i )
         {
             if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
             {
                 hasError = yes;
                 bytes = i+1;
                 break;
             }
             n = (n << 6) | (buf[i] & 0x3F);
         }
     }
     else if ( inp )
     {
         for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
         {
             int b = inp->getByte( inp->sourceData );
             buf[i] = (tmbchar) b;

             /* End of data or illegal successor byte value */
             if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
             {
                 hasError = yes;
                 bytes = i+1;
                 if ( b != EOF )
                     inp->ungetByte( inp->sourceData, buf[i] );
                 break;
             }
             n = (n << 6) | (buf[i] & 0x3F);
         }
     }
     else if ( bytes > 1 )
     {
         hasError = yes;
         bytes = 1;
     }

     if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
         hasError = yes;

     if (!hasError && (n > kMaxUTF8FromUCS4))
         hasError = yes;

 #if 0 /* Breaks Big5 D8 - DF */
     if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
         /* unpaired surrogates not allowed */
         hasError = yes;
 #endif

     if (!hasError)
     {
         int lo, hi;

         lo = offsetUTF8Sequences[bytes - 1];
         hi = offsetUTF8Sequences[bytes] - 1;

         /* check for overlong sequences */
         if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
             hasError = yes;
         else
         {
             hasError = yes; /* assume error until proven otherwise */

             for (i = lo; i <= hi; i++)
             {
                 int tempCount;
                 byte theByte;

                 for (tempCount = 0; tempCount < bytes; tempCount++)
                 {
                     if (!tempCount)
                         theByte = (tmbchar) firstByte;
                     else
                         theByte = buf[tempCount - 1];

                     if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
                          theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
                         hasError = no;
                     if (hasError)
                         break;
                 }
             }
         }
     }

 #if 1 && defined(_DEBUG)
     if ( hasError )
     {
        /* debug */
        fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
        fprintf( stderr, "0x%02x ", firstByte );
        for (i = 1; i < bytes; i++)
            fprintf( stderr, "0x%02x ", buf[i - 1] );
        fprintf( stderr, " = U+%04ulx\n", n );
     }
 #endif

     *count = bytes;
     *c = n;
     if ( hasError )
         return -1;
     return 0;
 }

 int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
                                 TidyOutputSink* outp, int* count )
 {
     byte tempbuf[10] = {0};
     byte* buf = &tempbuf[0];
     int bytes = 0;
     Bool hasError = no;

     if ( encodebuf )
         buf = (byte*) encodebuf;

     if (c <= 0x7F)  /* 0XXX XXXX one byte */
     {
         buf[0] = (tmbchar) c;
         bytes = 1;
     }
     else if (c <= 0x7FF)  /* 110X XXXX  two bytes */
     {
         buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
         buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
         bytes = 2;
     }
     else if (c <= 0xFFFF)  /* 1110 XXXX  three bytes */
     {
         buf[0] = (tmbchar) (0xE0 | (c >> 12));
         buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
         buf[2] = (tmbchar) (0x80 | (c & 0x3F));
         bytes = 3;
         if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
             hasError = yes;
 #if 0 /* Breaks Big5 D8 - DF */
         else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
             /* unpaired surrogates not allowed */
             hasError = yes;
 #endif
     }
     else if (c <= 0x1FFFFF)  /* 1111 0XXX  four bytes */
     {
         buf[0] = (tmbchar) (0xF0 | (c >> 18));
         buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
         buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
         buf[3] = (tmbchar) (0x80 | (c & 0x3F));
         bytes = 4;
         if (c > kMaxUTF8FromUCS4)
             hasError = yes;
     }
     else if (c <= 0x3FFFFFF)  /* 1111 10XX  five bytes */
     {
         buf[0] = (tmbchar) (0xF8 | (c >> 24));
         buf[1] = (tmbchar) (0x80 | (c >> 18));
         buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
         buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
         buf[4] = (tmbchar) (0x80 | (c & 0x3F));
         bytes = 5;
         hasError = yes;
     }
     else if (c <= 0x7FFFFFFF)  /* 1111 110X  six bytes */
     {
         buf[0] = (tmbchar) (0xFC | (c >> 30));
         buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
         buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
         buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
         buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
         buf[5] = (tmbchar) (0x80 | (c & 0x3F));
         bytes = 6;
         hasError = yes;
     }
     else
         hasError = yes;

     /* don't output invalid UTF-8 byte sequence to a stream */
     if ( !hasError && outp != NULL )
     {
         int ix;
         for ( ix=0; ix < bytes; ++ix )
           outp->putByte( outp->sinkData, buf[ix] );
     }

 #if 1 && defined(_DEBUG)
     if ( hasError )
     {
         int i;
         fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
         for (i = 0; i < bytes; i++)
             fprintf( stderr, "0x%02x ", buf[i] );
         fprintf( stderr, "\n" );
     }
 #endif

     *count = bytes;
     if (hasError)
         return -1;
     return 0;
 }


 /* return one less than the number of bytes used by the UTF-8 byte sequence */
 /* str points to the UTF-8 byte sequence */
 /* the Unicode char is returned in *ch */
 uint TY_(GetUTF8)( ctmbstr str, uint *ch )
 {
     uint n;
     int bytes;

     int err;

     bytes = 0;

     /* first byte "str[0]" is passed in separately from the */
     /* rest of the UTF-8 byte sequence starting at "str[1]" */
     err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
     if (err)
     {
 #if 1 && defined(_DEBUG)
         fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
 #endif
         n = 0xFFFD; /* replacement char */
     }

     *ch = n;
     return bytes - 1;
 }

 /* store char c as UTF-8 encoded byte stream */
 tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
 {
     int err, count = 0;

     err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
     if (err)
     {
 #if 1 && defined(_DEBUG)
         fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
 #endif
         /* replacement char 0xFFFD encoded as UTF-8 */
         buf[0] = (byte) 0xEF;
         buf[1] = (byte) 0xBF;
         buf[2] = (byte) 0xBD;
         count = 3;
     }

     buf += count;
     return buf;
 }

 Bool    TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
 {
   return ( ucs4 <= kMaxUTF16FromUCS4 );
 }

 Bool    TY_(IsHighSurrogate)( tchar ch )
 {
     return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
 }
 Bool    TY_(IsLowSurrogate)( tchar ch )
 {
     return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
 }

 tchar   TY_(CombineSurrogatePair)( tchar high, tchar low )
 {
     assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
     return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
              high - kUTF16HighSurrogateBegin + 0x10000 );
 }

 Bool   TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
 {
     Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
     if ( status )
     {
         *low  = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
         *high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
     }
     return status;
 }

 Bool    TY_(IsValidCombinedChar)( tchar ch )
 {
     return ( ch >= kUTF16SurrogatesBegin &&
              (ch & 0x0000FFFE) != 0x0000FFFE &&
              (ch & 0x0000FFFF) != 0x0000FFFF );
 }

 Bool    TY_(IsCombinedChar)( tchar ch )
 {
     return ( ch >= kUTF16SurrogatesBegin );
 }

 /*
  * local variables:
  * mode: c
  * indent-tabs-mode: nil
  * c-basic-offset: 4
  * eval: (c-set-offset 'substatement-open 0)
  * end:
  */
	/* utf8.c -- convert characters to/from UTF-8

	(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
	See tidy.h for the copyright notice.

	Uses public interfaces to abstract input source and output
	sink, which may be user supplied or either FILE* or memory
	based Tidy implementations. Encoding support is uniform
	regardless of I/O mechanism.

	Note, UTF-8 encoding, by itself, does not affect the actual
	"codepoints" of the underlying character encoding. In the
	cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
	refer to ISO-10646 "codepoints". For anything else, they
	refer to some other "codepoint" set.

	Put another way, UTF-8 is a variable length method to
	represent any non-negative integer value. The glyph
	that a integer value represents is unchanged and defined
	externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
	Latin2-9, and so on).

	Put still another way, UTF-8 is more of a _transfer_ encoding
	than a _character_ encoding, per se.
	*/

	#include "tidy.h"
	#include "forward.h"
	#include "utf8.h"

	/*
	UTF-8 encoding/decoding functions
	Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence

	Also see below for UTF-16 encoding/decoding functions

	References :

	1) UCS Transformation Format 8 (UTF-8):
	ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
	<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
	<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>

	Table 4 - Mapping from UCS-4 to UTF-8

	2) Unicode standards:
	<http://www.unicode.org/unicode/standard/standard.html>

	3) Legal UTF-8 byte sequences:
	<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>

	Code point 1st byte 2nd byte 3rd byte 4th byte
	---------- -------- -------- -------- --------
	U+0000..U+007F 00..7F
	U+0080..U+07FF C2..DF 80..BF
	U+0800..U+0FFF E0 A0..BF 80..BF
	U+1000..U+FFFF E1..EF 80..BF 80..BF
	U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
	U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
	U+100000..U+10FFFF F4 80..8F 80..BF 80..BF

	The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
	allows for the use of five- and six-byte sequences to encode
	characters that are outside the range of the Unicode character
	set; those five- and six-byte sequences are illegal for the use
	of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
	does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
	(but it does allow other noncharacters).

	4) RFC 2279: UTF-8, a transformation format of ISO 10646:
	<http://www.ietf.org/rfc/rfc2279.txt>

	5) UTF-8 and Unicode FAQ:
	<http://www.cl.cam.ac.uk/~mgk25/unicode.html>

	6) Markus Kuhn's UTF-8 decoder stress test file:
	<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>

	7) UTF-8 Demo:
	<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>

	8) UTF-8 Sampler:
	<http://www.columbia.edu/kermit/utf8.html>

	9) Transformation Format for 16 Planes of Group 00 (UTF-16):
	ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
	<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
	<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>

	10) RFC 2781: UTF-16, an encoding of ISO 10646:
	<http://www.ietf.org/rfc/rfc2781.txt>

	11) UTF-16 invalid surrogate pairs:
	<http://www.unicode.org/unicode/faq/utf_bom.html#16>

	UTF-16 UTF-8 UCS-4
	D83F DFF* F0 9F BF B* 0001FFF*
	D87F DFF* F0 AF BF B* 0002FFF*
	D8BF DFF* F0 BF BF B* 0003FFF*
	D8FF DFF* F1 8F BF B* 0004FFF*
	D93F DFF* F1 9F BF B* 0005FFF*
	D97F DFF* F1 AF BF B* 0006FFF*
	...
	DBBF DFF* F3 BF BF B* 000FFFF*
	DBFF DFF* F4 8F BF B* 0010FFF*

	* = E or F

	1010 A
	1011 B
	1100 C
	1101 D
	1110 E
	1111 F

	*/

	#define kNumUTF8Sequences 7
	#define kMaxUTF8Bytes 4

	#define kUTF8ByteSwapNotAChar 0xFFFE
	#define kUTF8NotAChar 0xFFFF

	#define kMaxUTF8FromUCS4 0x10FFFF

	#define kUTF16SurrogatesBegin 0x10000
	#define kMaxUTF16FromUCS4 0x10FFFF

	/* UTF-16 surrogate pair areas */
	#define kUTF16LowSurrogateBegin 0xD800
	#define kUTF16LowSurrogateEnd 0xDBFF
	#define kUTF16HighSurrogateBegin 0xDC00
	#define kUTF16HighSurrogateEnd 0xDFFF


	/* offsets into validUTF8 table below */
	static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
	{
	0, /* 1 byte */
	1, /* 2 bytes */
	2, /* 3 bytes */
	4, /* 4 bytes */
	kNumUTF8Sequences /* must be last */
	};

	static const struct validUTF8Sequence
	{
	uint lowChar;
	uint highChar;
	int numBytes;
	byte validBytes[8];
	} validUTF8[kNumUTF8Sequences] =
	{
	/* low high #bytes byte 1 byte 2 byte 3 byte 4 */
	{0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
	{0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
	{0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
	{0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
	{0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
	{0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
	{0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
	};

	int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
	TidyInputSource* inp, int* count )
	{
	byte tempbuf[10];
	byte *buf = &tempbuf[0];
	uint ch = 0, n = 0;
	int i, bytes = 0;
	Bool hasError = no;

	if ( successorBytes )
	buf = (byte*) successorBytes;

	/* special check if we have been passed an EOF char */
	if ( firstByte == EndOfStream )
	{
	/* at present */
	*c = firstByte;
	*count = 1;
	return 0;
	}

	ch = firstByte; /* first byte is passed in separately */

	if (ch <= 0x7F) /* 0XXX XXXX one byte */
	{
	n = ch;
	bytes = 1;
	}
	else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
	{
	n = ch & 31;
	bytes = 2;
	}
	else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
	{
	n = ch & 15;
	bytes = 3;
	}
	else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
	{
	n = ch & 7;
	bytes = 4;
	}
	else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
	{
	n = ch & 3;
	bytes = 5;
	hasError = yes;
	}
	else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
	{
	n = ch & 1;
	bytes = 6;
	hasError = yes;
	}
	else
	{
	/* not a valid first byte of a UTF-8 sequence */
	n = ch;
	bytes = 1;
	hasError = yes;
	}

	/* successor bytes should have the form 10XX XXXX */

	/* If caller supplied buffer, use it. Else see if caller
	** supplied an input source, use that.
	*/
	if ( successorBytes )
	{
	for ( i=0; i < bytes-1; ++i )
	{
	if ( !buf[i] \|\| (buf[i] & 0xC0) != 0x80 )
	{
	hasError = yes;
	bytes = i+1;
	break;
	}
	n = (n << 6) \| (buf[i] & 0x3F);
	}
	}
	else if ( inp )
	{
	for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
	{
	int b = inp->getByte( inp->sourceData );
	buf[i] = (tmbchar) b;

	/* End of data or illegal successor byte value */
	if ( b == EOF \|\| (buf[i] & 0xC0) != 0x80 )
	{
	hasError = yes;
	bytes = i+1;
	if ( b != EOF )
	inp->ungetByte( inp->sourceData, buf[i] );
	break;
	}
	n = (n << 6) \| (buf[i] & 0x3F);
	}
	}
	else if ( bytes > 1 )
	{
	hasError = yes;
	bytes = 1;
	}

	if (!hasError && ((n == kUTF8ByteSwapNotAChar) \|\| (n == kUTF8NotAChar)))
	hasError = yes;

	if (!hasError && (n > kMaxUTF8FromUCS4))
	hasError = yes;

	#if 0 /* Breaks Big5 D8 - DF */
	if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
	/* unpaired surrogates not allowed */
	hasError = yes;
	#endif

	if (!hasError)
	{
	int lo, hi;

	lo = offsetUTF8Sequences[bytes - 1];
	hi = offsetUTF8Sequences[bytes] - 1;

	/* check for overlong sequences */
	if ((n < validUTF8[lo].lowChar) \|\| (n > validUTF8[hi].highChar))
	hasError = yes;
	else
	{
	hasError = yes; /* assume error until proven otherwise */

	for (i = lo; i <= hi; i++)
	{
	int tempCount;
	byte theByte;

	for (tempCount = 0; tempCount < bytes; tempCount++)
	{
	if (!tempCount)
	theByte = (tmbchar) firstByte;
	else
	theByte = buf[tempCount - 1];

	if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
	theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
	hasError = no;
	if (hasError)
	break;
	}
	}
	}
	}

	#if 1 && defined(_DEBUG)
	if ( hasError )
	{
	/* debug */
	fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
	fprintf( stderr, "0x%02x ", firstByte );
	for (i = 1; i < bytes; i++)
	fprintf( stderr, "0x%02x ", buf[i - 1] );
	fprintf( stderr, " = U+%04ulx\n", n );
	}
	#endif

	*count = bytes;
	*c = n;
	if ( hasError )
	return -1;
	return 0;
	}

	int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
	TidyOutputSink* outp, int* count )
	{
	byte tempbuf[10] = {0};
	byte* buf = &tempbuf[0];
	int bytes = 0;
	Bool hasError = no;

	if ( encodebuf )
	buf = (byte*) encodebuf;

	if (c <= 0x7F) /* 0XXX XXXX one byte */
	{
	buf[0] = (tmbchar) c;
	bytes = 1;
	}
	else if (c <= 0x7FF) /* 110X XXXX two bytes */
	{
	buf[0] = (tmbchar) ( 0xC0 \| (c >> 6) );
	buf[1] = (tmbchar) ( 0x80 \| (c & 0x3F) );
	bytes = 2;
	}
	else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
	{
	buf[0] = (tmbchar) (0xE0 \| (c >> 12));
	buf[1] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
	buf[2] = (tmbchar) (0x80 \| (c & 0x3F));
	bytes = 3;
	if ( c == kUTF8ByteSwapNotAChar \|\| c == kUTF8NotAChar )
	hasError = yes;
	#if 0 /* Breaks Big5 D8 - DF */
	else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
	/* unpaired surrogates not allowed */
	hasError = yes;
	#endif
	}
	else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
	{
	buf[0] = (tmbchar) (0xF0 \| (c >> 18));
	buf[1] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
	buf[2] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
	buf[3] = (tmbchar) (0x80 \| (c & 0x3F));
	bytes = 4;
	if (c > kMaxUTF8FromUCS4)
	hasError = yes;
	}
	else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
	{
	buf[0] = (tmbchar) (0xF8 \| (c >> 24));
	buf[1] = (tmbchar) (0x80 \| (c >> 18));
	buf[2] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
	buf[3] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
	buf[4] = (tmbchar) (0x80 \| (c & 0x3F));
	bytes = 5;
	hasError = yes;
	}
	else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
	{
	buf[0] = (tmbchar) (0xFC \| (c >> 30));
	buf[1] = (tmbchar) (0x80 \| ((c >> 24) & 0x3F));
	buf[2] = (tmbchar) (0x80 \| ((c >> 18) & 0x3F));
	buf[3] = (tmbchar) (0x80 \| ((c >> 12) & 0x3F));
	buf[4] = (tmbchar) (0x80 \| ((c >> 6) & 0x3F));
	buf[5] = (tmbchar) (0x80 \| (c & 0x3F));
	bytes = 6;
	hasError = yes;
	}
	else
	hasError = yes;

	/* don't output invalid UTF-8 byte sequence to a stream */
	if ( !hasError && outp != NULL )
	{
	int ix;
	for ( ix=0; ix < bytes; ++ix )
	outp->putByte( outp->sinkData, buf[ix] );
	}

	#if 1 && defined(_DEBUG)
	if ( hasError )
	{
	int i;
	fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
	for (i = 0; i < bytes; i++)
	fprintf( stderr, "0x%02x ", buf[i] );
	fprintf( stderr, "\n" );
	}
	#endif

	*count = bytes;
	if (hasError)
	return -1;
	return 0;
	}


	/* return one less than the number of bytes used by the UTF-8 byte sequence */
	/* str points to the UTF-8 byte sequence */
	/* the Unicode char is returned in ch /
	uint TY_(GetUTF8)( ctmbstr str, uint *ch )
	{
	uint n;
	int bytes;

	int err;

	bytes = 0;

	/* first byte "str[0]" is passed in separately from the */
	/* rest of the UTF-8 byte sequence starting at "str[1]" */
	err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
	if (err)
	{
	#if 1 && defined(_DEBUG)
	fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
	#endif
	n = 0xFFFD; /* replacement char */
	}

	*ch = n;
	return bytes - 1;
	}

	/* store char c as UTF-8 encoded byte stream */
	tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
	{
	int err, count = 0;

	err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
	if (err)
	{
	#if 1 && defined(_DEBUG)
	fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
	#endif
	/* replacement char 0xFFFD encoded as UTF-8 */
	buf[0] = (byte) 0xEF;
	buf[1] = (byte) 0xBF;
	buf[2] = (byte) 0xBD;
	count = 3;
	}

	buf += count;
	return buf;
	}

	Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
	{
	return ( ucs4 <= kMaxUTF16FromUCS4 );
	}

	Bool TY_(IsHighSurrogate)( tchar ch )
	{
	return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
	}
	Bool TY_(IsLowSurrogate)( tchar ch )
	{
	return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
	}

	tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
	{
	assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
	return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
	high - kUTF16HighSurrogateBegin + 0x10000 );
	}

	Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
	{
	Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
	if ( status )
	{
	*low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
	*high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
	}
	return status;
	}

	Bool TY_(IsValidCombinedChar)( tchar ch )
	{
	return ( ch >= kUTF16SurrogatesBegin &&
	(ch & 0x0000FFFE) != 0x0000FFFE &&
	(ch & 0x0000FFFF) != 0x0000FFFF );
	}

	Bool TY_(IsCombinedChar)( tchar ch )
	{
	return ( ch >= kUTF16SurrogatesBegin );
	}

	/*
	* local variables:
	* mode: c
	* indent-tabs-mode: nil
	* c-basic-offset: 4
	* eval: (c-set-offset 'substatement-open 0)
	* end:
	*/