| /* streamio.c -- handles character stream I/O |
| |
| (c) 1998-2008 (W3C) MIT, ERCIM, Keio University |
| See tidy.h for the copyright notice. |
| |
| Wrapper around Tidy input source and output sink |
| that calls appropriate interfaces, and applies |
| necessary char encoding transformations: to/from |
| ISO-10646 and/or UTF-8. |
| |
| */ |
| |
| #include <stdio.h> |
| #include <errno.h> |
| |
| #include "streamio.h" |
| #include "tidy-int.h" |
| #include "lexer.h" |
| #include "message.h" |
| #include "utf8.h" |
| #include "tmbstr.h" |
| |
| #ifdef TIDY_WIN32_MLANG_SUPPORT |
| #include "win32tc.h" |
| #endif |
| |
| /************************ |
| ** Forward Declarations |
| ************************/ |
| |
| static uint ReadCharFromStream( StreamIn* in ); |
| |
| static uint ReadByte( StreamIn* in ); |
| static void UngetByte( StreamIn* in, uint byteValue ); |
| |
| static void PutByte( uint byteValue, StreamOut* out ); |
| |
| static void EncodeWin1252( uint c, StreamOut* out ); |
| static void EncodeMacRoman( uint c, StreamOut* out ); |
| static void EncodeIbm858( uint c, StreamOut* out ); |
| static void EncodeLatin0( uint c, StreamOut* out ); |
| |
| static uint DecodeIbm850(uint c); |
| static uint DecodeLatin0(uint c); |
| |
| static uint PopChar( StreamIn *in ); |
| |
| /****************************** |
| ** Static (duration) Globals |
| ******************************/ |
| |
| static StreamOut stderrStreamOut = |
| { |
| ASCII, |
| FSM_ASCII, |
| DEFAULT_NL_CONFIG, |
| #ifdef TIDY_WIN32_MLANG_SUPPORT |
| NULL, |
| #endif |
| FileIO, |
| { 0, TY_(filesink_putByte) } |
| }; |
| |
| static StreamOut stdoutStreamOut = |
| { |
| ASCII, |
| FSM_ASCII, |
| DEFAULT_NL_CONFIG, |
| #ifdef TIDY_WIN32_MLANG_SUPPORT |
| NULL, |
| #endif |
| FileIO, |
| { 0, TY_(filesink_putByte) } |
| }; |
| |
| StreamOut* TY_(StdErrOutput)(void) |
| { |
| if ( stderrStreamOut.sink.sinkData == 0 ) |
| stderrStreamOut.sink.sinkData = stderr; |
| return &stderrStreamOut; |
| } |
| |
| #if 0 |
| StreamOut* TY_(StdOutOutput)(void) |
| { |
| if ( stdoutStreamOut.sink.sinkData == 0 ) |
| stdoutStreamOut.sink.sinkData = stdout; |
| return &stdoutStreamOut; |
| } |
| #endif |
| |
| void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out ) |
| { |
| if ( out && out != &stderrStreamOut && out != &stdoutStreamOut ) |
| { |
| if ( out->iotype == FileIO ) |
| fclose( (FILE*) out->sink.sinkData ); |
| TidyDocFree( doc, out ); |
| } |
| } |
| |
| /************************ |
| ** Source |
| ************************/ |
| |
| static void InitLastPos( StreamIn *in ); |
| |
| StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding ) |
| { |
| StreamIn *in = (StreamIn*) TidyDocAlloc( doc, sizeof(StreamIn) ); |
| |
| TidyClearMemory( in, sizeof(StreamIn) ); |
| in->curline = 1; |
| in->curcol = 1; |
| in->encoding = encoding; |
| in->state = FSM_ASCII; |
| in->doc = doc; |
| in->bufsize = CHARBUF_SIZE; |
| in->allocator = doc->allocator; |
| in->charbuf = (tchar*)TidyDocAlloc(doc, sizeof(tchar) * in->bufsize); |
| InitLastPos( in ); |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| in->otextbuf = NULL; |
| in->otextlen = 0; |
| in->otextsize = 0; |
| #endif |
| return in; |
| } |
| |
| void TY_(freeStreamIn)(StreamIn* in) |
| { |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| if (in->otextbuf) |
| TidyFree(in->allocator, in->otextbuf); |
| #endif |
| TidyFree(in->allocator, in->charbuf); |
| TidyFree(in->allocator, in); |
| } |
| |
| StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE *fp, int encoding ) |
| { |
| StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
| if ( TY_(initFileSource)( doc->allocator, &in->source, fp ) != 0 ) |
| { |
| TY_(freeStreamIn)( in ); |
| return NULL; |
| } |
| in->iotype = FileIO; |
| return in; |
| } |
| |
| StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* buf, int encoding ) |
| { |
| StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
| tidyInitInputBuffer( &in->source, buf ); |
| in->iotype = BufferIO; |
| return in; |
| } |
| |
| StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding ) |
| { |
| StreamIn *in = TY_(initStreamIn)( doc, encoding ); |
| memcpy( &in->source, source, sizeof(TidyInputSource) ); |
| in->iotype = UserIO; |
| return in; |
| } |
| |
| int TY_(ReadBOMEncoding)(StreamIn *in) |
| { |
| uint c, c1; |
| #if SUPPORT_UTF16_ENCODINGS |
| uint bom; |
| #endif |
| |
| c = ReadByte(in); |
| if (c == EndOfStream) |
| return -1; |
| |
| c1 = ReadByte( in ); |
| if (c1 == EndOfStream) |
| { |
| UngetByte(in, c); |
| return -1; |
| } |
| |
| /* todo: dont warn about mismatch for auto input encoding */ |
| /* todo: let the user override the encoding found here */ |
| |
| #if SUPPORT_UTF16_ENCODINGS |
| bom = (c << 8) + c1; |
| |
| if ( bom == UNICODE_BOM_BE ) |
| { |
| /* big-endian UTF-16 */ |
| if ( in->encoding != UTF16 && in->encoding != UTF16BE ) |
| TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16BE); |
| |
| return UTF16BE; /* return decoded BOM */ |
| } |
| else if (bom == UNICODE_BOM_LE) |
| { |
| /* little-endian UTF-16 */ |
| if (in->encoding != UTF16 && in->encoding != UTF16LE) |
| TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF16LE); |
| |
| return UTF16LE; /* return decoded BOM */ |
| } |
| else |
| #endif /* SUPPORT_UTF16_ENCODINGS */ |
| { |
| uint c2 = ReadByte(in); |
| |
| if (c2 == EndOfStream) |
| { |
| UngetByte(in, c1); |
| UngetByte(in, c); |
| return -1; |
| } |
| |
| if (((c << 16) + (c1 << 8) + c2) == UNICODE_BOM_UTF8) |
| { |
| /* UTF-8 */ |
| if (in->encoding != UTF8) |
| TY_(ReportEncodingWarning)(in->doc, ENCODING_MISMATCH, UTF8); |
| |
| return UTF8; |
| } |
| else |
| UngetByte( in, c2 ); |
| } |
| |
| UngetByte(in, c1); |
| UngetByte(in, c); |
| |
| return -1; |
| } |
| |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| void TY_(AddByteToOriginalText)(StreamIn *in, tmbchar c) |
| { |
| if (in->otextlen + 1 >= in->otextsize) |
| { |
| size_t size = in->otextsize ? 1 : 2; |
| in->otextbuf = TidyRealloc(in->allocator, in->otextbuf, in->otextsize + size); |
| in->otextsize += size; |
| } |
| in->otextbuf[in->otextlen++] = c; |
| in->otextbuf[in->otextlen ] = 0; |
| } |
| |
| void TY_(AddCharToOriginalText)(StreamIn *in, tchar c) |
| { |
| int i, err, count = 0; |
| tmbchar buf[10] = {0}; |
| |
| err = TY_(EncodeCharToUTF8Bytes)(c, buf, NULL, &count); |
| |
| if (err) |
| { |
| /* replacement character 0xFFFD encoded as UTF-8 */ |
| buf[0] = (byte) 0xEF; |
| buf[1] = (byte) 0xBF; |
| buf[2] = (byte) 0xBD; |
| count = 3; |
| } |
| |
| for (i = 0; i < count; ++i) |
| TY_(AddByteToOriginalText)(in, buf[i]); |
| } |
| #endif |
| |
| static void InitLastPos( StreamIn *in ) |
| { |
| in->curlastpos = 0; |
| in->firstlastpos = 0; |
| } |
| |
| static void PopLastPos( StreamIn *in ) |
| { |
| in->curlastpos = (in->curlastpos+1)%LASTPOS_SIZE; |
| if ( in->curlastpos == in->firstlastpos ) |
| in->firstlastpos = (in->firstlastpos+1)%LASTPOS_SIZE; |
| } |
| |
| static void SaveLastPos( StreamIn *in ) |
| { |
| PopLastPos( in ); |
| in->lastcols[in->curlastpos] = in->curcol; |
| } |
| |
| static void RestoreLastPos( StreamIn *in ) |
| { |
| if ( in->firstlastpos == in->curlastpos ) |
| in->curcol = 0; |
| else |
| { |
| in->curcol = in->lastcols[in->curlastpos]; |
| if ( in->curlastpos == 0 ) |
| in->curlastpos = LASTPOS_SIZE; |
| in->curlastpos--; |
| } |
| } |
| |
| uint TY_(ReadChar)( StreamIn *in ) |
| { |
| uint c = EndOfStream; |
| uint tabsize = cfg( in->doc, TidyTabSize ); |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| Bool added = no; |
| #endif |
| |
| if ( in->pushed ) |
| return PopChar( in ); |
| |
| SaveLastPos( in ); |
| |
| if ( in->tabs > 0 ) |
| { |
| in->curcol++; |
| in->tabs--; |
| return ' '; |
| } |
| |
| for (;;) |
| { |
| c = ReadCharFromStream(in); |
| |
| if ( EndOfStream == c ) |
| return EndOfStream; |
| |
| if (c == '\n') |
| { |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| added = yes; |
| TY_(AddCharToOriginalText)(in, (tchar)c); |
| #endif |
| in->curcol = 1; |
| in->curline++; |
| break; |
| } |
| |
| if (c == '\t') |
| { |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| added = yes; |
| TY_(AddCharToOriginalText)(in, (tchar)c); |
| #endif |
| in->tabs = tabsize > 0 ? |
| tabsize - ((in->curcol - 1) % tabsize) - 1 |
| : 0; |
| in->curcol++; |
| c = ' '; |
| break; |
| } |
| |
| /* #427663 - map '\r' to '\n' - Andy Quick 11 Aug 00 */ |
| if (c == '\r') |
| { |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| added = yes; |
| TY_(AddCharToOriginalText)(in, (tchar)c); |
| #endif |
| c = ReadCharFromStream(in); |
| if (c != '\n') |
| { |
| TY_(UngetChar)( c, in ); |
| c = '\n'; |
| } |
| else |
| { |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| TY_(AddCharToOriginalText)(in, (tchar)c); |
| #endif |
| } |
| in->curcol = 1; |
| in->curline++; |
| break; |
| } |
| |
| #ifndef NO_NATIVE_ISO2022_SUPPORT |
| /* strip control characters, except for Esc */ |
| if (c == '\033') |
| break; |
| #endif |
| |
| /* Form Feed is allowed in HTML */ |
| if ( c == '\015' && !cfgBool(in->doc, TidyXmlTags) ) |
| break; |
| |
| if ( c < 32 ) |
| continue; /* discard control char */ |
| |
| /* watch out for chars that have already been decoded such as */ |
| /* IS02022, UTF-8 etc, that don't require further decoding */ |
| |
| if ( |
| in->encoding == RAW |
| #ifndef NO_NATIVE_ISO2022_SUPPORT |
| || in->encoding == ISO2022 |
| #endif |
| || in->encoding == UTF8 |
| |
| #if SUPPORT_ASIAN_ENCODINGS |
| || in->encoding == SHIFTJIS /* #431953 - RJ */ |
| || in->encoding == BIG5 /* #431953 - RJ */ |
| #endif |
| ) |
| { |
| in->curcol++; |
| break; |
| } |
| |
| #if SUPPORT_UTF16_ENCODINGS |
| /* handle surrogate pairs */ |
| if ( in->encoding == UTF16LE || |
| in->encoding == UTF16 || |
| in->encoding == UTF16BE ) |
| { |
| if ( !TY_(IsValidUTF16FromUCS4)(c) ) |
| { |
| /* invalid UTF-16 value */ |
| TY_(ReportEncodingError)(in->doc, INVALID_UTF16, c, yes); |
| c = 0; |
| } |
| else if ( TY_(IsLowSurrogate)(c) ) |
| { |
| uint n = c; |
| uint m = ReadCharFromStream( in ); |
| if ( m == EndOfStream ) |
| return EndOfStream; |
| |
| c = 0; |
| if ( TY_(IsHighSurrogate)(m) ) |
| { |
| n = TY_(CombineSurrogatePair)( m, n ); |
| if ( TY_(IsValidCombinedChar)(n) ) |
| c = n; |
| } |
| /* not a valid pair */ |
| if ( 0 == c ) |
| TY_(ReportEncodingError)( in->doc, INVALID_UTF16, c, yes ); |
| } |
| } |
| #endif |
| |
| /* Do first: acts on range 128 - 255 */ |
| switch ( in->encoding ) |
| { |
| case MACROMAN: |
| c = TY_(DecodeMacRoman)( c ); |
| break; |
| case IBM858: |
| c = DecodeIbm850( c ); |
| break; |
| case LATIN0: |
| c = DecodeLatin0( c ); |
| break; |
| } |
| |
| /* produced e.g. as a side-effect of smart quotes in Word */ |
| /* but can't happen if using MACROMAN encoding */ |
| if ( 127 < c && c < 160 ) |
| { |
| uint c1 = 0, replMode = DISCARDED_CHAR; |
| Bool isVendorChar = ( in->encoding == WIN1252 || |
| in->encoding == MACROMAN ); |
| Bool isWinChar = ( in->encoding == WIN1252 || |
| TY_(ReplacementCharEncoding) == WIN1252 ); |
| Bool isMacChar = ( in->encoding == MACROMAN || |
| TY_(ReplacementCharEncoding) == MACROMAN ); |
| |
| /* set error position just before offending character */ |
| if (in->doc->lexer) |
| { |
| in->doc->lexer->lines = in->curline; |
| in->doc->lexer->columns = in->curcol; |
| } |
| |
| if ( isWinChar ) |
| c1 = TY_(DecodeWin1252)( c ); |
| else if ( isMacChar ) |
| c1 = TY_(DecodeMacRoman)( c ); |
| if ( c1 ) |
| replMode = REPLACED_CHAR; |
| |
| if ( c1 == 0 && isVendorChar ) |
| TY_(ReportEncodingError)(in->doc, VENDOR_SPECIFIC_CHARS, c, replMode == DISCARDED_CHAR); |
| else if ( ! isVendorChar ) |
| TY_(ReportEncodingError)(in->doc, INVALID_SGML_CHARS, c, replMode == DISCARDED_CHAR); |
| |
| c = c1; |
| } |
| |
| if ( c == 0 ) |
| continue; /* illegal char is discarded */ |
| |
| in->curcol++; |
| break; |
| } |
| |
| #ifdef TIDY_STORE_ORIGINAL_TEXT |
| if (!added) |
| TY_(AddCharToOriginalText)(in, (tchar)c); |
| #endif |
| |
| return c; |
| } |
| |
| static uint PopChar( StreamIn *in ) |
| { |
| uint c = EndOfStream; |
| if ( in->pushed ) |
| { |
| assert( in->bufpos > 0 ); |
| c = in->charbuf[ --in->bufpos ]; |
| if ( in->bufpos == 0 ) |
| in->pushed = no; |
| |
| if ( c == '\n' ) |
| { |
| in->curcol = 1; |
| in->curline++; |
| PopLastPos( in ); |
| return c; |
| } |
| in->curcol++; |
| PopLastPos( in ); |
| } |
| return c; |
| } |
| |
| void TY_(UngetChar)( uint c, StreamIn *in ) |
| { |
| if (c == EndOfStream) |
| { |
| /* fprintf(stderr, "Attempt to UngetChar EOF\n"); */ |
| return; |
| } |
| |
| in->pushed = yes; |
| |
| if (in->bufpos + 1 >= in->bufsize) |
| in->charbuf = (tchar*)TidyRealloc(in->allocator, in->charbuf, sizeof(tchar) * ++(in->bufsize)); |
| |
| in->charbuf[(in->bufpos)++] = c; |
| |
| if (c == '\n') |
| --(in->curline); |
| |
| RestoreLastPos( in ); |
| } |
| |
| |
| |
| /************************ |
| ** Sink |
| ************************/ |
| |
| static StreamOut* initStreamOut( TidyDocImpl* doc, int encoding, uint nl ) |
| { |
| StreamOut* out = (StreamOut*) TidyDocAlloc( doc, sizeof(StreamOut) ); |
| TidyClearMemory( out, sizeof(StreamOut) ); |
| out->encoding = encoding; |
| out->state = FSM_ASCII; |
| out->nl = nl; |
| return out; |
| } |
| |
| StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint nl ) |
| { |
| StreamOut* out = initStreamOut( doc, encoding, nl ); |
| TY_(initFileSink)( &out->sink, fp ); |
| out->iotype = FileIO; |
| return out; |
| } |
| StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint nl ) |
| { |
| StreamOut* out = initStreamOut( doc, encoding, nl ); |
| tidyInitOutputBuffer( &out->sink, buf ); |
| out->iotype = BufferIO; |
| return out; |
| } |
| StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint nl ) |
| { |
| StreamOut* out = initStreamOut( doc, encoding, nl ); |
| memcpy( &out->sink, sink, sizeof(TidyOutputSink) ); |
| out->iotype = UserIO; |
| return out; |
| } |
| |
| void TY_(WriteChar)( uint c, StreamOut* out ) |
| { |
| /* Translate outgoing newlines */ |
| if ( LF == c ) |
| { |
| if ( out->nl == TidyCRLF ) |
| TY_(WriteChar)( CR, out ); |
| else if ( out->nl == TidyCR ) |
| c = CR; |
| } |
| |
| if (out->encoding == MACROMAN) |
| { |
| EncodeMacRoman( c, out ); |
| } |
| else if (out->encoding == WIN1252) |
| { |
| EncodeWin1252( c, out ); |
| } |
| else if (out->encoding == IBM858) |
| { |
| EncodeIbm858( c, out ); |
| } |
| else if (out->encoding == LATIN0) |
| { |
| EncodeLatin0( c, out ); |
| } |
| |
| else if (out->encoding == UTF8) |
| { |
| int count = 0; |
| |
| TY_(EncodeCharToUTF8Bytes)( c, NULL, &out->sink, &count ); |
| if (count <= 0) |
| { |
| /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF8 | REPLACED_CHAR, c); */ |
| /* replacement char 0xFFFD encoded as UTF-8 */ |
| PutByte(0xEF, out); PutByte(0xBF, out); PutByte(0xBF, out); |
| } |
| } |
| #ifndef NO_NATIVE_ISO2022_SUPPORT |
| else if (out->encoding == ISO2022) |
| { |
| if (c == 0x1b) /* ESC */ |
| out->state = FSM_ESC; |
| else |
| { |
| switch (out->state) |
| { |
| case FSM_ESC: |
| if (c == '$') |
| out->state = FSM_ESCD; |
| else if (c == '(') |
| out->state = FSM_ESCP; |
| else |
| out->state = FSM_ASCII; |
| break; |
| |
| case FSM_ESCD: |
| if (c == '(') |
| out->state = FSM_ESCDP; |
| else |
| out->state = FSM_NONASCII; |
| break; |
| |
| case FSM_ESCDP: |
| out->state = FSM_NONASCII; |
| break; |
| |
| case FSM_ESCP: |
| out->state = FSM_ASCII; |
| break; |
| |
| case FSM_NONASCII: |
| c &= 0x7F; |
| break; |
| |
| case FSM_ASCII: |
| break; |
| } |
| } |
| |
| PutByte(c, out); |
| } |
| #endif /* NO_NATIVE_ISO2022_SUPPORT */ |
| |
| #if SUPPORT_UTF16_ENCODINGS |
| else if ( out->encoding == UTF16LE || |
| out->encoding == UTF16BE || |
| out->encoding == UTF16 ) |
| { |
| int i, numChars = 1; |
| uint theChars[2]; |
| |
| if ( !TY_(IsValidUTF16FromUCS4)(c) ) |
| { |
| /* invalid UTF-16 value */ |
| /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ |
| c = 0; |
| numChars = 0; |
| } |
| else if ( TY_(IsCombinedChar)(c) ) |
| { |
| /* output both, unless something goes wrong */ |
| numChars = 2; |
| if ( !TY_(SplitSurrogatePair)(c, &theChars[0], &theChars[1]) ) |
| { |
| /* TY_(ReportEncodingError)(in->lexer, INVALID_UTF16 | DISCARDED_CHAR, c); */ |
| c = 0; |
| numChars = 0; |
| } |
| } |
| else |
| { |
| /* just put the char out */ |
| theChars[0] = c; |
| } |
| |
| for (i = 0; i < numChars; i++) |
| { |
| c = theChars[i]; |
| |
| if (out->encoding == UTF16LE) |
| { |
| uint ch = c & 0xFF; PutByte(ch, out); |
| ch = (c >> 8) & 0xFF; PutByte(ch, out); |
| } |
| |
| else if (out->encoding == UTF16BE || out->encoding == UTF16) |
| { |
| uint ch = (c >> 8) & 0xFF; PutByte(ch, out); |
| ch = c & 0xFF; PutByte(ch, out); |
| } |
| } |
| } |
| #endif |
| |
| #if SUPPORT_ASIAN_ENCODINGS |
| else if (out->encoding == BIG5 || out->encoding == SHIFTJIS) |
| { |
| if (c < 128) |
| PutByte(c, out); |
| else |
| { |
| uint ch = (c >> 8) & 0xFF; PutByte(ch, out); |
| ch = c & 0xFF; PutByte(ch, out); |
| } |
| } |
| #endif |
| |
| else |
| PutByte( c, out ); |
| } |
| |
| |
| |
| /**************************** |
| ** Miscellaneous / Helpers |
| ****************************/ |
| |
| /* char encoding used when replacing illegal SGML chars, |
| ** regardless of specified encoding. Set at compile time |
| ** to either Windows or Mac. |
| */ |
| const int TY_(ReplacementCharEncoding) = DFLT_REPL_CHARENC; |
| |
| |
| /* Mapping for Windows Western character set CP 1252 |
| ** (chars 128-159/U+0080-U+009F) to Unicode. |
| */ |
| static const uint Win2Unicode[32] = |
| { |
| 0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, |
| 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, |
| 0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, |
| 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178 |
| }; |
| |
| /* Function for conversion from Windows-1252 to Unicode */ |
| uint TY_(DecodeWin1252)(uint c) |
| { |
| if (127 < c && c < 160) |
| c = Win2Unicode[c - 128]; |
| |
| return c; |
| } |
| |
| static void EncodeWin1252( uint c, StreamOut* out ) |
| { |
| if (c < 128 || (c > 159 && c < 256)) |
| PutByte(c, out); |
| else |
| { |
| int i; |
| |
| for (i = 128; i < 160; i++) |
| if (Win2Unicode[i - 128] == c) |
| { |
| PutByte(i, out); |
| break; |
| } |
| } |
| } |
| |
| /* |
| John Love-Jensen contributed this table for mapping MacRoman |
| character set to Unicode |
| */ |
| |
| /* modified to only need chars 128-255/U+0080-U+00FF - Terry Teague 19 Aug 01 */ |
| static const uint Mac2Unicode[128] = |
| { |
| /* x7F = DEL */ |
| |
| 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, |
| 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, |
| |
| 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, |
| 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, |
| |
| 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, |
| 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, |
| |
| 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, |
| /* =BD U+2126 OHM SIGN */ |
| 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, |
| |
| 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, |
| 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, |
| |
| 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, |
| /* =DB U+00A4 CURRENCY SIGN */ |
| 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, |
| |
| 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, |
| 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, |
| /* xF0 = Apple Logo */ |
| /* =F0 U+2665 BLACK HEART SUIT */ |
| 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, |
| 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7 |
| }; |
| |
| /* Function to convert from MacRoman to Unicode */ |
| uint TY_(DecodeMacRoman)(uint c) |
| { |
| if (127 < c) |
| c = Mac2Unicode[c - 128]; |
| return c; |
| } |
| |
| static void EncodeMacRoman( uint c, StreamOut* out ) |
| { |
| if (c < 128) |
| PutByte(c, out); |
| else |
| { |
| /* For mac users, map Unicode back to MacRoman. */ |
| int i; |
| for (i = 128; i < 256; i++) |
| { |
| if (Mac2Unicode[i - 128] == c) |
| { |
| PutByte(i, out); |
| break; |
| } |
| } |
| } |
| } |
| |
| /* Mapping for OS/2 Western character set CP 850 |
| ** (chars 128-255) to Unicode. |
| */ |
| static const uint IBM2Unicode[128] = |
| { |
| 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7, |
| 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5, |
| 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9, |
| 0x00FF, 0x00D6, 0x00DC, 0x00F8, 0x00A3, 0x00D8, 0x00D7, 0x0192, |
| 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA, |
| 0x00BF, 0x00AE, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB, |
| 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00C1, 0x00C2, 0x00C0, |
| 0x00A9, 0x2563, 0x2551, 0x2557, 0x255D, 0x00A2, 0x00A5, 0x2510, |
| 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x00E3, 0x00C3, |
| 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, |
| 0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x20AC, 0x00cd, 0x00ce, |
| 0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, |
| 0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, |
| 0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4, |
| 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8, |
| 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0 |
| }; |
| |
| /* Function for conversion from OS/2-850 to Unicode */ |
| static uint DecodeIbm850(uint c) |
| { |
| if (127 < c && c < 256) |
| c = IBM2Unicode[c - 128]; |
| |
| return c; |
| } |
| |
| /* For OS/2,Java users, map Unicode back to IBM858 (IBM850+Euro). */ |
| static void EncodeIbm858( uint c, StreamOut* out ) |
| { |
| if (c < 128) |
| PutByte(c, out); |
| else |
| { |
| int i; |
| for (i = 128; i < 256; i++) |
| { |
| if (IBM2Unicode[i - 128] == c) |
| { |
| PutByte(i, out); |
| break; |
| } |
| } |
| } |
| } |
| |
| |
| /* Convert from Latin0 (aka Latin9, ISO-8859-15) to Unicode */ |
| static uint DecodeLatin0(uint c) |
| { |
| if (159 < c && c < 191) |
| { |
| switch (c) |
| { |
| case 0xA4: c = 0x20AC; break; |
| case 0xA6: c = 0x0160; break; |
| case 0xA8: c = 0x0161; break; |
| case 0xB4: c = 0x017D; break; |
| case 0xB8: c = 0x017E; break; |
| case 0xBC: c = 0x0152; break; |
| case 0xBD: c = 0x0153; break; |
| case 0xBE: c = 0x0178; break; |
| } |
| } |
| return c; |
| } |
| |
| /* Map Unicode back to ISO-8859-15. */ |
| static void EncodeLatin0( uint c, StreamOut* out ) |
| { |
| switch (c) |
| { |
| case 0x20AC: c = 0xA4; break; |
| case 0x0160: c = 0xA6; break; |
| case 0x0161: c = 0xA8; break; |
| case 0x017D: c = 0xB4; break; |
| case 0x017E: c = 0xB8; break; |
| case 0x0152: c = 0xBC; break; |
| case 0x0153: c = 0xBD; break; |
| case 0x0178: c = 0xBE; break; |
| } |
| PutByte(c, out); |
| } |
| |
| /* |
| Table to map symbol font characters to Unicode; undefined |
| characters are mapped to 0x0000 and characters without any |
| Unicode equivalent are mapped to '?'. Is this appropriate? |
| */ |
| |
| #if 0 |
| static const uint Symbol2Unicode[] = |
| { |
| 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, |
| 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, |
| |
| 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, |
| 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, |
| |
| 0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220D, |
| 0x0028, 0x0029, 0x2217, 0x002B, 0x002C, 0x2212, 0x002E, 0x002F, |
| |
| 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, |
| 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, |
| |
| 0x2245, 0x0391, 0x0392, 0x03A7, 0x0394, 0x0395, 0x03A6, 0x0393, |
| 0x0397, 0x0399, 0x03D1, 0x039A, 0x039B, 0x039C, 0x039D, 0x039F, |
| |
| 0x03A0, 0x0398, 0x03A1, 0x03A3, 0x03A4, 0x03A5, 0x03C2, 0x03A9, |
| 0x039E, 0x03A8, 0x0396, 0x005B, 0x2234, 0x005D, 0x22A5, 0x005F, |
| |
| 0x00AF, 0x03B1, 0x03B2, 0x03C7, 0x03B4, 0x03B5, 0x03C6, 0x03B3, |
| 0x03B7, 0x03B9, 0x03D5, 0x03BA, 0x03BB, 0x03BC, 0x03BD, 0x03BF, |
| |
| 0x03C0, 0x03B8, 0x03C1, 0x03C3, 0x03C4, 0x03C5, 0x03D6, 0x03C9, |
| 0x03BE, 0x03C8, 0x03B6, 0x007B, 0x007C, 0x007D, 0x223C, 0x003F, |
| |
| 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, |
| 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, |
| |
| 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, |
| 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, |
| |
| 0x00A0, 0x03D2, 0x2032, 0x2264, 0x2044, 0x221E, 0x0192, 0x2663, |
| 0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193, |
| |
| 0x00B0, 0x00B1, 0x2033, 0x2265, 0x00D7, 0x221D, 0x2202, 0x00B7, |
| 0x00F7, 0x2260, 0x2261, 0x2248, 0x2026, 0x003F, 0x003F, 0x21B5, |
| |
| 0x2135, 0x2111, 0x211C, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229, |
| 0x222A, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209, |
| |
| 0x2220, 0x2207, 0x00AE, 0x00A9, 0x2122, 0x220F, 0x221A, 0x22C5, |
| 0x00AC, 0x2227, 0x2228, 0x21D4, 0x21D0, 0x21D1, 0x21D2, 0x21D3, |
| |
| 0x25CA, 0x2329, 0x00AE, 0x00A9, 0x2122, 0x2211, 0x003F, 0x003F, |
| 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, |
| |
| 0x20AC, 0x232A, 0x222B, 0x2320, 0x003F, 0x2321, 0x003F, 0x003F, |
| 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F, 0x003F |
| }; |
| |
| /* Function to convert from Symbol Font chars to Unicode */ |
| uint DecodeSymbolFont(uint c) |
| { |
| if (c > 255) |
| return c; |
| |
| /* todo: add some error message */ |
| |
| return Symbol2Unicode[c]; |
| } |
| #endif |
| |
| |
| /* Facilitates user defined source by providing |
| ** an entry point to marshal pointers-to-functions. |
| ** Needed by .NET and possibly other language bindings. |
| */ |
| Bool TIDY_CALL tidyInitSource( TidyInputSource* source, |
| void* srcData, |
| TidyGetByteFunc gbFunc, |
| TidyUngetByteFunc ugbFunc, |
| TidyEOFFunc endFunc ) |
| { |
| Bool status = ( source && srcData && gbFunc && ugbFunc && endFunc ); |
| |
| if ( status ) |
| { |
| source->sourceData = srcData; |
| source->getByte = gbFunc; |
| source->ungetByte = ugbFunc; |
| source->eof = endFunc; |
| } |
| |
| return status; |
| } |
| |
| Bool TIDY_CALL tidyInitSink( TidyOutputSink* sink, |
| void* snkData, |
| TidyPutByteFunc pbFunc ) |
| { |
| Bool status = ( sink && snkData && pbFunc ); |
| if ( status ) |
| { |
| sink->sinkData = snkData; |
| sink->putByte = pbFunc; |
| } |
| return status; |
| } |
| |
| /* GetByte must return a byte value in a signed |
| ** integer so that a negative value can signal EOF |
| ** without interfering w/ 0-255 legitimate byte values. |
| */ |
| uint TIDY_CALL tidyGetByte( TidyInputSource* source ) |
| { |
| int bv = source->getByte( source->sourceData ); |
| return (uint) bv; |
| } |
| Bool TIDY_CALL tidyIsEOF( TidyInputSource* source ) |
| { |
| return source->eof( source->sourceData ); |
| } |
| void TIDY_CALL tidyUngetByte( TidyInputSource* source, uint ch ) |
| { |
| source->ungetByte( source->sourceData, (byte) ch ); |
| } |
| void TIDY_CALL tidyPutByte( TidyOutputSink* sink, uint ch ) |
| { |
| sink->putByte( sink->sinkData, (byte) ch ); |
| } |
| |
| static uint ReadByte( StreamIn* in ) |
| { |
| return tidyGetByte( &in->source ); |
| } |
| Bool TY_(IsEOF)( StreamIn* in ) |
| { |
| return tidyIsEOF( &in->source ); |
| } |
| static void UngetByte( StreamIn* in, uint byteValue ) |
| { |
| tidyUngetByte( &in->source, byteValue ); |
| } |
| static void PutByte( uint byteValue, StreamOut* out ) |
| { |
| tidyPutByte( &out->sink, byteValue ); |
| } |
| |
| #if 0 |
| static void UngetRawBytesToStream( StreamIn *in, byte* buf, int *count ) |
| { |
| int i; |
| |
| for (i = 0; i < *count; i++) |
| { |
| /* should never get here; testing for 0xFF, a valid char, is not a good idea */ |
| if ( in && TY_(IsEOF)(in) ) |
| { |
| /* fprintf(stderr,"Attempt to unget EOF in UngetRawBytesToStream\n"); */ |
| *count = -i; |
| return; |
| } |
| |
| in->source.ungetByte( in->source.sourceData, buf[i] ); |
| } |
| } |
| |
| /* |
| Read raw bytes from stream, return <= 0 if EOF; or if |
| "unget" is true, Unget the bytes to re-synchronize the input stream |
| Normally UTF-8 successor bytes are read using this routine. |
| */ |
| static void ReadRawBytesFromStream( StreamIn *in, byte* buf, int *count ) |
| { |
| int ix; |
| for ( ix=0; ix < *count; ++ix ) |
| { |
| if ( in->rawPushed ) |
| { |
| buf[ix] = in->rawBytebuf[ --in->rawBufpos ]; |
| if ( in->rawBufpos == 0 ) |
| in->rawPushed = no; |
| } |
| else |
| { |
| if ( in->source.eof(in->source.sourceData) ) |
| { |
| *count = -i; |
| break; |
| } |
| buf[ix] = in->source.getByte( in->source.sourceData ); |
| } |
| } |
| } |
| #endif /* 0 */ |
| |
| /* read char from stream */ |
| static uint ReadCharFromStream( StreamIn* in ) |
| { |
| uint c, n; |
| #ifdef TIDY_WIN32_MLANG_SUPPORT |
| uint bytesRead = 0; |
| #endif |
| |
| if ( TY_(IsEOF)(in) ) |
| return EndOfStream; |
| |
| c = ReadByte( in ); |
| |
| if (c == EndOfStream) |
| return c; |
| |
| #ifndef NO_NATIVE_ISO2022_SUPPORT |
| /* |
| A document in ISO-2022 based encoding uses some ESC sequences |
| called "designator" to switch character sets. The designators |
| defined and used in ISO-2022-JP are: |
| |
| "ESC" + "(" + ? for ISO646 variants |
| |
| "ESC" + "$" + ? and |
| "ESC" + "$" + "(" + ? for multibyte character sets |
| |
| Where ? stands for a single character used to indicate the |
| character set for multibyte characters. |
| |
| Tidy handles this by preserving the escape sequence and |
| setting the top bit of each byte for non-ascii chars. This |
| bit is then cleared on output. The input stream keeps track |
| of the state to determine when to set/clear the bit. |
| */ |
| |
| if (in->encoding == ISO2022) |
| { |
| if (c == 0x1b) /* ESC */ |
| { |
| in->state = FSM_ESC; |
| return c; |
| } |
| |
| switch (in->state) |
| { |
| case FSM_ESC: |
| if (c == '$') |
| in->state = FSM_ESCD; |
| else if (c == '(') |
| in->state = FSM_ESCP; |
| else |
| in->state = FSM_ASCII; |
| break; |
| |
| case FSM_ESCD: |
| if (c == '(') |
| in->state = FSM_ESCDP; |
| else |
| in->state = FSM_NONASCII; |
| break; |
| |
| case FSM_ESCDP: |
| in->state = FSM_NONASCII; |
| break; |
| |
| case FSM_ESCP: |
| in->state = FSM_ASCII; |
| break; |
| |
| case FSM_NONASCII: |
| c |= 0x80; |
| break; |
| |
| case FSM_ASCII: |
| break; |
| } |
| |
| return c; |
| } |
| #endif /* #ifndef NO_NATIVE_ISO2022_SUPPORT */ |
| |
| #if SUPPORT_UTF16_ENCODINGS |
| if ( in->encoding == UTF16LE ) |
| { |
| uint c1 = ReadByte( in ); |
| if ( EndOfStream == c1 ) |
| return EndOfStream; |
| n = (c1 << 8) + c; |
| return n; |
| } |
| |
| if ((in->encoding == UTF16) || (in->encoding == UTF16BE)) /* UTF-16 is big-endian by default */ |
| { |
| uint c1 = ReadByte( in ); |
| if ( EndOfStream == c1 ) |
| return EndOfStream; |
| n = (c << 8) + c1; |
| return n; |
| } |
| #endif |
| |
| if ( in->encoding == UTF8 ) |
| { |
| /* deal with UTF-8 encoded char */ |
| |
| int err, count = 0; |
| |
| /* first byte "c" is passed in separately */ |
| err = TY_(DecodeUTF8BytesToChar)( &n, c, NULL, &in->source, &count ); |
| if (!err && (n == (uint)EndOfStream) && (count == 1)) /* EOF */ |
| return EndOfStream; |
| else if (err) |
| { |
| /* set error position just before offending character */ |
| in->doc->lexer->lines = in->curline; |
| in->doc->lexer->columns = in->curcol; |
| |
| TY_(ReportEncodingError)(in->doc, INVALID_UTF8, n, no); |
| n = 0xFFFD; /* replacement char */ |
| } |
| |
| return n; |
| } |
| |
| #if SUPPORT_ASIAN_ENCODINGS |
| /* |
| This section is suitable for any "multibyte" variable-width |
| character encoding in which a one-byte code is less than |
| 128, and the first byte of a two-byte code is greater or |
| equal to 128. Note that Big5 and ShiftJIS fit into this |
| kind, even though their second byte may be less than 128 |
| */ |
| if ((in->encoding == BIG5) || (in->encoding == SHIFTJIS)) |
| { |
| if (c < 128) |
| return c; |
| else if ((in->encoding == SHIFTJIS) && (c >= 0xa1 && c <= 0xdf)) /* 461643 - fix suggested by Rick Cameron 14 Sep 01 */ |
| { |
| /* |
| Rick Cameron pointed out that for Shift_JIS, the values from |
| 0xa1 through 0xdf represent singe-byte characters |
| (U+FF61 to U+FF9F - half-shift Katakana) |
| */ |
| return c; |
| } |
| else |
| { |
| uint c1 = ReadByte( in ); |
| if ( EndOfStream == c1 ) |
| return EndOfStream; |
| n = (c << 8) + c1; |
| return n; |
| } |
| } |
| #endif |
| |
| #ifdef TIDY_WIN32_MLANG_SUPPORT |
| else if (in->encoding > WIN32MLANG) |
| { |
| assert( in->mlang != NULL ); |
| return TY_(Win32MLangGetChar)((byte)c, in, &bytesRead); |
| } |
| #endif |
| |
| else |
| n = c; |
| |
| return n; |
| } |
| |
| /* Output a Byte Order Mark if required */ |
| void TY_(outBOM)( StreamOut *out ) |
| { |
| if ( out->encoding == UTF8 |
| #if SUPPORT_UTF16_ENCODINGS |
| || out->encoding == UTF16LE |
| || out->encoding == UTF16BE |
| || out->encoding == UTF16 |
| #endif |
| ) |
| { |
| /* this will take care of encoding the BOM correctly */ |
| TY_(WriteChar)( UNICODE_BOM, out ); |
| } |
| } |
| |
| /* this is in intermediate fix for various problems in the */ |
| /* long term code and data in charsets.c should be used */ |
| static struct _enc2iana |
| { |
| uint id; |
| ctmbstr name; |
| ctmbstr tidyOptName; |
| } const enc2iana[] = |
| { |
| { ASCII, "us-ascii", "ascii" }, |
| { LATIN0, "iso-8859-15", "latin0" }, |
| { LATIN1, "iso-8859-1", "latin1" }, |
| { UTF8, "utf-8", "utf8" }, |
| { MACROMAN, "macintosh", "mac" }, |
| { WIN1252, "windows-1252", "win1252" }, |
| { IBM858, "ibm00858", "ibm858" }, |
| #if SUPPORT_UTF16_ENCODINGS |
| { UTF16LE, "utf-16", "utf16le" }, |
| { UTF16BE, "utf-16", "utf16be" }, |
| { UTF16, "utf-16", "utf16" }, |
| #endif |
| #if SUPPORT_ASIAN_ENCODINGS |
| { BIG5, "big5", "big5" }, |
| { SHIFTJIS, "shift_jis", "shiftjis"}, |
| #endif |
| #ifndef NO_NATIVE_ISO2022_SUPPORT |
| { ISO2022, NULL, "iso2022" }, |
| #endif |
| { RAW, NULL, "raw" } |
| }; |
| |
| ctmbstr TY_(GetEncodingNameFromTidyId)(uint id) |
| { |
| uint i; |
| |
| for (i = 0; enc2iana[i].name; ++i) |
| if (enc2iana[i].id == id) |
| return enc2iana[i].name; |
| |
| return NULL; |
| } |
| |
| ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id) |
| { |
| uint i; |
| |
| for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) |
| if (enc2iana[i].id == id) |
| return enc2iana[i].tidyOptName; |
| |
| return NULL; |
| } |
| |
| int TY_(GetCharEncodingFromOptName)( ctmbstr charenc ) |
| { |
| uint i; |
| |
| for (i = 0; i < sizeof(enc2iana)/sizeof(enc2iana[0]); ++i) |
| if (TY_(tmbstrcasecmp)(charenc, enc2iana[i].tidyOptName) == 0 ) |
| return enc2iana[i].id; |
| |
| return -1; |
| } |
| |
| /* |
| * local variables: |
| * mode: c |
| * indent-tabs-mode: nil |
| * c-basic-offset: 4 |
| * eval: (c-set-offset 'substatement-open 0) |
| * end: |
| */ |