| /*************************************************************************** |
| * |
| * charmap.cpp |
| * |
| * $Id$ |
| * |
| *************************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed |
| * with this work for additional information regarding copyright |
| * ownership. The ASF licenses this file to you under the Apache |
| * License, Version 2.0 (the "License"); you may not use this file |
| * except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| * implied. See the License for the specific language governing |
| * permissions and limitations under the License. |
| * |
| * Copyright 2001-2008 Rogue Wave Software, Inc. |
| * |
| **************************************************************************/ |
| |
| #include <rw/_defs.h> |
| |
| // On Compaq Tru64 UNIX if included after assert.h, the definition of |
| // _XOPEN_SOURCE macro in assert.h selects a different declaration for |
| // iconv than the one used in comp test. |
| #ifndef _WIN32 |
| # ifndef _RWSTD_NO_ICONV |
| # include <iconv.h> |
| # endif |
| # include _RWSTD_CERRNO |
| #else |
| # include <windows.h> |
| #endif // _WIN32 |
| |
| #include <cassert> |
| #include <cctype> |
| #include <cerrno> // for errno |
| #include <climits> |
| #include <clocale> // for LC_CTYPE, setlocale() |
| #include <cstdio> |
| #include <cstdlib> |
| #include <cstring> // for strrchr(), strerror() |
| |
| #include <map> |
| #include <string> |
| |
| #include <vector> |
| #include <iostream> |
| #include <fstream> |
| |
| #include "aliases.h" |
| #include "scanner.h" |
| #include "charmap.h" |
| #include "loc_exception.h" |
| #include "diagnostic.h" |
| |
| // This value specifies the largest allowed symbolic name length |
| // If necessary this can be increased, but it is very doubtful that |
| // that would ever be necessary |
| #define MAX_SYM_NAME_LEN 256 |
| |
| // this is the maximum size of a single byte of a character in the |
| // charmap file. According to POSIX this cannot be larger then 5 |
| // because all bytes are in the format "\x%x", "\d%x" or "\%o" and |
| // the numeric values cannot be greater then 3 digits long |
| #define MAX_BYTE_LEN 5 |
| |
| #ifndef _RWSTD_NO_ICONV |
| |
| static iconv_t |
| my_iconv_open (const char *to_codeset, const char *from_codeset) |
| { |
| typedef std::vector<std::string> StrVec; |
| |
| StrVec aliases [2]; |
| |
| const bool to_utf8 = !std::strcmp (to_codeset, "UTF-8"); |
| const bool from_utf8 = !to_utf8; |
| |
| // aliases [to_utf8].push_back (to_codeset); |
| // aliases [from_utf8].push_back (from_codeset); |
| |
| get_cname_aliases (to_codeset, aliases [to_utf8]); |
| get_cname_aliases (from_codeset, aliases [from_utf8]); |
| |
| typedef StrVec::iterator VecIter; |
| |
| std::string tried_names [2]; |
| |
| for (VecIter i = aliases [to_utf8].begin (); i != aliases [to_utf8].end (); |
| ++i) { |
| |
| for (VecIter j = aliases [from_utf8].begin (); |
| j != aliases [from_utf8].end (); ++j) { |
| |
| const char* const to_code = (*i).c_str (); |
| const char* const from_code = (*j).c_str (); |
| |
| const iconv_t ret = iconv_open (to_code, from_code); |
| |
| if (ret != iconv_t (-1)) |
| return ret; |
| |
| if (i == aliases [to_utf8].begin ()) { |
| |
| if (tried_names [from_utf8].size ()) { |
| tried_names [from_utf8] += ','; |
| tried_names [from_utf8] += ' '; |
| } |
| |
| tried_names [from_utf8] += '"'; |
| tried_names [from_utf8] += *j; |
| tried_names [from_utf8] += '"'; |
| } |
| } |
| |
| if (tried_names [to_utf8].size ()) { |
| tried_names [to_utf8] += ','; |
| tried_names [to_utf8] += ' '; |
| } |
| |
| tried_names [to_utf8] += '"'; |
| tried_names [to_utf8] += *i; |
| tried_names [to_utf8] += '"'; |
| } |
| |
| assert (0 != aliases [0].size ()); |
| assert (0 != aliases [1].size ()); |
| |
| issue_diag (W_ICONV, false, 0, |
| "iconv_open(\"%s\", \"%s\") failed; " |
| "tried { %s } and { %s }\n", |
| aliases [to_utf8][0].c_str (), |
| aliases [from_utf8][0].c_str (), |
| tried_names [to_utf8].c_str (), |
| tried_names [from_utf8].c_str ()); |
| |
| return iconv_t (-1); |
| } |
| |
| // open an iconv file descriptor to convert from the codeset to utf8 |
| iconv_t Charmap::open_iconv_to_utf8 () const |
| { |
| if (in_utf8_) |
| return 0; |
| |
| return my_iconv_open ("UTF-8", code_set_name_.c_str ()); |
| } |
| |
| # ifndef _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| iconv_t Charmap::open_iconv_to_ext () |
| { |
| return my_iconv_open (code_set_name_.c_str (), "UTF-8"); |
| } |
| |
| # endif // _RWSTD_NO_ISO_10646_WCHAR_T |
| #endif // _RWSTD_NO_ICONV |
| |
| |
| // utf8_decode translates the UTF-8 encoded character (specified |
| // by the range [from, to) into an object of type wchar_t |
| // algorithm derived from RFC2279 |
| static wchar_t utf8_decode (const char* from, const char* to) |
| { |
| assert (from <= to); |
| |
| const unsigned char* const ch = |
| _RWSTD_REINTERPRET_CAST (const unsigned char*, from); |
| |
| const unsigned char* const ch_end = |
| _RWSTD_REINTERPRET_CAST (const unsigned char*, to); |
| |
| size_t num_bytes = 0; |
| |
| wchar_t ret = 0; |
| |
| // if the first character is below 0x80 then the value of *ch is the |
| // actual value of the character so return that value as a wchar_t |
| if (*ch < 0x80) |
| return wchar_t (*ch); |
| |
| // if *ch is between 0xc2 and 0xe0 there are 2 bytes in the multi-byte |
| // character |
| if (*ch >= 0xc2 && *ch < 0xe0) { |
| ret = (*ch & 0x1f); |
| num_bytes = 2; |
| } |
| |
| // if *ch is between 0xe0 and 0xf0 there are 3 bytes in the multi-byte |
| // character |
| else if (*ch >= 0xe0 && *ch < 0xf0) { |
| ret = *ch & 0x0f; |
| num_bytes = 3; |
| } |
| else if (*ch >= 0xf0 && *ch < 0xf8) { |
| ret = *ch & 0x07; |
| num_bytes = 4; |
| } |
| else if (*ch >= 0xf8 && *ch < 0xfc) { |
| ret = *ch & 0x03; |
| num_bytes = 5; |
| } |
| else if (*ch >= 0xfc && *ch < 0xfe) { |
| ret = *ch & 0x01; |
| num_bytes = 6; |
| } |
| else { |
| issue_diag (E_MBCHAR, true, 0, |
| "illegal multibyte prefix '\\x%02x' in character " |
| "map file\n", *ch); |
| } |
| |
| if (ch_end < ch + num_bytes - 1) { |
| // the input doesn't have enough characters |
| issue_diag (E_MBCHAR, true, 0, |
| "incomplete multibyte character in character " |
| "map file: expecting %u bytes, found %u\n", |
| num_bytes, ch_end - ch); |
| } |
| |
| // for each byte in the character extract the useful data by shifting |
| // and bit or it into the wchar_t |
| for (size_t i = 1; i < num_bytes; ++i) |
| ret = (ret << 6) | (ch [i] & 0x3f); |
| |
| return ret; |
| } |
| |
| |
| // count the number of bytes in a multibyte sequence denoted |
| // by the argument by counting the number of escape characters |
| std::size_t Charmap::mbcharlen (const std::string &str) const |
| { |
| std::size_t count = 1; |
| |
| const char escape = scanner_.escape_char (); |
| |
| for (std::size_t idx = 0; ; ++idx, ++count) { |
| idx = str.find (escape, idx); |
| |
| if (std::string::npos == idx) |
| break; |
| } |
| |
| return count; |
| } |
| |
| |
| /**************************************************************************/ |
| |
| const char* const Charmap:: |
| portable_charset[] = { |
| /* 0x00 */ "<NUL>", |
| /* 0x01 SOH */ 0, |
| /* 0x02 STX */ 0, |
| /* 0x03 ETX */ 0, |
| /* 0x04 EOT */ 0, |
| /* 0x05 ENQ */ 0, |
| /* 0x06 ACK */ 0, |
| /* 0x07 BEL */ "<alert>", |
| /* 0x08 */ "<backspace>", |
| /* 0x09 TAB */ "<tab>", |
| /* 0x0a */ "<newline>", |
| /* 0x0b */ "<vertical-tab>", |
| /* 0x0c */ "<form-feed>", |
| /* 0x0d */ "<carriage-return>", |
| /* 0x0e SO */ 0, |
| /* 0x0f SI */ 0, |
| /* 0x10 DLE */ 0, |
| /* 0x11 DC1 */ 0, |
| /* 0x12 DC2 */ 0, |
| /* 0x13 DC3 */ 0, |
| /* 0x14 DC4 */ 0, |
| /* 0x15 NAK */ 0, |
| /* 0x16 SYN */ 0, |
| /* 0x17 ETB */ 0, |
| /* 0x18 CAN */ 0, |
| /* 0x19 EM */ 0, |
| /* 0x1a SUB */ 0, |
| /* 0x1b ESC */ 0, |
| /* 0x1c IS4 */ 0, |
| /* 0x1d IS3 */ 0, |
| /* 0x1e IS2 */ 0, |
| /* 0x1f IS1 */ 0, |
| /* 0x20 SPC */ "<space>", |
| /* 0x21 ! */ "<exclamation-mark>", |
| /* 0x22 ' */ "<quotation-mark>", |
| /* 0x23 # */ "<number-sign>", |
| /* 0x24 $ */ "<dollar-sign>", |
| /* 0x25 % */ "<percent-sign>", |
| /* 0x26 & */ "<ampersand>", |
| /* 0x27 ' */ "<apostrophe>", |
| /* 0x28 ( */ "<left-parenthesis>", |
| /* 0x29 ) */ "<right-parenthesis>", |
| /* 0x2a * */ "<asterisk>", |
| /* 0x2b + */ "<plus-sign>", |
| /* 0x2c , */ "<comma>", |
| /* 0x2d - */ "<hyphen>", // "<hyphen-minus>", |
| /* 0x2e . */ "<period>", // "<full-stop>", |
| /* 0x2f / */ "<slash>", // "<solidus>", |
| /* 0x30 0 */ "<zero>", |
| /* 0x31 1 */ "<one>", |
| /* 0x32 2 */ "<two>", |
| /* 0x33 3 */ "<three>", |
| /* 0x34 4 */ "<four>", |
| /* 0x35 5 */ "<five>", |
| /* 0x36 6 */ "<six>", |
| /* 0x37 7 */ "<seven>", |
| /* 0x38 8 */ "<eight>", |
| /* 0x39 9 */ "<nine>", |
| /* 0x3a : */ "<colon>", |
| /* 0x3b ; */ "<semicolon>", |
| /* 0x3c < */ "<less-than-sign>", |
| /* 0x3d = */ "<equals-sign>", |
| /* 0x3e > */ "<greater-than-sign>", |
| /* 0x3f ? */ "<question-mark>", |
| /* 0x40 @ */ "<commercial-at>", |
| /* 0x41 A */ "<A>", |
| /* 0x42 B */ "<B>", |
| /* 0x43 C */ "<C>", |
| /* 0x44 D */ "<D>", |
| /* 0x45 E */ "<E>", |
| /* 0x46 F */ "<F>", |
| /* 0x47 G */ "<G>", |
| /* 0x48 H */ "<H>", |
| /* 0x49 I */ "<I>", |
| /* 0x4a J */ "<J>", |
| /* 0x4b K */ "<K>", |
| /* 0x4c L */ "<L>", |
| /* 0x4d M */ "<M>", |
| /* 0x4e N */ "<N>", |
| /* 0x4f O */ "<O>", |
| /* 0x50 P */ "<P>", |
| /* 0x51 Q */ "<Q>", |
| /* 0x52 R */ "<R>", |
| /* 0x53 S */ "<S>", |
| /* 0x54 T */ "<T>", |
| /* 0x55 U */ "<U>", |
| /* 0x56 V */ "<V>", |
| /* 0x57 W */ "<W>", |
| /* 0x58 X */ "<X>", |
| /* 0x59 Y */ "<Y>", |
| /* 0x5a Z */ "<Z>", |
| /* 0x5b [ */ "<left-square-bracket>", |
| /* 0x5c \ */ "<backslash>", // "<reverse-solidus>", |
| /* 0x5d ] */ "<right-square-bracket>", |
| /* 0x5e ^ */ "<circumflex>", // "<circumflex-accent>", |
| /* 0x5f _ */ "<underscore>", // "<low-line>", |
| /* 0x60 ` */ "<grave-accent>", |
| /* 0x61 a */ "<a>", |
| /* 0x62 b */ "<b>", |
| /* 0x63 c */ "<c>", |
| /* 0x64 d */ "<d>", |
| /* 0x65 e */ "<e>", |
| /* 0x66 f */ "<f>", |
| /* 0x67 g */ "<g>", |
| /* 0x68 h */ "<h>", |
| /* 0x69 i */ "<i>", |
| /* 0x6a j */ "<j>", |
| /* 0x6b k */ "<k>", |
| /* 0x6c l */ "<l>", |
| /* 0x6d m */ "<m>", |
| /* 0x6e n */ "<n>", |
| /* 0x6f o */ "<o>", |
| /* 0x70 p */ "<p>", |
| /* 0x71 q */ "<q>", |
| /* 0x72 r */ "<r>", |
| /* 0x73 s */ "<s>", |
| /* 0x74 t */ "<t>", |
| /* 0x75 u */ "<u>", |
| /* 0x76 v */ "<v>", |
| /* 0x77 w */ "<w>", |
| /* 0x78 x */ "<x>", |
| /* 0x79 y */ "<y>", |
| /* 0x7a z */ "<z>", |
| /* 0x7b { */ "<left-brace>", // "<left-curly-bracket>", |
| /* 0x7c | */ "<vertical-line>", |
| /* 0x7d } */ "<right-brace>", // "<right-curly-bracket>", |
| /* 0x7e ~ */ "<tilde>", |
| /* 0x7f */ 0 |
| }; |
| |
| |
| // convert a string of narrow character into a wchar_t |
| bool Charmap::convert_to_wc (const std::string& sym_name, |
| const std::string& ext_enc, wchar_t& wc) |
| { |
| #ifndef _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| // the internal wchar_t representation for all characters |
| // in all locales is always ISO-10646 (UCS) on this system |
| return convert_to_ucs (sym_name, ext_enc, wc); |
| |
| #else // if defined _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| if (UCS4_internal_ || Clocale_.empty ()) { |
| |
| // when using UCS as the internal encoding or for a locale |
| // that has no corresponding C library locale convert the |
| // character to ISO-10646 (UCS) |
| return convert_to_ucs (sym_name, ext_enc, wc); |
| } |
| |
| // otherwise use libc to convert the multi-byte character |
| // to its wchar_t value |
| if (-1 == std::mbtowc (&wc, ext_enc.c_str (), ext_enc.size ())) { |
| |
| const char* const locname = std::setlocale (LC_CTYPE, 0); |
| const char* const errtext = std::strerror (errno); |
| |
| // diagnose the failure to convert the character as just |
| // a warning and (try to) convert it to ISO-10646 (UCS) |
| issue_diag (W_CALL, true, &next, |
| "mbtowc failed to convert character in locale " |
| "\"%s\": %s\n", locname, errtext); |
| |
| return convert_to_ucs (sym_name, ext_enc, wc); |
| } |
| |
| return true; |
| |
| #endif // _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| } |
| |
| |
| char* Charmap::convert_to_utf8 (const char *inbuf, size_t inbuf_s, |
| char *outbuf, size_t outbuf_s) const |
| { |
| #ifndef _RWSTD_NO_ICONV |
| |
| if (ic_to_utf8_ == iconv_t (-1)) |
| return 0; |
| |
| char* outbufp = outbuf; |
| |
| # ifndef _RWSTD_NO_ICONV_CONST_CHAR |
| const char* inbufp = inbuf; |
| # else |
| char* inbufp = _RWSTD_CONST_CAST(char*, inbuf); |
| # endif // _RWSTD_NO_ICONV_CONST_CHAR |
| |
| if (std::size_t (-1) == |
| iconv (ic_to_utf8_, &inbufp, &inbuf_s, &outbufp, &outbuf_s)) { |
| const char* const errtext = std::strerror (errno); |
| |
| issue_diag (W_ICONV, false, &next, |
| "iconv failed to convert \"%s\" " |
| "to UTF-8: %s\n", inbuf, errtext); |
| |
| return 0; |
| } |
| |
| return outbufp; |
| |
| #else // if defined (_RWSTD_NO_ICONV) |
| |
| return 0; |
| |
| #endif // _RWSTD_NO_ICONV |
| |
| } |
| |
| |
| |
| std::string Charmap::get_charmap_name () const |
| { |
| const std::string::size_type idx = charmap_name_.rfind (_RWSTD_PATH_SEP); |
| |
| if (idx != std::string::npos) |
| return charmap_name_.substr (idx + 1); |
| |
| return charmap_name_; |
| } |
| |
| |
| wchar_t Charmap::increment_wchar (wchar_t val) const |
| { |
| #ifndef _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| // to increment a wchar_t value and keep the encoding all we have |
| // to do is increment the val because the internal encoding is UCS |
| return val + 1; |
| |
| #else |
| // to increment a wchar_t value and keep the encoding we have to |
| // convert the wchar_t to the external encoding, increment that |
| // string value, and convert back to the internal representation |
| const rmb_cmap_iter it = rmb_cmap_.find (val); |
| |
| if (it != rmb_cmap_.end ()) { |
| |
| mb_cmap_iter ret; |
| |
| // multibyte character corresponding to the wchar_t value |
| std::string encoding = it->second; |
| |
| // continue incrementing the multi-byte value until we get a valid |
| // character. NOTE: this must be done for encodings such as SJIS where |
| // \x7f in the last byte of a multibyte string is not a valid character |
| // NOTE: this will not detect errors in the sequence, since the program |
| // will continue until it finds a valid character |
| do { |
| int last_elm = int (encoding.size ()) - 1; |
| |
| while (last_elm >= 0) { |
| |
| typedef unsigned char UChar; |
| |
| const unsigned ic = UChar (encoding [last_elm]) + 1; |
| |
| // if incrementing the last element caused it to exceed |
| // UCHAR_MAX increment the next higher byte if there is |
| // one |
| if (UCHAR_MAX < ic) |
| encoding [last_elm--] = '\0'; |
| else { |
| encoding [last_elm] = char (ic); |
| break; |
| } |
| } |
| |
| if (last_elm < 0) |
| return -1; // error |
| |
| } while ((ret = mb_cmap_.find (encoding)) == mb_cmap_.end ()); |
| |
| return ret->second; |
| } |
| |
| return -1; // error |
| |
| #endif // _RWSTD_NO_ISO_10646_WCHAR_T |
| |
| } |
| |
| |
| bool Charmap:: |
| increment_encoding (std::string &encoding) |
| { |
| // find the last escape character in the human readable representation |
| // of the encoding (i.e., in the multibyte character such as "/xf0/x80") |
| const std::string::size_type pos = |
| encoding.rfind (scanner_.escape_char ()); |
| |
| // the escape character must be there (guaranteed by the scanner) |
| assert (pos < encoding.size ()); |
| |
| const char* end = 0; |
| |
| // convert the last character in the multibyte character to a numeric |
| // value representing the last byte of the sequence |
| unsigned last_byte = |
| unsigned (scanner_.convert_escape (encoding.c_str () + pos, &end)); |
| |
| // POSIX requires that the incremented value be non-NUL |
| if (UCHAR_MAX <= last_byte || *end) |
| return false; |
| |
| // increment the last byte |
| ++last_byte; |
| |
| // format the last byte in the same notation (octal, decimal, |
| // or hexadecimal escape sequence) |
| static const char xdigits[] = "0123456789ABCDEF"; |
| |
| char byte_str [5]; |
| char *pdig = byte_str; |
| |
| switch (encoding [pos + 1]) { |
| case 'd': { // decimal escape |
| const unsigned hundreds = last_byte / 100; |
| const unsigned tens = (last_byte - hundreds) / 10; |
| const unsigned units = last_byte % 10; |
| |
| *pdig++ = 'd'; |
| |
| if (hundreds) |
| *pdig++ = xdigits [hundreds]; |
| |
| *pdig++ = xdigits [tens]; |
| *pdig++ = xdigits [units]; |
| *pdig = '\0'; |
| break; |
| } |
| |
| case 'x': { // hex escape |
| const unsigned hi = last_byte >> 4; |
| const unsigned lo = last_byte & 0xfU; |
| |
| *pdig++ = 'x'; |
| *pdig++ = xdigits [hi]; |
| *pdig++ = xdigits [lo]; |
| *pdig = '\0'; |
| break; |
| } |
| default: { // octal escape |
| const unsigned hi = last_byte >> 6; |
| const unsigned mid = (last_byte >> 3) & 07U; |
| const unsigned lo = last_byte & 07U; |
| |
| if (hi) |
| *pdig++ = xdigits [hi]; |
| |
| *pdig++ = xdigits [mid]; |
| *pdig++ = xdigits [lo]; |
| *pdig = '\0'; |
| } |
| } // switch |
| |
| // replace the last escape sequence with the new one |
| encoding.replace (pos + 1, std::string::npos, byte_str); |
| |
| return true; |
| } |
| |
| |
| std::string Charmap:: |
| encoding_to_mbchar (const std::string &encoding) const |
| { |
| std::string mbchar; |
| |
| for (const char *pbyte = encoding.c_str (); pbyte && *pbyte; ) |
| mbchar += char (scanner_.convert_escape (pbyte, &pbyte)); |
| |
| return mbchar; |
| } |
| |
| |
| // convert the locale's encoded character to UCS4 wchar_t |
| wchar_t Charmap:: |
| convert_sym_to_ucs (const std::string &sym) const |
| { |
| std::string::const_iterator it (sym.begin ()); |
| |
| if ( sym.size () < 4 || *it != '<' || *++it != 'U' |
| || !(std::isxdigit)(*++it)) { |
| issue_diag (E_UCS, true, 0, |
| "Unable to convert symbolic name %s to UCS.\n", |
| sym.c_str ()); |
| } |
| |
| const unsigned long val = std::strtoul (&*it, (char**)0, 16); |
| |
| if (_RWSTD_WCHAR_MAX <= val) |
| issue_diag (E_UCS, true, 0, |
| "UCS value %lu of symbolic character %s out of range.\n", |
| val, sym.c_str ()); |
| |
| return wchar_t (val); |
| } |
| |
| |
| // convert the locale's encoded character to UCS4/UCS2 wchar_t |
| bool Charmap::convert_to_ucs (const std::string &sym_name, |
| const std::string &encoding, wchar_t& wc) |
| { |
| #ifndef _WIN32 |
| |
| if (in_utf8_) { |
| wc = utf8_decode (encoding.c_str (), &*(encoding.end () - 1)); |
| return true; |
| } |
| |
| // allocate enough space for the longest possible UTF-8 character |
| char utf8_enc [8 + 1 /* NUL */]; |
| |
| const char* const ch_end = |
| convert_to_utf8 (encoding.c_str (), encoding.size (), |
| utf8_enc, sizeof utf8_enc); |
| if (ch_end) |
| // only if conversion to utf8 succeeded |
| wc = utf8_decode (utf8_enc, ch_end); |
| else |
| // if not, try to convert the symbolic name directly |
| wc = convert_sym_to_ucs (sym_name); |
| |
| return true; |
| |
| #else |
| |
| if (0 != codepage_) { |
| wchar_t ret[2] = {0}; |
| const int res = MultiByteToWideChar (codepage_, 0, |
| encoding.c_str(), -1, |
| ret, 2); |
| if (!res && ERROR_INVALID_PARAMETER == GetLastError ()) { |
| // the required codepage conversion table is not installed |
| wc = convert_sym_to_ucs (sym_name); |
| return true; |
| } |
| |
| if (!res || ret[1] != 0) |
| return false; |
| |
| wc = ret[0]; |
| return true; |
| } |
| |
| wc = convert_sym_to_ucs (sym_name); |
| return true; |
| |
| #endif // _WIN32 |
| } |
| |
| |
| void Charmap::add_to_cmaps (const std::string &sym_name, |
| const std::string &encoding, |
| bool is_mbchar /* = false */) |
| { |
| // compute the external (multibyte) encoding of the character |
| // if necessary (i.e., unless already done by the caller) |
| const std::string mbchar = |
| is_mbchar ? encoding : encoding_to_mbchar (encoding); |
| |
| symnames_list_.push_back (sym_name); |
| |
| if (1 == mbchar.size ()) { |
| // strval is a single-byte character |
| |
| const unsigned char ch = mbchar [0]; |
| |
| // add the wide character and its symbolic name to the narrow |
| // character maps |
| if (forward_maps) { |
| // the locale utility doesn't need reverse maps |
| n_cmap_.insert (std::make_pair (sym_name, ch)); |
| } |
| |
| if (reverse_maps) |
| rn_cmap_.insert (std::make_pair (ch, sym_name)); |
| |
| if (ch > largest_nchar_) |
| largest_nchar_ = ch; |
| } |
| |
| // (try to) compute the wide character value of the character |
| wchar_t wch; |
| |
| if (convert_to_wc (sym_name, mbchar, wch)) { |
| |
| // add the wide character and its symbolic name to the wide |
| // character maps |
| if (forward_maps) { |
| // the locale utility doesn't need forward maps |
| w_cmap_.insert (std::make_pair (sym_name, wch)); |
| } |
| |
| if (reverse_maps) |
| rw_cmap_.insert (std::make_pair (wch, sym_name)); |
| |
| // add the corresponding multibyte character to the multibyte |
| // character maps |
| mb_cmap_.insert (std::make_pair (mbchar, wch)); |
| rmb_cmap_.insert (std::make_pair (wch, mbchar)); |
| } |
| |
| // compute the UCS value of the character |
| wchar_t uch; |
| |
| if (convert_to_ucs (sym_name, mbchar, uch)) { |
| |
| // add UCS character and its symbolic name to the UCS |
| // character maps |
| ucs4_cmap_.insert (std::make_pair (sym_name, uch)); |
| rucs4_cmap_.insert (std::make_pair (uch, sym_name)); |
| } |
| } |
| |
| |
| // process the characters implicitly defined by using ellipsis between |
| // two explicitly defined characters |
| std::size_t Charmap:: |
| process_ellipsis (const Scanner::token_t &beg_tok, int num_ellipsis) |
| { |
| // get the upper end of the range denoted by the ellipsis |
| const Scanner::token_t end_tok = scanner_.next_token (); |
| |
| // get the human readabale encoding of the character |
| // denoted by the lower end of the ellipsis |
| const std::string encoding = scanner_.next_token ().name; |
| |
| // convert the encoding to a multibyte character |
| std::string mbchar = encoding_to_mbchar (encoding); |
| |
| // add the beg_tok symbol name to the maps |
| add_to_cmaps (beg_tok.name, mbchar, true); |
| |
| // extract the numeric portion of the symbolic character name |
| // denoted by the lower end of the ellipsis |
| std::size_t idx = 0; |
| |
| int base; // numeric base |
| const char *fmat; // sprintf() format specifier |
| |
| const std::size_t beg_len = beg_tok.name.size (); |
| |
| // determine the value of the beginning of the range |
| // denoted by the ellipsis |
| if (2 == num_ellipsis) { |
| base = 16; |
| fmat = "%.*s%0*lX>"; |
| |
| // advance to the first hex digit |
| while (idx < beg_len && !(std::isxdigit)(beg_tok.name [idx])) |
| ++idx; |
| } |
| else { |
| base = 10; |
| fmat = "%.*s%0*ld>"; |
| |
| // advance to the first decimal digit |
| while (idx < beg_len && !(std::isdigit)(beg_tok.name [idx])) |
| ++idx; |
| } |
| |
| // length of non-numeric prefix of the symbolic character name |
| const std::size_t pfx_len = idx; |
| |
| // get the character value plus one (since the first value |
| // has already been added to the map earlier) |
| char *num_end; |
| const unsigned long beg_val = |
| 1 + std::strtoul (beg_tok.name.c_str () + pfx_len, &num_end, base); |
| |
| // the length of the numeric portion |
| const std::size_t num_size = |
| num_end - (beg_tok.name.c_str () + pfx_len); |
| |
| // find the end of the range denoted by the ellipsis |
| idx = 0; |
| |
| const std::size_t end_len = end_tok.name.size (); |
| |
| if (2 == num_ellipsis) { |
| // advance to the next hex digit |
| while (idx < end_len && !(std::isxdigit)(end_tok.name [idx])) |
| ++idx; |
| } |
| else { |
| // advance to the next dec digit |
| while (idx < end_len && !(std::isdigit)(end_tok.name [idx])) |
| ++idx; |
| } |
| |
| const unsigned long end_val = |
| std::strtoul (end_tok.name.c_str () + idx, (char**)0, base); |
| |
| // the ending numeric value must be greater than or equal |
| // to the beginning numeric value |
| if (end_val < beg_val) |
| issue_diag (E_RANGE, true, &end_tok, |
| "invalid range found in character map file\n"); |
| |
| char next_name [MAX_SYM_NAME_LEN]; |
| |
| std::size_t nchars = 0; |
| |
| const char* const pfx = beg_tok.name.c_str (); |
| |
| for (unsigned long val = beg_val; val <= end_val; ++val, ++nchars) { |
| |
| std::sprintf (next_name, fmat, pfx_len, pfx, num_size, val); |
| |
| // increment the last byte of the multibyte character |
| // and if the result is valid (i.e., doesn't contain |
| // an embedded NUL) add the generated name and the |
| // multibyte character to the maps |
| const unsigned char last_byte = mbchar [mbchar.size () - 1]; |
| if (last_byte < UCHAR_MAX) { |
| mbchar [mbchar.size () - 1] = last_byte + 1; |
| add_to_cmaps (next_name, mbchar, true); |
| } |
| else { |
| // an ellipsis must not specify a range that includes |
| // an encoding with an embedded NUL |
| issue_diag (E_RANGE, true, &beg_tok, |
| "encoding of an element in range contains NUL\n"); |
| } |
| } |
| |
| // return the number of characters denoted by the ellipsis |
| return nchars; |
| } |
| |
| |
| // process all the characters in the character map file. |
| void Charmap::process_chars() |
| { |
| issue_diag (I_STAGE, false, 0, "processing CHARMAP section\n"); |
| |
| std::size_t ntokens = 0; |
| std::size_t nellips = 0; |
| std::size_t nchars = 0; |
| |
| next = scanner_.next_token(); |
| Scanner::token_t nextnext; |
| |
| // loop until we find the closing charmap token |
| for ( ; next.token != Scanner::tok_charmap; ++ntokens) { |
| |
| switch (next.token) { |
| |
| case Scanner::tok_nl: |
| case Scanner::tok_end: |
| break; |
| |
| case Scanner::tok_sym_name: |
| // the next token may be either ellipsis if this line |
| // of the charmap is in the form: |
| // "%s...%s %s\n", <sym_name>, <sym_name>, <encoding> |
| // or an encoding if this line is in the format: |
| // "%s %s\n", <sym_name>, <encoding> |
| nextnext = scanner_.next_token (); |
| ntokens += 3; |
| |
| switch (nextnext.token) { |
| |
| case Scanner::tok_abs_ellipsis: |
| // absolute ellipsis (see ISO/IEC TR 14652) |
| nchars += process_ellipsis (next, 3); |
| ++nellips; |
| break; |
| |
| case Scanner::tok_hex_ellipsis: |
| // hexadecimal symbolic ellipsis (see ISO/IEC TR 14652) |
| nchars += process_ellipsis (next, 2); |
| ++nellips; |
| break; |
| |
| case Scanner::tok_char_value: |
| // character represented as a numeric constant |
| add_to_cmaps (next.name, nextnext.name); |
| ++nchars; |
| break; |
| |
| default: |
| issue_diag (E_SYNTAX, true, &next, |
| "byte value expected following symbolic " |
| "name in character map file\n"); |
| } |
| |
| scanner_.ignore_line (); |
| break; |
| |
| default: |
| issue_diag (E_SYNTAX, true, &next, |
| "symbolic name expected in character map file\n"); |
| break; |
| } |
| |
| next = scanner_.next_token(); |
| } |
| |
| issue_diag (I_STAGE, false, 0, |
| "done processing CHARMAP section (%lu tokens, " |
| "%lu ellipses, %lu characters)\n", |
| ntokens, nellips, nchars); |
| |
| // make sure that all characters in the portable character set |
| // are in the charmap |
| if (forward_maps) |
| verify_portable_charset(); |
| } |
| |
| |
| void Charmap::verify_portable_charset () const |
| { |
| const std::size_t nchars = |
| sizeof portable_charset / sizeof *portable_charset; |
| |
| for (std::size_t i = 0; i < nchars; ++i) { |
| if (0 == portable_charset [i]) |
| continue; |
| |
| if (n_cmap_.find (portable_charset [i]) == n_cmap_.end ()) |
| issue_diag (W_NOPCS, false, 0, |
| "member of portable character set %s not found " |
| "in the character map\n", portable_charset [i]); |
| } |
| } |
| |
| |
| Charmap::Charmap(const char* Clocale, |
| const char* fname, |
| bool in_utf8, bool create_forward_maps, |
| bool create_reverse_maps, bool use_UCS4) |
| : mb_cur_max_(1), |
| charmap_name_ (fname), |
| Clocale_ (Clocale), |
| largest_nchar_(0), |
| in_utf8_(in_utf8), |
| forward_maps (create_forward_maps), |
| reverse_maps (create_reverse_maps), |
| UCS4_internal_ (use_UCS4) |
| { |
| #ifndef _RWSTD_NO_ICONV |
| ic_to_utf8_ = 0; |
| ic_to_ext_ = 0; |
| #endif // _RWSTD_NO_ICONV |
| |
| scanner_.open (fname, '#', '\\'); |
| |
| // set code_set_name to the name of the character set description |
| // file by default, in case it's not explicitly specified |
| const char* const slash = std::strrchr (fname, _RWSTD_PATH_SEP); |
| code_set_name_ = slash ? slash + 1 : fname; |
| |
| // loop until we reach the end of the file |
| while ((next = scanner_.next_token()).token != Scanner::tok_end_tokens) { |
| |
| switch (next.token) { |
| |
| case Scanner::tok_code_set_name: |
| next = scanner_.next_token (); |
| |
| if (next.token == Scanner::tok_string) { |
| code_set_name_ = next.name.substr (1, next.name.size () - 2); |
| } |
| else if (next.token == Scanner::tok_ndef) { |
| code_set_name_ = next.name; |
| } |
| else |
| issue_diag (E_SYNTAX, true, &next, |
| "string expected following <code_set_name>\n"); |
| |
| // we always need a iconv to utf8 so that we can create |
| // the utf8_charmap unless we are on windows |
| #ifndef _RWSTD_NO_ICONV |
| if (!in_utf8_) { |
| ic_to_utf8_ = open_iconv_to_utf8 (); |
| # if !defined (_RWSTD_NO_ISO_10646_WCHAR_T) |
| ic_to_ext_ = open_iconv_to_ext (); |
| # endif // _RWSTD_NO_ISO_10646_WCHAR_T |
| } |
| |
| #else // if defined (_RWSTD_NO_ICONV) |
| |
| # ifdef _WIN32 |
| codepage_ = get_codepage (code_set_name_); |
| if (codepage_ == 0) { |
| issue_diag (W_ICONV, false, 0, |
| "iconv_open (%s to UTF-8) failed\n", |
| code_set_name_.c_str()); |
| } |
| |
| # endif // _WIN32 |
| #endif // _RWSTD_NO_ICONV |
| |
| scanner_.ignore_line (); |
| break; |
| |
| case Scanner::tok_mb_cur_max: |
| mb_cur_max_ = std::atoi (scanner_.next_token ().name.c_str ()); |
| scanner_.ignore_line (); |
| break; |
| |
| case Scanner::tok_mb_cur_min: |
| scanner_.ignore_line (); |
| break; |
| |
| case Scanner::tok_charmap: |
| scanner_.ignore_line (); |
| process_chars(); |
| break; |
| case Scanner::tok_width: |
| // ignore the width section of the character map |
| while ((next = scanner_.next_token ()).token != Scanner::tok_width); |
| break; |
| |
| case Scanner::tok_nl: |
| break; |
| |
| default: |
| issue_diag (E_SYNTAX, false, &next, |
| "unknown token %s in character map file\n", |
| next.name.c_str ()); |
| } |
| } |
| } |