blob: c24038eb836a645ed7c80698adbe8710482a259a [file] [log] [blame]
/***************************************************************************
*
* charmap.cpp
*
* $Id$
*
***************************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Copyright 2001-2008 Rogue Wave Software, Inc.
*
**************************************************************************/
#include <rw/_defs.h>
// On Compaq Tru64 UNIX if included after assert.h, the definition of
// _XOPEN_SOURCE macro in assert.h selects a different declaration for
// iconv than the one used in comp test.
#ifndef _WIN32
# ifndef _RWSTD_NO_ICONV
# include <iconv.h>
# endif
# include _RWSTD_CERRNO
#else
# include <windows.h>
#endif // _WIN32
#include <cassert>
#include <cctype>
#include <cerrno> // for errno
#include <climits>
#include <clocale> // for LC_CTYPE, setlocale()
#include <cstdio>
#include <cstdlib>
#include <cstring> // for strrchr(), strerror()
#include <map>
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include "aliases.h"
#include "scanner.h"
#include "charmap.h"
#include "loc_exception.h"
#include "diagnostic.h"
// This value specifies the largest allowed symbolic name length
// If necessary this can be increased, but it is very doubtful that
// that would ever be necessary
#define MAX_SYM_NAME_LEN 256
// this is the maximum size of a single byte of a character in the
// charmap file. According to POSIX this cannot be larger then 5
// because all bytes are in the format "\x%x", "\d%x" or "\%o" and
// the numeric values cannot be greater then 3 digits long
#define MAX_BYTE_LEN 5
#ifndef _RWSTD_NO_ICONV
static iconv_t
my_iconv_open (const char *to_codeset, const char *from_codeset)
{
typedef std::vector<std::string> StrVec;
StrVec aliases [2];
const bool to_utf8 = !std::strcmp (to_codeset, "UTF-8");
const bool from_utf8 = !to_utf8;
// aliases [to_utf8].push_back (to_codeset);
// aliases [from_utf8].push_back (from_codeset);
get_cname_aliases (to_codeset, aliases [to_utf8]);
get_cname_aliases (from_codeset, aliases [from_utf8]);
typedef StrVec::iterator VecIter;
std::string tried_names [2];
for (VecIter i = aliases [to_utf8].begin (); i != aliases [to_utf8].end ();
++i) {
for (VecIter j = aliases [from_utf8].begin ();
j != aliases [from_utf8].end (); ++j) {
const char* const to_code = (*i).c_str ();
const char* const from_code = (*j).c_str ();
const iconv_t ret = iconv_open (to_code, from_code);
if (ret != iconv_t (-1))
return ret;
if (i == aliases [to_utf8].begin ()) {
if (tried_names [from_utf8].size ()) {
tried_names [from_utf8] += ',';
tried_names [from_utf8] += ' ';
}
tried_names [from_utf8] += '"';
tried_names [from_utf8] += *j;
tried_names [from_utf8] += '"';
}
}
if (tried_names [to_utf8].size ()) {
tried_names [to_utf8] += ',';
tried_names [to_utf8] += ' ';
}
tried_names [to_utf8] += '"';
tried_names [to_utf8] += *i;
tried_names [to_utf8] += '"';
}
assert (0 != aliases [0].size ());
assert (0 != aliases [1].size ());
issue_diag (W_ICONV, false, 0,
"iconv_open(\"%s\", \"%s\") failed; "
"tried { %s } and { %s }\n",
aliases [to_utf8][0].c_str (),
aliases [from_utf8][0].c_str (),
tried_names [to_utf8].c_str (),
tried_names [from_utf8].c_str ());
return iconv_t (-1);
}
// open an iconv file descriptor to convert from the codeset to utf8
iconv_t Charmap::open_iconv_to_utf8 () const
{
if (in_utf8_)
return 0;
return my_iconv_open ("UTF-8", code_set_name_.c_str ());
}
# ifndef _RWSTD_NO_ISO_10646_WCHAR_T
iconv_t Charmap::open_iconv_to_ext ()
{
return my_iconv_open (code_set_name_.c_str (), "UTF-8");
}
# endif // _RWSTD_NO_ISO_10646_WCHAR_T
#endif // _RWSTD_NO_ICONV
// utf8_decode translates the UTF-8 encoded character (specified
// by the range [from, to) into an object of type wchar_t
// algorithm derived from RFC2279
static wchar_t utf8_decode (const char* from, const char* to)
{
assert (from <= to);
const unsigned char* const ch =
_RWSTD_REINTERPRET_CAST (const unsigned char*, from);
const unsigned char* const ch_end =
_RWSTD_REINTERPRET_CAST (const unsigned char*, to);
size_t num_bytes = 0;
wchar_t ret = 0;
// if the first character is below 0x80 then the value of *ch is the
// actual value of the character so return that value as a wchar_t
if (*ch < 0x80)
return wchar_t (*ch);
// if *ch is between 0xc2 and 0xe0 there are 2 bytes in the multi-byte
// character
if (*ch >= 0xc2 && *ch < 0xe0) {
ret = (*ch & 0x1f);
num_bytes = 2;
}
// if *ch is between 0xe0 and 0xf0 there are 3 bytes in the multi-byte
// character
else if (*ch >= 0xe0 && *ch < 0xf0) {
ret = *ch & 0x0f;
num_bytes = 3;
}
else if (*ch >= 0xf0 && *ch < 0xf8) {
ret = *ch & 0x07;
num_bytes = 4;
}
else if (*ch >= 0xf8 && *ch < 0xfc) {
ret = *ch & 0x03;
num_bytes = 5;
}
else if (*ch >= 0xfc && *ch < 0xfe) {
ret = *ch & 0x01;
num_bytes = 6;
}
else {
issue_diag (E_MBCHAR, true, 0,
"illegal multibyte prefix '\\x%02x' in character "
"map file\n", *ch);
}
if (ch_end < ch + num_bytes - 1) {
// the input doesn't have enough characters
issue_diag (E_MBCHAR, true, 0,
"incomplete multibyte character in character "
"map file: expecting %u bytes, found %u\n",
num_bytes, ch_end - ch);
}
// for each byte in the character extract the useful data by shifting
// and bit or it into the wchar_t
for (size_t i = 1; i < num_bytes; ++i)
ret = (ret << 6) | (ch [i] & 0x3f);
return ret;
}
// count the number of bytes in a multibyte sequence denoted
// by the argument by counting the number of escape characters
std::size_t Charmap::mbcharlen (const std::string &str) const
{
std::size_t count = 1;
const char escape = scanner_.escape_char ();
for (std::size_t idx = 0; ; ++idx, ++count) {
idx = str.find (escape, idx);
if (std::string::npos == idx)
break;
}
return count;
}
/**************************************************************************/
const char* const Charmap::
portable_charset[] = {
/* 0x00 */ "<NUL>",
/* 0x01 SOH */ 0,
/* 0x02 STX */ 0,
/* 0x03 ETX */ 0,
/* 0x04 EOT */ 0,
/* 0x05 ENQ */ 0,
/* 0x06 ACK */ 0,
/* 0x07 BEL */ "<alert>",
/* 0x08 */ "<backspace>",
/* 0x09 TAB */ "<tab>",
/* 0x0a */ "<newline>",
/* 0x0b */ "<vertical-tab>",
/* 0x0c */ "<form-feed>",
/* 0x0d */ "<carriage-return>",
/* 0x0e SO */ 0,
/* 0x0f SI */ 0,
/* 0x10 DLE */ 0,
/* 0x11 DC1 */ 0,
/* 0x12 DC2 */ 0,
/* 0x13 DC3 */ 0,
/* 0x14 DC4 */ 0,
/* 0x15 NAK */ 0,
/* 0x16 SYN */ 0,
/* 0x17 ETB */ 0,
/* 0x18 CAN */ 0,
/* 0x19 EM */ 0,
/* 0x1a SUB */ 0,
/* 0x1b ESC */ 0,
/* 0x1c IS4 */ 0,
/* 0x1d IS3 */ 0,
/* 0x1e IS2 */ 0,
/* 0x1f IS1 */ 0,
/* 0x20 SPC */ "<space>",
/* 0x21 ! */ "<exclamation-mark>",
/* 0x22 ' */ "<quotation-mark>",
/* 0x23 # */ "<number-sign>",
/* 0x24 $ */ "<dollar-sign>",
/* 0x25 % */ "<percent-sign>",
/* 0x26 & */ "<ampersand>",
/* 0x27 ' */ "<apostrophe>",
/* 0x28 ( */ "<left-parenthesis>",
/* 0x29 ) */ "<right-parenthesis>",
/* 0x2a * */ "<asterisk>",
/* 0x2b + */ "<plus-sign>",
/* 0x2c , */ "<comma>",
/* 0x2d - */ "<hyphen>", // "<hyphen-minus>",
/* 0x2e . */ "<period>", // "<full-stop>",
/* 0x2f / */ "<slash>", // "<solidus>",
/* 0x30 0 */ "<zero>",
/* 0x31 1 */ "<one>",
/* 0x32 2 */ "<two>",
/* 0x33 3 */ "<three>",
/* 0x34 4 */ "<four>",
/* 0x35 5 */ "<five>",
/* 0x36 6 */ "<six>",
/* 0x37 7 */ "<seven>",
/* 0x38 8 */ "<eight>",
/* 0x39 9 */ "<nine>",
/* 0x3a : */ "<colon>",
/* 0x3b ; */ "<semicolon>",
/* 0x3c < */ "<less-than-sign>",
/* 0x3d = */ "<equals-sign>",
/* 0x3e > */ "<greater-than-sign>",
/* 0x3f ? */ "<question-mark>",
/* 0x40 @ */ "<commercial-at>",
/* 0x41 A */ "<A>",
/* 0x42 B */ "<B>",
/* 0x43 C */ "<C>",
/* 0x44 D */ "<D>",
/* 0x45 E */ "<E>",
/* 0x46 F */ "<F>",
/* 0x47 G */ "<G>",
/* 0x48 H */ "<H>",
/* 0x49 I */ "<I>",
/* 0x4a J */ "<J>",
/* 0x4b K */ "<K>",
/* 0x4c L */ "<L>",
/* 0x4d M */ "<M>",
/* 0x4e N */ "<N>",
/* 0x4f O */ "<O>",
/* 0x50 P */ "<P>",
/* 0x51 Q */ "<Q>",
/* 0x52 R */ "<R>",
/* 0x53 S */ "<S>",
/* 0x54 T */ "<T>",
/* 0x55 U */ "<U>",
/* 0x56 V */ "<V>",
/* 0x57 W */ "<W>",
/* 0x58 X */ "<X>",
/* 0x59 Y */ "<Y>",
/* 0x5a Z */ "<Z>",
/* 0x5b [ */ "<left-square-bracket>",
/* 0x5c \ */ "<backslash>", // "<reverse-solidus>",
/* 0x5d ] */ "<right-square-bracket>",
/* 0x5e ^ */ "<circumflex>", // "<circumflex-accent>",
/* 0x5f _ */ "<underscore>", // "<low-line>",
/* 0x60 ` */ "<grave-accent>",
/* 0x61 a */ "<a>",
/* 0x62 b */ "<b>",
/* 0x63 c */ "<c>",
/* 0x64 d */ "<d>",
/* 0x65 e */ "<e>",
/* 0x66 f */ "<f>",
/* 0x67 g */ "<g>",
/* 0x68 h */ "<h>",
/* 0x69 i */ "<i>",
/* 0x6a j */ "<j>",
/* 0x6b k */ "<k>",
/* 0x6c l */ "<l>",
/* 0x6d m */ "<m>",
/* 0x6e n */ "<n>",
/* 0x6f o */ "<o>",
/* 0x70 p */ "<p>",
/* 0x71 q */ "<q>",
/* 0x72 r */ "<r>",
/* 0x73 s */ "<s>",
/* 0x74 t */ "<t>",
/* 0x75 u */ "<u>",
/* 0x76 v */ "<v>",
/* 0x77 w */ "<w>",
/* 0x78 x */ "<x>",
/* 0x79 y */ "<y>",
/* 0x7a z */ "<z>",
/* 0x7b { */ "<left-brace>", // "<left-curly-bracket>",
/* 0x7c | */ "<vertical-line>",
/* 0x7d } */ "<right-brace>", // "<right-curly-bracket>",
/* 0x7e ~ */ "<tilde>",
/* 0x7f */ 0
};
// convert a string of narrow character into a wchar_t
bool Charmap::convert_to_wc (const std::string& sym_name,
const std::string& ext_enc, wchar_t& wc)
{
#ifndef _RWSTD_NO_ISO_10646_WCHAR_T
// the internal wchar_t representation for all characters
// in all locales is always ISO-10646 (UCS) on this system
return convert_to_ucs (sym_name, ext_enc, wc);
#else // if defined _RWSTD_NO_ISO_10646_WCHAR_T
if (UCS4_internal_ || Clocale_.empty ()) {
// when using UCS as the internal encoding or for a locale
// that has no corresponding C library locale convert the
// character to ISO-10646 (UCS)
return convert_to_ucs (sym_name, ext_enc, wc);
}
// otherwise use libc to convert the multi-byte character
// to its wchar_t value
if (-1 == std::mbtowc (&wc, ext_enc.c_str (), ext_enc.size ())) {
const char* const locname = std::setlocale (LC_CTYPE, 0);
const char* const errtext = std::strerror (errno);
// diagnose the failure to convert the character as just
// a warning and (try to) convert it to ISO-10646 (UCS)
issue_diag (W_CALL, true, &next,
"mbtowc failed to convert character in locale "
"\"%s\": %s\n", locname, errtext);
return convert_to_ucs (sym_name, ext_enc, wc);
}
return true;
#endif // _RWSTD_NO_ISO_10646_WCHAR_T
}
char* Charmap::convert_to_utf8 (const char *inbuf, size_t inbuf_s,
char *outbuf, size_t outbuf_s) const
{
#ifndef _RWSTD_NO_ICONV
if (ic_to_utf8_ == iconv_t (-1))
return 0;
char* outbufp = outbuf;
# ifndef _RWSTD_NO_ICONV_CONST_CHAR
const char* inbufp = inbuf;
# else
char* inbufp = _RWSTD_CONST_CAST(char*, inbuf);
# endif // _RWSTD_NO_ICONV_CONST_CHAR
if (std::size_t (-1) ==
iconv (ic_to_utf8_, &inbufp, &inbuf_s, &outbufp, &outbuf_s)) {
const char* const errtext = std::strerror (errno);
issue_diag (W_ICONV, false, &next,
"iconv failed to convert \"%s\" "
"to UTF-8: %s\n", inbuf, errtext);
return 0;
}
return outbufp;
#else // if defined (_RWSTD_NO_ICONV)
return 0;
#endif // _RWSTD_NO_ICONV
}
std::string Charmap::get_charmap_name () const
{
const std::string::size_type idx = charmap_name_.rfind (_RWSTD_PATH_SEP);
if (idx != std::string::npos)
return charmap_name_.substr (idx + 1);
return charmap_name_;
}
wchar_t Charmap::increment_wchar (wchar_t val) const
{
#ifndef _RWSTD_NO_ISO_10646_WCHAR_T
// to increment a wchar_t value and keep the encoding all we have
// to do is increment the val because the internal encoding is UCS
return val + 1;
#else
// to increment a wchar_t value and keep the encoding we have to
// convert the wchar_t to the external encoding, increment that
// string value, and convert back to the internal representation
const rmb_cmap_iter it = rmb_cmap_.find (val);
if (it != rmb_cmap_.end ()) {
mb_cmap_iter ret;
// multibyte character corresponding to the wchar_t value
std::string encoding = it->second;
// continue incrementing the multi-byte value until we get a valid
// character. NOTE: this must be done for encodings such as SJIS where
// \x7f in the last byte of a multibyte string is not a valid character
// NOTE: this will not detect errors in the sequence, since the program
// will continue until it finds a valid character
do {
int last_elm = int (encoding.size ()) - 1;
while (last_elm >= 0) {
typedef unsigned char UChar;
const unsigned ic = UChar (encoding [last_elm]) + 1;
// if incrementing the last element caused it to exceed
// UCHAR_MAX increment the next higher byte if there is
// one
if (UCHAR_MAX < ic)
encoding [last_elm--] = '\0';
else {
encoding [last_elm] = char (ic);
break;
}
}
if (last_elm < 0)
return -1; // error
} while ((ret = mb_cmap_.find (encoding)) == mb_cmap_.end ());
return ret->second;
}
return -1; // error
#endif // _RWSTD_NO_ISO_10646_WCHAR_T
}
bool Charmap::
increment_encoding (std::string &encoding)
{
// find the last escape character in the human readable representation
// of the encoding (i.e., in the multibyte character such as "/xf0/x80")
const std::string::size_type pos =
encoding.rfind (scanner_.escape_char ());
// the escape character must be there (guaranteed by the scanner)
assert (pos < encoding.size ());
const char* end = 0;
// convert the last character in the multibyte character to a numeric
// value representing the last byte of the sequence
unsigned last_byte =
unsigned (scanner_.convert_escape (encoding.c_str () + pos, &end));
// POSIX requires that the incremented value be non-NUL
if (UCHAR_MAX <= last_byte || *end)
return false;
// increment the last byte
++last_byte;
// format the last byte in the same notation (octal, decimal,
// or hexadecimal escape sequence)
static const char xdigits[] = "0123456789ABCDEF";
char byte_str [5];
char *pdig = byte_str;
switch (encoding [pos + 1]) {
case 'd': { // decimal escape
const unsigned hundreds = last_byte / 100;
const unsigned tens = (last_byte - hundreds) / 10;
const unsigned units = last_byte % 10;
*pdig++ = 'd';
if (hundreds)
*pdig++ = xdigits [hundreds];
*pdig++ = xdigits [tens];
*pdig++ = xdigits [units];
*pdig = '\0';
break;
}
case 'x': { // hex escape
const unsigned hi = last_byte >> 4;
const unsigned lo = last_byte & 0xfU;
*pdig++ = 'x';
*pdig++ = xdigits [hi];
*pdig++ = xdigits [lo];
*pdig = '\0';
break;
}
default: { // octal escape
const unsigned hi = last_byte >> 6;
const unsigned mid = (last_byte >> 3) & 07U;
const unsigned lo = last_byte & 07U;
if (hi)
*pdig++ = xdigits [hi];
*pdig++ = xdigits [mid];
*pdig++ = xdigits [lo];
*pdig = '\0';
}
} // switch
// replace the last escape sequence with the new one
encoding.replace (pos + 1, std::string::npos, byte_str);
return true;
}
std::string Charmap::
encoding_to_mbchar (const std::string &encoding) const
{
std::string mbchar;
for (const char *pbyte = encoding.c_str (); pbyte && *pbyte; )
mbchar += char (scanner_.convert_escape (pbyte, &pbyte));
return mbchar;
}
// convert the locale's encoded character to UCS4 wchar_t
wchar_t Charmap::
convert_sym_to_ucs (const std::string &sym) const
{
std::string::const_iterator it (sym.begin ());
if ( sym.size () < 4 || *it != '<' || *++it != 'U'
|| !(std::isxdigit)(*++it)) {
issue_diag (E_UCS, true, 0,
"Unable to convert symbolic name %s to UCS.\n",
sym.c_str ());
}
const unsigned long val = std::strtoul (&*it, (char**)0, 16);
if (_RWSTD_WCHAR_MAX <= val)
issue_diag (E_UCS, true, 0,
"UCS value %lu of symbolic character %s out of range.\n",
val, sym.c_str ());
return wchar_t (val);
}
// convert the locale's encoded character to UCS4/UCS2 wchar_t
bool Charmap::convert_to_ucs (const std::string &sym_name,
const std::string &encoding, wchar_t& wc)
{
#ifndef _WIN32
if (in_utf8_) {
wc = utf8_decode (encoding.c_str (), &*(encoding.end () - 1));
return true;
}
// allocate enough space for the longest possible UTF-8 character
char utf8_enc [8 + 1 /* NUL */];
const char* const ch_end =
convert_to_utf8 (encoding.c_str (), encoding.size (),
utf8_enc, sizeof utf8_enc);
if (ch_end)
// only if conversion to utf8 succeeded
wc = utf8_decode (utf8_enc, ch_end);
else
// if not, try to convert the symbolic name directly
wc = convert_sym_to_ucs (sym_name);
return true;
#else
if (0 != codepage_) {
wchar_t ret[2] = {0};
const int res = MultiByteToWideChar (codepage_, 0,
encoding.c_str(), -1,
ret, 2);
if (!res && ERROR_INVALID_PARAMETER == GetLastError ()) {
// the required codepage conversion table is not installed
wc = convert_sym_to_ucs (sym_name);
return true;
}
if (!res || ret[1] != 0)
return false;
wc = ret[0];
return true;
}
wc = convert_sym_to_ucs (sym_name);
return true;
#endif // _WIN32
}
void Charmap::add_to_cmaps (const std::string &sym_name,
const std::string &encoding,
bool is_mbchar /* = false */)
{
// compute the external (multibyte) encoding of the character
// if necessary (i.e., unless already done by the caller)
const std::string mbchar =
is_mbchar ? encoding : encoding_to_mbchar (encoding);
symnames_list_.push_back (sym_name);
if (1 == mbchar.size ()) {
// strval is a single-byte character
const unsigned char ch = mbchar [0];
// add the wide character and its symbolic name to the narrow
// character maps
if (forward_maps) {
// the locale utility doesn't need reverse maps
n_cmap_.insert (std::make_pair (sym_name, ch));
}
if (reverse_maps)
rn_cmap_.insert (std::make_pair (ch, sym_name));
if (ch > largest_nchar_)
largest_nchar_ = ch;
}
// (try to) compute the wide character value of the character
wchar_t wch;
if (convert_to_wc (sym_name, mbchar, wch)) {
// add the wide character and its symbolic name to the wide
// character maps
if (forward_maps) {
// the locale utility doesn't need forward maps
w_cmap_.insert (std::make_pair (sym_name, wch));
}
if (reverse_maps)
rw_cmap_.insert (std::make_pair (wch, sym_name));
// add the corresponding multibyte character to the multibyte
// character maps
mb_cmap_.insert (std::make_pair (mbchar, wch));
rmb_cmap_.insert (std::make_pair (wch, mbchar));
}
// compute the UCS value of the character
wchar_t uch;
if (convert_to_ucs (sym_name, mbchar, uch)) {
// add UCS character and its symbolic name to the UCS
// character maps
ucs4_cmap_.insert (std::make_pair (sym_name, uch));
rucs4_cmap_.insert (std::make_pair (uch, sym_name));
}
}
// process the characters implicitly defined by using ellipsis between
// two explicitly defined characters
std::size_t Charmap::
process_ellipsis (const Scanner::token_t &beg_tok, int num_ellipsis)
{
// get the upper end of the range denoted by the ellipsis
const Scanner::token_t end_tok = scanner_.next_token ();
// get the human readabale encoding of the character
// denoted by the lower end of the ellipsis
const std::string encoding = scanner_.next_token ().name;
// convert the encoding to a multibyte character
std::string mbchar = encoding_to_mbchar (encoding);
// add the beg_tok symbol name to the maps
add_to_cmaps (beg_tok.name, mbchar, true);
// extract the numeric portion of the symbolic character name
// denoted by the lower end of the ellipsis
std::size_t idx = 0;
int base; // numeric base
const char *fmat; // sprintf() format specifier
const std::size_t beg_len = beg_tok.name.size ();
// determine the value of the beginning of the range
// denoted by the ellipsis
if (2 == num_ellipsis) {
base = 16;
fmat = "%.*s%0*lX>";
// advance to the first hex digit
while (idx < beg_len && !(std::isxdigit)(beg_tok.name [idx]))
++idx;
}
else {
base = 10;
fmat = "%.*s%0*ld>";
// advance to the first decimal digit
while (idx < beg_len && !(std::isdigit)(beg_tok.name [idx]))
++idx;
}
// length of non-numeric prefix of the symbolic character name
const std::size_t pfx_len = idx;
// get the character value plus one (since the first value
// has already been added to the map earlier)
char *num_end;
const unsigned long beg_val =
1 + std::strtoul (beg_tok.name.c_str () + pfx_len, &num_end, base);
// the length of the numeric portion
const std::size_t num_size =
num_end - (beg_tok.name.c_str () + pfx_len);
// find the end of the range denoted by the ellipsis
idx = 0;
const std::size_t end_len = end_tok.name.size ();
if (2 == num_ellipsis) {
// advance to the next hex digit
while (idx < end_len && !(std::isxdigit)(end_tok.name [idx]))
++idx;
}
else {
// advance to the next dec digit
while (idx < end_len && !(std::isdigit)(end_tok.name [idx]))
++idx;
}
const unsigned long end_val =
std::strtoul (end_tok.name.c_str () + idx, (char**)0, base);
// the ending numeric value must be greater than or equal
// to the beginning numeric value
if (end_val < beg_val)
issue_diag (E_RANGE, true, &end_tok,
"invalid range found in character map file\n");
char next_name [MAX_SYM_NAME_LEN];
std::size_t nchars = 0;
const char* const pfx = beg_tok.name.c_str ();
for (unsigned long val = beg_val; val <= end_val; ++val, ++nchars) {
std::sprintf (next_name, fmat, pfx_len, pfx, num_size, val);
// increment the last byte of the multibyte character
// and if the result is valid (i.e., doesn't contain
// an embedded NUL) add the generated name and the
// multibyte character to the maps
const unsigned char last_byte = mbchar [mbchar.size () - 1];
if (last_byte < UCHAR_MAX) {
mbchar [mbchar.size () - 1] = last_byte + 1;
add_to_cmaps (next_name, mbchar, true);
}
else {
// an ellipsis must not specify a range that includes
// an encoding with an embedded NUL
issue_diag (E_RANGE, true, &beg_tok,
"encoding of an element in range contains NUL\n");
}
}
// return the number of characters denoted by the ellipsis
return nchars;
}
// process all the characters in the character map file.
void Charmap::process_chars()
{
issue_diag (I_STAGE, false, 0, "processing CHARMAP section\n");
std::size_t ntokens = 0;
std::size_t nellips = 0;
std::size_t nchars = 0;
next = scanner_.next_token();
Scanner::token_t nextnext;
// loop until we find the closing charmap token
for ( ; next.token != Scanner::tok_charmap; ++ntokens) {
switch (next.token) {
case Scanner::tok_nl:
case Scanner::tok_end:
break;
case Scanner::tok_sym_name:
// the next token may be either ellipsis if this line
// of the charmap is in the form:
// "%s...%s %s\n", <sym_name>, <sym_name>, <encoding>
// or an encoding if this line is in the format:
// "%s %s\n", <sym_name>, <encoding>
nextnext = scanner_.next_token ();
ntokens += 3;
switch (nextnext.token) {
case Scanner::tok_abs_ellipsis:
// absolute ellipsis (see ISO/IEC TR 14652)
nchars += process_ellipsis (next, 3);
++nellips;
break;
case Scanner::tok_hex_ellipsis:
// hexadecimal symbolic ellipsis (see ISO/IEC TR 14652)
nchars += process_ellipsis (next, 2);
++nellips;
break;
case Scanner::tok_char_value:
// character represented as a numeric constant
add_to_cmaps (next.name, nextnext.name);
++nchars;
break;
default:
issue_diag (E_SYNTAX, true, &next,
"byte value expected following symbolic "
"name in character map file\n");
}
scanner_.ignore_line ();
break;
default:
issue_diag (E_SYNTAX, true, &next,
"symbolic name expected in character map file\n");
break;
}
next = scanner_.next_token();
}
issue_diag (I_STAGE, false, 0,
"done processing CHARMAP section (%lu tokens, "
"%lu ellipses, %lu characters)\n",
ntokens, nellips, nchars);
// make sure that all characters in the portable character set
// are in the charmap
if (forward_maps)
verify_portable_charset();
}
void Charmap::verify_portable_charset () const
{
const std::size_t nchars =
sizeof portable_charset / sizeof *portable_charset;
for (std::size_t i = 0; i < nchars; ++i) {
if (0 == portable_charset [i])
continue;
if (n_cmap_.find (portable_charset [i]) == n_cmap_.end ())
issue_diag (W_NOPCS, false, 0,
"member of portable character set %s not found "
"in the character map\n", portable_charset [i]);
}
}
Charmap::Charmap(const char* Clocale,
const char* fname,
bool in_utf8, bool create_forward_maps,
bool create_reverse_maps, bool use_UCS4)
: mb_cur_max_(1),
charmap_name_ (fname),
Clocale_ (Clocale),
largest_nchar_(0),
in_utf8_(in_utf8),
forward_maps (create_forward_maps),
reverse_maps (create_reverse_maps),
UCS4_internal_ (use_UCS4)
{
#ifndef _RWSTD_NO_ICONV
ic_to_utf8_ = 0;
ic_to_ext_ = 0;
#endif // _RWSTD_NO_ICONV
scanner_.open (fname, '#', '\\');
// set code_set_name to the name of the character set description
// file by default, in case it's not explicitly specified
const char* const slash = std::strrchr (fname, _RWSTD_PATH_SEP);
code_set_name_ = slash ? slash + 1 : fname;
// loop until we reach the end of the file
while ((next = scanner_.next_token()).token != Scanner::tok_end_tokens) {
switch (next.token) {
case Scanner::tok_code_set_name:
next = scanner_.next_token ();
if (next.token == Scanner::tok_string) {
code_set_name_ = next.name.substr (1, next.name.size () - 2);
}
else if (next.token == Scanner::tok_ndef) {
code_set_name_ = next.name;
}
else
issue_diag (E_SYNTAX, true, &next,
"string expected following <code_set_name>\n");
// we always need a iconv to utf8 so that we can create
// the utf8_charmap unless we are on windows
#ifndef _RWSTD_NO_ICONV
if (!in_utf8_) {
ic_to_utf8_ = open_iconv_to_utf8 ();
# if !defined (_RWSTD_NO_ISO_10646_WCHAR_T)
ic_to_ext_ = open_iconv_to_ext ();
# endif // _RWSTD_NO_ISO_10646_WCHAR_T
}
#else // if defined (_RWSTD_NO_ICONV)
# ifdef _WIN32
codepage_ = get_codepage (code_set_name_);
if (codepage_ == 0) {
issue_diag (W_ICONV, false, 0,
"iconv_open (%s to UTF-8) failed\n",
code_set_name_.c_str());
}
# endif // _WIN32
#endif // _RWSTD_NO_ICONV
scanner_.ignore_line ();
break;
case Scanner::tok_mb_cur_max:
mb_cur_max_ = std::atoi (scanner_.next_token ().name.c_str ());
scanner_.ignore_line ();
break;
case Scanner::tok_mb_cur_min:
scanner_.ignore_line ();
break;
case Scanner::tok_charmap:
scanner_.ignore_line ();
process_chars();
break;
case Scanner::tok_width:
// ignore the width section of the character map
while ((next = scanner_.next_token ()).token != Scanner::tok_width);
break;
case Scanner::tok_nl:
break;
default:
issue_diag (E_SYNTAX, false, &next,
"unknown token %s in character map file\n",
next.name.c_str ());
}
}
}