blob: 9930d95947902b45f809cc2fc3e2870342658ea3 [file] [log] [blame]
/***************************************************************************
*
* def.cpp
*
* $Id$
*
***************************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Copyright 2001-2006 Rogue Wave Software.
*
**************************************************************************/
// #ifndef _RWSTD_NO_PURE_C_HEADERS
// # define _RWSTD_NO_PURE_C_HEADERS
// #endif // _RWSTD_NO_PURE_C_HEADERS
// #ifndef _RWSTD_NO_DEPRECATED_C_HEADERS
// # define _RWSTD_NO_DEPRECATED_C_HEADERS
// #endif // _RWSTD_NO_DEPRECATED_C_HEADERS
#ifdef __DECCXX
# undef __PURE_CNAME
#endif // __DECCXX
#include <algorithm>
#include <fstream>
#include <iostream>
#include <locale>
#include <map>
#include <string>
#include <vector>
#include <cassert>
#include <cctype>
#include <cerrno>
#include <climits>
#include <clocale>
#include <cstdio>
#include <cstdlib>
#include <cstring> // for memset()
#include "aliases.h"
#include "def.h"
#include "diagnostic.h"
#include "loc_exception.h"
#include "path.h"
#define UTF8_MAX_SIZE 6
// convert_to_ext converts a wchar_t value with some encoding into
// a narrow character string in the current locale's encoding
std::string Def::convert_to_ext (wchar_t val)
{
rmb_cmap_iter it;
if ((it = charmap_.get_rmb_cmap().find(val))
!= charmap_.get_rmb_cmap().end()){
return it->second;
}
issue_diag (E_CVT2EXT, true, 0,
"unable to convert character %d to external "
"representation\n", val);
return std::string("");
}
// convert the wchar_t value into a utf8 string
std::string Def::utf8_encode (wchar_t wc)
{
unsigned int wc_int = _RWSTD_STATIC_CAST (unsigned int, wc);
std::string ret;
std::size_t size = 0;
char buf[UTF8_MAX_SIZE + 1];
char* bufp = buf;
if (wc_int < 0x80)
{
size = 1;
*bufp++ = wc_int;
}
else
{
int b;
for (b = 2; b < UTF8_MAX_SIZE; b++)
if ((wc_int & (~(wchar_t)0 << (5 * b + 1))) == 0)
break;
size = b;
*bufp = (unsigned char) (~0xff >> b);
--b;
do
{
bufp[b] = 0x80 | (wc_int & 0x3f);
wc_int >>= 6;
}
while (--b > 0);
*bufp |= wc_int;
}
buf[size] = (char)0;
for (unsigned int i = 0; i < size; i++)
ret += buf[i];
return ret;
}
void Def::copy_file (const std::string& name, const std::string& outname)
{
assert (name.size() > 0);
assert (outname.size() > 0);
std::ifstream from (name.c_str(), std::ios::binary);
if (!from) {
issue_diag (E_OPENRD, true,
&next, "unable to open locale database %s\n",
name.c_str());
}
from.exceptions (std::ios::badbit);
std::ofstream to (outname.c_str(), std::ios::binary);
if (!to) {
issue_diag (E_OPENWR, true,
&next, "unable to create locale database %s\n",
outname.c_str());
}
to.exceptions (std::ios::failbit | std::ios::badbit);
// copy the file
to << from.rdbuf ();
}
void Def::copy_category(int category, std::string name)
{
assert (name.size() > 0);
// create the name of the file to copy to and call copy_file
std::string outname (output_name_);
makedir (outname.c_str ());
switch (category) {
// append the category name to both 'name' and 'outname'
// and call the copy_file routine
// the xxx_written variable is set to true so that write_xxx
// does not overwrite the file that is written here
case LC_CTYPE:
(name += _RWSTD_PATH_SEP) += "LC_CTYPE";
(outname += _RWSTD_PATH_SEP) += "LC_CTYPE";
copy_file (name, outname);
ctype_written_ = true;
break;
case LC_COLLATE:
(name += _RWSTD_PATH_SEP) += "LC_COLLATE";
(outname += _RWSTD_PATH_SEP) += "LC_COLLATE";
copy_file(name, outname);
collate_written_ = true;
break;
case LC_MONETARY:
(name += _RWSTD_PATH_SEP) += "LC_MONETARY";
(outname += _RWSTD_PATH_SEP) += "LC_MONETARY";
copy_file(name, outname);
mon_written_ = true;
break;
case LC_NUMERIC:
(name += _RWSTD_PATH_SEP) += "LC_NUMERIC";
(outname += _RWSTD_PATH_SEP) += "LC_NUMERIC";
copy_file(name, outname);
num_written_ = true;
break;
case LC_TIME:
(name += _RWSTD_PATH_SEP) += "LC_TIME";
(outname += _RWSTD_PATH_SEP) += "LC_TIME";
copy_file(name, outname);
time_written_ = true;
break;
#ifdef LC_MESSAGES
case LC_MESSAGES:
(name += _RWSTD_PATH_SEP) += "LC_MESSAGES";
(outname += _RWSTD_PATH_SEP) += "LC_MESSAGES";
copy_file(name, outname);
messages_written_ = true;
break;
#endif // LC_MESSAGES
default:
break;
}
}
// strip a pair, which should be in the form '(<sym>,<sym2>)'
void Def::strip_pair (const std::string &tok, std::string &sym,
std::string &sym2)
{
std::size_t i = 0;
if(tok[i] == '(') {
if(tok[++i] == '<')
while (tok[i] != '>'){
if (tok[i] == scanner_.escape_char ())
i++;
sym.push_back(tok[i++]);
}
// this push_back is safe because the while loop above ends when
// tok[i] == '>'
sym.push_back(tok[i++]);
if (tok[i++] != ',')
issue_diag (E_PAIR, true, &next,
"invalid pair %s\n", tok.c_str());
if (tok[i] == '<')
while (tok[i] != '>'){
if (tok[i] == scanner_.escape_char ())
sym2.push_back(tok[i++]);
if ('\0' != tok[i])
sym2.push_back(tok[i++]);
else
issue_diag (E_PAIR, true, &next,
"invalid pair %s\n", tok.c_str());
}
// this push_back is safe because the while loop above ends when
// tok[i] == '>'
sym2.push_back(tok[i++]);
}
}
// converts str, which is a string in the following format
// "[<sym_name>][char]" including the quotes to a string of characters
// str is not a const reference because if the string spans multiple lines
// str is modified
std::string Def::convert_string (const std::string &str1)
{
assert (str1[0] == '\"');
std::string ret;
std::string sym;
// the index starts at 1 so that we ignore the initial '"'
int idx = 1;
const char* str = str1.c_str();
while (str[idx] != '\"') {
sym.clear();
// if we reach the null-terminator before we see an end-quote
// then we must have a multi-line string, so get the next token
if (str[idx] == '\0') {
if((next = scanner_.next_token()).token == Scanner::tok_string)
break;
str = next.name.c_str();
idx = 0;
}
// '<' marks the beginning of a symbolic name
// construct the name and look up its value in the cmap
if (str[idx] == '<') {
while (str [idx] && str [idx] != '>') {
if (str[idx] == scanner_.escape_char ())
idx++;
sym += str[idx++];
}
// this is safe because the while loop ended with *str == '>'
if (str [idx])
sym += str [idx++];
w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym);
if (w_pos != charmap_.get_w_cmap().end()) {
ret += convert_to_ext(w_pos->second);
}
else {
return std::string();
}
}
// the definition file contains a sting with non-symbol names.
// process each character as it's actual character value.
// Locale definitions that use this may not be portable.
else {
ret += (char)str[idx++];
}
}
return ret;
}
#ifndef _RWSTD_NO_WCHAR_T
// converts a collating element definition to an array of wide characters
// (the wide characters the collating element is composed of).
// this overload deals with collating elements defined through
// a sequence of symbolic names, NOT enclosed within quotes.
std::wstring
Def::convert_wstring (const StringVector& sym_array)
{
std::wstring ret;
StringVector::const_iterator it = sym_array.begin ();
while (it != sym_array.end ()) {
// lookup the symbol we just constructed
w_cmap_iter w_pos = charmap_.get_w_cmap().find (*it);
if (w_pos != charmap_.get_w_cmap().end()) {
ret += w_pos->second;
it++;
}
else {
// we return an empty string if we couldn't find any character
// in the character map
ret.clear();
return ret;
}
}
return ret;
}
// this overload deals with collating elements defined through
// a sequence of characters or symbolic names, enclosed within quotes.
std::wstring
Def::convert_wstring (const token_t& t)
{
std::wstring ret;
std::string sym;
std::string str1 (t.name);
int idx = 0;
char term = 0;
const char* str = str1.c_str();
// skip first character if quote
if (str[idx] == '\"') {
term = '\"', idx++;
}
while (str[idx] != term) {
sym.clear();
// '<' marks the beginning of a symbolic name
// construct the name and look up its value in the cmap
if (str[idx] == '<') {
while (str[idx] != '>') {
if (str[idx] == scanner_.escape_char ()) {
// sym += str[idx++];
idx++;
}
if ('\0' != str[idx])
sym += str[idx++];
else
issue_diag (E_SYMEND, true, &t,
"end of symbolic name not found\n");
}
// this is safe because the while loop ended with *str == '>'
sym += str[idx++];
// lookup the symbol we just constructed
w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym);
if (w_pos != charmap_.get_w_cmap().end()) {
ret += w_pos->second;
}
else {
// if we can't find a symbol then return an empty string,
// most likely this will happen if inside a collating-element
// the user uses a character that is not in the current
// codeset, in this case the collating element will be ignored
ret.clear();
return ret;
}
}
// the definition file contains a string with non-symbol names.
// process each character as it's actual character value.
// Locale definitions that use this may not be portable.
else
ret += (wchar_t)str[idx++];
}
return ret;
}
#endif // _RWSTD_NO_WCHAR_T
// automatically fill any categories that depend on other categories
void Def::auto_fill ()
{
mask_iter mask_pos;
for (std::size_t i = 0; i <= UCHAR_MAX; i++) {
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|| ctype_out_.mask_tab[i] & std::ctype_base::lower
|| ctype_out_.mask_tab[i] & std::ctype_base::alpha
|| ctype_out_.mask_tab[i] & std::ctype_base::digit
|| ctype_out_.mask_tab[i] & std::ctype_base::xdigit
|| ctype_out_.mask_tab[i] & std::ctype_base::punct)
ctype_out_.mask_tab[i] |= std::ctype_base::print;
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|| ctype_out_.mask_tab[i] & std::ctype_base::lower)
ctype_out_.mask_tab[i] |= std::ctype_base::alpha;
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|| ctype_out_.mask_tab[i] & std::ctype_base::lower
|| ctype_out_.mask_tab[i] & std::ctype_base::alpha
|| ctype_out_.mask_tab[i] & std::ctype_base::digit
|| ctype_out_.mask_tab[i] & std::ctype_base::xdigit
|| ctype_out_.mask_tab[i] & std::ctype_base::punct)
ctype_out_.mask_tab[i] |= std::ctype_base::graph;
}
for (mask_pos = mask_.begin(); mask_pos != mask_.end(); mask_pos++) {
// all lower, alpha, digit, xdigit, and punct, and space
// characters are automatically print
if ( mask_pos->second & std::ctype_base::upper
|| mask_pos->second & std::ctype_base::lower
|| mask_pos->second & std::ctype_base::alpha
|| mask_pos->second & std::ctype_base::digit
|| mask_pos->second & std::ctype_base::xdigit
|| mask_pos->second & std::ctype_base::punct)
// || mask_pos->second & std::ctype_base::space)
mask_pos->second |= std::ctype_base::print;
// all upper and lower characters are alpha
if ( mask_pos->second & std::ctype_base::upper
|| mask_pos->second & std::ctype_base::lower)
mask_pos->second |= std::ctype_base::alpha;
// all upper, lower, alpha, digit, xdigit, and punct characters
// are graph characters
if ( mask_pos->second & std::ctype_base::upper
|| mask_pos->second & std::ctype_base::lower
|| mask_pos->second & std::ctype_base::alpha
|| mask_pos->second & std::ctype_base::digit
|| mask_pos->second & std::ctype_base::xdigit
|| mask_pos->second & std::ctype_base::punct)
mask_pos->second |= std::ctype_base::graph;
}
}
void Def::process_input ()
{
while ((next = scanner_.next_token ()).token != Scanner::tok_end_tokens) {
switch (next.token) {
case Scanner::tok_comment:
scanner_.ignore_line ();
break;
case Scanner::tok_ctype:
process_ctype ();
break;
case Scanner::tok_collate:
process_collate ();
break;
case Scanner::tok_monetary:
process_monetary ();
break;
case Scanner::tok_numeric:
process_numeric ();
break;
case Scanner::tok_time:
process_time ();
break;
case Scanner::tok_messages:
process_messages ();
break;
case Scanner::tok_nl:
break;
default:
scanner_.ignore_line ();
break;
}
}
auto_fill ();
}
Def::Def (const char* filename, const char* out_name, Charmap& char_map,
bool no_position)
: warnings_occurred_ (false),
scan_ahead_ (false),
next_offset_ (0),
output_name_ (out_name),
charmap_ (char_map),
ctype_written_ (false),
codecvt_written_ (false),
collate_written_ (false),
time_written_ (false),
num_written_ (false),
mon_written_ (false),
messages_written_ (false),
ctype_def_found_ (false),
collate_def_found_ (false),
time_def_found_ (false),
num_def_found_ (false),
mon_def_found_ (false),
messages_def_found_ (false),
undefined_keyword_found_ (false),
no_position_ (no_position)
{
// make sure ctype_out object is cleared
std::memset (&ctype_out_, 0, sizeof (ctype_out_));
std::memset (&time_out_, 0, sizeof (time_out_));
// invalidate format characters by setting each to CHAR_MAX
// as specified by the C function localeconv()
mon_out_.frac_digits [0] = CHAR_MAX;
mon_out_.frac_digits [1] = CHAR_MAX;
mon_out_.p_cs_precedes [0] = CHAR_MAX;
mon_out_.p_sep_by_space [0] = CHAR_MAX;
mon_out_.n_cs_precedes [0] = CHAR_MAX;
mon_out_.n_sep_by_space [0] = CHAR_MAX;
mon_out_.p_sign_posn [0] = CHAR_MAX;
mon_out_.n_sign_posn [0] = CHAR_MAX;
mon_st_.mon_grouping += CHAR_MAX;
// invalidate int'l formats
mon_out_.p_cs_precedes [1] = CHAR_MAX;
mon_out_.p_sep_by_space [1] = CHAR_MAX;
mon_out_.n_cs_precedes [1] = CHAR_MAX;
mon_out_.n_sep_by_space [1] = CHAR_MAX;
mon_out_.p_sign_posn [1] = CHAR_MAX;
mon_out_.n_sign_posn [1] = CHAR_MAX;
num_st_.grouping += CHAR_MAX;
collate_out_.largest_ce = 1;
collate_out_.longest_weight = 1;
collate_out_.num_wchars = 0;
std::memset (collate_out_.weight_type, 0,
sizeof (collate_out_.weight_type));
// initialize all extensions to 0
ctype_out_.ctype_ext_off = 0;
num_out_.numeric_ext_off = 0;
collate_out_.collate_ext_off = 0;
mon_out_.monetary_ext_off = 0;
time_out_.time_ext_off = 0;
// actual processing
scanner_.open (filename);
}
Def::~Def ()
{
// free up the memory that was allocated
coll_map_iter coll_map_pos;
for (coll_map_pos = coll_map_.begin();
coll_map_pos != coll_map_.end(); coll_map_pos ++) {
delete[] (coll_map_pos->second.weights);
}
}