| /*************************************************************************** |
| * |
| * def.cpp |
| * |
| * $Id$ |
| * |
| *************************************************************************** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed |
| * with this work for additional information regarding copyright |
| * ownership. The ASF licenses this file to you under the Apache |
| * License, Version 2.0 (the "License"); you may not use this file |
| * except in compliance with the License. You may obtain a copy of |
| * the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
| * implied. See the License for the specific language governing |
| * permissions and limitations under the License. |
| * |
| * Copyright 2001-2006 Rogue Wave Software. |
| * |
| **************************************************************************/ |
| |
| // #ifndef _RWSTD_NO_PURE_C_HEADERS |
| // # define _RWSTD_NO_PURE_C_HEADERS |
| // #endif // _RWSTD_NO_PURE_C_HEADERS |
| |
| // #ifndef _RWSTD_NO_DEPRECATED_C_HEADERS |
| // # define _RWSTD_NO_DEPRECATED_C_HEADERS |
| // #endif // _RWSTD_NO_DEPRECATED_C_HEADERS |
| |
| #ifdef __DECCXX |
| # undef __PURE_CNAME |
| #endif // __DECCXX |
| |
| #include <algorithm> |
| #include <fstream> |
| #include <iostream> |
| #include <locale> |
| #include <map> |
| #include <string> |
| #include <vector> |
| |
| #include <cassert> |
| #include <cctype> |
| #include <cerrno> |
| #include <climits> |
| #include <clocale> |
| #include <cstdio> |
| #include <cstdlib> |
| #include <cstring> // for memset() |
| |
| #include "aliases.h" |
| #include "def.h" |
| #include "diagnostic.h" |
| #include "loc_exception.h" |
| #include "path.h" |
| |
| |
| #define UTF8_MAX_SIZE 6 |
| |
| |
| // convert_to_ext converts a wchar_t value with some encoding into |
| // a narrow character string in the current locale's encoding |
| std::string Def::convert_to_ext (wchar_t val) |
| { |
| rmb_cmap_iter it; |
| if ((it = charmap_.get_rmb_cmap().find(val)) |
| != charmap_.get_rmb_cmap().end()){ |
| return it->second; |
| |
| } |
| |
| issue_diag (E_CVT2EXT, true, 0, |
| "unable to convert character %d to external " |
| "representation\n", val); |
| |
| return std::string(""); |
| } |
| |
| |
| // convert the wchar_t value into a utf8 string |
| std::string Def::utf8_encode (wchar_t wc) |
| { |
| unsigned int wc_int = _RWSTD_STATIC_CAST (unsigned int, wc); |
| |
| std::string ret; |
| std::size_t size = 0; |
| char buf[UTF8_MAX_SIZE + 1]; |
| char* bufp = buf; |
| |
| if (wc_int < 0x80) |
| { |
| size = 1; |
| *bufp++ = wc_int; |
| } |
| else |
| { |
| int b; |
| |
| for (b = 2; b < UTF8_MAX_SIZE; b++) |
| if ((wc_int & (~(wchar_t)0 << (5 * b + 1))) == 0) |
| break; |
| size = b; |
| |
| *bufp = (unsigned char) (~0xff >> b); |
| --b; |
| do |
| { |
| bufp[b] = 0x80 | (wc_int & 0x3f); |
| wc_int >>= 6; |
| } |
| while (--b > 0); |
| *bufp |= wc_int; |
| } |
| buf[size] = (char)0; |
| for (unsigned int i = 0; i < size; i++) |
| ret += buf[i]; |
| return ret; |
| |
| } |
| |
| void Def::copy_file (const std::string& name, const std::string& outname) |
| { |
| assert (name.size() > 0); |
| assert (outname.size() > 0); |
| |
| std::ifstream from (name.c_str(), std::ios::binary); |
| if (!from) { |
| issue_diag (E_OPENRD, true, |
| &next, "unable to open locale database %s\n", |
| name.c_str()); |
| } |
| from.exceptions (std::ios::badbit); |
| |
| std::ofstream to (outname.c_str(), std::ios::binary); |
| if (!to) { |
| issue_diag (E_OPENWR, true, |
| &next, "unable to create locale database %s\n", |
| outname.c_str()); |
| } |
| to.exceptions (std::ios::failbit | std::ios::badbit); |
| |
| // copy the file |
| to << from.rdbuf (); |
| } |
| |
| |
| void Def::copy_category(int category, std::string name) |
| { |
| assert (name.size() > 0); |
| |
| // create the name of the file to copy to and call copy_file |
| std::string outname (output_name_); |
| |
| makedir (outname.c_str ()); |
| |
| switch (category) { |
| // append the category name to both 'name' and 'outname' |
| // and call the copy_file routine |
| // the xxx_written variable is set to true so that write_xxx |
| // does not overwrite the file that is written here |
| case LC_CTYPE: |
| (name += _RWSTD_PATH_SEP) += "LC_CTYPE"; |
| (outname += _RWSTD_PATH_SEP) += "LC_CTYPE"; |
| copy_file (name, outname); |
| ctype_written_ = true; |
| |
| break; |
| case LC_COLLATE: |
| (name += _RWSTD_PATH_SEP) += "LC_COLLATE"; |
| (outname += _RWSTD_PATH_SEP) += "LC_COLLATE"; |
| copy_file(name, outname); |
| collate_written_ = true; |
| |
| break; |
| case LC_MONETARY: |
| (name += _RWSTD_PATH_SEP) += "LC_MONETARY"; |
| (outname += _RWSTD_PATH_SEP) += "LC_MONETARY"; |
| copy_file(name, outname); |
| mon_written_ = true; |
| |
| break; |
| |
| case LC_NUMERIC: |
| (name += _RWSTD_PATH_SEP) += "LC_NUMERIC"; |
| (outname += _RWSTD_PATH_SEP) += "LC_NUMERIC"; |
| copy_file(name, outname); |
| num_written_ = true; |
| |
| break; |
| case LC_TIME: |
| (name += _RWSTD_PATH_SEP) += "LC_TIME"; |
| (outname += _RWSTD_PATH_SEP) += "LC_TIME"; |
| copy_file(name, outname); |
| time_written_ = true; |
| |
| break; |
| |
| #ifdef LC_MESSAGES |
| case LC_MESSAGES: |
| (name += _RWSTD_PATH_SEP) += "LC_MESSAGES"; |
| (outname += _RWSTD_PATH_SEP) += "LC_MESSAGES"; |
| copy_file(name, outname); |
| messages_written_ = true; |
| break; |
| #endif // LC_MESSAGES |
| |
| default: |
| break; |
| } |
| } |
| |
| |
| // strip a pair, which should be in the form '(<sym>,<sym2>)' |
| void Def::strip_pair (const std::string &tok, std::string &sym, |
| std::string &sym2) |
| { |
| std::size_t i = 0; |
| |
| if(tok[i] == '(') { |
| if(tok[++i] == '<') |
| while (tok[i] != '>'){ |
| if (tok[i] == scanner_.escape_char ()) |
| i++; |
| sym.push_back(tok[i++]); |
| } |
| // this push_back is safe because the while loop above ends when |
| // tok[i] == '>' |
| sym.push_back(tok[i++]); |
| if (tok[i++] != ',') |
| issue_diag (E_PAIR, true, &next, |
| "invalid pair %s\n", tok.c_str()); |
| if (tok[i] == '<') |
| while (tok[i] != '>'){ |
| if (tok[i] == scanner_.escape_char ()) |
| sym2.push_back(tok[i++]); |
| if ('\0' != tok[i]) |
| sym2.push_back(tok[i++]); |
| else |
| issue_diag (E_PAIR, true, &next, |
| "invalid pair %s\n", tok.c_str()); |
| } |
| |
| // this push_back is safe because the while loop above ends when |
| // tok[i] == '>' |
| sym2.push_back(tok[i++]); |
| |
| } |
| } |
| |
| // converts str, which is a string in the following format |
| // "[<sym_name>][char]" including the quotes to a string of characters |
| // str is not a const reference because if the string spans multiple lines |
| // str is modified |
| std::string Def::convert_string (const std::string &str1) |
| { |
| assert (str1[0] == '\"'); |
| |
| std::string ret; |
| |
| std::string sym; |
| // the index starts at 1 so that we ignore the initial '"' |
| int idx = 1; |
| |
| const char* str = str1.c_str(); |
| while (str[idx] != '\"') { |
| sym.clear(); |
| // if we reach the null-terminator before we see an end-quote |
| // then we must have a multi-line string, so get the next token |
| if (str[idx] == '\0') { |
| if((next = scanner_.next_token()).token == Scanner::tok_string) |
| break; |
| str = next.name.c_str(); |
| idx = 0; |
| } |
| |
| // '<' marks the beginning of a symbolic name |
| // construct the name and look up its value in the cmap |
| if (str[idx] == '<') { |
| while (str [idx] && str [idx] != '>') { |
| if (str[idx] == scanner_.escape_char ()) |
| idx++; |
| sym += str[idx++]; |
| } |
| |
| // this is safe because the while loop ended with *str == '>' |
| if (str [idx]) |
| sym += str [idx++]; |
| |
| w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym); |
| if (w_pos != charmap_.get_w_cmap().end()) { |
| ret += convert_to_ext(w_pos->second); |
| } |
| else { |
| return std::string(); |
| } |
| } |
| |
| // the definition file contains a sting with non-symbol names. |
| // process each character as it's actual character value. |
| // Locale definitions that use this may not be portable. |
| else { |
| ret += (char)str[idx++]; |
| |
| } |
| } |
| |
| return ret; |
| |
| |
| } |
| |
| #ifndef _RWSTD_NO_WCHAR_T |
| // converts a collating element definition to an array of wide characters |
| // (the wide characters the collating element is composed of). |
| |
| // this overload deals with collating elements defined through |
| // a sequence of symbolic names, NOT enclosed within quotes. |
| std::wstring |
| Def::convert_wstring (const StringVector& sym_array) |
| { |
| std::wstring ret; |
| StringVector::const_iterator it = sym_array.begin (); |
| while (it != sym_array.end ()) { |
| // lookup the symbol we just constructed |
| w_cmap_iter w_pos = charmap_.get_w_cmap().find (*it); |
| if (w_pos != charmap_.get_w_cmap().end()) { |
| ret += w_pos->second; |
| it++; |
| } |
| else { |
| // we return an empty string if we couldn't find any character |
| // in the character map |
| ret.clear(); |
| return ret; |
| } |
| } |
| |
| return ret; |
| } |
| |
| // this overload deals with collating elements defined through |
| // a sequence of characters or symbolic names, enclosed within quotes. |
| std::wstring |
| Def::convert_wstring (const token_t& t) |
| { |
| std::wstring ret; |
| std::string sym; |
| |
| std::string str1 (t.name); |
| |
| int idx = 0; |
| char term = 0; |
| const char* str = str1.c_str(); |
| |
| // skip first character if quote |
| if (str[idx] == '\"') { |
| term = '\"', idx++; |
| } |
| |
| while (str[idx] != term) { |
| sym.clear(); |
| |
| // '<' marks the beginning of a symbolic name |
| // construct the name and look up its value in the cmap |
| if (str[idx] == '<') { |
| while (str[idx] != '>') { |
| if (str[idx] == scanner_.escape_char ()) { |
| // sym += str[idx++]; |
| idx++; |
| } |
| |
| if ('\0' != str[idx]) |
| sym += str[idx++]; |
| else |
| issue_diag (E_SYMEND, true, &t, |
| "end of symbolic name not found\n"); |
| } |
| |
| // this is safe because the while loop ended with *str == '>' |
| sym += str[idx++]; |
| |
| // lookup the symbol we just constructed |
| w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym); |
| if (w_pos != charmap_.get_w_cmap().end()) { |
| ret += w_pos->second; |
| } |
| else { |
| // if we can't find a symbol then return an empty string, |
| // most likely this will happen if inside a collating-element |
| // the user uses a character that is not in the current |
| // codeset, in this case the collating element will be ignored |
| ret.clear(); |
| return ret; |
| } |
| } |
| // the definition file contains a string with non-symbol names. |
| // process each character as it's actual character value. |
| // Locale definitions that use this may not be portable. |
| else |
| ret += (wchar_t)str[idx++]; |
| } |
| |
| return ret; |
| |
| } |
| |
| #endif // _RWSTD_NO_WCHAR_T |
| |
| |
| // automatically fill any categories that depend on other categories |
| void Def::auto_fill () |
| { |
| |
| mask_iter mask_pos; |
| |
| for (std::size_t i = 0; i <= UCHAR_MAX; i++) { |
| if ( ctype_out_.mask_tab[i] & std::ctype_base::upper |
| || ctype_out_.mask_tab[i] & std::ctype_base::lower |
| || ctype_out_.mask_tab[i] & std::ctype_base::alpha |
| || ctype_out_.mask_tab[i] & std::ctype_base::digit |
| || ctype_out_.mask_tab[i] & std::ctype_base::xdigit |
| || ctype_out_.mask_tab[i] & std::ctype_base::punct) |
| |
| ctype_out_.mask_tab[i] |= std::ctype_base::print; |
| |
| if ( ctype_out_.mask_tab[i] & std::ctype_base::upper |
| || ctype_out_.mask_tab[i] & std::ctype_base::lower) |
| |
| ctype_out_.mask_tab[i] |= std::ctype_base::alpha; |
| |
| if ( ctype_out_.mask_tab[i] & std::ctype_base::upper |
| || ctype_out_.mask_tab[i] & std::ctype_base::lower |
| || ctype_out_.mask_tab[i] & std::ctype_base::alpha |
| || ctype_out_.mask_tab[i] & std::ctype_base::digit |
| || ctype_out_.mask_tab[i] & std::ctype_base::xdigit |
| || ctype_out_.mask_tab[i] & std::ctype_base::punct) |
| |
| ctype_out_.mask_tab[i] |= std::ctype_base::graph; |
| } |
| |
| for (mask_pos = mask_.begin(); mask_pos != mask_.end(); mask_pos++) { |
| // all lower, alpha, digit, xdigit, and punct, and space |
| // characters are automatically print |
| |
| if ( mask_pos->second & std::ctype_base::upper |
| || mask_pos->second & std::ctype_base::lower |
| || mask_pos->second & std::ctype_base::alpha |
| || mask_pos->second & std::ctype_base::digit |
| || mask_pos->second & std::ctype_base::xdigit |
| || mask_pos->second & std::ctype_base::punct) |
| // || mask_pos->second & std::ctype_base::space) |
| |
| mask_pos->second |= std::ctype_base::print; |
| |
| // all upper and lower characters are alpha |
| if ( mask_pos->second & std::ctype_base::upper |
| || mask_pos->second & std::ctype_base::lower) |
| |
| mask_pos->second |= std::ctype_base::alpha; |
| |
| // all upper, lower, alpha, digit, xdigit, and punct characters |
| // are graph characters |
| if ( mask_pos->second & std::ctype_base::upper |
| || mask_pos->second & std::ctype_base::lower |
| || mask_pos->second & std::ctype_base::alpha |
| || mask_pos->second & std::ctype_base::digit |
| || mask_pos->second & std::ctype_base::xdigit |
| || mask_pos->second & std::ctype_base::punct) |
| |
| mask_pos->second |= std::ctype_base::graph; |
| |
| |
| } |
| } |
| |
| |
| void Def::process_input () |
| { |
| while ((next = scanner_.next_token ()).token != Scanner::tok_end_tokens) { |
| |
| switch (next.token) { |
| |
| case Scanner::tok_comment: |
| scanner_.ignore_line (); |
| break; |
| |
| case Scanner::tok_ctype: |
| process_ctype (); |
| break; |
| |
| case Scanner::tok_collate: |
| process_collate (); |
| break; |
| |
| case Scanner::tok_monetary: |
| process_monetary (); |
| break; |
| |
| case Scanner::tok_numeric: |
| process_numeric (); |
| break; |
| |
| case Scanner::tok_time: |
| process_time (); |
| break; |
| |
| case Scanner::tok_messages: |
| process_messages (); |
| break; |
| |
| case Scanner::tok_nl: |
| break; |
| |
| default: |
| scanner_.ignore_line (); |
| break; |
| } |
| } |
| |
| auto_fill (); |
| } |
| |
| |
| Def::Def (const char* filename, const char* out_name, Charmap& char_map, |
| bool no_position) |
| : warnings_occurred_ (false), |
| scan_ahead_ (false), |
| next_offset_ (0), |
| output_name_ (out_name), |
| charmap_ (char_map), |
| ctype_written_ (false), |
| codecvt_written_ (false), |
| collate_written_ (false), |
| time_written_ (false), |
| num_written_ (false), |
| mon_written_ (false), |
| messages_written_ (false), |
| ctype_def_found_ (false), |
| collate_def_found_ (false), |
| time_def_found_ (false), |
| num_def_found_ (false), |
| mon_def_found_ (false), |
| messages_def_found_ (false), |
| undefined_keyword_found_ (false), |
| no_position_ (no_position) |
| |
| { |
| // make sure ctype_out object is cleared |
| std::memset (&ctype_out_, 0, sizeof (ctype_out_)); |
| std::memset (&time_out_, 0, sizeof (time_out_)); |
| |
| // invalidate format characters by setting each to CHAR_MAX |
| // as specified by the C function localeconv() |
| mon_out_.frac_digits [0] = CHAR_MAX; |
| mon_out_.frac_digits [1] = CHAR_MAX; |
| mon_out_.p_cs_precedes [0] = CHAR_MAX; |
| mon_out_.p_sep_by_space [0] = CHAR_MAX; |
| mon_out_.n_cs_precedes [0] = CHAR_MAX; |
| mon_out_.n_sep_by_space [0] = CHAR_MAX; |
| mon_out_.p_sign_posn [0] = CHAR_MAX; |
| mon_out_.n_sign_posn [0] = CHAR_MAX; |
| |
| mon_st_.mon_grouping += CHAR_MAX; |
| |
| // invalidate int'l formats |
| mon_out_.p_cs_precedes [1] = CHAR_MAX; |
| mon_out_.p_sep_by_space [1] = CHAR_MAX; |
| mon_out_.n_cs_precedes [1] = CHAR_MAX; |
| mon_out_.n_sep_by_space [1] = CHAR_MAX; |
| mon_out_.p_sign_posn [1] = CHAR_MAX; |
| mon_out_.n_sign_posn [1] = CHAR_MAX; |
| |
| num_st_.grouping += CHAR_MAX; |
| |
| collate_out_.largest_ce = 1; |
| collate_out_.longest_weight = 1; |
| collate_out_.num_wchars = 0; |
| std::memset (collate_out_.weight_type, 0, |
| sizeof (collate_out_.weight_type)); |
| |
| // initialize all extensions to 0 |
| ctype_out_.ctype_ext_off = 0; |
| num_out_.numeric_ext_off = 0; |
| collate_out_.collate_ext_off = 0; |
| mon_out_.monetary_ext_off = 0; |
| time_out_.time_ext_off = 0; |
| |
| // actual processing |
| scanner_.open (filename); |
| } |
| |
| |
| Def::~Def () |
| { |
| // free up the memory that was allocated |
| |
| coll_map_iter coll_map_pos; |
| for (coll_map_pos = coll_map_.begin(); |
| coll_map_pos != coll_map_.end(); coll_map_pos ++) { |
| delete[] (coll_map_pos->second.weights); |
| } |
| |
| } |