blob: b81854b43b36444c730bfb6eac641bbdd7898300 [file] [log] [blame]
/***************************************************************************
*
* collate.cpp
*
* $Id$
*
***************************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Copyright 2001-2006 Rogue Wave Software.
*
**************************************************************************/
#include "def.h" // for Def
#include "diagnostic.h" // for issue_diag()
#include "path.h" // for get_pathname()
#include "scanner.h" // for scanner
#include <fstream> // for ofstream
#include <iostream> // for cout
#include <string> // for string
#include <vector> // for vector
#include <cassert> // for assert()
#include <cctype> // for toupper()
#include <cstdio> // for sprintf()
#include <cstdlib> // for strtoul()
#include <cstring> // for strchr()
void Def::
create_wchar_utf8_table ()
{
if (!wchar_utf8_to_ext_.empty())
return;
typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;
const n_cmap_citer2 n_cmap_end = charmap_.get_mb_cmap ().end ();
for (n_cmap_citer2 n_cmap_it = charmap_.get_mb_cmap ().begin ();
n_cmap_it != n_cmap_end; ++n_cmap_it) {
const std::string wchar_utf8 = utf8_encode (n_cmap_it->second);
wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8,
n_cmap_it->first));
}
}
void Def::process_weights (collate_entry_t& entry)
{
// iterator of weights tokens
token_list_t::iterator w_it = entry.second.begin ();
w_cmap_iter wcmap_it;
ce_map_iter ce_map_it;
coll_map_iter coll_map_it;
Weights_t* weights = new Weights_t[collate_out_.num_weights];
std::size_t i = 0;
for (i = 0; i < collate_out_.num_weights && w_it != entry.second.end ();
++i, ++w_it){
get_weight (*w_it, weights, i);
}
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// the remaining weights - see above - are given as weight the
// collating value of the symbol at start of line
while (i < collate_out_.num_weights) {
weights[i].size = 1;
wcmap_it = w_cmap.find (entry.first.name);
if (wcmap_it != w_cmap.end()) {
coll_map_it = coll_map_.find (wcmap_it->second);
weights[i].weight[0] = coll_map_it->second.coll_val;
}
else if ((ce_map_it = ce_map_.find (entry.first.name)) !=
ce_map_.end()) {
weights[i].weight[0] = ce_map_it->second.coll_val;
}
else {
warnings_occurred_ =
issue_diag (W_COLSYM, false, &entry.first,
"symbolic name %s not defined as character or "
"collating-element, ignoring\n",
entry.first.name.c_str()) || warnings_occurred_;
}
i++;
}
wcmap_it = w_cmap.find (entry.first.name);
if (wcmap_it != w_cmap.end()) {
coll_map_it = coll_map_.find (wcmap_it->second);
coll_map_it->second.weights = weights;
} else if ((ce_map_it = ce_map_.find (entry.first.name)) != ce_map_.end()) {
ce_map_it->second.weights = weights;
} else {
warnings_occurred_ =
issue_diag (W_COLSYM, false, &entry.first,
"symbolic name %s not defined as character or "
"collating-element, ignoring\n",
entry.first.name.c_str()) || warnings_occurred_;
}
}
unsigned int Def::process_order_stmt (collate_section_t& section)
{
// number of orders in the section
collate_out_.num_weights = section.order.size ();
if (collate_out_.num_weights == 0) {
collate_out_.num_weights = 1;
collate_out_.weight_type[0] = 0;
} else {
collate_out_.num_weights = 0;
token_list_t::iterator ord_it = section.order.begin ();
for (; ord_it != section.order.end ();
++ord_it, ++collate_out_.num_weights) {
if (ord_it->token == Scanner::tok_forward) {
collate_out_.weight_type[collate_out_.num_weights] = 0;
} else if (ord_it->token == Scanner::tok_backward) {
collate_out_.weight_type[collate_out_.num_weights] = 1;
} else if (ord_it->token == Scanner::tok_position) {
// this is equivalent to "forward,position"
if (no_position_)
collate_out_.weight_type[collate_out_.num_weights] = 0;
else
collate_out_.weight_type[collate_out_.num_weights] = 2;
} else {
std::string::size_type comma_pos = ord_it->name.find (',');
if (comma_pos != std::string::npos) {
std::string first = ord_it->name.substr (0, comma_pos);
std::string second = ord_it->name.substr (comma_pos + 1);
// verify a valid weight ordering
if ((first == "forward" && second != "position") ||
first == "backward" && second != "position") {
issue_diag (E_COLORD, true,
&*ord_it, "unrecognized collating order\n");
break;
} else if (first == "forward") {
if (no_position_)
collate_out_.weight_type
[collate_out_.num_weights] = 0;
else
collate_out_.weight_type
[collate_out_.num_weights] = 2;
}
else
collate_out_.weight_type[collate_out_.num_weights] = 3;
}
else {
issue_diag (E_COLORD, true,
&*ord_it, "unrecognized collating order\n");
}
}
}
}
// build the order value;
unsigned int order = 0;
for (int i = 0; i < collate_out_.num_weights; ++i) {
order <<= 2;
order |= collate_out_.weight_type[i];
}
return order;
}
// decimally increment the symbolic name, turning something like
// <U1234> into <U1245>, or <jis234> to <jis235>
static std::string
dec_increment (const std::string &sym)
{
const char *pdig = sym.c_str ();
for (; !(std::isdigit (*pdig)); ++pdig);
char *end;
const unsigned long sym_val = std::strtoul (pdig, &end, 10);
char numstr [64];
std::sprintf (numstr, "%lu", sym_val + 1);
std::string next_sym = sym.substr (0, pdig - sym.c_str ());
next_sym.append (numstr);
next_sym.append (end);
return next_sym;
}
// hexadecimally increment the symbolic name
static std::string
hex_increment (const std::string& sym)
{
const char *pdig = sym.c_str ();
for (; !(std::isxdigit (*pdig)); ++pdig);
char *end;
const unsigned long sym_val = std::strtoul (pdig, &end, 16);
char numstr [64];
std::sprintf (numstr, "%lx", sym_val + 1);
std::string next_sym = sym.substr (0, pdig - sym.c_str ());
next_sym.append (numstr);
next_sym.append (end);
return next_sym;
}
// scanning ahead and adding all symbols to the collating symbols map
void Def::preprocess_collation_definitions()
{
// start with collation values from 2
unsigned int coll_value = 2;
// previous_elm and next_elm are used for processing ellipsis.
std::string prev_elm;
std::string next_elm;
static unsigned int max_orders = 0;
token_list_t::iterator cs_it = sym_list_.begin ();
for (; cs_it != sym_list_.end (); ++cs_it) {
cs_map_iter csm_it = cs_map_.find (cs_it->name);
if (csm_it != cs_map_.end ())
csm_it->second = coll_value++;
}
std::list<collate_section_t>::iterator s_it = section_list_.begin ();
if (s_it != section_list_.end()) {
// get maximum weights count
max_orders = s_it->order.size ();
}
for (; s_it != section_list_.end (); ++s_it) {
if (max_orders != s_it->order.size ())
issue_diag (E_COLNUM, true, &*s_it->order.begin (),
"number of collation orders "
"different for this section\n");
}
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// preprocess sections' collation definitions
for (s_it = section_list_.begin ();
s_it != section_list_.end (); ++s_it) {
// process the order statement and get the weight
unsigned int order = process_order_stmt (*s_it);
std::list<collate_entry_t>::iterator e_it =
s_it->entries.begin ();
for (; e_it != s_it->entries.end (); ++e_it) {
switch (e_it->first.token) {
case Scanner::tok_sym_name:
// store the previous name
prev_elm = e_it->first.name;
// process each of the collation definitions
process_collation_definition (false, *e_it,
coll_value++, order);
break;
case Scanner::tok_abs_ellipsis: // "<FOO>...<BAR>"
case Scanner::tok_hex_ellipsis: // "<FOO>..<BAR>"
case Scanner::tok_dec_ellipsis: // "<FOO>....<BAR>"
{
if (++e_it != s_it->entries.end ())
next_elm = e_it->first.name;
else {
issue_diag (E_SYNTAX, true, &e_it->first,
"unexpected end of collation section while"
" processing ellipsis during scan_ahead\n");
}
if (e_it->first.token == Scanner::tok_abs_ellipsis) {
wchar_t w_cmap_cur_val
= w_cmap.find(prev_elm)->second;
wchar_t w_cmap_end_val
= w_cmap.find(next_elm)->second;
// the first value has already been
// added so don't add it again
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
while (w_cmap_cur_val != w_cmap_end_val) {
// process iteration
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
ci.coll_val = coll_value++;
ci.order = order;
coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));
w_cmap_cur_val =
charmap_.increment_wchar (w_cmap_cur_val);
}
// add last element "next_elm" to array
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
ci.coll_val = coll_value++;
ci.order = order;
coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));
}
else {
// we are incrementing the symbolic names
std::string next_name = prev_elm;
do {
if (e_it->first.token == Scanner::tok_hex_ellipsis)
next_name = hex_increment (next_name);
else
next_name = dec_increment (next_name);
w_cmap_iter it = w_cmap.find (next_name);
if (it != w_cmap.end()) {
// process iteration
collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
ci.coll_val = coll_value++;
ci.order = order;
coll_map_.insert (
std::make_pair (it->second, ci));
}
} while (next_name != next_elm);
}
prev_elm = next_elm;
break;
}
case Scanner::tok_dbl_ellipsis:
issue_diag (W_NOTSUP, true, &e_it->first,
"ellipsis not supported"
" processing ellipsis during scan_ahead\n");
default:
break;
}
}
}
}
void Def::process_collation_definition ( bool do_weights,
collate_entry_t& entry,
unsigned int coll_value,
unsigned int order)
{
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// iterators
w_cmap_iter w_cmap_pos =
w_cmap.find(entry.first.name);
ce_map_iter ce_map_pos =
ce_map_.find(entry.first.name);
// look up the symbolic name in the wide character map
if (w_cmap_pos != w_cmap.end()) {
wchar_t wval = w_cmap_pos->second;
coll_map_iter coll_map_pos = coll_map_.find (wval);
coll_map_pos->second.coll_val = coll_value;
coll_map_pos->second.order = order;
// process the weights
if (do_weights) {
unsigned int offset = next_offset_++;
coll_map_pos->second.offset = offset;
off_mapr_.insert(std::make_pair(offset, entry.first.name));
process_weights (entry);
}
}
else if (ce_map_pos != ce_map_.end()) {
ce_map_pos->second.coll_val = coll_value;
ce_map_pos->second.order = order;
// process the weights
if (do_weights) {
unsigned int offset = next_offset_++;
ce_map_pos->second.offset = offset;
off_mapr_.insert(std::make_pair(offset, entry.first.name));
process_weights (entry);
}
}
else if (cs_map_.find(entry.first.name) != cs_map_.end()) {
cs_map_.find(entry.first.name)->second = coll_value;
}
else {
// the symbol is missing from all three maps;
// add it in cs map as "sym <-> collating_value" pair
cs_map_.insert (std::make_pair(entry.first.name, coll_value));
warnings_occurred_ =
issue_diag (W_COLSYM, false, &entry.first,
"unknown symbol name %s found in "
"LC_COLLATE definition\n",
entry.first.name.c_str()) || warnings_occurred_;
}
}
void Def::process_order(collate_section_t& section,
unsigned int& coll_value)
{
std::string sym;
// get the orders
unsigned int section_order = process_order_stmt (section);
// current element
std::string curr_elm;
// previous_elm and next_elm are used for processing ellipsis.
std::string prev_elm;
std::string next_elm;
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// iterate thru section's entries
std::list<collate_entry_t>::iterator e_it = section.entries.begin ();
for (; e_it != section.entries.end (); ++e_it) {
if (e_it->first.token == Scanner::tok_sym_name) {
prev_elm = e_it->first.name;
process_collation_definition (true, *e_it,
coll_value++, section_order);
}
else if ( e_it->first.token == Scanner::tok_abs_ellipsis
|| e_it->first.token == Scanner::tok_dec_ellipsis
|| e_it->first.token == Scanner::tok_hex_ellipsis) {
// process line with ellipsis
Weights_t* weights_template =
new Weights_t[collate_out_.num_weights];
std::vector<bool> ordinal_weights;
token_list_t::iterator w_it = e_it->second.begin ();
for (int i = 0; i < collate_out_.num_weights
&& w_it != e_it->second.end (); ++i, ++w_it) {
ordinal_weights.push_back (
get_weight (*w_it,weights_template, i));
}
while (ordinal_weights.size() < collate_out_.num_weights)
ordinal_weights.push_back (true);
// next line
if (++e_it != section.entries.end ())
next_elm = e_it->first.name;
else {
issue_diag (E_SYNTAX, true, &e_it->first,
"unexpected end of collation section while"
" processing ellipsis\n");
}
if (e_it->first.token == Scanner::tok_abs_ellipsis) {
wchar_t w_cmap_cur_val
= w_cmap.find(prev_elm)->second;
wchar_t w_cmap_end_val
= w_cmap.find(next_elm)->second;
// the first value has already been added so don't add it again
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
while (w_cmap_cur_val != w_cmap_end_val) {
add_to_coll (w_cmap_cur_val, weights_template,
coll_value++, ordinal_weights, false);
w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
}
// add the end element to the collation array.
add_to_coll (w_cmap_end_val, weights_template, coll_value++,
ordinal_weights, false);
} else {
// we are incrementing the symbolic names
std::string next_name = prev_elm;
do {
if (e_it->first.token == Scanner::tok_hex_ellipsis)
next_name = hex_increment (next_name);
else
next_name = dec_increment (next_name);
w_cmap_iter it = w_cmap.find (next_name);
if (it != w_cmap.end()) {
add_to_coll (it->second, weights_template,
coll_value++, ordinal_weights, false);
}
} while (next_name != next_elm);
}
// store current left-hand symbol
next_elm = prev_elm;
delete[] weights_template;
} else if (e_it->first.token == Scanner::tok_undefined) {
// add all characters that were not explicitely given a collation
// value in increasing order based on their encoded values
undefined_keyword_found_ = true;
collate_out_.undefined_optimization = true;
undef_char_info_.order = section_order;
Weights_t* weights_template
= new Weights_t[collate_out_.num_weights];
std::vector<bool> ordinal_weights;
token_list_t::iterator w_it = e_it->second.begin ();
for (int i = 0; i < collate_out_.num_weights
&& w_it != e_it->second.end(); ++i, ++w_it) {
if (w_it->token == Scanner::tok_abs_ellipsis)
collate_out_.undefined_optimization = false;
ordinal_weights.push_back (
!get_weight (*w_it, weights_template, i));
}
while (ordinal_weights.size() < collate_out_.num_weights)
ordinal_weights.push_back (false);
add_missing_values (ordinal_weights, weights_template,
coll_value, false);
delete[] weights_template;
}
}
}
void Def::add_missing_values (const std::vector<bool> &ordinal_weights,
const Weights_t* weights_template,
unsigned int &coll_value, bool give_warning)
{
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// we want to print a warning message once if there are characters that
// were not given collation values.
bool warning_issued = false;
symnames_list_iter symnames_it;
for (symnames_it = charmap_.get_symnames_list ().begin ();
symnames_it != charmap_.get_symnames_list ().end ();
++symnames_it) {
std::map<std::string, wchar_t>::const_iterator w_cmap_it;
if ((w_cmap_it = w_cmap.find(*symnames_it)) != w_cmap.end()) {
wchar_t wchar_val = (*w_cmap_it).second;
coll_map_iter coll_map_it;
if ((coll_map_it = coll_map_.find(wchar_val)) != coll_map_.end()) {
if (coll_map_it->second.offset == UINT_MAX) {
if (give_warning && !warning_issued) {
warning_issued = true;
warnings_occurred_ =
issue_diag (W_MISSING, false,
0, "some characters in the codeset "
"were not explicitly given a "
"collation value\n")
|| warnings_occurred_;
}
if (!collate_out_.undefined_optimization) {
add_to_coll (wchar_val, weights_template,
coll_value++, ordinal_weights, true);
}
}
}
}
}
if (collate_out_.undefined_optimization){
collate_out_.undefined_weight_idx = next_offset_++;
undef_char_info_.offset = collate_out_.undefined_weight_idx;
undef_char_info_.coll_val = coll_value++;
// initialize the weight template with the undefined collation info
Weights_t* weights = new Weights_t[collate_out_.num_weights];
if (0 != weights_template) {
for (int k = 0; k < collate_out_.num_weights; ++k) {
weights[k].size = weights_template[k].size;
for (int j = 0; j < 256; ++j)
weights[k].weight[j] = weights_template[k].weight[j];
}
}
std::vector<bool>::const_iterator it;
// now go through the vector that tells us if a particular
// weight is based on the ordinal position of the element
// and place the correct ordinal value in the weight vector
// if it is.
int i = 0;
for (it = ordinal_weights.begin();
it != ordinal_weights.end(); ++it, ++i)
{
// FIXME: this may not be correct
if (!*it){
weights[i].size = 1;
weights[i].weight[0] = undef_char_info_.coll_val;
}
}
undef_char_info_.weights = weights;
}
}
void Def::add_to_coll (const wchar_t w_cmap_cur_val,
const Weights_t* weights_template,
const unsigned int coll_value,
const std::vector<bool>& ordinal_weights,
bool undefined_value)
{
Weights_t* weights = new Weights_t[collate_out_.num_weights];
if (0 != weights_template) {
for (int k = 0; k < collate_out_.num_weights; ++k) {
weights[k].size = weights_template[k].size;
for (int j = 0; j < 256; ++j)
weights[k].weight[j] = weights_template[k].weight[j];
}
}
// get the symbolic name from the value
std::string name
= charmap_.get_rw_cmap().find (w_cmap_cur_val)->second;
coll_map_.find (w_cmap_cur_val)->second.coll_val = coll_value;
unsigned int offset;
if (undefined_value && collate_out_.undefined_optimization)
offset = next_offset_;
else
offset = next_offset_++;
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
w_cmap_iter w_cmap_pos = w_cmap.find(name);
coll_map_iter coll_it = coll_map_.find (w_cmap_pos->second);
coll_it->second.offset = offset;
off_mapr_.insert(std::make_pair(offset, name));
std::vector<bool>::const_iterator it;
// now go through the vector that tells us if a particular
// weight is based on the ordinal position of the element
// and place the correct ordinal value in the weight vector
// if it is.
int i = 0;
for (it = ordinal_weights.begin();
it != ordinal_weights.end(); ++it, ++i)
{
if (*it){
weights[i].size = 1;
weights[i].weight[0] = coll_value;
}
}
coll_it->second.weights = weights;
}
// if undefined optimization is on then only those characters that have
// been defined should go into the valid set, otherwise all characters go
// into the set.
void Def::gen_valid_coll_mb_set ()
{
if (!valid_coll_mb_set_.empty())
return;
for (coll_map_iter coll_it = coll_map_.begin();
coll_it != coll_map_.end(); ++coll_it) {
if (coll_it->second.offset != UINT_MAX
|| !collate_out_.undefined_optimization){
rmb_cmap_iter rn_cmap_it
= charmap_.get_rmb_cmap().find (coll_it->first);
std::string valid = rn_cmap_it->second.substr
(0, rn_cmap_it->second.size() - 1);
while (valid.size() > 0){
valid_coll_mb_set_.insert (valid);
valid = valid.substr(0, valid.size() - 1);
}
}
}
}
// In processing the collate section of the locale definition file
// we will filter out the characters that are not present in the
// charmap file; these characters when encountered on the left-hand side
// of a weight definition for a symbolic name will be left in place
// and used as a mere collating symbol for the rest of the parsing -
// i.e. its weights ignored and assigned the weight corresponding to
// its position in the collate section.
void Def::process_collate()
{
issue_diag (I_STAGE, false, 0, "processing LC_COLLATE section\n");
// update flags
collate_def_found_ = true;
// initialization
init_coll_map ();
// first preprocess the collate section
preprocess_collate ();
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
// process the collating symbols list
token_list_t::iterator cs_it = cs_list_.begin ();
for (;cs_it != cs_list_.end (); ++cs_it) {
if (w_cmap.find (cs_it->name) !=
w_cmap.end())
issue_diag (E_SYNTAX, true, &*cs_it,
"collating-symbol %s found in character map\n",
cs_it->name.c_str ());
else if (ce_map_.find (cs_it->name) != ce_map_.end())
issue_diag (E_SYNTAX, true, &*cs_it,
"collating-symbol %s has already been defined as "
"a collating-element\n", cs_it->name.c_str ());
cs_map_.insert (std::make_pair (cs_it->name, 0));
}
// process the collating elements list
std::list<collate_elem_t>::iterator ce_it =
ce_list_.begin ();
for (; ce_it != ce_list_.end (); ++ce_it) {
if (w_cmap.find (ce_it->first.name) !=
w_cmap.end()) {
issue_diag (E_SYNTAX, true, &ce_it->first,
"collating element %s found in charmap\n",
ce_it->first.name.c_str());
}
// collating element info to be stored
ce_info_t ce_tmp;
ce_tmp.offset = UINT_MAX;
ce_tmp.coll_val = UINT_MAX;
ce_tmp.order = 0;
ce_tmp.weights = 0;
// array of symbolic names
std::vector<std::string> ce_sym_array;
// get the definition of this collating element
token_list_t& ce_def_list = ce_it->second;
token_list_t::iterator ce_def_it = ce_def_list.begin ();
bool invalid = false;
if (ce_def_it->token == Scanner::tok_string) {
if ((ce_tmp.ce_wstr = convert_wstring (*ce_def_it)).empty())
invalid = true;
} else if (ce_def_it->token == Scanner::tok_sym_name) {
for (; ce_def_it != ce_def_list.end (); ++ce_def_it)
ce_sym_array.push_back (ce_def_it->name);
if ((ce_tmp.ce_wstr = convert_wstring (ce_sym_array)).empty())
invalid = true;
} else {
issue_diag (E_SYNTAX, true, &*ce_def_it,
"illegal collating-element expression: %s\n",
ce_def_it->name.c_str ());
}
if (invalid)
issue_diag (W_SYNTAX, false, &*ce_def_it,
"character in collating element definition "
"not found in character map\n");
if (!invalid) {
ce_map_.insert (std::make_pair(ce_it->first.name, ce_tmp));
std::string ce_str;
std::string ce_utf8_wstr;
for (unsigned int i = 0; i < ce_tmp.ce_wstr.size(); ++i){
ce_str += convert_to_ext(ce_tmp.ce_wstr[i]);
ce_utf8_wstr += utf8_encode (ce_tmp.ce_wstr[i]);
}
ce_sym_map_.insert (std::make_pair(ce_str, ce_it->first.name));
ce_wsym_map_.insert (std::make_pair (ce_utf8_wstr,
ce_it->first.name));
}
}
// preprocess the collation definitions and make known all
// the symbolic names available
preprocess_collation_definitions();
// collation values 0 and 1 are reserved
unsigned int coll_value = 2;
// FIXME: assign values to the collating-elements AGAIN
cs_it = sym_list_.begin ();
for (; cs_it != sym_list_.end (); ++cs_it) {
cs_map_iter csm_it = cs_map_.find (cs_it->name);
if (csm_it != cs_map_.end ())
csm_it->second = coll_value++;
}
// process the sections
std::list<collate_section_t>::iterator sect_it =
section_list_.begin ();
for (; sect_it != section_list_.end (); ++sect_it)
process_order (*sect_it, coll_value);
// check to make sure that all the elements in the codeset were added
if (!undefined_keyword_found_) {
std::vector<bool> ordinal_weights;
for (int i = 0; i < collate_out_.num_weights; ++i)
ordinal_weights.push_back (false);
collate_out_.undefined_optimization = true;
add_missing_values (ordinal_weights, 0, coll_value, true);
}
}
void Def::gen_n_to_w_coll_tables (const std::string &charp,
unsigned int tab_num)
{
gen_valid_coll_mb_set();
offset_tab_t tab;
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
tab.off[k] = UINT_MAX;
}
tab.first_offset = -1;
mb_cmap_iter n_cmap_it;
for (unsigned int i = 0; i <= UCHAR_MAX; ++i){
unsigned char cur_char = (unsigned char)i;
std::string mb_char = charp;
mb_char += (char)cur_char;
if ((n_cmap_it = charmap_.get_mb_cmap().find (mb_char))
!= charmap_.get_mb_cmap().end()) {
coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
if (coll_map_it->second.offset != UINT_MAX
|| !collate_out_.undefined_optimization) {
tab.off[cur_char] = coll_map_it->second.offset;
if (tab.first_offset == -1)
tab.first_offset = cur_char;
}
}
else {
if (valid_coll_mb_set_.find (mb_char) != valid_coll_mb_set_.end()){
++next_tab_num_;
tab.off[cur_char] = next_tab_num_ | 0x80000000;
gen_n_to_w_coll_tables (mb_char, next_tab_num_);
if (tab.first_offset == -1)
tab.first_offset = cur_char;
}
else {
tab.off[cur_char] = UINT_MAX;
}
}
}
char_offs_.insert (std::make_pair (tab_num, tab));
}
void Def::gen_w_to_n_coll_tables (const std::string &charp,
unsigned int tab_num)
{
// initialize a table used in ctype and collate
create_wchar_utf8_table();
gen_valid_coll_wchar_set();
offset_tab_t tab;
for (unsigned int k = 0; k <= UCHAR_MAX; ++k)
tab.off[k] = UINT_MAX;
tab.first_offset = -1;
wchar_utf8_iter wu_it;
mb_cmap_iter n_cmap_it;
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
unsigned char cur_char = (unsigned char)i;
std::string mb_char = (charp);
mb_char += (char)cur_char;
wu_it = wchar_utf8_to_ext_.find (mb_char);
if (wu_it != wchar_utf8_to_ext_.end()) {
n_cmap_it = charmap_.get_mb_cmap().find (wu_it->second);
if (n_cmap_it != charmap_.get_mb_cmap().end ()) {
coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
if (coll_map_it->second.offset != UINT_MAX ||
!collate_out_.undefined_optimization) {
tab.off[cur_char] = coll_map_it->second.offset;
if (tab.first_offset == -1)
tab.first_offset = cur_char;
}
}
}
else
if (valid_coll_wchar_set_.find (mb_char) !=
valid_coll_wchar_set_.end()) {
++next_wchar_coll_tab_num_;
tab.off[cur_char] = next_wchar_coll_tab_num_ | 0x80000000;
gen_w_to_n_coll_tables (mb_char,
next_wchar_coll_tab_num_);
if (tab.first_offset == -1)
tab.first_offset = cur_char;
}
else
tab.off[cur_char] = UINT_MAX;
}
w_to_n_coll_.insert (std::make_pair (tab_num, tab));
}
void Def::gen_n_ce_tables (const std::set<std::string>ce_elms,
unsigned int idx, unsigned int tab_num)
{
if (ce_elms.size() > 0) {
ce_offset_tab_t tab;
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
tab.off[k] = UINT_MAX;
}
tab.first_offset = -1;
tab.last_offset = 0;
ce_sym_map_iter ce_sym_map_it;
ce_map_iter ce_map_it;
std::set<std::string>::const_iterator ce_elms_it;
std::set<std::string> next_elms;
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
next_elms.clear();
for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end();
ce_elms_it ++) {
if ((unsigned char)i == (unsigned char)((*ce_elms_it)[idx])) {
if ((*ce_elms_it).size() == idx + 1) {
if ((ce_sym_map_it = ce_sym_map_.find (*ce_elms_it))
!= ce_sym_map_.end()) {
ce_map_it = ce_map_.find(ce_sym_map_it->second);
tab.off[i] = ce_map_it->second.offset;
if (tab.first_offset == -1)
tab.first_offset = i;
if ((unsigned int)tab.last_offset < i)
tab.last_offset = i;
}
}
else {
next_elms.insert (*ce_elms_it);
if (tab.off[i] == UINT_MAX) {
++ next_n_ce_tab_num_;
tab.off[i] = next_n_ce_tab_num_ | 0x80000000;
}
if (tab.first_offset == -1)
tab.first_offset = i;
if ((unsigned int)tab.last_offset < i)
tab.last_offset = i;
}
}
}
// now recursively call gen_n_ce_tables if any collating
// elements with this character were found;
if (next_elms.size() != 0)
gen_n_ce_tables (next_elms, idx + 1, next_n_ce_tab_num_);
}
n_ce_offs_.insert(std::make_pair(tab_num, tab));
}
}
void Def::gen_w_ce_tables (const std::set<std::string>ce_elms,
unsigned int idx, unsigned int tab_num)
{
if (ce_elms.size() > 0) {
ce_offset_tab_t tab;
for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
tab.off[k] = UINT_MAX;
}
tab.first_offset = -1;
tab.last_offset = 0;
ce_sym_map_iter ce_sym_map_it;
ce_map_iter ce_map_it;
std::set<std::string>::const_iterator ce_elms_it;
std::set<std::string> next_elms;
for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
next_elms.clear();
for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end();
ce_elms_it ++) {
if ((unsigned char)i == (unsigned char)(*ce_elms_it)[idx]) {
if ((*ce_elms_it).size() == idx + 1) {
if ((ce_sym_map_it = ce_wsym_map_.find (*ce_elms_it))
!= ce_wsym_map_.end()) {
ce_map_it = ce_map_.find(ce_sym_map_it->second);
tab.off[i] = ce_map_it->second.offset;
if (tab.first_offset == -1)
tab.first_offset = i;
if ((unsigned int)tab.last_offset < i)
tab.last_offset = i;
}
}
else {
next_elms.insert (*ce_elms_it);
if (tab.off[i] == UINT_MAX) {
++ next_w_ce_tab_num_;
tab.off[i] = next_w_ce_tab_num_ | 0x80000000;
}
if (tab.first_offset == -1)
tab.first_offset = i;
if ((unsigned int)tab.last_offset < i)
tab.last_offset = i;
}
}
}
// now recursively call gen_w_ce_tables if any collating
// elements with this character were found;
if (next_elms.size() != 0)
gen_w_ce_tables (next_elms, idx + 1, next_w_ce_tab_num_);
}
w_ce_offs_.insert(std::make_pair(tab_num, tab));
}
}
void Def::dump_collate ()
{
std::cout << "LC_COLLATE\n";
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
token_list_t::iterator sl_it = sym_list_.begin ();
for (; sl_it != sym_list_.end (); ++sl_it) {
std::cout << sl_it->name << " % ";
cs_map_iter cs_it = cs_map_.find (sl_it->name);
if (cs_it != cs_map_.end ()) {
std::cout << cs_it->second << '\n';
continue;
}
ce_map_iter ce_it = ce_map_.find (sl_it->name);
if (ce_it != ce_map_.end ()) {
std::cout << ce_it->second.coll_val << '\n';
continue;
}
std::map<std::string, wchar_t >::const_iterator cw_it =
w_cmap.find (sl_it->name);
if (cw_it != w_cmap.end ()) {
coll_map_iter cm_it =
coll_map_.find (cw_it->second);
if (cm_it != coll_map_.end ()) {
std::cout << cm_it->second.coll_val << '\n';
continue;
}
}
std::cout << '\n';
}
std::cout << "\n\n";
std::list<collate_section_t>::iterator sect_it =
section_list_.begin ();
while (sect_it != section_list_.end ()) {
// dump the orders
std::cout << "order_start " << sect_it->name;
token_list_t::iterator o_it = sect_it->order.begin ();
for (; o_it != sect_it->order.end (); ++o_it)
std::cout << ";" << o_it->name;
std::cout << '\n';
// for each entry in the entries list
collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
for (; e_it != sect_it->entries.end (); ++e_it) {
// dump the collation definition (weights included)
std::cout << e_it->first.name << " ";
if (!e_it->second.empty ()) {
token_list_t::iterator w_it = e_it->second.begin ();
std::cout << (w_it++)->name;
for (; w_it != e_it->second.end (); ++w_it)
std::cout << ";" << w_it->name;
}
// lookup the value associated with this collation symbol
cs_map_iter cs_it = cs_map_.find (e_it->first.name);
if (cs_it != cs_map_.end ()) {
std::cout << " % " << cs_it->second << '\n';
continue;
}
ce_map_iter ce_it = ce_map_.find (e_it->first.name);
if (ce_it != ce_map_.end ()) {
std::cout << " % " << ce_it->second.coll_val << '\n';
continue;
}
std::map<std::string, wchar_t >::const_iterator cw_it =
w_cmap.find (e_it->first.name);
if (cw_it != w_cmap.end ()) {
coll_map_iter cm_it =
coll_map_.find (cw_it->second);
if (cm_it != coll_map_.end ()) {
std::cout << " % " << cm_it->second.coll_val << '\n';
continue;
}
}
std::cout << '\n';
}
std::cout << "order_end\n";
++sect_it;
}
std::cout << "END LC_COLLATE\n";
}
void Def::write_collate (std::string dir_name)
{
assert (!dir_name.empty());
static const char lc_name[] = "LC_COLLATE";
if (collate_written_)
return;
if (!collate_def_found_) {
issue_diag (I_SKIP, false, 0,
"%s section not found, skipping\n", lc_name);
return;
}
next_tab_num_ = 0;
next_wchar_coll_tab_num_ = 0;
std::set<std::string> ce_strs;
ce_sym_map_iter it;
ce_map_iter ce_mit;
for (it = ce_sym_map_.begin (); it != ce_sym_map_.end (); ++it) {
if ((ce_mit = ce_map_.find (it->second))->second.coll_val
!= UINT_MAX) {
// check to see of the largest_ce needs to be changed
if (ce_mit->second.ce_wstr.size() + 1
> collate_out_.largest_ce)
collate_out_.largest_ce = ce_mit->second.ce_wstr.size();
ce_strs.insert (it->first);
}
}
next_n_ce_tab_num_ = 0;
gen_n_ce_tables(ce_strs, 0, 0);
ce_strs.clear();
for (it = ce_wsym_map_.begin (); it != ce_wsym_map_.end (); ++it) {
if (ce_map_.find (it->second)->second.coll_val != UINT_MAX)
ce_strs.insert (it->first);
}
next_w_ce_tab_num_ = 0;
gen_w_ce_tables (ce_strs, 0, 0);
gen_n_to_w_coll_tables ("", next_tab_num_);
gen_w_to_n_coll_tables ("", next_wchar_coll_tab_num_);
(dir_name += _RWSTD_PATH_SEP) += lc_name;
issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());
std::ofstream out (dir_name.c_str(), std::ios::binary);
out.exceptions (std::ios::failbit | std::ios::badbit);
unsigned int i;
// calculate the size of an individual weight element
collate_out_.elm_size = collate_out_.num_weights
* collate_out_.longest_weight * sizeof (unsigned int) +
sizeof (unsigned int);
// the first section of the collate database is the collating
// element information
collate_out_.n_ce_tab_off = 0;
collate_out_.w_ce_tab_off = collate_out_.n_ce_tab_off;
n_ce_offs_iter n_ce_offs_it;
for (n_ce_offs_it = n_ce_offs_.begin();
n_ce_offs_it != n_ce_offs_.end (); ++n_ce_offs_it) {
collate_out_.w_ce_tab_off += (n_ce_offs_it->second.last_offset
- n_ce_offs_it->second.first_offset + 1)* sizeof (int);
}
// next comes the weight information
collate_out_.weight_tab_off = collate_out_.w_ce_tab_off;
w_ce_offs_iter w_ce_offs_it;
for (w_ce_offs_it = w_ce_offs_.begin();
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
collate_out_.weight_tab_off += (w_ce_offs_it->second.last_offset
- w_ce_offs_it->second.first_offset + 1)* sizeof (int);
}
coll_map_iter coll_map_pos;
collate_out_.num_elms = off_mapr_.size();
if (collate_out_.undefined_optimization)
++collate_out_.num_elms;
// now calculate the offset for the first narrow character table
collate_out_.n_char_tab_off = collate_out_.weight_tab_off
+ collate_out_.num_elms * collate_out_.elm_size;
// now calculate the offset fo the fist wide character table
// but first we need the size of the narrow tables
char_offs_iter char_offs_it;
unsigned int char_offs_size = 0;
for (char_offs_it = char_offs_.begin();
char_offs_it != char_offs_.end(); ++char_offs_it) {
char_offs_size += (UCHAR_MAX + 1
- char_offs_it->second.first_offset)
* sizeof (unsigned int);
}
collate_out_.w_char_tab_off = collate_out_.n_char_tab_off
+ char_offs_size;
// now calculate the offset for the narrow character offset table
// but first we need the size of the wide tables
w_to_n_coll_iter w_to_n_coll_it;
unsigned int w_to_n_size = 0;
for (w_to_n_coll_it = w_to_n_coll_.begin();
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
w_to_n_size += (UCHAR_MAX + 1
- w_to_n_coll_it->second.first_offset)
* sizeof (unsigned int);
}
collate_out_.n_char_off_tab_off = collate_out_.w_char_tab_off
+ w_to_n_size;
// now calculate the offset for the wide character offset table
collate_out_.w_char_off_tab_off = collate_out_.n_char_off_tab_off
+ char_offs_.size() * sizeof (unsigned int);
// calculate the offset for the narrow collating element offset table
collate_out_.n_ce_off_tab_off = collate_out_.w_char_off_tab_off
+ w_to_n_coll_.size() * sizeof (unsigned int);
// calculate the offset for the wide collating element offset table
collate_out_.w_ce_off_tab_off = collate_out_.n_ce_off_tab_off
+ n_ce_offs_.size() * sizeof (unsigned int);
// now calculate the offset of the first character information
collate_out_.n_char_first_char_off = collate_out_.w_ce_off_tab_off
+ w_ce_offs_.size() * sizeof (unsigned int);
// now calculate the offset of the wide table first char info
collate_out_.w_char_first_char_off = collate_out_.n_char_first_char_off
+ char_offs_.size() * sizeof (unsigned char);
// now calculate the offset of the narrow ce first character info
collate_out_.n_ce_first_char_off = collate_out_.w_char_first_char_off
+ w_to_n_coll_.size() * sizeof (unsigned char);
// now calculate the offset of the wide ce first character info
collate_out_.w_ce_first_char_off = collate_out_.n_ce_first_char_off
+ n_ce_offs_.size() * sizeof (unsigned char);
// now calculate the offset of the narrow ce last character info
collate_out_.n_ce_last_char_off = collate_out_.w_ce_first_char_off
+ w_ce_offs_.size() * sizeof (unsigned char);
// now calculate the offset of the wide ce last character info
collate_out_.w_ce_last_char_off = collate_out_.n_ce_last_char_off
+ n_ce_offs_.size() * sizeof (unsigned char);
// now calculate the offset of the codeset name
collate_out_.codeset_off = collate_out_.w_ce_last_char_off
+ w_ce_offs_.size() * sizeof (unsigned char);
// finally calculate the offset of the charmap name
collate_out_.charmap_off = collate_out_.codeset_off
+ charmap_.get_code_set_name().size() + 1;
// print out the collate struct
out.write ((char*)&collate_out_, sizeof(collate_out_));
for (n_ce_offs_it = n_ce_offs_.begin();
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
for (i = (unsigned int)n_ce_offs_it->second.first_offset;
i <= (unsigned int)n_ce_offs_it->second.last_offset; ++i)
out.write ((char*)&n_ce_offs_it->second.off[i],
sizeof (n_ce_offs_it->second.off[i]));
}
for (w_ce_offs_it = w_ce_offs_.begin();
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
for (i = (unsigned int)w_ce_offs_it->second.first_offset;
i <= (unsigned int)w_ce_offs_it->second.last_offset; ++i)
out.write ((char*)&w_ce_offs_it->second.off[i],
sizeof (w_ce_offs_it->second.off[i]));
}
// now print out the weight array
unsigned int maxw = UINT_MAX;
Weights_t* weights;
bool undefined_written = false;
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
std::size_t off_idx = 0;
off_mapr_iter current_off = off_mapr_.begin();
for (; current_off != off_mapr_.end(); ++off_idx) {
// check the current offset
if (current_off->first != off_idx
&& off_idx != undef_char_info_.offset)
issue_diag (E_RANGE, true, 0,
"current_off : %d, index : %d, giving up\n",
current_off->first, off_idx);
if (collate_out_.undefined_optimization
&& off_idx == undef_char_info_.offset) {
undefined_written = true;
weights = undef_char_info_.weights;
out.write ((char*)&undef_char_info_.order,
sizeof (undef_char_info_.order));
for (int j = 0; j < collate_out_.num_weights; ++j) {
for (int k = 0; k < weights[j].size; ++k) {
out.write ((char*)&weights[j].weight[k],
sizeof (weights[j].weight[k]));
}
for (int c = weights[j].size;
c < collate_out_.longest_weight; ++c)
out.write ((char*)&maxw, sizeof (maxw));
}
continue;
}
w_cmap_iter w_cmap_pos = w_cmap.find
(current_off->second);
if (w_cmap_pos != w_cmap.end()) {
coll_map_pos = coll_map_.find (w_cmap_pos->second);
out.write ((char*)&coll_map_pos->second.order,
sizeof (coll_map_pos->second.order));
weights = coll_map_pos->second.weights;
for (int j = 0; j < collate_out_.num_weights; ++j) {
for (int k = 0; k < weights[j].size; ++k) {
out.write ((char*)&weights[j].weight[k],
sizeof (weights[j].weight[k]));
}
for (int c = weights[j].size;
c < collate_out_.longest_weight; ++c)
out.write ((char*)&maxw, sizeof (maxw));
}
++current_off;
}
else {
ce_map_iter ce_map_it = ce_map_.find (current_off->second);
if (ce_map_it != ce_map_.end()
&& ce_map_it->second.coll_val != UINT_MAX) {
out.write ((char*)&ce_map_it->second.order,
sizeof (ce_map_it->second.order));
weights = ce_map_it->second.weights;
for (int j = 0; j < collate_out_.num_weights; ++j) {
for (int k = 0; k < weights[j].size; ++k) {
out.write ((char*)&weights[j].weight[k],
sizeof (weights[j].weight[k]));
}
for (int c = weights[j].size;
c < collate_out_.longest_weight; ++c)
out.write ((char*)&maxw, sizeof (maxw));
}
}
++current_off;
}
}
// maske sure that we got to the undefined value
if (!undefined_written && collate_out_.undefined_optimization) {
if (off_idx != undef_char_info_.offset)
issue_diag (E_RANGE, true, 0,
"current_off : %d, index : %d, giving up\n",
undef_char_info_.offset, off_idx);
weights = undef_char_info_.weights;
out.write ((char*)&undef_char_info_.order,
sizeof (undef_char_info_.order));
for (int j = 0; j < collate_out_.num_weights; ++j) {
for (int k = 0; k < weights[j].size; ++k) {
out.write ((char*)&weights[j].weight[k],
sizeof (weights[j].weight[k]));
}
for (int c = weights[j].size;
c < collate_out_.longest_weight; ++c)
out.write ((char*)&maxw, sizeof (maxw));
}
}
// now print out the narrow character tables
for (char_offs_it = char_offs_.begin();
char_offs_it != char_offs_.end(); ++char_offs_it) {
for (unsigned int c = char_offs_it->second.first_offset;
c <= UCHAR_MAX; ++c) {
out.write ((const char*)&char_offs_it->second.off[c],
sizeof (char_offs_it->second.off[c]));
}
}
// now print out the wide character tables
for (w_to_n_coll_it = w_to_n_coll_.begin();
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
for (unsigned int c = w_to_n_coll_it->second.first_offset;
c <= UCHAR_MAX; ++c) {
out.write ((const char*)&w_to_n_coll_it->second.off[c],
sizeof (w_to_n_coll_it->second.off[c]));
}
}
// now print the narrow character table offsets
unsigned int current_offset = 0;
for (char_offs_it = char_offs_.begin();
char_offs_it != char_offs_.end(); ++char_offs_it) {
out.write ((const char*)&current_offset, sizeof (current_offset));
current_offset += (UCHAR_MAX + 1
- char_offs_it->second.first_offset);
}
// now print the wide character table offsets
current_offset = 0;
for (w_to_n_coll_it = w_to_n_coll_.begin();
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
out.write ((const char*)&current_offset, sizeof (current_offset));
current_offset += (UCHAR_MAX + 1
- w_to_n_coll_it->second.first_offset);
}
// now print the narrow ce table offsets
current_offset = 0;
for (n_ce_offs_it = n_ce_offs_.begin();
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
out.write ((const char*)&current_offset, sizeof (current_offset));
current_offset += (n_ce_offs_it->second.last_offset
- n_ce_offs_it->second.first_offset + 1);
}
// now print the wide ce table offsets
current_offset = 0;
for (w_ce_offs_it = w_ce_offs_.begin();
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
out.write ((const char*)&current_offset, sizeof (current_offset));
current_offset += (w_ce_offs_it->second.last_offset
- w_ce_offs_it->second.first_offset + 1);
}
// now print out the narrow character tables starting character
for (char_offs_it = char_offs_.begin();
char_offs_it != char_offs_.end(); ++char_offs_it) {
const char off = char ((char_offs_it->second).first_offset);
out << off;
}
// now print out the wide character tables starting character
for (w_to_n_coll_it = w_to_n_coll_.begin();
w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
const char off = char ((w_to_n_coll_it->second).first_offset);
out << off;
}
// now print out the narrow ce tables starting character
for (n_ce_offs_it = n_ce_offs_.begin();
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
const char off = char ((n_ce_offs_it->second).first_offset);
out << off;
}
// now print out the wide ce tables starting character
for (w_ce_offs_it = w_ce_offs_.begin();
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
const char off = char ((w_ce_offs_it->second).first_offset);
out << off;
}
// now print out the narrow ce tables ending character
for (n_ce_offs_it = n_ce_offs_.begin();
n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
const char off = char ((n_ce_offs_it->second).last_offset);
out << off;
}
// now print out the wide ce tables ending character
for (w_ce_offs_it = w_ce_offs_.begin();
w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
const char off = char ((w_ce_offs_it->second).last_offset);
out << off;
}
// finally write the codeset and charmap names
out << charmap_.get_code_set_name() << std::ends
<< charmap_.get_charmap_name() << std::ends;
}
void Def::init_coll_map() {
rw_cmap_iter rw_cmap_pos;
collate_info_t tmp = {UINT_MAX, UINT_MAX, 0, 0};
for (rw_cmap_pos = charmap_.get_rw_cmap().begin();
rw_cmap_pos != charmap_.get_rw_cmap().end();
++rw_cmap_pos) {
coll_map_.insert (std::make_pair (rw_cmap_pos->first, tmp));
}
}
// In the preprocess_collate member function the collate section is
// preprocessed and the result of the preprocessing is stored in-memory
// as a number of lists; the content of these lists is then preprocessed
// acoording to the reorder statements; the result is then passed
// on to process_collate.
void Def::preprocess_collate ()
{
int nesting_level = 0;
while (true) {
// fetch next token
next = scanner_.next_token();
switch (next.token) {
case Scanner::tok_end:
next = scanner_.next_token();
if (next.token == Scanner::tok_collate) {
// end of collation block
if (nesting_level == 0)
return;
nesting_level--;
scanner_.close ();
} else
issue_diag (E_SYNTAX, true, &next,
"wrong section name in END directive\n");
break;
case Scanner::tok_copy: {
next = scanner_.next_token();
if (next.token != Scanner::tok_string)
issue_diag (E_SYNTAX, true, &next,
"expected string following \"copy\" directive\n");
// bump up the nesting level
++nesting_level;
issue_diag (I_STAGE, false, 0, "processing copy directive\n");
// open the file
scanner_.open (get_pathname (strip_quotes (next.name), next.file));
// get comment char and escape char;
// these informations are stored by the scanner
while ((next = scanner_.next_token ()).token
!= Scanner::tok_collate ){
// the LC_IDENTIFICATION section may also have a
// LC_COLLATE token that will mess up the parsing
if (next.token == Scanner::tok_ident) {
while ((next = scanner_.next_token()).token
!= Scanner::tok_end );
next = scanner_.next_token();
}
}
break;
}
// a collating element definition defines a symbolic name that
// represents symbolically the congregation of two other symbolic
// names which have to be present in the character map;
// the form of the phrase is:
// collating-element sym from ("string" | (sym sym+))
case Scanner::tok_coll_elem: {
next = scanner_.next_token();
// we expect a symbolic name
if (next.token != Scanner::tok_sym_name)
issue_diag (E_SYNTAX, true, &next,
"symbolic name expected following "
"collating-element\n");
// symbolic name
token_t sym (next);
next = scanner_.next_token ();
if (next.token != Scanner::tok_from)
issue_diag (E_SYNTAX, true, &next,
"illegal collating-element expression\n");
token_list_t ce_def_list;
next = scanner_.next_token ();
if (next.token == Scanner::tok_string) {
ce_def_list.push_back (next);
ce_list_.push_back (std::make_pair(sym,ce_def_list));
} else if (next.token == Scanner::tok_sym_name) {
do {
ce_def_list.push_back (next);
next = scanner_.next_token();
} while (next.token != Scanner::tok_nl);
ce_list_.push_back (std::make_pair(sym,ce_def_list));
} else
issue_diag (E_SYNTAX, true, &next,
"illegal collating-element expression\n");
break;
}
case Scanner::tok_coll_sym:
next = scanner_.next_token();
if (next.token != Scanner::tok_sym_name)
issue_diag (E_SYNTAX, true, &next,
"symbolic name expected following "
"collating-symbol\n");
cs_list_.push_back (next);
break;
case Scanner::tok_script:
next = scanner_.next_token ();
if (next.token != Scanner::tok_sym_name)
issue_diag (W_SYNTAX, false, &next,
"expecting script name, got %s\n",
next.name.c_str ());
script_list_.push_back(next);
break;
case Scanner::tok_sym_name:
sym_list_.push_back (next);
break;
case Scanner::tok_order_start:
preprocess_order();
break;
case Scanner::tok_reorder:
preprocess_reorder ();
break;
case Scanner::tok_reorder_section:
preprocess_reorder_section ();
break;
default:
break;
}
}
}
// The task of preprocess_order is to parse and model the content of the
// order sections in the input files
void Def::preprocess_order ()
{
// one order-start keyword has been encountered;
// push a collate_section down the list and use it
while (next.token != Scanner::tok_order_end) {
section_list_.push_back (collate_section_t());
collate_section_t& section = section_list_.back ();
next = scanner_.next_token ();
// store the orders
while (next.token != Scanner::tok_nl) {
if (next.token == Scanner::tok_sym_name) {
// symbolic name, has to be a section name
section.name = next.name;
} else {
// might be a combination of order and position
// do the same until a better way is found
section.order.push_back (next);
}
// fetch next token
next = scanner_.next_token ();
}
// unnamed sections
if (section.name == "")
section.name = "unnamed";
issue_diag (I_STAGE, false, 0,
"processing %s order\n", section.name.c_str ());
// store the collation statements
while (!(next.token == Scanner::tok_order_end ||
next.token == Scanner::tok_order_start )) {
next = scanner_.next_token ();
switch (next.token) {
case Scanner::tok_end:
issue_diag (E_SYNTAX, true, &next,
"unexpected END directive while "
"parsing collate section\n");
case Scanner::tok_nl:
continue;
case Scanner::tok_comment:
scanner_.ignore_line ();
break;
case Scanner::tok_undefined:
case Scanner::tok_sym_name:
{
token_t sym (next);
section.entries.push_back (
std::make_pair(sym,token_list_t()));
collate_entry_t& entry = section.entries.back ();
next = scanner_.next_token ();
while (next.token != Scanner::tok_nl) {
entry.second.push_back (next);
next = scanner_.next_token ();
}
break;
}
default:
break;
}
}
}
}
// Handles the reorder statements in the form:
// reorder-after <sym>
// <sym> <sym><sym>...
// OR
// reorder-after <sym>
// .. <sym><sym>.....
// followed by reorder-end statement or another reorder-after
void Def::preprocess_reorder ()
{
while (true) {
// process one or more statements grouped
// under a reorder section
// retrieve the symbol
next = scanner_.next_token ();
if (next.token != Scanner::tok_sym_name)
issue_diag (E_SYNTAX, true, &next,
"expecting symbolic name, got %s\n",
next.name.c_str ());
// store the symbolic name
token_t sym (next);
collate_entry_list_t entries_list;
while (true) {
// process the statements
next = scanner_.next_token ();
if (next.token == Scanner::tok_nl )
continue;
else if ( next.token == Scanner::tok_sym_name
|| next.token == Scanner::tok_hex_ellipsis) {
// the line will contain one single symbolic name
// or a complete collation statement
collate_entry_t entry;
entry.first = next;
next = scanner_.next_token ();
while (next.token != Scanner::tok_nl) {
entry.second.push_back (next);
next = scanner_.next_token ();
}
// add the entry to the list
entries_list.push_back (entry);
} else if (next.token == Scanner::tok_reorder) {
// call insert_entry
if (!entries_list.empty ())
insert_entries (sym, entries_list);
entries_list.clear ();
break;
} else if (next.token == Scanner::tok_reorder_end ) {
// call insert_entry
// call insert_entry
if (!entries_list.empty ())
insert_entries (sym, entries_list);
return;
} else {
issue_diag (E_SYNTAX, true, &next,
"unexpected token : %s in a reorder block\n",
next.name.c_str ());
}
}
}
}
void Def::preprocess_reorder_section ()
{
while (next.token != Scanner::tok_reorder_section_end) {
// process one or more statements grouped
// under a reorder section
// retrieve the symbol
next = scanner_.next_token ();
token_t sym (next);
// process the statements
while (!(next.token == Scanner::tok_reorder_section ||
next.token == Scanner::tok_reorder_section_end)) {
next = scanner_.next_token ();
switch (next.token) {
case Scanner::tok_nl:
continue;
case Scanner::tok_sym_name: {
// the line will contain one single symbolic name
// which is the name of a section
next = scanner_.next_token ();
if (next.token != Scanner::tok_sym_name) {
issue_diag (E_SYNTAX, true, &next,
"expecting section name, got %s\n",
next.name.c_str ());
}
std::list<collate_section_t>::iterator beg =
section_list_.begin ();
std::list<collate_section_t>::iterator end =
section_list_.end ();
std::list<collate_section_t>::iterator ref_it, mov_it;
for (ref_it = beg;
ref_it->name != sym.name && ref_it != end;
++ref_it);
for (mov_it = beg;
mov_it->name != next.name && mov_it != end;
++mov_it);
if (ref_it == end || mov_it == end)
issue_diag (E_SYNTAX, true, &next,
"missing sections %s %s requested\n",
sym.name.c_str (), next.name.c_str ());
// replace it
section_list_.splice (++ref_it, section_list_,
mov_it, mov_it);
// replace the insertion point
sym = next;
break;
}
default:
issue_diag (E_SYNTAX, true, &next,
"unexpected token : %s in a "
"reorder section block\n",
next.name.c_str ());
break;
}
}
}
}
void Def::list_collate ()
{
// print the script list
token_list_t::iterator script_it = script_list_.begin ();
while (script_it != script_list_.end ()) {
std::cout << "script " << (script_it++)->name << '\n';
}
std::cout << '\n';
// print the collating symbols' list
token_list_t::iterator cs_it = cs_list_.begin ();
while (cs_it != cs_list_.end ()) {
std::cout << "collating-symbol " << (cs_it++)->name << '\n';
}
std::cout << '\n';
// print the collating elements list
std::list<collate_elem_t>::iterator ce_it = ce_list_.begin ();
while (ce_it != ce_list_.end ()) {
std::cout << "collating-element " << ce_it->first.name
<< " from ";
token_list_t::iterator ce_def_it = ce_it->second.begin ();
for (;ce_def_it != ce_it->second.end (); ce_def_it++) {
std::cout << ce_def_it->name << ";";
}
std::cout << '\n';
}
std::cout << '\n';
// print out general/global symbols' list
token_list_t::iterator sym_it = sym_list_.begin ();
while (sym_it != sym_list_.end ()) {
std::cout << (sym_it++)->name << '\n';
}
std::cout << '\n';
// print out sections
std::list<collate_section_t>::iterator sc_it =
section_list_.begin ();
while (sc_it != section_list_.end ()) {
// print prolog
std::cout << "order_start ";
if (sc_it->name != "unnamed")
std::cout << sc_it->name << ";";
token_list_t::iterator it = sc_it->order.begin ();
while (it != sc_it->order.end ())
std::cout << (it++)->name << ";";
std::cout << '\n';
std::list<collate_entry_t>::iterator e_it =
sc_it->entries.begin ();
while (e_it != sc_it->entries.end ()) {
std::cout << e_it->first.name << " ";
token_list_t::iterator w_it = e_it->second.begin ();
while (w_it != e_it->second.end ())
std::cout << (w_it++)->name << ";";
std::cout << '\n';
++e_it;
}
++sc_it;
}
std::cout << '\n' << "order_end\n";
}
// FIXME - modify the algorithm to get log(N) complexity
// Inserts an entry inside the section information;
// searching for the appropriate entry in the list is
// done in linear time
bool Def::insert_entries (token_t& s, collate_entry_list_t& e)
{
// first remove these entries if found
collate_entry_list_t::iterator r_it = e.begin ();
for (; r_it != e.end (); ++r_it)
remove_entry (*r_it);
// first search through the symbols list; if found check
// the collate_entry object; it should not have weights
token_list_t::iterator it = sym_list_.begin ();
for (; it != sym_list_.end (); ++it) {
if (it->name != s.name)
continue;
// FIXME - all statements have to have no weights if they are
// to be inserted after collation symbols outside section boundaries;
// found it; check one collate_entry_t object
if (!e.begin ()->second.empty ()) {
issue_diag (E_REORD, true, &s,
"requested reorder-after: reference "
"symbol %s was found outside section boundaries "
"and the objects to be reordered are collation "
"definitions\n", s.name.c_str ());
}
// insert the symbolic name there
collate_entry_list_t::iterator e_it = e.begin ();
for (; e_it != e.end (); ++e_it)
it = sym_list_.insert (++it, e_it->first);
return true;
}
// if not successful then search in each section
std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
while (sect_it != section_list_.end ()) {
collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
for (; e_it != sect_it->entries.end (); ++e_it) {
if (e_it->first.name != s.name)
continue;
// found it; insert entry
sect_it->entries.insert (++e_it, e.begin (), e.end ());
return true;
}
++sect_it;
}
issue_diag (W_REORD, false, &s,
"requested reorder-after: reference "
"symbol %s was not found \n", s.name.c_str ());
return false;
}
// FIXME - modify the algorithm to get log(N) complexity
void Def::remove_entry (collate_entry_t& e)
{
// search in the sym_list_ and in the sections
if (e.second.empty ()) {
token_list_t::iterator it = sym_list_.begin ();
for (; it != sym_list_.end (); ++it) {
if (it->name != e.first.name)
continue;
sym_list_.erase (it);
return;
}
}
// search in the sections
std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
while (sect_it != section_list_.end ()) {
std::list<collate_entry_t>::iterator e_it = sect_it->entries.begin ();
for (; e_it != sect_it->entries.end (); ++e_it) {
if (e_it->first.name != e.first.name)
continue;
sect_it->entries.erase (e_it);
return;
}
++sect_it;
}
return ;
}
bool Def::get_weight ( token_t& w,
Weights_t* weights,
int weight_num)
{
const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();
bool ret = false;
weights[weight_num].size = 1;
ce_map_iter ce_map_it;
if(w.token == Scanner::tok_sym_name) {
w_cmap_iter w_cmap_pos = w_cmap.find (w.name);
if (w_cmap_pos != w_cmap.end()){
coll_map_iter coll_map_pos = coll_map_.find(w_cmap_pos->second);
if (coll_map_pos->second.coll_val == UINT_MAX)
issue_diag (E_SYMUSED, true,
&w, "symbolic name %s "
"used as weight before being assigned a "
"collation value\n", w.name.c_str());
// store weight
weights[weight_num].weight[0] = coll_map_pos->second.coll_val;
}
else if ((ce_map_it = ce_map_.find (w.name))
!= ce_map_.end())
weights[weight_num].weight[0] = ce_map_it->second.coll_val;
else {
cs_map_iter cs_it = cs_map_.find (w.name);
if (cs_it == cs_map_.end ())
issue_diag (E_SYNTAX, true,
&w, "symbolic name %s not found\n",
w.name.c_str());
weights[weight_num].weight[0] = cs_it->second;
}
}
else if (w.token == Scanner::tok_char_value) {
// the weight is given in numerical form
const char* next_val =
std::strchr (w.name.c_str (), scanner_.escape_char ());
assert (0 != next_val);
const char* next_wt = std::strchr (w.name.c_str (), ';');
while (weight_num < collate_out_.num_weights) {
std::size_t c;
for (c = 0; *next_val && (!next_wt || next_val < next_wt); ++c) {
const char* end = 0;
weights [weight_num].weight [c] =
scanner_.convert_escape (next_val, &end, true);
assert (0 != end);
next_val = end;
}
weights [weight_num++].size = c;
if (next_wt)
next_wt = std::strchr (next_val, ';');
}
}
else if(w.token == Scanner::tok_ignore) {
// use the special weight 0 for IGNORE weights
weights[weight_num].size = 1;
weights[weight_num].weight[0] = 0;
}
else if (w.token == Scanner::tok_string) {
// the weights are given either in symbolic name form (e.g.,
// "<symbolic-name>" or in the form of a quoted multibyte
// character string (e.g., "\001\d010\x16\")
const std::string tmp (w.name.substr (1, w.name.size() - 2));
// keeps track of the length of the weight
unsigned char k = 0;
// iterate thru the string content and retrieve the symbols
std::string::const_iterator it = tmp.begin ();
const char escape = scanner_.escape_char ();
while (it != tmp.end ()) {
// weight in string form
std::string wsym;
// next comes a symbolic name
if (*it == '<') {
while (*it != '>') {
if (*it == escape)
++it;
wsym += *it++;
}
wsym += *it++;
// wsym has the symbolic name, lookup for it in
// the character map, collating-symbol map,
// and collating-element map
w_cmap_iter w_cm_pos = w_cmap.find(wsym);
cs_map_iter cs_it = cs_map_.find (wsym);
ce_map_iter ce_it = ce_map_.find (wsym);
if (w_cm_pos != w_cmap.end()) {
// is in the character map, check its associated
// collation value
coll_map_iter coll_it = coll_map_.find(w_cm_pos->second);
if (coll_it == coll_map_.end())
issue_diag (E_SYMUSED, true, &w,
"weight %s not defined\n",
wsym.c_str());
if (coll_it->second.coll_val == UINT_MAX)
issue_diag (E_SYMUSED, true,
&w, "symbolic name %s "
"used as weight before being assigned a "
"collation value\n", wsym.c_str());
weights[weight_num].weight[k++] = coll_it->second.coll_val;
}
else if (ce_it != ce_map_.end()) {
// it is in the collating-element map
weights[weight_num].weight[k++] = ce_it->second.coll_val;
}
else if (cs_it != cs_map_.end()) {
// it is in the collating-symbol map
weights[weight_num].weight[k++] = cs_it->second;
}
else {
// it is not in any of the maps, that's an error
issue_diag (E_SYNTAX, true, &w,
"symbolic name %s not found\n",
wsym.c_str());
}
}
else if (*it == escape) {
// weight is given in a quoted escape form
const char* const beg = tmp.c_str () + (it - tmp.begin ());
const char* end = 0;
weights [weight_num].weight [k++] =
scanner_.convert_escape (beg, &end, true);
assert (0 != end);
it += end - beg;
break;
}
else
issue_diag (E_SYNTAX, true, &w,
"illegal string content as a weight");
}
weights[weight_num].size = k;
if (k > collate_out_.longest_weight)
collate_out_.longest_weight = k;
}
else if (w.token == Scanner::tok_abs_ellipsis)
// return true if ellipsis are embedded in the weight
ret = true;
else
warnings_occurred_ =
issue_diag (W_SYNTAX, false, &w,
"illegal token %s found in collation definition\n",
w.name.c_str()) || warnings_occurred_;
return ret;
}
// if undefined optimization is on then only those characters that have
// been defined should go into the valid set, otherwise all characters go
// into the set.
void Def::gen_valid_coll_wchar_set () {
if (!valid_coll_wchar_set_.empty())
return;
create_wchar_utf8_table();
for (coll_map_iter coll_it = coll_map_.begin();
coll_it != coll_map_.end(); ++coll_it) {
if (coll_it->second.offset != UINT_MAX
|| !collate_out_.undefined_optimization){
std::string valid = utf8_encode (coll_it->first);
valid = valid.substr (0, valid.size() - 1);
while (valid.size() > 0){
valid_coll_wchar_set_.insert (valid);
valid = valid.substr(0, valid.size() - 1);
}
}
}
}