/***************************************************************************
 *
 * collate.cpp
 *
 * $Id$
 *
 ***************************************************************************
 *
 * Licensed to the Apache Software  Foundation (ASF) under one or more
 * contributor  license agreements.  See  the NOTICE  file distributed
 * with  this  work  for  additional information  regarding  copyright
 * ownership.   The ASF  licenses this  file to  you under  the Apache
 * License, Version  2.0 (the  "License"); you may  not use  this file
 * except in  compliance with the License.   You may obtain  a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the  License is distributed on an  "AS IS" BASIS,
 * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
 * implied.   See  the License  for  the  specific language  governing
 * permissions and limitations under the License.
 *
 * Copyright 2001-2006 Rogue Wave Software.
 * 
 **************************************************************************/

#include "def.h"          // for Def
#include "diagnostic.h"   // for issue_diag()
#include "path.h"         // for get_pathname()
#include "scanner.h"      // for scanner

#include <fstream>        // for ofstream
#include <iostream>       // for cout
#include <string>         // for string
#include <vector>         // for vector

#include <cassert>        // for assert()
#include <cctype>         // for toupper()
#include <cstdio>         // for sprintf()
#include <cstdlib>        // for strtoul()
#include <cstring>        // for strchr()


void Def::
create_wchar_utf8_table ()
{
    if (!wchar_utf8_to_ext_.empty())
        return;

    typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;

    const n_cmap_citer2 n_cmap_end = charmap_.get_mb_cmap ().end ();

    for (n_cmap_citer2 n_cmap_it = charmap_.get_mb_cmap ().begin ();
         n_cmap_it != n_cmap_end; ++n_cmap_it) {

        const std::string wchar_utf8 = utf8_encode (n_cmap_it->second);

        wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8,
                                                   n_cmap_it->first));
    }
}


void Def::process_weights (collate_entry_t& entry)
{
    // iterator of weights tokens
    token_list_t::iterator w_it = entry.second.begin ();

    w_cmap_iter   wcmap_it;
    ce_map_iter   ce_map_it;
    coll_map_iter coll_map_it;

    Weights_t* weights = new Weights_t[collate_out_.num_weights];

    std::size_t i = 0;
    for (i = 0; i < collate_out_.num_weights && w_it != entry.second.end (); 
         ++i, ++w_it){
        get_weight (*w_it, weights, i);
    }

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // the remaining weights - see above - are given as weight the
    // collating value of the symbol at start of line
    while (i < collate_out_.num_weights) {
        weights[i].size = 1;
        wcmap_it = w_cmap.find (entry.first.name);
        if (wcmap_it != w_cmap.end()) {
            coll_map_it = coll_map_.find (wcmap_it->second);
            weights[i].weight[0] = coll_map_it->second.coll_val;
        }
        else if ((ce_map_it = ce_map_.find (entry.first.name)) != 
                 ce_map_.end()) {
           weights[i].weight[0] = ce_map_it->second.coll_val;
        }
        else {
            warnings_occurred_ = 
                issue_diag (W_COLSYM, false, &entry.first,
                            "symbolic name %s not defined as character or "
                            "collating-element, ignoring\n", 
                            entry.first.name.c_str()) || warnings_occurred_;
        }
        i++;
    }

    wcmap_it = w_cmap.find (entry.first.name);
    if (wcmap_it != w_cmap.end()) {
        coll_map_it = coll_map_.find (wcmap_it->second);
        coll_map_it->second.weights = weights;
    } else if ((ce_map_it = ce_map_.find (entry.first.name)) != ce_map_.end()) {
        ce_map_it->second.weights = weights;
    } else {
        warnings_occurred_ =  
            issue_diag (W_COLSYM, false, &entry.first,
                        "symbolic name %s not defined as character or "
                        "collating-element, ignoring\n", 
                        entry.first.name.c_str()) || warnings_occurred_;
    }
}


unsigned int Def::process_order_stmt (collate_section_t& section)
{
    // number of orders in the section
    collate_out_.num_weights = section.order.size ();
    if (collate_out_.num_weights == 0) {
        collate_out_.num_weights = 1;
        collate_out_.weight_type[0] = 0;
    } else {
        collate_out_.num_weights = 0;
        token_list_t::iterator ord_it = section.order.begin ();
        for (; ord_it != section.order.end (); 
             ++ord_it, ++collate_out_.num_weights) {
            if (ord_it->token == Scanner::tok_forward) {
                collate_out_.weight_type[collate_out_.num_weights] = 0;
            } else if (ord_it->token == Scanner::tok_backward) {
                collate_out_.weight_type[collate_out_.num_weights] = 1;
            } else if (ord_it->token == Scanner::tok_position) {
                // this is equivalent to "forward,position"
                if (no_position_)
                    collate_out_.weight_type[collate_out_.num_weights] = 0;
                else
                    collate_out_.weight_type[collate_out_.num_weights] = 2;
            } else {
                std::string::size_type comma_pos = ord_it->name.find (',');
                if (comma_pos != std::string::npos) {
                    std::string first  = ord_it->name.substr (0, comma_pos);
                    std::string second = ord_it->name.substr (comma_pos + 1);

                    // verify a valid weight ordering
                    if ((first == "forward" && second != "position") || 
                        first == "backward" && second != "position") {
                        issue_diag (E_COLORD, true,
                                    &*ord_it, "unrecognized collating order\n");
                        break;
                    } else if (first == "forward") {
                        if (no_position_)
                            collate_out_.weight_type
                                [collate_out_.num_weights] = 0;
                        else
                            collate_out_.weight_type
                                [collate_out_.num_weights] = 2;
                    }
                    else
                        collate_out_.weight_type[collate_out_.num_weights] = 3;
                }
                else {
                    issue_diag (E_COLORD, true,
                                &*ord_it, "unrecognized collating order\n");
                }
            }
        }
    }

    // build the order value;
    unsigned int order = 0;
    for (int i = 0; i < collate_out_.num_weights; ++i) {
        order <<= 2;
        order |= collate_out_.weight_type[i];
    }

    return order;
}


// decimally increment the symbolic name, turning something like
// <U1234> into <U1245>, or <jis234> to <jis235>
static std::string
dec_increment (const std::string &sym)
{
    const char *pdig = sym.c_str ();

    for (; !(std::isdigit (*pdig)); ++pdig);

    char *end;

    const unsigned long sym_val = std::strtoul (pdig, &end, 10);

    char numstr [64];
    std::sprintf (numstr, "%lu", sym_val + 1);

    std::string next_sym = sym.substr (0, pdig - sym.c_str ());
    next_sym.append (numstr);
    next_sym.append (end);

    return next_sym;
}


// hexadecimally increment the symbolic name
static std::string
hex_increment (const std::string& sym)
{
    const char *pdig = sym.c_str ();

    for (; !(std::isxdigit (*pdig)); ++pdig);

    char *end;

    const unsigned long sym_val = std::strtoul (pdig, &end, 16);

    char numstr [64];
    std::sprintf (numstr, "%lx", sym_val + 1);

    std::string next_sym = sym.substr (0, pdig - sym.c_str ());
    next_sym.append (numstr);
    next_sym.append (end);

    return next_sym;
}


// scanning ahead and adding all symbols to the collating symbols map
void Def::preprocess_collation_definitions()
{
    // start with collation values from 2
    unsigned int coll_value = 2;

    // previous_elm and next_elm are used for processing ellipsis.
    std::string prev_elm;
    std::string next_elm;

    static unsigned int max_orders = 0;

    token_list_t::iterator cs_it = sym_list_.begin ();
    for (; cs_it != sym_list_.end (); ++cs_it) {
        cs_map_iter csm_it = cs_map_.find (cs_it->name);
        if (csm_it != cs_map_.end ())
            csm_it->second = coll_value++;
    }

    std::list<collate_section_t>::iterator s_it = section_list_.begin ();

    if (s_it != section_list_.end()) {
        // get maximum weights count
        max_orders = s_it->order.size ();
    }

    for (; s_it != section_list_.end (); ++s_it) {
        if (max_orders != s_it->order.size ())
            issue_diag (E_COLNUM, true, &*s_it->order.begin (),
                        "number of collation orders "
                        "different for this section\n");
    }

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // preprocess sections' collation definitions
    for (s_it = section_list_.begin (); 
         s_it != section_list_.end (); ++s_it) {

        // process the order statement and get the weight
        unsigned int order = process_order_stmt (*s_it);

        std::list<collate_entry_t>::iterator e_it = 
            s_it->entries.begin ();

        for (; e_it != s_it->entries.end (); ++e_it) {
            switch (e_it->first.token) {
            case Scanner::tok_sym_name:
                // store the previous name
                prev_elm = e_it->first.name;

                // process each of the collation definitions
                process_collation_definition (false, *e_it, 
                                              coll_value++, order);

                break;

            case Scanner::tok_abs_ellipsis:   // "<FOO>...<BAR>"
            case Scanner::tok_hex_ellipsis:   // "<FOO>..<BAR>"
            case Scanner::tok_dec_ellipsis:   // "<FOO>....<BAR>"
            {

                if (++e_it != s_it->entries.end ())
                    next_elm = e_it->first.name;
                else {
                    issue_diag (E_SYNTAX, true, &e_it->first,
                                "unexpected end of collation section while"
                                " processing ellipsis during scan_ahead\n");
                }

                if (e_it->first.token == Scanner::tok_abs_ellipsis) {
                    wchar_t w_cmap_cur_val 
                        = w_cmap.find(prev_elm)->second;
                    wchar_t w_cmap_end_val
                        = w_cmap.find(next_elm)->second;

                    // the first value has already been 
                    // added so don't add it again
                    w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
                    while (w_cmap_cur_val != w_cmap_end_val) {
                        // process iteration
                        collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
                        ci.coll_val = coll_value++;
                        ci.order = order;

                        coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));
                        w_cmap_cur_val = 
                            charmap_.increment_wchar (w_cmap_cur_val);
                    }

                    // add last element "next_elm" to array
                    collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
                    ci.coll_val = coll_value++;
                    ci.order = order;
                    coll_map_.insert (std::make_pair (w_cmap_cur_val, ci));

                }
                else {
                    // we are incrementing the symbolic names
                    std::string next_name = prev_elm;
                    do {
                        if (e_it->first.token == Scanner::tok_hex_ellipsis) 
                            next_name = hex_increment (next_name);
                        else 
                            next_name = dec_increment (next_name);

                        w_cmap_iter it = w_cmap.find (next_name);
                        if (it != w_cmap.end()) {
                            // process iteration
                            collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0};
                            ci.coll_val = coll_value++;
                            ci.order = order;
                            coll_map_.insert (
                                std::make_pair (it->second, ci));
                        }
                    } while (next_name != next_elm);
                }

                prev_elm = next_elm;
                break;
            }

            case Scanner::tok_dbl_ellipsis:
                    issue_diag (W_NOTSUP, true, &e_it->first,
                                "ellipsis not supported"
                                " processing ellipsis during scan_ahead\n");
            default:
                break;
            }
        }
    }
}


void Def::process_collation_definition ( bool do_weights,
                                         collate_entry_t& entry,
                                         unsigned int coll_value,
                                         unsigned int order)
{
    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // iterators
    w_cmap_iter w_cmap_pos = 
        w_cmap.find(entry.first.name);
    ce_map_iter ce_map_pos = 
        ce_map_.find(entry.first.name);

    // look up the symbolic name in the wide character map
    if (w_cmap_pos != w_cmap.end()) {

        wchar_t wval = w_cmap_pos->second;
        coll_map_iter coll_map_pos = coll_map_.find (wval);

        coll_map_pos->second.coll_val = coll_value;
        coll_map_pos->second.order = order;

        // process the weights
        if (do_weights) {
            unsigned int offset = next_offset_++;
            coll_map_pos->second.offset = offset;
            off_mapr_.insert(std::make_pair(offset, entry.first.name));

            process_weights (entry); 
        }

    }
    else if (ce_map_pos != ce_map_.end()) {

        ce_map_pos->second.coll_val = coll_value;
        ce_map_pos->second.order    = order;
                
        // process the weights
        if (do_weights) {
            unsigned int offset = next_offset_++;
            ce_map_pos->second.offset = offset;
            off_mapr_.insert(std::make_pair(offset, entry.first.name));     

            process_weights (entry);
        }

    }
    else if (cs_map_.find(entry.first.name) != cs_map_.end()) {

        cs_map_.find(entry.first.name)->second = coll_value;

    }
    else {
        // the symbol is missing from all three maps; 
        // add it in cs map as "sym <-> collating_value" pair
        cs_map_.insert (std::make_pair(entry.first.name, coll_value));

        warnings_occurred_ =  
            issue_diag (W_COLSYM, false, &entry.first, 
                        "unknown symbol name %s found in "
                        "LC_COLLATE definition\n", 
                        entry.first.name.c_str()) || warnings_occurred_;
    }
}

void Def::process_order(collate_section_t& section, 
                        unsigned int& coll_value)
{
    std::string sym;

    // get the orders
    unsigned int section_order = process_order_stmt (section);

    // current element
    std::string curr_elm;

    // previous_elm and next_elm are used for processing ellipsis.
    std::string prev_elm;
    std::string next_elm;

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // iterate thru section's entries
    std::list<collate_entry_t>::iterator e_it = section.entries.begin ();
    for (; e_it != section.entries.end (); ++e_it) {
        if (e_it->first.token == Scanner::tok_sym_name) {
            prev_elm = e_it->first.name;
            process_collation_definition (true, *e_it, 
                                          coll_value++, section_order);

        }
        else if (   e_it->first.token == Scanner::tok_abs_ellipsis
                 || e_it->first.token == Scanner::tok_dec_ellipsis
                 || e_it->first.token == Scanner::tok_hex_ellipsis) {

            // process line with ellipsis
            Weights_t* weights_template = 
                new Weights_t[collate_out_.num_weights];
            std::vector<bool> ordinal_weights;

            token_list_t::iterator w_it = e_it->second.begin ();
            for (int i = 0; i < collate_out_.num_weights 
                     && w_it != e_it->second.end (); ++i, ++w_it) {
                ordinal_weights.push_back (
                    get_weight (*w_it,weights_template, i));
            }
            while (ordinal_weights.size() < collate_out_.num_weights)
                ordinal_weights.push_back (true);

            // next line
            if (++e_it != section.entries.end ())
                next_elm = e_it->first.name;
            else {
                issue_diag (E_SYNTAX, true, &e_it->first,
                            "unexpected end of collation section while"
                            " processing ellipsis\n");
            }

            if (e_it->first.token == Scanner::tok_abs_ellipsis) {
                wchar_t w_cmap_cur_val 
                    = w_cmap.find(prev_elm)->second;
                wchar_t w_cmap_end_val
                    = w_cmap.find(next_elm)->second;
            
                // the first value has already been added so don't add it again
                w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
            
                while (w_cmap_cur_val != w_cmap_end_val) {

                    add_to_coll (w_cmap_cur_val, weights_template,
                                 coll_value++, ordinal_weights, false);
   
                    w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val);
                }

                // add the end element to the collation array.
                add_to_coll (w_cmap_end_val, weights_template, coll_value++,
                             ordinal_weights, false);

            } else { 

                // we are incrementing the symbolic names
                std::string next_name = prev_elm;
                do {
                    if (e_it->first.token == Scanner::tok_hex_ellipsis) 
                        next_name = hex_increment (next_name);
                    else 
                        next_name = dec_increment (next_name);

                    w_cmap_iter it = w_cmap.find (next_name);
                    if (it != w_cmap.end()) {
                        add_to_coll (it->second, weights_template, 
                                     coll_value++, ordinal_weights, false);
                    }
                } while (next_name != next_elm);
            }

            // store current left-hand symbol
            next_elm = prev_elm;
        
            delete[] weights_template;

        } else if (e_it->first.token == Scanner::tok_undefined) {

            // add all characters that were not explicitely given a collation
            // value in increasing order based on their encoded values
            undefined_keyword_found_ = true;
            collate_out_.undefined_optimization = true;
            undef_char_info_.order = section_order;
            Weights_t* weights_template 
                = new Weights_t[collate_out_.num_weights];
            std::vector<bool> ordinal_weights;

            token_list_t::iterator w_it = e_it->second.begin ();
            for (int i = 0; i < collate_out_.num_weights 
                     && w_it != e_it->second.end(); ++i, ++w_it) {

                if (w_it->token == Scanner::tok_abs_ellipsis) 
                    collate_out_.undefined_optimization = false;

                ordinal_weights.push_back (
                    !get_weight (*w_it, weights_template, i));
            }

            while (ordinal_weights.size() < collate_out_.num_weights)
                ordinal_weights.push_back (false);

            add_missing_values (ordinal_weights, weights_template,
                                coll_value, false);

            delete[] weights_template;
        } 
    }
}

void Def::add_missing_values (const std::vector<bool> &ordinal_weights,
                              const Weights_t* weights_template,
                              unsigned int &coll_value, bool give_warning)
{
    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // we want to print a warning message once if there are characters that 
    // were not given collation values.
    bool warning_issued = false;

    symnames_list_iter symnames_it;
    for (symnames_it = charmap_.get_symnames_list ().begin ();
         symnames_it != charmap_.get_symnames_list ().end ();
         ++symnames_it) {

        std::map<std::string, wchar_t>::const_iterator w_cmap_it;
        if ((w_cmap_it = w_cmap.find(*symnames_it)) != w_cmap.end()) {
            wchar_t wchar_val = (*w_cmap_it).second;

            coll_map_iter coll_map_it;
            if ((coll_map_it = coll_map_.find(wchar_val)) != coll_map_.end()) {
                if (coll_map_it->second.offset == UINT_MAX) {
                    if (give_warning && !warning_issued) {
                        warning_issued = true;
                        warnings_occurred_ = 
                            issue_diag (W_MISSING, false,
                                        0, "some characters in the codeset "
                                        "were not explicitly given a "
                                        "collation value\n")
                            || warnings_occurred_;
                    }
                    if (!collate_out_.undefined_optimization) {
                        add_to_coll (wchar_val, weights_template, 
                                     coll_value++, ordinal_weights, true);
                    }
                }
            }
        }
    }
    if (collate_out_.undefined_optimization){
        collate_out_.undefined_weight_idx = next_offset_++;
        undef_char_info_.offset = collate_out_.undefined_weight_idx;
        undef_char_info_.coll_val = coll_value++;
        // initialize the weight template with the undefined collation info
        Weights_t* weights = new Weights_t[collate_out_.num_weights];

        if (0 != weights_template) {
            for (int k = 0; k < collate_out_.num_weights; ++k) {
                weights[k].size = weights_template[k].size; 
                for (int j = 0; j < 256; ++j)
                    weights[k].weight[j] = weights_template[k].weight[j];
            }
        }


        std::vector<bool>::const_iterator it;
                
        // now go through the vector that tells us if a particular 
        // weight is based on the ordinal position of the element
        // and place the correct ordinal value in the weight vector
        // if it is.
        int i = 0;
        for (it = ordinal_weights.begin();
             it != ordinal_weights.end(); ++it, ++i)
        {
            // FIXME: this may not be correct
            if (!*it){
                weights[i].size = 1;
                weights[i].weight[0] = undef_char_info_.coll_val;
            }
        }
        undef_char_info_.weights = weights;
        
        
    }
}

void Def::add_to_coll (const wchar_t            w_cmap_cur_val, 
                       const Weights_t*         weights_template,
                       const unsigned int       coll_value,
                       const std::vector<bool>& ordinal_weights,
                       bool                     undefined_value)
{
    Weights_t* weights = new Weights_t[collate_out_.num_weights];

    if (0 != weights_template) {
        for (int k = 0; k < collate_out_.num_weights; ++k) {
            weights[k].size = weights_template[k].size; 
            for (int j = 0; j < 256; ++j)
                weights[k].weight[j] = weights_template[k].weight[j];
        }
    }

    // get the symbolic name from the value
    std::string name 
        = charmap_.get_rw_cmap().find (w_cmap_cur_val)->second;
    coll_map_.find (w_cmap_cur_val)->second.coll_val = coll_value;

    unsigned int offset;
    if (undefined_value && collate_out_.undefined_optimization)
        offset = next_offset_;
    else
        offset = next_offset_++;

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    w_cmap_iter w_cmap_pos = w_cmap.find(name);
    coll_map_iter coll_it = coll_map_.find (w_cmap_pos->second);
    coll_it->second.offset = offset;

    off_mapr_.insert(std::make_pair(offset, name));

    std::vector<bool>::const_iterator it;
                
    // now go through the vector that tells us if a particular 
    // weight is based on the ordinal position of the element
    // and place the correct ordinal value in the weight vector
    // if it is.
    int i = 0;
    for (it = ordinal_weights.begin();
         it != ordinal_weights.end(); ++it, ++i)
    {
        if (*it){
            weights[i].size = 1;
            weights[i].weight[0] = coll_value;
        }
    }
    coll_it->second.weights = weights;
}

// if undefined optimization is on then only those characters that have 
// been defined should go into the valid set, otherwise all characters go
// into the set.
void Def::gen_valid_coll_mb_set ()
{
    if (!valid_coll_mb_set_.empty())
        return;
    
    for (coll_map_iter coll_it = coll_map_.begin(); 
         coll_it != coll_map_.end(); ++coll_it) {
        if (coll_it->second.offset != UINT_MAX 
            || !collate_out_.undefined_optimization){
            rmb_cmap_iter rn_cmap_it 
                = charmap_.get_rmb_cmap().find (coll_it->first);
            std::string valid = rn_cmap_it->second.substr 
                (0, rn_cmap_it->second.size() - 1);
            
            while (valid.size() > 0){
                valid_coll_mb_set_.insert (valid);
                valid = valid.substr(0, valid.size() - 1); 
            }
        }
    }
}


// In processing the collate section of the locale definition file
// we will filter out the characters that are not present in the 
// charmap file; these characters when encountered on the left-hand side
// of a weight definition for a symbolic name will be left in place
// and used as a mere collating symbol for the rest of the parsing - 
// i.e. its weights ignored and assigned the weight corresponding to
// its position in the collate section.
void Def::process_collate()
{
    issue_diag (I_STAGE, false, 0, "processing LC_COLLATE section\n");

    // update flags
    collate_def_found_ = true;
    // initialization
    init_coll_map ();

    // first preprocess the collate section
    preprocess_collate ();

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    // process the collating symbols list
    token_list_t::iterator cs_it = cs_list_.begin ();
    for (;cs_it != cs_list_.end (); ++cs_it) {
        if (w_cmap.find (cs_it->name) != 
            w_cmap.end())
            issue_diag (E_SYNTAX, true, &*cs_it,
                        "collating-symbol %s found in character map\n",
                        cs_it->name.c_str ());
        else if (ce_map_.find (cs_it->name) != ce_map_.end())
            issue_diag (E_SYNTAX, true, &*cs_it,
                        "collating-symbol %s has already been defined as "
                        "a collating-element\n", cs_it->name.c_str ());

        cs_map_.insert (std::make_pair (cs_it->name, 0));
    }

    // process the collating elements list
    std::list<collate_elem_t>::iterator ce_it = 
        ce_list_.begin ();
    for (; ce_it != ce_list_.end (); ++ce_it) {
        if (w_cmap.find (ce_it->first.name) != 
            w_cmap.end()) {
            issue_diag (E_SYNTAX, true, &ce_it->first,
                        "collating element %s found in charmap\n", 
                        ce_it->first.name.c_str());
        }

        // collating element info to be stored
        ce_info_t ce_tmp;
        ce_tmp.offset   = UINT_MAX;
        ce_tmp.coll_val = UINT_MAX;
        ce_tmp.order    = 0;
        ce_tmp.weights  = 0;

        // array of symbolic names
        std::vector<std::string> ce_sym_array;

        // get the definition of this collating element
        token_list_t& ce_def_list = ce_it->second;
        token_list_t::iterator ce_def_it = ce_def_list.begin ();
        bool invalid = false;

        if (ce_def_it->token == Scanner::tok_string) {
            if ((ce_tmp.ce_wstr = convert_wstring (*ce_def_it)).empty())
                invalid = true;
        } else if (ce_def_it->token == Scanner::tok_sym_name) {
            for (; ce_def_it != ce_def_list.end (); ++ce_def_it) 
                ce_sym_array.push_back (ce_def_it->name);
            if ((ce_tmp.ce_wstr = convert_wstring (ce_sym_array)).empty())
                invalid = true;
        } else {
            issue_diag (E_SYNTAX, true, &*ce_def_it, 
                        "illegal collating-element expression: %s\n",
                        ce_def_it->name.c_str ());
        }
        if (invalid)
            issue_diag (W_SYNTAX, false, &*ce_def_it,
                        "character in collating element definition "
                        "not found in character map\n");

        if (!invalid) {
            ce_map_.insert (std::make_pair(ce_it->first.name, ce_tmp));

            std::string ce_str;
            std::string ce_utf8_wstr;
            for (unsigned int i = 0; i < ce_tmp.ce_wstr.size(); ++i){
                ce_str += convert_to_ext(ce_tmp.ce_wstr[i]);
                ce_utf8_wstr += utf8_encode (ce_tmp.ce_wstr[i]); 
            }
            ce_sym_map_.insert (std::make_pair(ce_str, ce_it->first.name));
            ce_wsym_map_.insert (std::make_pair (ce_utf8_wstr, 
                                                 ce_it->first.name));
        }
    }

    // preprocess the collation definitions and make known all
    // the symbolic names available
    preprocess_collation_definitions();

    // collation values 0 and 1 are reserved
    unsigned int coll_value = 2;

    // FIXME: assign values to the collating-elements AGAIN
    cs_it = sym_list_.begin ();
    for (; cs_it != sym_list_.end (); ++cs_it) {
        cs_map_iter csm_it = cs_map_.find (cs_it->name);
        if (csm_it != cs_map_.end ())
            csm_it->second = coll_value++;
    }

    // process the sections
    std::list<collate_section_t>::iterator sect_it = 
        section_list_.begin ();
    for (; sect_it != section_list_.end (); ++sect_it)
        process_order (*sect_it, coll_value);

    // check to make sure that all the elements in the codeset were added
    if (!undefined_keyword_found_) {
        std::vector<bool> ordinal_weights;
        for (int i = 0; i < collate_out_.num_weights; ++i) 
            ordinal_weights.push_back (false);
        collate_out_.undefined_optimization = true;
        add_missing_values (ordinal_weights, 0, coll_value, true);
    }

}


void Def::gen_n_to_w_coll_tables (const std::string &charp, 
                                  unsigned int tab_num)
{
    gen_valid_coll_mb_set();
    offset_tab_t tab;
    for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
        tab.off[k] = UINT_MAX;
    }

    tab.first_offset = -1;
    mb_cmap_iter n_cmap_it;

    for (unsigned int i = 0; i <= UCHAR_MAX; ++i){
        unsigned char cur_char = (unsigned char)i;
        std::string mb_char = charp;
        mb_char += (char)cur_char;
        if ((n_cmap_it = charmap_.get_mb_cmap().find (mb_char)) 
            != charmap_.get_mb_cmap().end()) {
            coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
            if (coll_map_it->second.offset != UINT_MAX 
                || !collate_out_.undefined_optimization) {
                tab.off[cur_char] = coll_map_it->second.offset;
                if (tab.first_offset == -1)
                    tab.first_offset = cur_char;
            }
        }
        else {
            if (valid_coll_mb_set_.find (mb_char) != valid_coll_mb_set_.end()){
                ++next_tab_num_;
                tab.off[cur_char] = next_tab_num_ | 0x80000000;
                gen_n_to_w_coll_tables (mb_char, next_tab_num_);             
                if (tab.first_offset == -1)
                    tab.first_offset = cur_char;

            }
            else {
                tab.off[cur_char] = UINT_MAX;
            }
        }
    }
    char_offs_.insert (std::make_pair (tab_num, tab));
}


void Def::gen_w_to_n_coll_tables (const std::string &charp, 
                                  unsigned int tab_num)
{
    // initialize a table used in ctype and collate
    create_wchar_utf8_table();
    gen_valid_coll_wchar_set();

    offset_tab_t tab;
    for (unsigned int k = 0; k <= UCHAR_MAX; ++k) 
        tab.off[k] = UINT_MAX;

    tab.first_offset = -1;
    wchar_utf8_iter wu_it;
    mb_cmap_iter n_cmap_it;

    for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
        unsigned char cur_char = (unsigned char)i;
        std::string   mb_char = (charp);

        mb_char += (char)cur_char;

        wu_it = wchar_utf8_to_ext_.find (mb_char);
        if (wu_it != wchar_utf8_to_ext_.end()) {
            n_cmap_it = charmap_.get_mb_cmap().find (wu_it->second);
            if (n_cmap_it != charmap_.get_mb_cmap().end ()) {
                
                coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second);
                if (coll_map_it->second.offset != UINT_MAX || 
                    !collate_out_.undefined_optimization) {            
                    tab.off[cur_char] = coll_map_it->second.offset;
                    if (tab.first_offset == -1)
                        tab.first_offset = cur_char;
                }
            }
        }
        else 
            if (valid_coll_wchar_set_.find (mb_char) != 
                valid_coll_wchar_set_.end()) {

                ++next_wchar_coll_tab_num_;
                tab.off[cur_char] = next_wchar_coll_tab_num_ | 0x80000000;
                gen_w_to_n_coll_tables (mb_char, 
                                        next_wchar_coll_tab_num_);
                if (tab.first_offset == -1)
                    tab.first_offset = cur_char;
            } 
            else
                tab.off[cur_char] = UINT_MAX;
    }
    
    w_to_n_coll_.insert (std::make_pair (tab_num, tab));
}


void Def::gen_n_ce_tables (const std::set<std::string>ce_elms,
                           unsigned int idx, unsigned int tab_num)
{
    if (ce_elms.size() > 0) {
        ce_offset_tab_t tab;
        for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
            tab.off[k] = UINT_MAX;
        }
        tab.first_offset = -1;
        tab.last_offset = 0;
        ce_sym_map_iter ce_sym_map_it;
        ce_map_iter ce_map_it;

        std::set<std::string>::const_iterator ce_elms_it;
        std::set<std::string> next_elms;

        for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
            next_elms.clear();
            for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end(); 
                 ce_elms_it ++) {

                if ((unsigned char)i == (unsigned char)((*ce_elms_it)[idx])) {
                    if ((*ce_elms_it).size() == idx + 1) {
                        if ((ce_sym_map_it = ce_sym_map_.find (*ce_elms_it))
                            != ce_sym_map_.end()) {
                            ce_map_it = ce_map_.find(ce_sym_map_it->second);
                            tab.off[i] = ce_map_it->second.offset;
                            if (tab.first_offset == -1)
                                tab.first_offset = i;
                            if ((unsigned int)tab.last_offset < i)
                                tab.last_offset = i;
                        }
                    }
                    else {
                    
                        next_elms.insert (*ce_elms_it);
                        if (tab.off[i] == UINT_MAX) {
                            ++ next_n_ce_tab_num_;
                            tab.off[i] = next_n_ce_tab_num_ | 0x80000000;
                        }
                        if (tab.first_offset == -1)
                            tab.first_offset = i;
                        if ((unsigned int)tab.last_offset < i)
                            tab.last_offset = i;
                    }
                }
            }
            // now recursively call gen_n_ce_tables if any collating
            // elements with this character were found;
            if (next_elms.size() != 0)
                gen_n_ce_tables (next_elms, idx + 1, next_n_ce_tab_num_);
        }
        n_ce_offs_.insert(std::make_pair(tab_num, tab));

    }
}


void Def::gen_w_ce_tables (const std::set<std::string>ce_elms,
                           unsigned int idx, unsigned int tab_num)
{
    if (ce_elms.size() > 0) {
        ce_offset_tab_t tab;
        for (unsigned int k = 0; k <= UCHAR_MAX; ++k) {
            tab.off[k] = UINT_MAX;
        }
        tab.first_offset = -1;
        tab.last_offset = 0;
        ce_sym_map_iter ce_sym_map_it;
        ce_map_iter ce_map_it;

        std::set<std::string>::const_iterator ce_elms_it;
        std::set<std::string> next_elms;

        for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {
            next_elms.clear();
            for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end(); 
                 ce_elms_it ++) {

                if ((unsigned char)i == (unsigned char)(*ce_elms_it)[idx]) {
                    if ((*ce_elms_it).size() == idx + 1) {
                        if ((ce_sym_map_it = ce_wsym_map_.find (*ce_elms_it))
                            != ce_wsym_map_.end()) {
                            ce_map_it = ce_map_.find(ce_sym_map_it->second);
                            tab.off[i] = ce_map_it->second.offset;
                            if (tab.first_offset == -1)
                                tab.first_offset = i;
                            if ((unsigned int)tab.last_offset < i)
                                tab.last_offset = i;
                        }
                    }
                    else {
                        next_elms.insert (*ce_elms_it);
                        if (tab.off[i] == UINT_MAX) {
                            ++ next_w_ce_tab_num_;
                            tab.off[i] = next_w_ce_tab_num_ | 0x80000000;
                        }
                        if (tab.first_offset == -1)
                            tab.first_offset = i;
                        if ((unsigned int)tab.last_offset < i)
                            tab.last_offset = i;
                    }
                }
            }
            // now recursively call gen_w_ce_tables if any collating
            // elements with this character were found;
            if (next_elms.size() != 0)
                gen_w_ce_tables (next_elms, idx + 1, next_w_ce_tab_num_);
        }
        w_ce_offs_.insert(std::make_pair(tab_num, tab));
    }
}


void Def::dump_collate ()
{
    std::cout << "LC_COLLATE\n";

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    token_list_t::iterator sl_it = sym_list_.begin ();
    for (; sl_it != sym_list_.end (); ++sl_it) {
        std::cout << sl_it->name << " % "; 
        cs_map_iter cs_it = cs_map_.find (sl_it->name);
        if (cs_it != cs_map_.end ()) {
            std::cout << cs_it->second << '\n';
            continue;
        }

        ce_map_iter ce_it = ce_map_.find (sl_it->name);
        if (ce_it != ce_map_.end ()) {
            std::cout << ce_it->second.coll_val << '\n';
            continue;
        }
        
        std::map<std::string, wchar_t >::const_iterator cw_it = 
            w_cmap.find (sl_it->name);
        if (cw_it != w_cmap.end ()) {
            coll_map_iter cm_it = 
                coll_map_.find (cw_it->second);
            if (cm_it != coll_map_.end ()) {
                std::cout << cm_it->second.coll_val << '\n';
                continue;
            }
        }
        std::cout << '\n';
    }

    std::cout << "\n\n";

    std::list<collate_section_t>::iterator sect_it = 
        section_list_.begin ();
    while (sect_it != section_list_.end ()) {

        // dump the orders
        std::cout << "order_start " << sect_it->name;
        token_list_t::iterator o_it = sect_it->order.begin ();
        for (; o_it != sect_it->order.end (); ++o_it)
            std::cout << ";" << o_it->name;
        std::cout << '\n';

        // for each entry in the entries list
        collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
        for (; e_it != sect_it->entries.end (); ++e_it) {

            // dump the collation definition (weights included)
            std::cout << e_it->first.name << " ";
            if (!e_it->second.empty ()) {
                token_list_t::iterator w_it = e_it->second.begin ();
                std::cout << (w_it++)->name;
                for (; w_it != e_it->second.end (); ++w_it) 
                    std::cout << ";" << w_it->name;
            }

            // lookup the value associated with this collation symbol
            cs_map_iter cs_it = cs_map_.find (e_it->first.name);
            if (cs_it != cs_map_.end ()) {
                std::cout << " % " << cs_it->second << '\n';
                continue;
            }

            ce_map_iter ce_it = ce_map_.find (e_it->first.name);
            if (ce_it != ce_map_.end ()) {
                std::cout << " % " << ce_it->second.coll_val << '\n';
                continue;
            }
        
            std::map<std::string, wchar_t >::const_iterator cw_it = 
                w_cmap.find (e_it->first.name);
            if (cw_it != w_cmap.end ()) {
                coll_map_iter cm_it = 
                    coll_map_.find (cw_it->second);
                if (cm_it != coll_map_.end ()) {
                    std::cout << " % " << cm_it->second.coll_val << '\n';
                    continue;
                }

            }
            std::cout << '\n';
        }

        std::cout << "order_end\n";
        ++sect_it;
    }
    std::cout << "END LC_COLLATE\n";
}


void Def::write_collate (std::string dir_name)
{
    assert (!dir_name.empty());

    static const char lc_name[] = "LC_COLLATE";

    if (collate_written_)
        return;

    if (!collate_def_found_) {
        issue_diag (I_SKIP, false, 0,
                    "%s section not found, skipping\n", lc_name);
        return;
    }

    next_tab_num_ = 0;
    next_wchar_coll_tab_num_ = 0;
    std::set<std::string> ce_strs;
    ce_sym_map_iter it;
    ce_map_iter ce_mit;
    for (it = ce_sym_map_.begin (); it != ce_sym_map_.end (); ++it) {
        if ((ce_mit = ce_map_.find (it->second))->second.coll_val 
            != UINT_MAX) {
            // check to see of the largest_ce needs to be changed
            if (ce_mit->second.ce_wstr.size() + 1 
                > collate_out_.largest_ce)
                collate_out_.largest_ce = ce_mit->second.ce_wstr.size();
            
            ce_strs.insert (it->first);
        }
    }
    next_n_ce_tab_num_ = 0;
    gen_n_ce_tables(ce_strs, 0, 0);
    ce_strs.clear();
    for (it = ce_wsym_map_.begin (); it != ce_wsym_map_.end (); ++it) {
        if (ce_map_.find (it->second)->second.coll_val != UINT_MAX)
            ce_strs.insert (it->first);
    }
    next_w_ce_tab_num_ = 0;
    gen_w_ce_tables (ce_strs, 0, 0);

    gen_n_to_w_coll_tables ("", next_tab_num_);
    gen_w_to_n_coll_tables ("", next_wchar_coll_tab_num_);

    (dir_name += _RWSTD_PATH_SEP) += lc_name;

    issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());

    std::ofstream out (dir_name.c_str(), std::ios::binary);
    out.exceptions (std::ios::failbit | std::ios::badbit);

    unsigned int i;

    // calculate the size of an individual weight element
    collate_out_.elm_size = collate_out_.num_weights 
        * collate_out_.longest_weight * sizeof (unsigned int) + 
        sizeof (unsigned int);

    // the first section of the collate database is the collating
    // element information
    collate_out_.n_ce_tab_off = 0;
    collate_out_.w_ce_tab_off = collate_out_.n_ce_tab_off;
    n_ce_offs_iter n_ce_offs_it;
    for (n_ce_offs_it = n_ce_offs_.begin(); 
         n_ce_offs_it != n_ce_offs_.end (); ++n_ce_offs_it) {
        collate_out_.w_ce_tab_off += (n_ce_offs_it->second.last_offset 
                                      - n_ce_offs_it->second.first_offset + 1)* sizeof (int);
    }

    // next comes the weight information
    collate_out_.weight_tab_off = collate_out_.w_ce_tab_off;
    w_ce_offs_iter w_ce_offs_it;
    for (w_ce_offs_it = w_ce_offs_.begin(); 
         w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
        collate_out_.weight_tab_off += (w_ce_offs_it->second.last_offset 
                                        - w_ce_offs_it->second.first_offset + 1)* sizeof (int);
    }

    coll_map_iter coll_map_pos;

    collate_out_.num_elms = off_mapr_.size();
    if (collate_out_.undefined_optimization)
        ++collate_out_.num_elms;

    // now calculate the offset for the first narrow character table
    collate_out_.n_char_tab_off = collate_out_.weight_tab_off
        + collate_out_.num_elms * collate_out_.elm_size;

    // now calculate the offset fo the fist wide character table
    // but first we need the size of the narrow tables
    char_offs_iter char_offs_it;
    unsigned int char_offs_size = 0;
    for (char_offs_it = char_offs_.begin(); 
         char_offs_it != char_offs_.end(); ++char_offs_it) {
        char_offs_size += (UCHAR_MAX + 1 
                           - char_offs_it->second.first_offset)
            * sizeof (unsigned int);
    }
        
    collate_out_.w_char_tab_off = collate_out_.n_char_tab_off
        + char_offs_size;

    // now calculate the offset for the narrow character offset table
    // but first we need the size of the wide tables
    w_to_n_coll_iter w_to_n_coll_it;
    unsigned int w_to_n_size = 0;
    for (w_to_n_coll_it = w_to_n_coll_.begin();
         w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
        w_to_n_size += (UCHAR_MAX + 1 
                        - w_to_n_coll_it->second.first_offset)
            * sizeof (unsigned int);
    }

    collate_out_.n_char_off_tab_off = collate_out_.w_char_tab_off
        + w_to_n_size;

    // now calculate the offset for the wide character offset table
    collate_out_.w_char_off_tab_off = collate_out_.n_char_off_tab_off
        + char_offs_.size() * sizeof (unsigned int);

    // calculate the offset for the narrow collating element offset table
    collate_out_.n_ce_off_tab_off = collate_out_.w_char_off_tab_off
        + w_to_n_coll_.size() * sizeof (unsigned int);

    // calculate the offset for the wide collating element offset table
    collate_out_.w_ce_off_tab_off = collate_out_.n_ce_off_tab_off
        + n_ce_offs_.size() * sizeof (unsigned int);

    // now calculate the offset of the first character information
    collate_out_.n_char_first_char_off = collate_out_.w_ce_off_tab_off
        + w_ce_offs_.size() * sizeof (unsigned int);
        
    // now calculate the offset of the wide table first char info
    collate_out_.w_char_first_char_off = collate_out_.n_char_first_char_off
        + char_offs_.size() * sizeof (unsigned char);
        
    // now calculate the offset of the narrow ce first character info
    collate_out_.n_ce_first_char_off = collate_out_.w_char_first_char_off
        + w_to_n_coll_.size() * sizeof (unsigned char);

    // now calculate the offset of the wide ce first character info
    collate_out_.w_ce_first_char_off = collate_out_.n_ce_first_char_off
        + n_ce_offs_.size() * sizeof (unsigned char);

    // now calculate the offset of the narrow ce last character info
    collate_out_.n_ce_last_char_off = collate_out_.w_ce_first_char_off
        + w_ce_offs_.size() * sizeof (unsigned char);

    // now calculate the offset of the wide ce last character info
    collate_out_.w_ce_last_char_off = collate_out_.n_ce_last_char_off
        + n_ce_offs_.size() * sizeof (unsigned char);

    // now calculate the offset of the codeset name
    collate_out_.codeset_off = collate_out_.w_ce_last_char_off
        + w_ce_offs_.size() * sizeof (unsigned char);

    // finally calculate the offset of the charmap name
    collate_out_.charmap_off = collate_out_.codeset_off 
        + charmap_.get_code_set_name().size() + 1;
        


    // print out the collate struct
    out.write ((char*)&collate_out_, sizeof(collate_out_));

    for (n_ce_offs_it = n_ce_offs_.begin(); 
         n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
        for (i = (unsigned int)n_ce_offs_it->second.first_offset; 
             i <= (unsigned int)n_ce_offs_it->second.last_offset; ++i)
            out.write ((char*)&n_ce_offs_it->second.off[i], 
                       sizeof (n_ce_offs_it->second.off[i]));
    }

    for (w_ce_offs_it = w_ce_offs_.begin(); 
         w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
        for (i = (unsigned int)w_ce_offs_it->second.first_offset; 
             i <= (unsigned int)w_ce_offs_it->second.last_offset; ++i)
            out.write ((char*)&w_ce_offs_it->second.off[i], 
                       sizeof (w_ce_offs_it->second.off[i]));
    }

        
    // now print out the weight array
    unsigned int maxw = UINT_MAX;
    Weights_t* weights;
    bool undefined_written = false;

    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    std::size_t off_idx = 0;
    off_mapr_iter current_off = off_mapr_.begin();
    for (; current_off != off_mapr_.end(); ++off_idx) {

        // check the current offset
        if (current_off->first != off_idx 
            && off_idx != undef_char_info_.offset)
            issue_diag (E_RANGE, true, 0, 
                        "current_off : %d, index : %d, giving up\n",
                        current_off->first, off_idx);

        if (collate_out_.undefined_optimization 
            && off_idx == undef_char_info_.offset) {
            undefined_written = true;
            weights = undef_char_info_.weights;
            out.write ((char*)&undef_char_info_.order, 
                       sizeof (undef_char_info_.order));
            for (int j = 0; j < collate_out_.num_weights; ++j) {
                for (int k = 0; k < weights[j].size; ++k) {
                    out.write ((char*)&weights[j].weight[k],
                               sizeof (weights[j].weight[k]));
                }
                for (int c = weights[j].size; 
                     c < collate_out_.longest_weight; ++c)
                    out.write ((char*)&maxw, sizeof (maxw)); 
            }
            continue;
        }

        w_cmap_iter w_cmap_pos = w_cmap.find 
            (current_off->second);
        if (w_cmap_pos != w_cmap.end()) {
            coll_map_pos = coll_map_.find (w_cmap_pos->second);
            out.write ((char*)&coll_map_pos->second.order, 
                       sizeof (coll_map_pos->second.order));
            weights = coll_map_pos->second.weights;
            for (int j = 0; j < collate_out_.num_weights; ++j) {
                for (int k = 0; k < weights[j].size; ++k) {
                    out.write ((char*)&weights[j].weight[k],
                               sizeof (weights[j].weight[k]));
                }
                for (int c = weights[j].size; 
                     c < collate_out_.longest_weight; ++c)
                    out.write ((char*)&maxw, sizeof (maxw)); 
            }
            ++current_off;
        }

        else {
            ce_map_iter ce_map_it = ce_map_.find (current_off->second);
            if (ce_map_it != ce_map_.end() 
                && ce_map_it->second.coll_val != UINT_MAX) {
                out.write ((char*)&ce_map_it->second.order, 
                           sizeof (ce_map_it->second.order));
                weights = ce_map_it->second.weights;
                for (int j = 0; j < collate_out_.num_weights; ++j) {
                    for (int k = 0; k < weights[j].size; ++k) {
                        out.write ((char*)&weights[j].weight[k],
                                   sizeof (weights[j].weight[k]));
                    }
                    for (int c = weights[j].size; 
                         c < collate_out_.longest_weight; ++c)
                        out.write ((char*)&maxw, sizeof (maxw)); 
                }

            }
            ++current_off;
        }
    }            

    // maske sure that we got to the undefined value
    if (!undefined_written && collate_out_.undefined_optimization) {
        if (off_idx != undef_char_info_.offset)
            issue_diag (E_RANGE, true, 0, 
                        "current_off : %d, index : %d, giving up\n",
                        undef_char_info_.offset, off_idx);

        weights = undef_char_info_.weights;
        out.write ((char*)&undef_char_info_.order, 
                   sizeof (undef_char_info_.order));
        for (int j = 0; j < collate_out_.num_weights; ++j) {
            for (int k = 0; k < weights[j].size; ++k) {
                out.write ((char*)&weights[j].weight[k],
                           sizeof (weights[j].weight[k]));
            }
            for (int c = weights[j].size; 
                 c < collate_out_.longest_weight; ++c)
                out.write ((char*)&maxw, sizeof (maxw)); 
        }
    }

    // now print out the narrow character tables
    for (char_offs_it = char_offs_.begin(); 
         char_offs_it != char_offs_.end(); ++char_offs_it) {
        for (unsigned int c = char_offs_it->second.first_offset; 
             c <= UCHAR_MAX; ++c) {
            out.write ((const char*)&char_offs_it->second.off[c],
                       sizeof (char_offs_it->second.off[c]));
        }
    }

    // now print out the wide character tables
    for (w_to_n_coll_it = w_to_n_coll_.begin(); 
         w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
        for (unsigned int c = w_to_n_coll_it->second.first_offset; 
             c <= UCHAR_MAX; ++c) {
            out.write ((const char*)&w_to_n_coll_it->second.off[c],
                       sizeof (w_to_n_coll_it->second.off[c]));
        }
    }
        
    // now print the narrow character table offsets
    unsigned int current_offset = 0;
    for (char_offs_it = char_offs_.begin();
         char_offs_it != char_offs_.end(); ++char_offs_it) {
        out.write ((const char*)&current_offset, sizeof (current_offset));
        current_offset += (UCHAR_MAX + 1 
                           - char_offs_it->second.first_offset);
    }

    // now print the wide character table offsets
    current_offset = 0;
    for (w_to_n_coll_it = w_to_n_coll_.begin();
         w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
        out.write ((const char*)&current_offset, sizeof (current_offset));
        current_offset += (UCHAR_MAX + 1 
                           - w_to_n_coll_it->second.first_offset);
    }

    // now print the narrow ce table offsets
    current_offset = 0;
    for (n_ce_offs_it = n_ce_offs_.begin();
         n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
        out.write ((const char*)&current_offset, sizeof (current_offset));
        current_offset += (n_ce_offs_it->second.last_offset 
                           - n_ce_offs_it->second.first_offset + 1);
    }

    // now print the wide ce table offsets
    current_offset = 0;
    for (w_ce_offs_it = w_ce_offs_.begin();
         w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
        out.write ((const char*)&current_offset, sizeof (current_offset));
        current_offset += (w_ce_offs_it->second.last_offset 
                           - w_ce_offs_it->second.first_offset + 1);
    }


    // now print out the narrow character tables starting character
    for (char_offs_it = char_offs_.begin();
         char_offs_it != char_offs_.end(); ++char_offs_it) {
        const char off = char ((char_offs_it->second).first_offset);
        out << off;
    }        
        
    // now print out the wide character tables starting character
    for (w_to_n_coll_it = w_to_n_coll_.begin();
         w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) {
        const char off = char ((w_to_n_coll_it->second).first_offset);
        out << off;
    }        
        
    // now print out the narrow ce tables starting character
    for (n_ce_offs_it = n_ce_offs_.begin();
         n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
        const char off = char ((n_ce_offs_it->second).first_offset);
        out << off;
    }        

    // now print out the wide ce tables starting character
    for (w_ce_offs_it = w_ce_offs_.begin();
         w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
        const char off = char ((w_ce_offs_it->second).first_offset);
        out << off;
    }        

    // now print out the narrow ce tables ending character
    for (n_ce_offs_it = n_ce_offs_.begin();
         n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) {
        const char off = char ((n_ce_offs_it->second).last_offset);
        out << off;
    }        

    // now print out the wide ce tables ending character
    for (w_ce_offs_it = w_ce_offs_.begin();
         w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) {
        const char off = char ((w_ce_offs_it->second).last_offset);
        out << off;
    }        
        
    // finally write the codeset and charmap names
    out << charmap_.get_code_set_name() << std::ends
        << charmap_.get_charmap_name() << std::ends;
}


void Def::init_coll_map() {

    rw_cmap_iter rw_cmap_pos;
    collate_info_t tmp = {UINT_MAX, UINT_MAX, 0, 0};

    for (rw_cmap_pos = charmap_.get_rw_cmap().begin(); 
         rw_cmap_pos != charmap_.get_rw_cmap().end();
         ++rw_cmap_pos) {
        coll_map_.insert (std::make_pair (rw_cmap_pos->first, tmp));
    }
}


// In the preprocess_collate member function the collate section is 
// preprocessed and the result of the preprocessing is stored in-memory
// as a number of lists; the content of these lists is then preprocessed 
// acoording to the reorder statements; the result is then passed
// on to process_collate.
void Def::preprocess_collate ()
{
    int nesting_level = 0;

    while (true) {
        // fetch next token
        next = scanner_.next_token();
        
        switch (next.token) {
        case Scanner::tok_end:
            next = scanner_.next_token();
            if (next.token == Scanner::tok_collate) {
                // end of collation block
                if (nesting_level == 0) 
                    return;

                nesting_level--;
                scanner_.close ();
            } else
                issue_diag (E_SYNTAX, true, &next,
                            "wrong section name in END directive\n");

            break;
        case Scanner::tok_copy: {
            next = scanner_.next_token();
            if (next.token != Scanner::tok_string)
                issue_diag (E_SYNTAX, true, &next,
                            "expected string following \"copy\" directive\n");

            // bump up the nesting level
            ++nesting_level;

            issue_diag (I_STAGE, false, 0, "processing copy directive\n");

            // open the file
            scanner_.open (get_pathname (strip_quotes (next.name), next.file));

            // get comment char and escape char; 
            // these informations are stored by the scanner
            while ((next = scanner_.next_token ()).token 
                   != Scanner::tok_collate ){
                // the LC_IDENTIFICATION section may also have a 
                // LC_COLLATE token that will mess up the parsing
                if (next.token == Scanner::tok_ident) {
                    while ((next = scanner_.next_token()).token
                           != Scanner::tok_end );
                    next = scanner_.next_token();
                }
            }
            
            break;
        }

        // a collating element definition defines a symbolic name that
        // represents symbolically the congregation of two other symbolic
        // names which have to be present in the character map;
        // the form of the phrase is:
        // collating-element sym from ("string" | (sym sym+))
        case Scanner::tok_coll_elem: {
            next = scanner_.next_token();
            // we expect a symbolic name
            if (next.token != Scanner::tok_sym_name) 
                issue_diag (E_SYNTAX, true, &next,
                            "symbolic name expected following "
                            "collating-element\n");
            // symbolic name
            token_t sym (next);

            next = scanner_.next_token ();
            if (next.token != Scanner::tok_from)
                issue_diag (E_SYNTAX, true, &next,
                            "illegal collating-element expression\n");

            token_list_t ce_def_list;
            next = scanner_.next_token ();
            if (next.token == Scanner::tok_string) {
                ce_def_list.push_back (next);
                ce_list_.push_back (std::make_pair(sym,ce_def_list));
            } else if (next.token == Scanner::tok_sym_name) {
                do {
                    ce_def_list.push_back (next);
                    next = scanner_.next_token();
                } while (next.token != Scanner::tok_nl);
                ce_list_.push_back (std::make_pair(sym,ce_def_list));
            } else 
                issue_diag (E_SYNTAX, true, &next,
                            "illegal collating-element expression\n");
            break;
        }
        case Scanner::tok_coll_sym:
            next = scanner_.next_token();
            if (next.token != Scanner::tok_sym_name)
                issue_diag (E_SYNTAX, true, &next,
                            "symbolic name expected following "
                            "collating-symbol\n");
            cs_list_.push_back (next);
            break;
        case Scanner::tok_script:
            next = scanner_.next_token ();
            if (next.token != Scanner::tok_sym_name)
                issue_diag (W_SYNTAX, false, &next,
                            "expecting script name, got %s\n", 
                            next.name.c_str ());
            script_list_.push_back(next);
            break;
        case Scanner::tok_sym_name:
            sym_list_.push_back (next);
            break;
        case Scanner::tok_order_start:
            preprocess_order();
            break;

        case Scanner::tok_reorder:
            preprocess_reorder ();
            break;

        case Scanner::tok_reorder_section:
            preprocess_reorder_section ();
            break;

        default:
            break;
        }
    }

}

// The task of preprocess_order is to parse and model the content of the 
// order sections in the input files
void Def::preprocess_order ()
{
    // one order-start keyword has been encountered; 
    // push a collate_section down the list and use it
    while (next.token != Scanner::tok_order_end) {
        section_list_.push_back (collate_section_t());
        collate_section_t& section = section_list_.back ();

        next = scanner_.next_token ();
        
        // store the orders
        while (next.token != Scanner::tok_nl) {
            if (next.token == Scanner::tok_sym_name) {
                // symbolic name, has to be a section name
                section.name = next.name;
            } else {
                // might be a combination of order and position
                // do the same until a better way is found
                section.order.push_back (next);
            }

            // fetch next token
            next = scanner_.next_token ();
        }

        // unnamed sections
        if (section.name == "")
            section.name = "unnamed";

        issue_diag (I_STAGE, false, 0,
                    "processing %s order\n", section.name.c_str ());

        // store the collation statements
        while (!(next.token == Scanner::tok_order_end || 
                 next.token == Scanner::tok_order_start )) {

            next = scanner_.next_token ();
            switch (next.token) {
            case Scanner::tok_end:
                issue_diag (E_SYNTAX, true, &next,
                            "unexpected END directive while "
                            "parsing collate section\n");
            case  Scanner::tok_nl:
                continue;
            case Scanner::tok_comment:
                scanner_.ignore_line ();
                break;
            case Scanner::tok_undefined:
            case Scanner::tok_sym_name: 
            {
                token_t sym (next);
                section.entries.push_back (
                    std::make_pair(sym,token_list_t()));
                collate_entry_t& entry = section.entries.back ();
                
                next = scanner_.next_token ();
                while (next.token != Scanner::tok_nl) {
                    entry.second.push_back (next);
                    next = scanner_.next_token ();
                }
                break;
            }
                        default:
                                break;
            }
        }
    }
}


// Handles the reorder statements in the form:
//    reorder-after <sym>
//    <sym> <sym><sym>...
// OR
//    reorder-after <sym>
//    ..  <sym><sym>.....
// followed by reorder-end statement or another reorder-after
void Def::preprocess_reorder ()
{
    while (true) {
        // process one or more statements grouped 
        // under a reorder section
        
        // retrieve the symbol
        next = scanner_.next_token ();
        if (next.token != Scanner::tok_sym_name)
            issue_diag (E_SYNTAX, true, &next,
                       "expecting symbolic name, got %s\n",
                       next.name.c_str ());
            
        // store the symbolic name
        token_t sym (next);
        collate_entry_list_t entries_list;
        while (true) {

            // process the statements
            next = scanner_.next_token ();

            if (next.token == Scanner::tok_nl )
                continue;
            else if (   next.token == Scanner::tok_sym_name
                     || next.token == Scanner::tok_hex_ellipsis) {
                // the line will contain one single symbolic name 
                // or a complete collation statement
                collate_entry_t entry;
                entry.first = next;

                next = scanner_.next_token ();
                while (next.token != Scanner::tok_nl) { 
                    entry.second.push_back (next);
                    next = scanner_.next_token ();
                }

                // add the entry to the list
                entries_list.push_back (entry);
            } else if (next.token == Scanner::tok_reorder) {
                // call insert_entry
                if (!entries_list.empty ())
                    insert_entries (sym, entries_list);
                entries_list.clear ();
                break;
            } else if (next.token == Scanner::tok_reorder_end ) {
                // call insert_entry
                // call insert_entry
                if (!entries_list.empty ())
                    insert_entries (sym, entries_list);
                return;
            } else {
                issue_diag (E_SYNTAX, true, &next,
                            "unexpected token : %s in a reorder block\n",
                            next.name.c_str ());
            }
        }
    }
}


void Def::preprocess_reorder_section ()
{
    while (next.token != Scanner::tok_reorder_section_end) {
        // process one or more statements grouped 
        // under a reorder section
        
        // retrieve the symbol
        next = scanner_.next_token ();
        token_t sym (next);

        // process the statements
        while (!(next.token == Scanner::tok_reorder_section || 
                 next.token == Scanner::tok_reorder_section_end)) {

            next = scanner_.next_token ();
            switch (next.token) {
            case Scanner::tok_nl:
                continue;
            case Scanner::tok_sym_name: {
                // the line will contain one single symbolic name 
                // which is the name of a section
                next = scanner_.next_token ();
                if (next.token != Scanner::tok_sym_name) {
                    issue_diag (E_SYNTAX, true, &next,
                                "expecting section name, got %s\n",
                                next.name.c_str ());
                }

                std::list<collate_section_t>::iterator beg = 
                    section_list_.begin ();
                std::list<collate_section_t>::iterator end = 
                    section_list_.end ();
                std::list<collate_section_t>::iterator ref_it, mov_it;
                for (ref_it = beg; 
                     ref_it->name != sym.name && ref_it != end; 
                     ++ref_it);
                for (mov_it = beg; 
                     mov_it->name != next.name && mov_it != end; 
                     ++mov_it);

                if (ref_it == end || mov_it == end)
                    issue_diag (E_SYNTAX, true, &next,
                                "missing sections %s %s requested\n",
                                sym.name.c_str (), next.name.c_str ());

                // replace it
                section_list_.splice (++ref_it, section_list_,
                                      mov_it, mov_it);

                // replace the insertion point
                sym = next;
                break;
            }
            default:
                issue_diag (E_SYNTAX, true, &next,
                            "unexpected token : %s in a "
                            "reorder section block\n",
                            next.name.c_str ());
                break;
            }

        }
    }
}


void Def::list_collate ()
{
    // print the script list
    token_list_t::iterator script_it = script_list_.begin ();
    while (script_it != script_list_.end ()) {
        std::cout << "script  " << (script_it++)->name << '\n';
    }
    std::cout << '\n';

    // print the collating symbols' list
    token_list_t::iterator cs_it = cs_list_.begin ();
    while (cs_it != cs_list_.end ()) {
        std::cout << "collating-symbol     " << (cs_it++)->name << '\n';
    }
    std::cout << '\n';

    // print the collating elements list
    std::list<collate_elem_t>::iterator ce_it = ce_list_.begin ();
    while (ce_it != ce_list_.end ()) {
        std::cout << "collating-element " << ce_it->first.name
                  << " from ";
        token_list_t::iterator ce_def_it = ce_it->second.begin ();
        for (;ce_def_it != ce_it->second.end (); ce_def_it++) {
            std::cout << ce_def_it->name << ";";
        }
        std::cout << '\n';
    }    
    std::cout << '\n';

    // print out general/global symbols' list
    token_list_t::iterator sym_it = sym_list_.begin ();
    while (sym_it != sym_list_.end ()) {
        std::cout << (sym_it++)->name << '\n';
    }
    std::cout << '\n';

    // print out sections
    std::list<collate_section_t>::iterator sc_it = 
        section_list_.begin ();
    while (sc_it != section_list_.end ()) {
        // print prolog
        std::cout << "order_start ";
        if (sc_it->name != "unnamed")
            std::cout << sc_it->name << ";";
        
        token_list_t::iterator it = sc_it->order.begin ();
        while (it != sc_it->order.end ())
            std::cout << (it++)->name << ";";
        std::cout << '\n';

        std::list<collate_entry_t>::iterator e_it = 
            sc_it->entries.begin ();
        while (e_it != sc_it->entries.end ()) {
            std::cout << e_it->first.name << "   ";
            token_list_t::iterator w_it =  e_it->second.begin ();
            while (w_it != e_it->second.end ())
                std::cout << (w_it++)->name << ";";
            std::cout << '\n';
            ++e_it;
        }

        ++sc_it;
    }
    std::cout << '\n' << "order_end\n";
}


// FIXME - modify the algorithm to get log(N) complexity
// Inserts an entry inside the section information;
// searching for the appropriate entry in the list is
// done in linear time 
bool Def::insert_entries (token_t& s, collate_entry_list_t& e)
{
    // first remove these entries if found
    collate_entry_list_t::iterator r_it = e.begin ();
    for (; r_it != e.end (); ++r_it) 
        remove_entry (*r_it);

    // first search through the symbols list; if found check
    // the collate_entry object; it should not have weights
    token_list_t::iterator it = sym_list_.begin ();
    for (; it != sym_list_.end (); ++it) {
        if (it->name != s.name) 
            continue;

        // FIXME - all statements have to have no weights if they are
        // to be inserted after collation symbols outside section boundaries;
        // found it;  check one collate_entry_t object
        if (!e.begin ()->second.empty ()) {
            issue_diag (E_REORD, true, &s,
                        "requested reorder-after: reference "
                        "symbol %s was found outside section boundaries "
                        "and the objects to be reordered are collation "
                        "definitions\n", s.name.c_str ());
        }

        // insert the symbolic name there
        collate_entry_list_t::iterator e_it = e.begin ();
        for (; e_it != e.end (); ++e_it) 
            it = sym_list_.insert (++it, e_it->first);

        return true;
    }

    // if not successful then search in each section
    std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
    while (sect_it != section_list_.end ()) {
        collate_entry_list_t::iterator e_it = sect_it->entries.begin ();
        for (; e_it != sect_it->entries.end (); ++e_it) {
            if (e_it->first.name != s.name)
                continue;

            // found it; insert entry
            sect_it->entries.insert (++e_it, e.begin (), e.end ());
            return true;
        }
        ++sect_it;
    }

    issue_diag (W_REORD, false, &s,
                "requested reorder-after: reference "
                "symbol %s was not found \n", s.name.c_str ());

    return false;
}


// FIXME - modify the algorithm to get log(N) complexity
void Def::remove_entry (collate_entry_t& e)
{
    // search in the sym_list_ and in the sections
    if (e.second.empty ()) {
        token_list_t::iterator it = sym_list_.begin ();
        for (; it != sym_list_.end (); ++it) {
            if (it->name != e.first.name) 
                continue;

            sym_list_.erase (it);
            return;
        }
    }

    // search in the sections
    std::list<collate_section_t>::iterator sect_it = section_list_.begin ();
    while (sect_it != section_list_.end ()) {
        std::list<collate_entry_t>::iterator e_it = sect_it->entries.begin ();
        for (; e_it != sect_it->entries.end (); ++e_it) {
            if (e_it->first.name != e.first.name)
                continue;

            sect_it->entries.erase (e_it);
            return;
        }
        ++sect_it;
    }
    return ;
}



bool Def::get_weight ( token_t&     w,
                       Weights_t*   weights, 
                       int          weight_num)
{
    const std::map<std::string, wchar_t>& w_cmap = charmap_.get_w_cmap ();

    bool ret = false;
    weights[weight_num].size = 1;
    ce_map_iter ce_map_it;
    if(w.token == Scanner::tok_sym_name) {
        w_cmap_iter w_cmap_pos = w_cmap.find (w.name);
        if (w_cmap_pos != w_cmap.end()){
            coll_map_iter coll_map_pos = coll_map_.find(w_cmap_pos->second);

            if (coll_map_pos->second.coll_val == UINT_MAX) 
                issue_diag (E_SYMUSED, true, 
                            &w, "symbolic name %s "
                            "used as weight before being assigned a "
                            "collation value\n", w.name.c_str());

            // store weight
            weights[weight_num].weight[0] = coll_map_pos->second.coll_val;
        }
        else if ((ce_map_it = ce_map_.find (w.name)) 
                 != ce_map_.end())
            weights[weight_num].weight[0] = ce_map_it->second.coll_val;
        else {
            cs_map_iter cs_it = cs_map_.find (w.name);
            if (cs_it == cs_map_.end ()) 
                issue_diag (E_SYNTAX, true, 
                            &w, "symbolic name %s not found\n", 
                            w.name.c_str());
              
            weights[weight_num].weight[0] = cs_it->second;
        }
    }
    else if (w.token == Scanner::tok_char_value) {
        // the weight is given in numerical form
        const char* next_val =
            std::strchr (w.name.c_str (), scanner_.escape_char ());

        assert (0 != next_val);

        const char* next_wt = std::strchr (w.name.c_str (), ';');

        while (weight_num < collate_out_.num_weights) {

            std::size_t c;

            for (c = 0; *next_val && (!next_wt || next_val < next_wt); ++c) {

                const char* end = 0;

                weights [weight_num].weight [c] =
                    scanner_.convert_escape (next_val, &end, true);

                assert (0 != end);

                next_val = end;
            }

            weights [weight_num++].size = c;

            if (next_wt)
                next_wt = std::strchr (next_val, ';');
                
        }

    }
    else if(w.token == Scanner::tok_ignore) {
        // use the special weight 0 for IGNORE weights
        weights[weight_num].size = 1;
        weights[weight_num].weight[0] = 0;
    }
    else if (w.token == Scanner::tok_string) {
        // the weights are given either in symbolic name form (e.g.,
        // "<symbolic-name>" or in the form of a quoted multibyte
        // character string (e.g., "\001\d010\x16\")
        const std::string tmp (w.name.substr (1, w.name.size() - 2));

        // keeps track of the length of the weight
        unsigned char k = 0;

        // iterate thru the string content and retrieve the symbols
        std::string::const_iterator it = tmp.begin ();

        const char escape = scanner_.escape_char ();

        while (it != tmp.end ()) {
            // weight in string form
            std::string wsym;

            // next comes a symbolic name
            if (*it == '<') {
                while (*it != '>') {
                    if (*it == escape) 
                        ++it;

                    wsym += *it++;
                }
                wsym += *it++;

                // wsym has the symbolic name, lookup for it in 
                // the character map, collating-symbol map, 
                // and collating-element map
                w_cmap_iter w_cm_pos = w_cmap.find(wsym);
                cs_map_iter cs_it = cs_map_.find (wsym);
                ce_map_iter ce_it = ce_map_.find (wsym);

                if (w_cm_pos != w_cmap.end()) {
                    // is in the character map, check its associated
                    // collation value
                    coll_map_iter coll_it = coll_map_.find(w_cm_pos->second);
                    if (coll_it == coll_map_.end())
                        issue_diag (E_SYMUSED, true, &w, 
                                    "weight %s not defined\n", 
                                    wsym.c_str());
                    if (coll_it->second.coll_val == UINT_MAX) 
                        issue_diag (E_SYMUSED, true, 
                                    &w, "symbolic name %s "
                                    "used as weight before being assigned a "
                                    "collation value\n", wsym.c_str());

                    weights[weight_num].weight[k++] = coll_it->second.coll_val;
                }
                else if (ce_it != ce_map_.end()) {
                    // it is in the collating-element map
                    weights[weight_num].weight[k++] = ce_it->second.coll_val;
                }
                else if (cs_it != cs_map_.end()) {
                    // it is in the collating-symbol map
                    weights[weight_num].weight[k++] = cs_it->second;
                }
                else {
                    // it is not in any of the maps, that's an error
                    issue_diag (E_SYNTAX, true, &w, 
                                "symbolic name %s not found\n", 
                                wsym.c_str());
                }
            }
            else if (*it == escape) {

                // weight is given in a quoted escape form
                const char* const beg = tmp.c_str () + (it - tmp.begin ());
                const char*       end = 0;

                weights [weight_num].weight [k++] =
                    scanner_.convert_escape (beg, &end, true);

                assert (0 != end);

                it += end - beg;

                break;
            }
            else 
                issue_diag (E_SYNTAX, true, &w,
                            "illegal string content as a weight");
        }

        weights[weight_num].size = k;
        if (k > collate_out_.longest_weight)
            collate_out_.longest_weight = k;

    }
    else if (w.token == Scanner::tok_abs_ellipsis)
        // return true if ellipsis are embedded in the weight
        ret = true;
    else
        warnings_occurred_ = 
            issue_diag (W_SYNTAX, false, &w,
                        "illegal token %s found in collation definition\n",
                        w.name.c_str()) || warnings_occurred_;

    return ret;
}


// if undefined optimization is on then only those characters that have 
// been defined should go into the valid set, otherwise all characters go
// into the set.
void Def::gen_valid_coll_wchar_set () {

    if (!valid_coll_wchar_set_.empty())
        return;

    create_wchar_utf8_table();

    for (coll_map_iter coll_it = coll_map_.begin(); 
         coll_it != coll_map_.end(); ++coll_it) {
        if (coll_it->second.offset != UINT_MAX 
            || !collate_out_.undefined_optimization){

            std::string valid = utf8_encode (coll_it->first);
            valid = valid.substr (0, valid.size() - 1);
            
            while (valid.size() > 0){
                valid_coll_wchar_set_.insert (valid);
                valid = valid.substr(0, valid.size() - 1); 
            }
        }
    }
}
