util/codecvt.cpp - stdcxx - Git at Google

 /***************************************************************************
  *
  * codecvt.cpp
  *
  * $Id$
  *
  ***************************************************************************
  *
  * Licensed to the Apache Software  Foundation (ASF) under one or more
  * contributor  license agreements.  See  the NOTICE  file distributed
  * with  this  work  for  additional information  regarding  copyright
  * ownership.   The ASF  licenses this  file to  you under  the Apache
  * License, Version  2.0 (the  "License"); you may  not use  this file
  * except in  compliance with the License.   You may obtain  a copy of
  * the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the  License is distributed on an  "AS IS" BASIS,
  * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
  * implied.   See  the License  for  the  specific language  governing
  * permissions and limitations under the License.
  *
  * Copyright 2001-2006 Rogue Wave Software.
  *
  **************************************************************************/

 #include "diagnostic.h"   // for issue_diag()
 #include "def.h"          // for Def

 #include "path.h"         // for get_pathname()
 #include "scanner.h"      // for scanner

 #include <cassert>        // for assert()
 #include <climits>        // for UCHAR_MAX
 #include <cstring>        // for memset()
 #include <fstream>        // for ifstream, ofstream


 typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;


 std::size_t Def::
 gen_mbchar_tables (codecvt_offsets_map_t           &tab,
                    std::map<std::string, unsigned> &off_map,
                    const std::string               &charp /* = "" */,
                    unsigned                         tabno /* = 0 */)
 {
     // upon the first call (but not during subsequent recursive calls)
     // generate a set of multibyte prefixes from the set of all known
     // multibyte characters
     static unsigned               ntabs   = 0;
     static std::set<std::string>* pfx_set = 0;

     const n_cmap_citer2 mb_map_end = charmap_.get_mb_cmap ().end ();

     if (0 == pfx_set) {
         pfx_set = new std::set<std::string>;

         // iterate over the range of valid multibyte characters
         // obtained from the charmap and generate a complete
         // subset of non-empty multibyte prefixes from each
         unsigned off = 0;

         const n_cmap_citer2 mb_map_begin = charmap_.get_mb_cmap ().begin ();

         for (n_cmap_citer2 it = mb_map_begin; it != mb_map_end; ++it, ++off) {

             // insert the ordinal number of each multibyte character
             // into a map for fast lookup later
             off_map.insert (std::make_pair (it->first, off));

             // generate non-empty prefixes up to one byte less
             // in length than the complete multibyte character
             for (std::string prefix = it->first; 1 < prefix.size (); ) {
                 prefix = prefix.substr (0, prefix.size () - 1);
                 pfx_set->insert (prefix);
             }
         }
     }

     // number of valid characters inserted into the tables
     std::size_t nchars = 0;

     // an array of offsets to the multibyte character or to the next
     // array containing such offsets (defined recursively for up to
     // MB_CUR_MAX levels of nesting)
     codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

     std::string mb_char (charp + '\0');

     for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

         unsigned char cur_char = (unsigned char)i;

         mb_char [mb_char.size () - 1] = char (cur_char);

         if (mb_map_end == charmap_.get_mb_cmap ().find (mb_char)) {
             // mb_char is not a complete, valid multibyte character
             // check to see if it's a prefix of one

             if (pfx_set->find (mb_char) == pfx_set->end ()) {
                 // mb_char is not a prefix of a valid multibyte
                 // character, mark it invalide
                 offsets->off [cur_char] = UINT_MAX;
             }
             else {
                 // mb_char is a prefix of a valid multibyte character,
                 // set the MSB to denote that it "continues" in the
                 // table at the next higher offset
                 offsets->off [cur_char] = ++ntabs | 0x80000000;

                 // generate that table
                 nchars += gen_mbchar_tables (tab, off_map, mb_char, ntabs);
             }
         }
         else {
             // mb_char is a complete, valid miltibyte character
             // insert its ordinal number (offset) into the array
             offsets->off [cur_char] = off_map.find (mb_char)->second;
             ++nchars;
         }
     }

     // insert the completely populated table into the map
     tab.insert (std::make_pair (tabno, offsets));

     if (0 == ntabs) {
         // clean up on return from the topmost (non-recursive) call
         delete pfx_set;
         pfx_set = 0;
     }

     return nchars;
 }


 std::size_t Def::
 gen_wchar_tables (codecvt_offsets_map_t &tab,
                   const std::string     &charp /* = "" */,
                   unsigned int           tabno /* = 0 */)
 {
     // upon the first call (but not during subsequent recursive calls)
     // generate a set of multibyte prefixes from the set of all known
     // multibyte characters
     static unsigned                            ntabs   = 0;
     static std::set<std::string>              *pfx_set = 0;
     static std::map<std::string, unsigned>    *off_map = 0;
     static std::map<std::string, std::string> *utf_map = 0;

     if (0 == utf_map) {
         pfx_set = new std::set<std::string>;
         off_map = new std::map<std::string, unsigned>;
         utf_map = new std::map<std::string, std::string>;

         const n_cmap_citer2 first = charmap_.get_mb_cmap ().begin ();
         const n_cmap_citer2 last  = charmap_.get_mb_cmap ().end ();

         unsigned off = 0;

         for (n_cmap_citer2 it = first; it != last; ++it) {

             off_map->insert (std::make_pair (it->first, off));

             off += it->first.size () + 1;

             std::string utf = utf8_encode (it->second);

             utf_map->insert (std::make_pair (utf, it->first));

             while (1 < utf.size ()) {
                 utf = utf.substr (0, utf.size () - 1);
                 pfx_set->insert (utf);
             }
         }
     }

     codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

     // number of valid characters inserted into the tables
     std::size_t nchars = 0;

     std::string mb_char (charp + '\0');

     for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

         unsigned char cur_char = (unsigned char)i;

         mb_char [mb_char.size () - 1] = char (cur_char);

         const wchar_utf8_iter it = utf_map->find (mb_char);
         if (it == utf_map->end ()) {
             if (pfx_set->find (mb_char) == pfx_set->end ()) {
                 offsets->off [cur_char] = UINT_MAX;
             }
             else {
                 offsets->off [cur_char] = ++ntabs | 0x80000000;

                 nchars += gen_wchar_tables (tab, mb_char, ntabs);
             }
         }
         else {
             offsets->off [cur_char] = off_map->find (it->second)->second;

             ++nchars;
         }
     }

     tab.insert (std::make_pair (tabno, offsets));

     if (0 == ntabs) {
         // clean up
         delete pfx_set;
         delete utf_map;

         pfx_set = 0;
         utf_map = 0;
     }

     return nchars;
 }


 std::size_t Def::
 gen_utf8_tables (codecvt_offsets_map_t           &tab,
                  std::map<std::string, unsigned> &off_map,
                  const std::string               &charp /* = "" */,
                  unsigned                         tabno /* = 0 */)
 {
     static unsigned                        ntabs   = 0;
     static std::set<std::string>          *pfx_set = 0;
     static std::map<std::string, wchar_t> *utf_map = 0;

     if (0 == pfx_set) {
         pfx_set = new std::set<std::string>;

         const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
         const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();

         for (ucs4_cmap_iter it = first; it != last; ++it) {

             for (std::string prefix = utf8_encode (it->second);
                  1 < prefix.size (); ) {
                 prefix = prefix.substr (0, prefix.size () - 1);
                 pfx_set->insert (prefix);
             }
         }
     }

     // the set of complete utf8 strings in the current character map
     typedef std::map<std::string, wchar_t>::iterator utf8_map_iter;

     if (0 == utf_map) {
         utf_map = new std::map<std::string, wchar_t>;

         const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
         const ucs4_cmap_iter last  = charmap_.get_ucs4_cmap ().end ();

         for (ucs4_cmap_iter it = first; it != last; ++it) {
             const std::string utf = utf8_encode (it->second);
             utf_map->insert (std::make_pair (utf, it->second));
         }
     }

     codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

     // number of valid characters inserted into the tables
     std::size_t nchars = 0;

     std::string mb_char = charp + '\0';

     for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {

         unsigned char cur_char = (unsigned char)i;

         mb_char [mb_char.size () - 1] = char (cur_char);

         const utf8_map_iter where = utf_map->find (mb_char);

         if (where == utf_map->end ()) {
             if (pfx_set->find (mb_char) == pfx_set->end ()) {
                 offsets->off [cur_char] = UINT_MAX;
             }
             else {
                 offsets->off [cur_char] = ++ntabs | 0x80000000;
                 nchars += gen_utf8_tables (tab, off_map, mb_char, ntabs);
             }
         }
         else {
             // first get the symbolic name
             std::string str
                 = charmap_.get_rucs4_cmap ().find (where->second)->second;

             // then get the internal encoding of the character
             const wchar_t int_enc = charmap_.get_w_cmap().find (str)->second;

             // then get the external encoding to use in a lookup in
             // mb_char_off_map
             str = charmap_.get_rmb_cmap ().find (int_enc)->second;

             offsets->off [cur_char] = off_map.find (str)->second;

             ++nchars;
         }
     }

     tab.insert (std::make_pair (tabno, offsets));

     if (0 == ntabs) {
         // clean up
         delete pfx_set;
         delete utf_map;

         pfx_set = 0;
         utf_map = 0;
     }
     return nchars;
 }


 void Def::
 gen_xlit_data ()
 {
     // data offset points to the beginning of the data containing
     // the narrow strings character encodings
     unsigned int data_offset = 0;

     // traverse the map and construct the map of offsets
     xlit_map_t::const_iterator it = xlit_map_.begin ();
     for (; it != xlit_map_.end (); ++it) {
         // insert pair(wchar_t value, offset of first string in data block)
         xlit_data_offset_map_.insert (
             std::make_pair (it->first,data_offset));

         // advance the data_offset value to the next "first" string
         std::list<std::string>::const_iterator sit =
             it->second.begin ();
         for (; sit != it->second.end (); ++sit) {
             data_offset += sit->size () + 1;
         }
         ++data_offset;
     }

     // create a new table (first), populate it with default values
     // and insert it in the tables map
     xlit_offset_table_t table0;
     unsigned int k;
     for (k = 0; k < UCHAR_MAX + 1; ++k)
         table0.offset_table [k] = UINT_MAX;

     // insert it into the map
     xlit_table_map_.insert (std::make_pair(0, table0));

     const xlit_map_t::const_iterator xlit_map_end = xlit_map_.end ();

     // traverse the map again and build the tables
     for (it = xlit_map_.begin (); it != xlit_map_end; ++it) {

         // encode the wchar_t value to UTF-8
         const std::string utf8_rep (utf8_encode (it->first));
         data_offset = xlit_data_offset_map_.find (it->first)->second;

         // traverse the utf8 representation string and create the
         // necessary tables and populate the indexes
         unsigned int table_idx = 0;

         const std::string::const_iterator utf8_rep_end = utf8_rep.end ();
         std::string::const_iterator       string_it    = utf8_rep.begin ();

         for (; string_it != utf8_rep_end; ++string_it) {
             // get the table corresponding to the current index and locate
             // the value at that index
             const xlit_table_map_t::iterator res =
                 xlit_table_map_.find (table_idx);

             assert (res != xlit_table_map_.end ());

             // offset in table
             unsigned char off_idx = (unsigned char)*string_it;

             // res is the iterator pointing to the correct table in the map
             // check the index and if not populated, create a new table
             if (res->second.offset_table [off_idx] == UINT_MAX) {

                 // if this is the last position in the string, then
                 // fill the table position with the offset of the string data
                 if ((string_it + 1) == utf8_rep.end ()) {
                     xlit_data_offset_map_t::const_iterator data_it =
                         xlit_data_offset_map_.find (it->first);
                     assert (data_it != xlit_data_offset_map_.end ());

                     // fill the table position with the found offset
                     res->second.offset_table [off_idx] = data_it->second;
                     continue;
                 }

                 // create a new table and append it to the map
                 xlit_offset_table_t table;
                 for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i)
                     table.offset_table [i] = UINT_MAX;

                 // insert it into the map
                 unsigned int tmp = xlit_table_map_.size ();
                 xlit_table_map_.insert (std::make_pair(tmp, table));

                 // store its index at correct position in current table
                 res->second.offset_table [off_idx] = tmp | 0x80000000;
                 table_idx = tmp;
             } else {
                 table_idx =
                     res->second.offset_table [off_idx] & 0x7FFFFFFF;
             }
         }
     }
 }


 void Def::
 write_codecvt (std::string dir_name)
 {
     // if it has been already written
     if (codecvt_written_)
         return;

     // compose the directory name
     ((dir_name += _RWSTD_PATH_SEP) += "..") += _RWSTD_PATH_SEP;
     dir_name += charmap_.get_code_set_name ();

     // check to see if the codecvt database already exists and
     // avoid recreating it if it does (as an optimization)
     if (std::ifstream (dir_name.c_str ())) {
         issue_diag (I_OPENWR, false, 0,
                     "%s exists, skipping\n", dir_name.c_str ());
         return;
     }

     //////////////////////////////////////////////////////////////////
     // generate multibyte conversion tables
     issue_diag (I_STAGE, false, 0, "generating multibyte tables\n");

     codecvt_offsets_map_t           mbchar_offs;
     std::map<std::string, unsigned> off_map;
     const std::size_t n_mbchars = gen_mbchar_tables (mbchar_offs, off_map);

     // generate wchar_t conversion tables
     issue_diag (I_STAGE, false, 0, "generating wchar_t tables\n");

     codecvt_offsets_map_t wchar_offs;
     const std::size_t n_wchars = gen_wchar_tables (wchar_offs);

     // generate UTF-8 conversion conversion tables
     issue_diag (I_STAGE, false, 0, "generating UTF-8 tables\n");

     codecvt_offsets_map_t uchar_offs;
     const std::size_t n_uchars = gen_utf8_tables (uchar_offs, off_map);

     // not needed beyond this point, clear it out
     off_map.clear ();

     // generate the transliteration tables and the transliteration data
     issue_diag (I_STAGE, false, 0, "generating transliteration tables\n");
     gen_xlit_data ();

     //////////////////////////////////////////////////////////////////
     // populate the codecvt structure before writing it out
     // in binary form to the file (the codecvt database)
     _RW::__rw_codecvt_t codecvt_out;
     std::memset (&codecvt_out, 0, sizeof codecvt_out);

     // calculate byte offsets within the structure
     codecvt_out.n_to_w_tab_off = 0;
     codecvt_out.w_to_n_tab_off = codecvt_out.n_to_w_tab_off
         + mbchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

     codecvt_out.utf8_to_ext_tab_off = codecvt_out.w_to_n_tab_off
         + wchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

     // insert the transliteration tables here
     codecvt_out.xliteration_off = codecvt_out.utf8_to_ext_tab_off
         + uchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

     codecvt_out.wchar_off = codecvt_out.xliteration_off
         + xlit_table_map_.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

     codecvt_out.codeset_off = codecvt_out.wchar_off
         + charmap_.get_mb_cmap ().size () * 2 * sizeof (wchar_t);

     codecvt_out.charmap_off = codecvt_out.codeset_off
         + charmap_.get_code_set_name ().size () + 1 /* NUL */;

     const std::size_t mb_offset = codecvt_out.charmap_off
         + charmap_.get_charmap_name ().size () + 1 /* NUL */;

     // compute the size of narrow strings map which added to
     // mb_offset will give the start of the transliteration data
     std::size_t xlit_data_offset = mb_offset;

     mb_cmap_iter iter;

     for (iter = charmap_.get_mb_cmap ().begin();
          iter != charmap_.get_mb_cmap().end(); ++iter) {
         xlit_data_offset += iter->first.size() + 1;
     }

     // now traverse again the utf8 tables for transliteration data
     // and recompute the offsets:
     const xlit_table_map_t::const_iterator xlit_table_map_end =
         xlit_table_map_.end ();

     xlit_table_map_t::iterator xit = xlit_table_map_.begin ();
     for (; xit != xlit_table_map_end; ++xit) {
         for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) {
             if (xit->second.offset_table [i] & 0x80000000)
                 continue;
             // add the offset for xliteration data
             xit->second.offset_table [i] += xlit_data_offset;
         }
     }

     codecvt_out.mb_cur_max = charmap_.get_mb_cur_max();

     issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());

     // create the stream with exceptions enabled
     std::ofstream out (dir_name.c_str(), std::ios::binary);
     out.exceptions (std::ios::failbit | std::ios::badbit);

     // write the codecvt_out structure
     out.write ((char*)&codecvt_out, sizeof codecvt_out);

     typedef codecvt_offsets_map_t::iterator off_iter_t;

     //////////////////////////////////////////////////////////////////
     // write out the multibyte to wchar_t tables
     issue_diag (I_WRITE, false, 0,
                 "writing %lu multibyte tables (%lu characters)\n",
                 mbchar_offs.size (), n_mbchars);

     for (off_iter_t it = mbchar_offs.begin (); it != mbchar_offs.end (); ++it) {
         for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

             const unsigned off = it->second->off [i];

             out.write ((const char*)&off, sizeof off);
         }

         delete it->second;
     }

     // not needed beyond this point, clear it out
     mbchar_offs.clear ();

     //////////////////////////////////////////////////////////////////
     // write out the wchar_t to multibyte conversion tables
     issue_diag (I_WRITE, false, 0,
                 "writing %lu wchar_t tables (%lu characters)\n",
                 wchar_offs.size (), n_wchars);

     for (off_iter_t it = wchar_offs.begin (); it != wchar_offs.end (); ++it) {
         for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

             // adjust offsets to multibyte characters (but not those
             // to other tables or invalid encodings)
             unsigned off = it->second->off [i];

             if (!(off & 0x80000000))
                 off += mb_offset;

             out.write ((const char*)&off, sizeof off);
         }

         delete it->second;
     }

     // not needed beyond this point, clear it out
     wchar_offs.clear ();

     //////////////////////////////////////////////////////////////////
     // write out the UTF-8 to (libc) multibyte tables
     issue_diag (I_WRITE, false, 0,
                 "writing %lu UTF-8 tables (%lu characters)\n",
                 uchar_offs.size (), n_uchars);

     for (off_iter_t it = uchar_offs.begin (); it != uchar_offs.end (); ++it) {
         for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

             // adjust offsets to multibyte characters (but not those
             // to other tables or invalid encodings)
             unsigned off = it->second->off [i];

             if (!(off & 0x80000000))
                 off += mb_offset;

             out.write ((const char*)&off, sizeof off);
         }

         delete it->second;
     }

     // not needed beyond this point, clear it out
     uchar_offs.clear ();

     //////////////////////////////////////////////////////////////////
     // write out the transliteration UTF-8 lookup tables
     issue_diag (I_WRITE, false, 0,
                 "writing transliteration table (size %lu)\n",
                 xlit_table_map_.size ());

     xit = xlit_table_map_.begin ();
     for (; xit != xlit_table_map_end; ++xit) {
         const unsigned int* ptable = &xit->second.offset_table [0];
         for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i, ++ptable)
             out.write ((const char*)ptable, sizeof (unsigned int));
     }

     issue_diag (I_WRITE, false, 0,
                 "writing the UCS table (%lu characters)\n",
                 charmap_.get_mb_cmap ().size ());

     const mb_cmap_iter n_cmap2_end = charmap_.get_mb_cmap ().end ();

     // write the locale-encoded wchar_t and the UCS4 wchar_t
     for (iter = charmap_.get_mb_cmap ().begin();
          iter != n_cmap2_end; ++iter) {
         out.write ((const char*)&iter->second, sizeof (iter->second));
         out.write ((const char*)& (charmap_.get_ucs4_cmap().find
                                    (charmap_.get_rw_cmap().find
                                     (iter->second)->second))->second,
                    sizeof (wchar_t));
     }

     // write the code_set_name string and charmap string
     out << charmap_.get_code_set_name() << std::ends
         << charmap_.get_charmap_name() << std::ends;


     // write out the narrow character strings
     for (iter = charmap_.get_mb_cmap().begin();
          iter != n_cmap2_end; ++iter) {
         out.write (iter->first.c_str(), iter->first.size() + 1);
     }

     issue_diag (I_WRITE, false, 0,
                 "writing transliteration data (size %lu)\n",
                 xlit_map_.size ());

     // write out the transliteration data
     xlit_map_t::const_iterator xlit_data_it = xlit_map_.begin ();
     for (; xlit_data_it != xlit_map_.end (); ++xlit_data_it) {
         std::list<std::string>::const_iterator sit =
             xlit_data_it->second.begin ();
         for (; sit != xlit_data_it->second.end (); ++sit) {
             out.write (sit->c_str (), sit->size () + 1);
         }
         out.write ("\0", 1);
     }
 }
	/***************************************************************************
	*
	* codecvt.cpp
	*
	* $Id$
	*
	***************************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed
	* with this work for additional information regarding copyright
	* ownership. The ASF licenses this file to you under the Apache
	* License, Version 2.0 (the "License"); you may not use this file
	* except in compliance with the License. You may obtain a copy of
	* the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
	* implied. See the License for the specific language governing
	* permissions and limitations under the License.
	*
	* Copyright 2001-2006 Rogue Wave Software.
	*
	**************************************************************************/

	#include "diagnostic.h" // for issue_diag()
	#include "def.h" // for Def

	#include "path.h" // for get_pathname()
	#include "scanner.h" // for scanner

	#include <cassert> // for assert()
	#include <climits> // for UCHAR_MAX
	#include <cstring> // for memset()
	#include <fstream> // for ifstream, ofstream


	typedef std::map<std::string, wchar_t>::const_iterator n_cmap_citer2;


	std::size_t Def::
	gen_mbchar_tables (codecvt_offsets_map_t &tab,
	std::map<std::string, unsigned> &off_map,
	const std::string &charp /* = "" */,
	unsigned tabno /* = 0 */)
	{
	// upon the first call (but not during subsequent recursive calls)
	// generate a set of multibyte prefixes from the set of all known
	// multibyte characters
	static unsigned ntabs = 0;
	static std::set<std::string>* pfx_set = 0;

	const n_cmap_citer2 mb_map_end = charmap_.get_mb_cmap ().end ();

	if (0 == pfx_set) {
	pfx_set = new std::set<std::string>;

	// iterate over the range of valid multibyte characters
	// obtained from the charmap and generate a complete
	// subset of non-empty multibyte prefixes from each
	unsigned off = 0;

	const n_cmap_citer2 mb_map_begin = charmap_.get_mb_cmap ().begin ();

	for (n_cmap_citer2 it = mb_map_begin; it != mb_map_end; ++it, ++off) {

	// insert the ordinal number of each multibyte character
	// into a map for fast lookup later
	off_map.insert (std::make_pair (it->first, off));

	// generate non-empty prefixes up to one byte less
	// in length than the complete multibyte character
	for (std::string prefix = it->first; 1 < prefix.size (); ) {
	prefix = prefix.substr (0, prefix.size () - 1);
	pfx_set->insert (prefix);
	}
	}
	}

	// number of valid characters inserted into the tables
	std::size_t nchars = 0;

	// an array of offsets to the multibyte character or to the next
	// array containing such offsets (defined recursively for up to
	// MB_CUR_MAX levels of nesting)
	codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

	std::string mb_char (charp + '\0');

	for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

	unsigned char cur_char = (unsigned char)i;

	mb_char [mb_char.size () - 1] = char (cur_char);

	if (mb_map_end == charmap_.get_mb_cmap ().find (mb_char)) {
	// mb_char is not a complete, valid multibyte character
	// check to see if it's a prefix of one

	if (pfx_set->find (mb_char) == pfx_set->end ()) {
	// mb_char is not a prefix of a valid multibyte
	// character, mark it invalide
	offsets->off [cur_char] = UINT_MAX;
	}
	else {
	// mb_char is a prefix of a valid multibyte character,
	// set the MSB to denote that it "continues" in the
	// table at the next higher offset
	offsets->off [cur_char] = ++ntabs \| 0x80000000;

	// generate that table
	nchars += gen_mbchar_tables (tab, off_map, mb_char, ntabs);
	}
	}
	else {
	// mb_char is a complete, valid miltibyte character
	// insert its ordinal number (offset) into the array
	offsets->off [cur_char] = off_map.find (mb_char)->second;
	++nchars;
	}
	}

	// insert the completely populated table into the map
	tab.insert (std::make_pair (tabno, offsets));

	if (0 == ntabs) {
	// clean up on return from the topmost (non-recursive) call
	delete pfx_set;
	pfx_set = 0;
	}

	return nchars;
	}


	std::size_t Def::
	gen_wchar_tables (codecvt_offsets_map_t &tab,
	const std::string &charp /* = "" */,
	unsigned int tabno /* = 0 */)
	{
	// upon the first call (but not during subsequent recursive calls)
	// generate a set of multibyte prefixes from the set of all known
	// multibyte characters
	static unsigned ntabs = 0;
	static std::set<std::string> *pfx_set = 0;
	static std::map<std::string, unsigned> *off_map = 0;
	static std::map<std::string, std::string> *utf_map = 0;

	if (0 == utf_map) {
	pfx_set = new std::set<std::string>;
	off_map = new std::map<std::string, unsigned>;
	utf_map = new std::map<std::string, std::string>;

	const n_cmap_citer2 first = charmap_.get_mb_cmap ().begin ();
	const n_cmap_citer2 last = charmap_.get_mb_cmap ().end ();

	unsigned off = 0;

	for (n_cmap_citer2 it = first; it != last; ++it) {

	off_map->insert (std::make_pair (it->first, off));

	off += it->first.size () + 1;

	std::string utf = utf8_encode (it->second);

	utf_map->insert (std::make_pair (utf, it->first));

	while (1 < utf.size ()) {
	utf = utf.substr (0, utf.size () - 1);
	pfx_set->insert (utf);
	}
	}
	}

	codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

	// number of valid characters inserted into the tables
	std::size_t nchars = 0;

	std::string mb_char (charp + '\0');

	for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

	unsigned char cur_char = (unsigned char)i;

	mb_char [mb_char.size () - 1] = char (cur_char);

	const wchar_utf8_iter it = utf_map->find (mb_char);
	if (it == utf_map->end ()) {
	if (pfx_set->find (mb_char) == pfx_set->end ()) {
	offsets->off [cur_char] = UINT_MAX;
	}
	else {
	offsets->off [cur_char] = ++ntabs \| 0x80000000;

	nchars += gen_wchar_tables (tab, mb_char, ntabs);
	}
	}
	else {
	offsets->off [cur_char] = off_map->find (it->second)->second;

	++nchars;
	}
	}

	tab.insert (std::make_pair (tabno, offsets));

	if (0 == ntabs) {
	// clean up
	delete pfx_set;
	delete utf_map;

	pfx_set = 0;
	utf_map = 0;
	}

	return nchars;
	}


	std::size_t Def::
	gen_utf8_tables (codecvt_offsets_map_t &tab,
	std::map<std::string, unsigned> &off_map,
	const std::string &charp /* = "" */,
	unsigned tabno /* = 0 */)
	{
	static unsigned ntabs = 0;
	static std::set<std::string> *pfx_set = 0;
	static std::map<std::string, wchar_t> *utf_map = 0;

	if (0 == pfx_set) {
	pfx_set = new std::set<std::string>;

	const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
	const ucs4_cmap_iter last = charmap_.get_ucs4_cmap ().end ();

	for (ucs4_cmap_iter it = first; it != last; ++it) {

	for (std::string prefix = utf8_encode (it->second);
	1 < prefix.size (); ) {
	prefix = prefix.substr (0, prefix.size () - 1);
	pfx_set->insert (prefix);
	}
	}
	}

	// the set of complete utf8 strings in the current character map
	typedef std::map<std::string, wchar_t>::iterator utf8_map_iter;

	if (0 == utf_map) {
	utf_map = new std::map<std::string, wchar_t>;

	const ucs4_cmap_iter first = charmap_.get_ucs4_cmap ().begin ();
	const ucs4_cmap_iter last = charmap_.get_ucs4_cmap ().end ();

	for (ucs4_cmap_iter it = first; it != last; ++it) {
	const std::string utf = utf8_encode (it->second);
	utf_map->insert (std::make_pair (utf, it->second));
	}
	}

	codecvt_offset_tab_t* const offsets = new codecvt_offset_tab_t;

	// number of valid characters inserted into the tables
	std::size_t nchars = 0;

	std::string mb_char = charp + '\0';

	for (unsigned int i = 0; i <= UCHAR_MAX; ++i) {

	unsigned char cur_char = (unsigned char)i;

	mb_char [mb_char.size () - 1] = char (cur_char);

	const utf8_map_iter where = utf_map->find (mb_char);

	if (where == utf_map->end ()) {
	if (pfx_set->find (mb_char) == pfx_set->end ()) {
	offsets->off [cur_char] = UINT_MAX;
	}
	else {
	offsets->off [cur_char] = ++ntabs \| 0x80000000;
	nchars += gen_utf8_tables (tab, off_map, mb_char, ntabs);
	}
	}
	else {
	// first get the symbolic name
	std::string str
	= charmap_.get_rucs4_cmap ().find (where->second)->second;

	// then get the internal encoding of the character
	const wchar_t int_enc = charmap_.get_w_cmap().find (str)->second;

	// then get the external encoding to use in a lookup in
	// mb_char_off_map
	str = charmap_.get_rmb_cmap ().find (int_enc)->second;

	offsets->off [cur_char] = off_map.find (str)->second;

	++nchars;
	}
	}

	tab.insert (std::make_pair (tabno, offsets));

	if (0 == ntabs) {
	// clean up
	delete pfx_set;
	delete utf_map;

	pfx_set = 0;
	utf_map = 0;
	}
	return nchars;
	}


	void Def::
	gen_xlit_data ()
	{
	// data offset points to the beginning of the data containing
	// the narrow strings character encodings
	unsigned int data_offset = 0;

	// traverse the map and construct the map of offsets
	xlit_map_t::const_iterator it = xlit_map_.begin ();
	for (; it != xlit_map_.end (); ++it) {
	// insert pair(wchar_t value, offset of first string in data block)
	xlit_data_offset_map_.insert (
	std::make_pair (it->first,data_offset));

	// advance the data_offset value to the next "first" string
	std::list<std::string>::const_iterator sit =
	it->second.begin ();
	for (; sit != it->second.end (); ++sit) {
	data_offset += sit->size () + 1;
	}
	++data_offset;
	}

	// create a new table (first), populate it with default values
	// and insert it in the tables map
	xlit_offset_table_t table0;
	unsigned int k;
	for (k = 0; k < UCHAR_MAX + 1; ++k)
	table0.offset_table [k] = UINT_MAX;

	// insert it into the map
	xlit_table_map_.insert (std::make_pair(0, table0));

	const xlit_map_t::const_iterator xlit_map_end = xlit_map_.end ();

	// traverse the map again and build the tables
	for (it = xlit_map_.begin (); it != xlit_map_end; ++it) {

	// encode the wchar_t value to UTF-8
	const std::string utf8_rep (utf8_encode (it->first));
	data_offset = xlit_data_offset_map_.find (it->first)->second;

	// traverse the utf8 representation string and create the
	// necessary tables and populate the indexes
	unsigned int table_idx = 0;

	const std::string::const_iterator utf8_rep_end = utf8_rep.end ();
	std::string::const_iterator string_it = utf8_rep.begin ();

	for (; string_it != utf8_rep_end; ++string_it) {
	// get the table corresponding to the current index and locate
	// the value at that index
	const xlit_table_map_t::iterator res =
	xlit_table_map_.find (table_idx);

	assert (res != xlit_table_map_.end ());

	// offset in table
	unsigned char off_idx = (unsigned char)*string_it;

	// res is the iterator pointing to the correct table in the map
	// check the index and if not populated, create a new table
	if (res->second.offset_table [off_idx] == UINT_MAX) {

	// if this is the last position in the string, then
	// fill the table position with the offset of the string data
	if ((string_it + 1) == utf8_rep.end ()) {
	xlit_data_offset_map_t::const_iterator data_it =
	xlit_data_offset_map_.find (it->first);
	assert (data_it != xlit_data_offset_map_.end ());

	// fill the table position with the found offset
	res->second.offset_table [off_idx] = data_it->second;
	continue;
	}

	// create a new table and append it to the map
	xlit_offset_table_t table;
	for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i)
	table.offset_table [i] = UINT_MAX;

	// insert it into the map
	unsigned int tmp = xlit_table_map_.size ();
	xlit_table_map_.insert (std::make_pair(tmp, table));

	// store its index at correct position in current table
	res->second.offset_table [off_idx] = tmp \| 0x80000000;
	table_idx = tmp;
	} else {
	table_idx =
	res->second.offset_table [off_idx] & 0x7FFFFFFF;
	}
	}
	}
	}


	void Def::
	write_codecvt (std::string dir_name)
	{
	// if it has been already written
	if (codecvt_written_)
	return;

	// compose the directory name
	((dir_name += _RWSTD_PATH_SEP) += "..") += _RWSTD_PATH_SEP;
	dir_name += charmap_.get_code_set_name ();

	// check to see if the codecvt database already exists and
	// avoid recreating it if it does (as an optimization)
	if (std::ifstream (dir_name.c_str ())) {
	issue_diag (I_OPENWR, false, 0,
	"%s exists, skipping\n", dir_name.c_str ());
	return;
	}

	//////////////////////////////////////////////////////////////////
	// generate multibyte conversion tables
	issue_diag (I_STAGE, false, 0, "generating multibyte tables\n");

	codecvt_offsets_map_t mbchar_offs;
	std::map<std::string, unsigned> off_map;
	const std::size_t n_mbchars = gen_mbchar_tables (mbchar_offs, off_map);

	// generate wchar_t conversion tables
	issue_diag (I_STAGE, false, 0, "generating wchar_t tables\n");

	codecvt_offsets_map_t wchar_offs;
	const std::size_t n_wchars = gen_wchar_tables (wchar_offs);

	// generate UTF-8 conversion conversion tables
	issue_diag (I_STAGE, false, 0, "generating UTF-8 tables\n");

	codecvt_offsets_map_t uchar_offs;
	const std::size_t n_uchars = gen_utf8_tables (uchar_offs, off_map);

	// not needed beyond this point, clear it out
	off_map.clear ();

	// generate the transliteration tables and the transliteration data
	issue_diag (I_STAGE, false, 0, "generating transliteration tables\n");
	gen_xlit_data ();

	//////////////////////////////////////////////////////////////////
	// populate the codecvt structure before writing it out
	// in binary form to the file (the codecvt database)
	_RW::__rw_codecvt_t codecvt_out;
	std::memset (&codecvt_out, 0, sizeof codecvt_out);

	// calculate byte offsets within the structure
	codecvt_out.n_to_w_tab_off = 0;
	codecvt_out.w_to_n_tab_off = codecvt_out.n_to_w_tab_off
	+ mbchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

	codecvt_out.utf8_to_ext_tab_off = codecvt_out.w_to_n_tab_off
	+ wchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

	// insert the transliteration tables here
	codecvt_out.xliteration_off = codecvt_out.utf8_to_ext_tab_off
	+ uchar_offs.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

	codecvt_out.wchar_off = codecvt_out.xliteration_off
	+ xlit_table_map_.size () * (UCHAR_MAX + 1) * sizeof (unsigned);

	codecvt_out.codeset_off = codecvt_out.wchar_off
	+ charmap_.get_mb_cmap ().size () * 2 * sizeof (wchar_t);

	codecvt_out.charmap_off = codecvt_out.codeset_off
	+ charmap_.get_code_set_name ().size () + 1 /* NUL */;

	const std::size_t mb_offset = codecvt_out.charmap_off
	+ charmap_.get_charmap_name ().size () + 1 /* NUL */;

	// compute the size of narrow strings map which added to
	// mb_offset will give the start of the transliteration data
	std::size_t xlit_data_offset = mb_offset;

	mb_cmap_iter iter;

	for (iter = charmap_.get_mb_cmap ().begin();
	iter != charmap_.get_mb_cmap().end(); ++iter) {
	xlit_data_offset += iter->first.size() + 1;
	}

	// now traverse again the utf8 tables for transliteration data
	// and recompute the offsets:
	const xlit_table_map_t::const_iterator xlit_table_map_end =
	xlit_table_map_.end ();

	xlit_table_map_t::iterator xit = xlit_table_map_.begin ();
	for (; xit != xlit_table_map_end; ++xit) {
	for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i) {
	if (xit->second.offset_table [i] & 0x80000000)
	continue;
	// add the offset for xliteration data
	xit->second.offset_table [i] += xlit_data_offset;
	}
	}

	codecvt_out.mb_cur_max = charmap_.get_mb_cur_max();

	issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ());

	// create the stream with exceptions enabled
	std::ofstream out (dir_name.c_str(), std::ios::binary);
	out.exceptions (std::ios::failbit \| std::ios::badbit);

	// write the codecvt_out structure
	out.write ((char*)&codecvt_out, sizeof codecvt_out);

	typedef codecvt_offsets_map_t::iterator off_iter_t;

	//////////////////////////////////////////////////////////////////
	// write out the multibyte to wchar_t tables
	issue_diag (I_WRITE, false, 0,
	"writing %lu multibyte tables (%lu characters)\n",
	mbchar_offs.size (), n_mbchars);

	for (off_iter_t it = mbchar_offs.begin (); it != mbchar_offs.end (); ++it) {
	for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

	const unsigned off = it->second->off [i];

	out.write ((const char*)&off, sizeof off);
	}

	delete it->second;
	}

	// not needed beyond this point, clear it out
	mbchar_offs.clear ();

	//////////////////////////////////////////////////////////////////
	// write out the wchar_t to multibyte conversion tables
	issue_diag (I_WRITE, false, 0,
	"writing %lu wchar_t tables (%lu characters)\n",
	wchar_offs.size (), n_wchars);

	for (off_iter_t it = wchar_offs.begin (); it != wchar_offs.end (); ++it) {
	for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

	// adjust offsets to multibyte characters (but not those
	// to other tables or invalid encodings)
	unsigned off = it->second->off [i];

	if (!(off & 0x80000000))
	off += mb_offset;

	out.write ((const char*)&off, sizeof off);
	}

	delete it->second;
	}

	// not needed beyond this point, clear it out
	wchar_offs.clear ();

	//////////////////////////////////////////////////////////////////
	// write out the UTF-8 to (libc) multibyte tables
	issue_diag (I_WRITE, false, 0,
	"writing %lu UTF-8 tables (%lu characters)\n",
	uchar_offs.size (), n_uchars);

	for (off_iter_t it = uchar_offs.begin (); it != uchar_offs.end (); ++it) {
	for (unsigned i = 0; i <= UCHAR_MAX; ++i) {

	// adjust offsets to multibyte characters (but not those
	// to other tables or invalid encodings)
	unsigned off = it->second->off [i];

	if (!(off & 0x80000000))
	off += mb_offset;

	out.write ((const char*)&off, sizeof off);
	}

	delete it->second;
	}

	// not needed beyond this point, clear it out
	uchar_offs.clear ();

	//////////////////////////////////////////////////////////////////
	// write out the transliteration UTF-8 lookup tables
	issue_diag (I_WRITE, false, 0,
	"writing transliteration table (size %lu)\n",
	xlit_table_map_.size ());

	xit = xlit_table_map_.begin ();
	for (; xit != xlit_table_map_end; ++xit) {
	const unsigned int* ptable = &xit->second.offset_table [0];
	for (unsigned int i = 0; i < UCHAR_MAX + 1; ++i, ++ptable)
	out.write ((const char*)ptable, sizeof (unsigned int));
	}

	issue_diag (I_WRITE, false, 0,
	"writing the UCS table (%lu characters)\n",
	charmap_.get_mb_cmap ().size ());

	const mb_cmap_iter n_cmap2_end = charmap_.get_mb_cmap ().end ();

	// write the locale-encoded wchar_t and the UCS4 wchar_t
	for (iter = charmap_.get_mb_cmap ().begin();
	iter != n_cmap2_end; ++iter) {
	out.write ((const char*)&iter->second, sizeof (iter->second));
	out.write ((const char*)& (charmap_.get_ucs4_cmap().find
	(charmap_.get_rw_cmap().find
	(iter->second)->second))->second,
	sizeof (wchar_t));
	}

	// write the code_set_name string and charmap string
	out << charmap_.get_code_set_name() << std::ends
	<< charmap_.get_charmap_name() << std::ends;


	// write out the narrow character strings
	for (iter = charmap_.get_mb_cmap().begin();
	iter != n_cmap2_end; ++iter) {
	out.write (iter->first.c_str(), iter->first.size() + 1);
	}

	issue_diag (I_WRITE, false, 0,
	"writing transliteration data (size %lu)\n",
	xlit_map_.size ());

	// write out the transliteration data
	xlit_map_t::const_iterator xlit_data_it = xlit_map_.begin ();
	for (; xlit_data_it != xlit_map_.end (); ++xlit_data_it) {
	std::list<std::string>::const_iterator sit =
	xlit_data_it->second.begin ();
	for (; sit != xlit_data_it->second.end (); ++sit) {
	out.write (sit->c_str (), sit->size () + 1);
	}
	out.write ("\0", 1);
	}
	}