tools/ICUData/ICUData.cpp - xerces-c - Git at Google

 /*
  * The Apache Software License, Version 1.1
  *
  * Copyright (c) 1999-2000 The Apache Software Foundation.  All rights
  * reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * 3. The end-user documentation included with the redistribution,
  *    if any, must include the following acknowledgment:
  *       "This product includes software developed by the
  *        Apache Software Foundation (http://www.apache.org/)."
  *    Alternately, this acknowledgment may appear in the software itself,
  *    if and wherever such third-party acknowledgments normally appear.
  *
  * 4. The names "Xerces" and "Apache Software Foundation" must
  *    not be used to endorse or promote products derived from this
  *    software without prior written permission. For written
  *    permission, please contact apache\@apache.org.
  *
  * 5. Products derived from this software may not be called "Apache",
  *    nor may "Apache" appear in their name, without prior written
  *    permission of the Apache Software Foundation.
  *
  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  * ====================================================================
  *
  * This software consists of voluntary contributions made by many
  * individuals on behalf of the Apache Software Foundation, and was
  * originally based on software copyright (c) 1999, International
  * Business Machines, Inc., http://www.ibm.com .  For more information
  * on the Apache Software Foundation, please see
  * <http://www.apache.org/>.
  */


 /*
  * $Log$
  * Revision 1.1  2000/03/17 23:58:00  roddey
  * New utility for munging ICU UCM files and spitting out tables for
  * our intrinsic encoders.
  *
  */

 // ---------------------------------------------------------------------------
 //  This program is designed to parse a standard ICU .UCM file and spit out
 //  a C++ code fragment that represents the tables required by the intrinsic
 //  XML parser transcoders.
 //
 //  The file format is pretty simple and this program is not intended to be
 //  industrial strength by any means. Its use by anyone but the author is
 //  at the user's own risk.
 //
 //  The code looks for the min/max bytes per character to know what kind of
 //  table to spit out, but for now only handles single char sets.
 // ---------------------------------------------------------------------------


 // ---------------------------------------------------------------------------
 //  Includes
 // ---------------------------------------------------------------------------
 #include    <ctype.h>
 #include    <stdio.h>
 #include    <stdlib.h>
 #include    <iostream.h>
 #include    <string.h>


 // ---------------------------------------------------------------------------
 //  Const data
 // ---------------------------------------------------------------------------
 static const unsigned int   gMaxInRecs = 1024;


 // ---------------------------------------------------------------------------
 //  Local data types
 // ---------------------------------------------------------------------------
 struct XlatRec
 {
     unsigned short  uniVal;
     unsigned char   cpVal;
 };


 // ---------------------------------------------------------------------------
 //  Local data
 //
 //  gInFile
 //  gOutFile
 //      These are the file stream for the input UCM file and the output file
 //      that we write the C++ code to.
 //
 //  fLineNum
 //      Used to track the current line number in the source file, for error
 //      reporting.
 //
 //  gMainTable
 //  gMainTableSz
 //      This is the table that is filled in from the original source document.
 //      We don't know how big it will be, but its not likely to be much more
 //      than 300 entries or so (256 output code points with some multiply
 //      mapped Unicode code points.) So we make it extra large and watch for
 //      possible overflow.
 //
 //      The size value is bumped up as we load entries into it during the
 //      parse of the file.
 //
 //  gMaxChar
 //  gMinChar
 //      The min/max chars that are used to represent a character. These are
 //      read from the header of the input file.
 //
 //  gRepChar
 //      The replacement character to be used. This is read from the header of
 //      the input file.
 // ---------------------------------------------------------------------------
 static FILE*            gInFile;
 static FILE*            gOutFile;
 static unsigned int     fLineNum;
 static XlatRec          gMainTable[gMaxInRecs];
 static unsigned int     gMainTableSz = 0;
 static unsigned int     gMaxChar;
 static unsigned int     gMinChar;
 static unsigned char    gRepChar = 1;


 // ---------------------------------------------------------------------------
 //  Local functions
 // ---------------------------------------------------------------------------
 static unsigned int getLine(        char* const     toFill
                             , const unsigned int    maxChars
                             , const bool            eofOk = false)
 {
     while (true)
     {
         if (!fgets(toFill, maxChars, gInFile))
         {
             if (feof(gInFile))
             {
                 if (eofOk)
                     return ~0UL;
                 else
                     cout << "Unexpected end of input at line: " << fLineNum << endl;
             }
              else
             {
                 cout << "Error processing input at line: " << fLineNum << endl;
                 exit(1);
             }
         }
         fLineNum++;

         //
         //  If its not a comment, then break out
         //
         if (toFill[0] != '#')
             break;
     }

     //
     //  There could be a trailing comment on this line, so lets get rid
     //  of it. Search for a # char and put a null there.
     //
     char* endPtr = toFill;
     while (*endPtr && (*endPtr != '#'))
         endPtr++;
     if (*endPtr == '#')
         *endPtr = 0;

     // Strip trailing whitespace
     endPtr = toFill + (strlen(toFill) - 1);
     while (isspace(*endPtr))
         endPtr--;
     *(endPtr + 1) = 0;

     // And return the count of chars we got
     return strlen(toFill);
 }


 static unsigned int extractVal(char* const srcStr)
 {
     char* srcPtr = srcStr;

     // Run forward to the first non-space
     while (isspace(*srcPtr))
         srcPtr++;

     if (!*srcPtr)
     {
         cout << "Invalid numeric value on line: " << fLineNum << endl;
         exit(1);
     }

     //
     //  If it starts with \, then its a hex value in the form \xXX. Else its
     //  just a decimal value.
     //
     unsigned int retVal;
     char* endPtr;
     if (*srcPtr == '\\')
     {
         // Skip the \\x and interpret as a hex value
         srcPtr += 2;
         retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
     }
      else
     {
         retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
     }

     // We should have translated up to the end of the string
     if (*endPtr)
     {
         cout << "Invalid numeric value on line: " << fLineNum << endl;
         exit(1);
     }

     return retVal;
 }


 static void loadTable()
 {
     //
     //  Just loop, reading lines at a time, until we either find the start
     //  of the character table or hit the end of the file. Along the way, we
     //  should see a few header values that we store away.
     //
     const unsigned int  tmpBufSz = 2048;
     char                tmpBuf[tmpBufSz - 1];
     while (getLine(tmpBuf, tmpBufSz))
     {
         //
         //  Check for one of the special values we are intersted int. If
         //  its CHARMAP, then we fall out of this loop.
         //
         if (!strcmp(tmpBuf, "CHARMAP"))
             break;

         if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
         {
             gMaxChar = extractVal(&tmpBuf[12]);
         }
          else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
         {
             gMinChar = extractVal(&tmpBuf[12]);
         }
          else if (!strncmp(tmpBuf, "<subchar>", 9))
         {
             gRepChar = (char)extractVal(&tmpBuf[9]);
         }
     }

     //
     //  Ok, now we just run till we hit the "END CHARMAP" line. Each entry
     //  will be in the form:
     //
     //      <UXXXX>     \xXX
     //
     //  Where X is a hex number.
     //
     char* endPtr;
     while (getLine(tmpBuf, tmpBufSz))
     {
         // Watch for the end of table
         if (!strcmp(tmpBuf, "END CHARMAP"))
             break;

         // The absolute minium it could be is 12 chars
         if (strlen(tmpBuf) < 12)
         {
             cout << "Line " << fLineNum << " is too short to hold a valid entry"
                  << endl;
             exit(1);
         }

         // Make sure the first token meets the criteria
         if ((tmpBuf[0] != '<')
         ||  (tmpBuf[1] != 'U')
         ||  (tmpBuf[6] != '>'))
         {
             cout << "Line " << fLineNum << " has a badly formed Unicode value"
                  << endl;
             exit(1);
         }

         //
         //  Looks reasonable so lets try to convert it. We can play tricks
         //  with this buffer, so put a null over the > char.
         //
         tmpBuf[6] = 0;
         const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
         if (*endPtr)
         {
             cout << "Invalid Unicode value on line " << fLineNum << endl;
             exit(1);
         }

         //
         //  Ok, lets search over to the second token. We have to find a \\
         //  character.
         //
         char* srcPtr = &tmpBuf[7];
         while (*srcPtr && (*srcPtr != '\\'))
             srcPtr++;

         // If we never found it, its in error
         if (!*srcPtr)
         {
             cout << "Never found second token on line " << fLineNum << endl;
             exit(1);
         }

         // Try to translate it
         srcPtr += 2;
         const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
         if (*endPtr)
         {
             cout << "Invalid code page value on line " << fLineNum << endl;
             exit(1);
         }

         // Make sure that the values are within range
         if (uniVal > 0xFFFF)
         {
             cout << "Unicode value is too big on line " << fLineNum << endl;
             exit(1);
         }

         if (cpVal > 0xFF)
         {
             cout << "Code page value is too big on line " << fLineNum << endl;
             exit(1);
         }

         // Looks reasonable, so add a new entry to the global table
         gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
         gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
         gMainTableSz++;
     }
 }


 int compFuncTo(const void* p1, const void* p2)
 {
     const XlatRec* rec1 = (const XlatRec*)p1;
     const XlatRec* rec2 = (const XlatRec*)p2;

     return (int)rec1->uniVal - (int)rec2->uniVal;
 }


 int compFuncFrom(const void* p1, const void* p2)
 {
     const XlatRec* rec1 = (const XlatRec*)p1;
     const XlatRec* rec2 = (const XlatRec*)p2;

     //
     //  Since there can be multiple Unicode chars that map to a single
     //  code page char, we have to handle the situationw here they are
     //  equal specially. If the code page vals are equal, then the one
     //  with the smaller Unicode code point is considered smaller.
     //
     if (rec1->cpVal == rec2->cpVal)
         return (int)rec1->uniVal - (int)rec2->uniVal;

     // Else use the code page value for sorting
     return (int)rec1->cpVal - (int)rec2->cpVal;
 }


 static void formatSBTables()
 {
     // For now, only handle single byte char sets
     if ((gMinChar != 1) || (gMaxChar != 1))
     {
         cout << "formatSBTables can only handle single byte encodings"
              << endl;
         exit(1);
     }

     //
     //  First, we want to sort the table by the code page value field. This
     //  is the order required for the 'from' table to convert from the code
     //  page to the internal Unicode format.
     //
     qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);

     //
     //  Now spit out the header for the table. This is the same for all
     //  of them, since they are static to the file and can just all have
     //  the same name.
     //
     fprintf
     (
         gOutFile
         , "static const XMLCh gFromTable[256] =\n{\n    "
     );

     //
     //  Now for each unique entry in the cp value field, we want to put out
     //  the Unicode value for that entry. Since we sorted them such that
     //  dups have the one with the smaller Unicode value in the lower index,
     //  we always hit the desired value first, and then can just skip over
     //  a duplicate.
     //
     unsigned int curValue = 0;
     unsigned int index;
     for (index = 0; index < gMainTableSz; index++)
     {
         if (curValue)
         {
             if (!(curValue % 8))
                 fprintf(gOutFile, "\n  , ");
             else
                 fprintf(gOutFile, ", ");
         }

         if (curValue == gMainTable[index].cpVal)
         {
             fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);

             // If there is a dump, then skip it
             if (index < gMainTableSz)
             {
                 if (gMainTable[index + 1].cpVal == curValue)
                     index++;
             }
         }
          else if (curValue < gMainTable[index].cpVal)
         {
             fprintf(gOutFile, "0xFFFF");
         }
          else
         {
             // Screwed up
             cout << "Current value got above target value\n" << endl;
             exit(1);
         }
         curValue++;

         // If the current value goes over 256, we are in trouble
         if (curValue > 256)
         {
             cout << "The code page value cannot be > 256 in SB mode\n" << endl;
             exit(1);
         }
     }

     // And print the trailer for this table
     fprintf(gOutFile, "\n};\n\n");


     //
     //  Now lets sort by the Unicode value field. This sort is used for
     //  the 'to' table. The Unicode value is found by binary search and
     //  used to map to the right output encoding value.
     //
     qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);

     // Output the table ehader for this one
     fprintf
     (
         gOutFile
         , "static const XMLTransService::TransRec gToTable[] =\n{\n    "
     );

     for (index = 0; index < gMainTableSz; index++)
     {
         if (index)
         {
             if (!(index % 4))
                 fprintf(gOutFile, "\n  , ");
             else
                 fprintf(gOutFile, ", ");
         }

         fprintf
         (
             gOutFile
             , "{ 0x%04X, 0x%02X }"
             , (unsigned int)gMainTable[index].uniVal
             , (unsigned int)gMainTable[index].cpVal
         );
     }

     // Print the trailer for this table
     fprintf(gOutFile, "\n};\n");

     // And print out the table size constant
     fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);
 }

 static void showUsage()
 {
     cout << "ICUData inputUCMfile outputfile\n" << endl;
 }


 // ---------------------------------------------------------------------------
 //  The parameters are:
 //
 //  argV[1] = The source UCM file
 //  argV[2] = The path to the output file
 // ---------------------------------------------------------------------------
 int main(int argC, char** argV)
 {
     // We have to have 3 parameters
     if (argC != 3)
     {
         showUsage();
         return 1;
     }

     // Try to open the first file for input
     gInFile = fopen(argV[1], "rt");
     if (!gInFile)
     {
         cout << "Could not find input file: " << argV[1] << endl;
         return 1;
     }

     // Try to open the second file for output (truncated)
     gOutFile = fopen(argV[2], "wt+");
     if (!gOutFile)
     {
         cout << "Could not create output file: " << argV[1] << endl;
         return 1;
     }

     //
     //  This will parse the file and load the table. It will also look for
     //  a couple of key fields in the file header and store that data into
     //  globals.
     //
     loadTable();

     // If we didn't get any table entries, then give up
     if (!gMainTableSz)
     {
         cout << "No translation table entries were found in the file" << endl;
         return 1;
     }

     //
     //  Ok, we got the data loaded. Now lets output the tables. This method
     //  spit out both tables to the output file, in a format ready to be
     //  incorporated directly into the source code.
     //
     formatSBTables();

     // Close our files
     fclose(gInFile);
     fclose(gOutFile);

     return 0;
 }
	/*
	* The Apache Software License, Version 1.1
	*
	* Copyright (c) 1999-2000 The Apache Software Foundation. All rights
	* reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* 3. The end-user documentation included with the redistribution,
	* if any, must include the following acknowledgment:
	* "This product includes software developed by the
	* Apache Software Foundation (http://www.apache.org/)."
	* Alternately, this acknowledgment may appear in the software itself,
	* if and wherever such third-party acknowledgments normally appear.
	*
	* 4. The names "Xerces" and "Apache Software Foundation" must
	* not be used to endorse or promote products derived from this
	* software without prior written permission. For written
	* permission, please contact apache\@apache.org.
	*
	* 5. Products derived from this software may not be called "Apache",
	* nor may "Apache" appear in their name, without prior written
	* permission of the Apache Software Foundation.
	*
	* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
	* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
	* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	* ====================================================================
	*
	* This software consists of voluntary contributions made by many
	* individuals on behalf of the Apache Software Foundation, and was
	* originally based on software copyright (c) 1999, International
	* Business Machines, Inc., http://www.ibm.com . For more information
	* on the Apache Software Foundation, please see
	* <http://www.apache.org/>.
	*/


	/*
	* $Log$
	* Revision 1.1 2000/03/17 23:58:00 roddey
	* New utility for munging ICU UCM files and spitting out tables for
	* our intrinsic encoders.
	*
	*/

	// ---------------------------------------------------------------------------
	// This program is designed to parse a standard ICU .UCM file and spit out
	// a C++ code fragment that represents the tables required by the intrinsic
	// XML parser transcoders.
	//
	// The file format is pretty simple and this program is not intended to be
	// industrial strength by any means. Its use by anyone but the author is
	// at the user's own risk.
	//
	// The code looks for the min/max bytes per character to know what kind of
	// table to spit out, but for now only handles single char sets.
	// ---------------------------------------------------------------------------


	// ---------------------------------------------------------------------------
	// Includes
	// ---------------------------------------------------------------------------
	#include <ctype.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <iostream.h>
	#include <string.h>


	// ---------------------------------------------------------------------------
	// Const data
	// ---------------------------------------------------------------------------
	static const unsigned int gMaxInRecs = 1024;


	// ---------------------------------------------------------------------------
	// Local data types
	// ---------------------------------------------------------------------------
	struct XlatRec
	{
	unsigned short uniVal;
	unsigned char cpVal;
	};


	// ---------------------------------------------------------------------------
	// Local data
	//
	// gInFile
	// gOutFile
	// These are the file stream for the input UCM file and the output file
	// that we write the C++ code to.
	//
	// fLineNum
	// Used to track the current line number in the source file, for error
	// reporting.
	//
	// gMainTable
	// gMainTableSz
	// This is the table that is filled in from the original source document.
	// We don't know how big it will be, but its not likely to be much more
	// than 300 entries or so (256 output code points with some multiply
	// mapped Unicode code points.) So we make it extra large and watch for
	// possible overflow.
	//
	// The size value is bumped up as we load entries into it during the
	// parse of the file.
	//
	// gMaxChar
	// gMinChar
	// The min/max chars that are used to represent a character. These are
	// read from the header of the input file.
	//
	// gRepChar
	// The replacement character to be used. This is read from the header of
	// the input file.
	// ---------------------------------------------------------------------------
	static FILE* gInFile;
	static FILE* gOutFile;
	static unsigned int fLineNum;
	static XlatRec gMainTable[gMaxInRecs];
	static unsigned int gMainTableSz = 0;
	static unsigned int gMaxChar;
	static unsigned int gMinChar;
	static unsigned char gRepChar = 1;


	// ---------------------------------------------------------------------------
	// Local functions
	// ---------------------------------------------------------------------------
	static unsigned int getLine( char* const toFill
	, const unsigned int maxChars
	, const bool eofOk = false)
	{
	while (true)
	{
	if (!fgets(toFill, maxChars, gInFile))
	{
	if (feof(gInFile))
	{
	if (eofOk)
	return ~0UL;
	else
	cout << "Unexpected end of input at line: " << fLineNum << endl;
	}
	else
	{
	cout << "Error processing input at line: " << fLineNum << endl;
	exit(1);
	}
	}
	fLineNum++;

	//
	// If its not a comment, then break out
	//
	if (toFill[0] != '#')
	break;
	}

	//
	// There could be a trailing comment on this line, so lets get rid
	// of it. Search for a # char and put a null there.
	//
	char* endPtr = toFill;
	while (endPtr && (endPtr != '#'))
	endPtr++;
	if (*endPtr == '#')
	*endPtr = 0;

	// Strip trailing whitespace
	endPtr = toFill + (strlen(toFill) - 1);
	while (isspace(*endPtr))
	endPtr--;
	*(endPtr + 1) = 0;

	// And return the count of chars we got
	return strlen(toFill);
	}


	static unsigned int extractVal(char* const srcStr)
	{
	char* srcPtr = srcStr;

	// Run forward to the first non-space
	while (isspace(*srcPtr))
	srcPtr++;

	if (!*srcPtr)
	{
	cout << "Invalid numeric value on line: " << fLineNum << endl;
	exit(1);
	}

	//
	// If it starts with \, then its a hex value in the form \xXX. Else its
	// just a decimal value.
	//
	unsigned int retVal;
	char* endPtr;
	if (*srcPtr == '\\')
	{
	// Skip the \\x and interpret as a hex value
	srcPtr += 2;
	retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
	}
	else
	{
	retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
	}

	// We should have translated up to the end of the string
	if (*endPtr)
	{
	cout << "Invalid numeric value on line: " << fLineNum << endl;
	exit(1);
	}

	return retVal;
	}


	static void loadTable()
	{
	//
	// Just loop, reading lines at a time, until we either find the start
	// of the character table or hit the end of the file. Along the way, we
	// should see a few header values that we store away.
	//
	const unsigned int tmpBufSz = 2048;
	char tmpBuf[tmpBufSz - 1];
	while (getLine(tmpBuf, tmpBufSz))
	{
	//
	// Check for one of the special values we are intersted int. If
	// its CHARMAP, then we fall out of this loop.
	//
	if (!strcmp(tmpBuf, "CHARMAP"))
	break;

	if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
	{
	gMaxChar = extractVal(&tmpBuf[12]);
	}
	else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
	{
	gMinChar = extractVal(&tmpBuf[12]);
	}
	else if (!strncmp(tmpBuf, "<subchar>", 9))
	{
	gRepChar = (char)extractVal(&tmpBuf[9]);
	}
	}

	//
	// Ok, now we just run till we hit the "END CHARMAP" line. Each entry
	// will be in the form:
	//
	// <UXXXX> \xXX
	//
	// Where X is a hex number.
	//
	char* endPtr;
	while (getLine(tmpBuf, tmpBufSz))
	{
	// Watch for the end of table
	if (!strcmp(tmpBuf, "END CHARMAP"))
	break;

	// The absolute minium it could be is 12 chars
	if (strlen(tmpBuf) < 12)
	{
	cout << "Line " << fLineNum << " is too short to hold a valid entry"
	<< endl;
	exit(1);
	}

	// Make sure the first token meets the criteria
	if ((tmpBuf[0] != '<')
	\|\| (tmpBuf[1] != 'U')
	\|\| (tmpBuf[6] != '>'))
	{
	cout << "Line " << fLineNum << " has a badly formed Unicode value"
	<< endl;
	exit(1);
	}

	//
	// Looks reasonable so lets try to convert it. We can play tricks
	// with this buffer, so put a null over the > char.
	//
	tmpBuf[6] = 0;
	const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
	if (*endPtr)
	{
	cout << "Invalid Unicode value on line " << fLineNum << endl;
	exit(1);
	}

	//
	// Ok, lets search over to the second token. We have to find a \\
	// character.
	//
	char* srcPtr = &tmpBuf[7];
	while (srcPtr && (srcPtr != '\\'))
	srcPtr++;

	// If we never found it, its in error
	if (!*srcPtr)
	{
	cout << "Never found second token on line " << fLineNum << endl;
	exit(1);
	}

	// Try to translate it
	srcPtr += 2;
	const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
	if (*endPtr)
	{
	cout << "Invalid code page value on line " << fLineNum << endl;
	exit(1);
	}

	// Make sure that the values are within range
	if (uniVal > 0xFFFF)
	{
	cout << "Unicode value is too big on line " << fLineNum << endl;
	exit(1);
	}

	if (cpVal > 0xFF)
	{
	cout << "Code page value is too big on line " << fLineNum << endl;
	exit(1);
	}

	// Looks reasonable, so add a new entry to the global table
	gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
	gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
	gMainTableSz++;
	}
	}



	int compFuncTo(const void* p1, const void* p2)
	{
	const XlatRec* rec1 = (const XlatRec*)p1;
	const XlatRec* rec2 = (const XlatRec*)p2;

	return (int)rec1->uniVal - (int)rec2->uniVal;
	}


	int compFuncFrom(const void* p1, const void* p2)
	{
	const XlatRec* rec1 = (const XlatRec*)p1;
	const XlatRec* rec2 = (const XlatRec*)p2;

	//
	// Since there can be multiple Unicode chars that map to a single
	// code page char, we have to handle the situationw here they are
	// equal specially. If the code page vals are equal, then the one
	// with the smaller Unicode code point is considered smaller.
	//
	if (rec1->cpVal == rec2->cpVal)
	return (int)rec1->uniVal - (int)rec2->uniVal;

	// Else use the code page value for sorting
	return (int)rec1->cpVal - (int)rec2->cpVal;
	}


	static void formatSBTables()
	{
	// For now, only handle single byte char sets
	if ((gMinChar != 1) \|\| (gMaxChar != 1))
	{
	cout << "formatSBTables can only handle single byte encodings"
	<< endl;
	exit(1);
	}

	//
	// First, we want to sort the table by the code page value field. This
	// is the order required for the 'from' table to convert from the code
	// page to the internal Unicode format.
	//
	qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);

	//
	// Now spit out the header for the table. This is the same for all
	// of them, since they are static to the file and can just all have
	// the same name.
	//
	fprintf
	(
	gOutFile
	, "static const XMLCh gFromTable[256] =\n{\n "
	);

	//
	// Now for each unique entry in the cp value field, we want to put out
	// the Unicode value for that entry. Since we sorted them such that
	// dups have the one with the smaller Unicode value in the lower index,
	// we always hit the desired value first, and then can just skip over
	// a duplicate.
	//
	unsigned int curValue = 0;
	unsigned int index;
	for (index = 0; index < gMainTableSz; index++)
	{
	if (curValue)
	{
	if (!(curValue % 8))
	fprintf(gOutFile, "\n , ");
	else
	fprintf(gOutFile, ", ");
	}

	if (curValue == gMainTable[index].cpVal)
	{
	fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);

	// If there is a dump, then skip it
	if (index < gMainTableSz)
	{
	if (gMainTable[index + 1].cpVal == curValue)
	index++;
	}
	}
	else if (curValue < gMainTable[index].cpVal)
	{
	fprintf(gOutFile, "0xFFFF");
	}
	else
	{
	// Screwed up
	cout << "Current value got above target value\n" << endl;
	exit(1);
	}
	curValue++;

	// If the current value goes over 256, we are in trouble
	if (curValue > 256)
	{
	cout << "The code page value cannot be > 256 in SB mode\n" << endl;
	exit(1);
	}
	}

	// And print the trailer for this table
	fprintf(gOutFile, "\n};\n\n");


	//
	// Now lets sort by the Unicode value field. This sort is used for
	// the 'to' table. The Unicode value is found by binary search and
	// used to map to the right output encoding value.
	//
	qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);

	// Output the table ehader for this one
	fprintf
	(
	gOutFile
	, "static const XMLTransService::TransRec gToTable[] =\n{\n "
	);

	for (index = 0; index < gMainTableSz; index++)
	{
	if (index)
	{
	if (!(index % 4))
	fprintf(gOutFile, "\n , ");
	else
	fprintf(gOutFile, ", ");
	}

	fprintf
	(
	gOutFile
	, "{ 0x%04X, 0x%02X }"
	, (unsigned int)gMainTable[index].uniVal
	, (unsigned int)gMainTable[index].cpVal
	);
	}

	// Print the trailer for this table
	fprintf(gOutFile, "\n};\n");

	// And print out the table size constant
	fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);
	}

	static void showUsage()
	{
	cout << "ICUData inputUCMfile outputfile\n" << endl;
	}



	// ---------------------------------------------------------------------------
	// The parameters are:
	//
	// argV[1] = The source UCM file
	// argV[2] = The path to the output file
	// ---------------------------------------------------------------------------
	int main(int argC, char** argV)
	{
	// We have to have 3 parameters
	if (argC != 3)
	{
	showUsage();
	return 1;
	}

	// Try to open the first file for input
	gInFile = fopen(argV[1], "rt");
	if (!gInFile)
	{
	cout << "Could not find input file: " << argV[1] << endl;
	return 1;
	}

	// Try to open the second file for output (truncated)
	gOutFile = fopen(argV[2], "wt+");
	if (!gOutFile)
	{
	cout << "Could not create output file: " << argV[1] << endl;
	return 1;
	}

	//
	// This will parse the file and load the table. It will also look for
	// a couple of key fields in the file header and store that data into
	// globals.
	//
	loadTable();

	// If we didn't get any table entries, then give up
	if (!gMainTableSz)
	{
	cout << "No translation table entries were found in the file" << endl;
	return 1;
	}

	//
	// Ok, we got the data loaded. Now lets output the tables. This method
	// spit out both tables to the output file, in a format ready to be
	// incorporated directly into the source code.
	//
	formatSBTables();

	// Close our files
	fclose(gInFile);
	fclose(gOutFile);

	return 0;
	}