blob: 19e08a4fc19564d36b7589cec1d767ea3913424e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// This program is designed to parse a standard ICU .UCM file and spit out
// a C++ code fragment that represents the tables required by the intrinsic
// XML parser transcoders.
//
// The file format is pretty simple and this program is not intended to be
// industrial strength by any means. Its use by anyone but the author is
// at the user's own risk.
//
// The code looks for the min/max bytes per character to know what kind of
// table to spit out, but for now only handles single char sets.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream.h>
#include <string.h>
// ---------------------------------------------------------------------------
// Const data
// ---------------------------------------------------------------------------
static const unsigned int gMaxInRecs = 1024;
// ---------------------------------------------------------------------------
// Local data types
// ---------------------------------------------------------------------------
struct XlatRec
{
unsigned short uniVal;
unsigned char cpVal;
};
// ---------------------------------------------------------------------------
// Local data
//
// gInFile
// gOutFile
// These are the file stream for the input UCM file and the output file
// that we write the C++ code to.
//
// fLineNum
// Used to track the current line number in the source file, for error
// reporting.
//
// gMainTable
// gMainTableSz
// This is the table that is filled in from the original source document.
// We don't know how big it will be, but its not likely to be much more
// than 300 entries or so (256 output code points with some multiply
// mapped Unicode code points.) So we make it extra large and watch for
// possible overflow.
//
// The size value is bumped up as we load entries into it during the
// parse of the file.
//
// gMaxChar
// gMinChar
// The min/max chars that are used to represent a character. These are
// read from the header of the input file.
//
// gRepChar
// The replacement character to be used. This is read from the header of
// the input file.
// ---------------------------------------------------------------------------
static FILE* gInFile;
static FILE* gOutFile;
static unsigned int fLineNum;
static XlatRec gMainTable[gMaxInRecs];
static unsigned int gMainTableSz = 0;
static unsigned int gMaxChar;
static unsigned int gMinChar;
static unsigned char gRepChar = 1;
// ---------------------------------------------------------------------------
// Local functions
// ---------------------------------------------------------------------------
static unsigned int getLine( char* const toFill
, const unsigned int maxChars
, const bool eofOk = false)
{
while (true)
{
if (!fgets(toFill, maxChars, gInFile))
{
if (feof(gInFile))
{
if (eofOk)
return ~0UL;
else
cout << "Unexpected end of input at line: " << fLineNum << endl;
}
else
{
cout << "Error processing input at line: " << fLineNum << endl;
exit(1);
}
}
fLineNum++;
//
// If its not a comment, then break out
//
if (toFill[0] != '#')
break;
}
//
// There could be a trailing comment on this line, so lets get rid
// of it. Search for a # char and put a null there.
//
char* endPtr = toFill;
while (*endPtr && (*endPtr != '#'))
endPtr++;
if (*endPtr == '#')
*endPtr = 0;
// Strip trailing whitespace
endPtr = toFill + (strlen(toFill) - 1);
while (isspace(*endPtr))
endPtr--;
*(endPtr + 1) = 0;
// And return the count of chars we got
return strlen(toFill);
}
static unsigned int extractVal(char* const srcStr)
{
char* srcPtr = srcStr;
// Run forward to the first non-space
while (isspace(*srcPtr))
srcPtr++;
if (!*srcPtr)
{
cout << "Invalid numeric value on line: " << fLineNum << endl;
exit(1);
}
//
// If it starts with \, then its a hex value in the form \xXX. Else its
// just a decimal value.
//
unsigned int retVal;
char* endPtr;
if (*srcPtr == '\\')
{
// Skip the \\x and interpret as a hex value
srcPtr += 2;
retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16);
}
else
{
retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10);
}
// We should have translated up to the end of the string
if (*endPtr)
{
cout << "Invalid numeric value on line: " << fLineNum << endl;
exit(1);
}
return retVal;
}
static void loadTable()
{
//
// Just loop, reading lines at a time, until we either find the start
// of the character table or hit the end of the file. Along the way, we
// should see a few header values that we store away.
//
const unsigned int tmpBufSz = 2048;
char tmpBuf[tmpBufSz - 1];
while (getLine(tmpBuf, tmpBufSz))
{
//
// Check for one of the special values we are intersted int. If
// its CHARMAP, then we fall out of this loop.
//
if (!strcmp(tmpBuf, "CHARMAP"))
break;
if (!strncmp(tmpBuf, "<mb_cur_max>", 12))
{
gMaxChar = extractVal(&tmpBuf[12]);
}
else if (!strncmp(tmpBuf, "<mb_cur_min>", 12))
{
gMinChar = extractVal(&tmpBuf[12]);
}
else if (!strncmp(tmpBuf, "<subchar>", 9))
{
gRepChar = (char)extractVal(&tmpBuf[9]);
}
}
//
// Ok, now we just run till we hit the "END CHARMAP" line. Each entry
// will be in the form:
//
// <UXXXX> \xXX
//
// Where X is a hex number.
//
char* endPtr;
while (getLine(tmpBuf, tmpBufSz))
{
// Watch for the end of table
if (!strcmp(tmpBuf, "END CHARMAP"))
break;
// The absolute minium it could be is 12 chars
if (strlen(tmpBuf) < 12)
{
cout << "Line " << fLineNum << " is too short to hold a valid entry"
<< endl;
exit(1);
}
// Make sure the first token meets the criteria
if ((tmpBuf[0] != '<')
|| (tmpBuf[1] != 'U')
|| (tmpBuf[6] != '>'))
{
cout << "Line " << fLineNum << " has a badly formed Unicode value"
<< endl;
exit(1);
}
//
// Looks reasonable so lets try to convert it. We can play tricks
// with this buffer, so put a null over the > char.
//
tmpBuf[6] = 0;
const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16);
if (*endPtr)
{
cout << "Invalid Unicode value on line " << fLineNum << endl;
exit(1);
}
//
// Ok, lets search over to the second token. We have to find a \\
// character.
//
char* srcPtr = &tmpBuf[7];
while (*srcPtr && (*srcPtr != '\\'))
srcPtr++;
// If we never found it, its in error
if (!*srcPtr)
{
cout << "Never found second token on line " << fLineNum << endl;
exit(1);
}
// Try to translate it
srcPtr += 2;
const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16);
if (*endPtr)
{
cout << "Invalid code page value on line " << fLineNum << endl;
exit(1);
}
// Make sure that the values are within range
if (uniVal > 0xFFFF)
{
cout << "Unicode value is too big on line " << fLineNum << endl;
exit(1);
}
if (cpVal > 0xFF)
{
cout << "Code page value is too big on line " << fLineNum << endl;
exit(1);
}
// Looks reasonable, so add a new entry to the global table
gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal;
gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal;
gMainTableSz++;
}
}
int compFuncTo(const void* p1, const void* p2)
{
const XlatRec* rec1 = (const XlatRec*)p1;
const XlatRec* rec2 = (const XlatRec*)p2;
return (int)rec1->uniVal - (int)rec2->uniVal;
}
int compFuncFrom(const void* p1, const void* p2)
{
const XlatRec* rec1 = (const XlatRec*)p1;
const XlatRec* rec2 = (const XlatRec*)p2;
//
// Since there can be multiple Unicode chars that map to a single
// code page char, we have to handle the situationw here they are
// equal specially. If the code page vals are equal, then the one
// with the smaller Unicode code point is considered smaller.
//
if (rec1->cpVal == rec2->cpVal)
return (int)rec1->uniVal - (int)rec2->uniVal;
// Else use the code page value for sorting
return (int)rec1->cpVal - (int)rec2->cpVal;
}
static void formatSBTables()
{
// For now, only handle single byte char sets
if ((gMinChar != 1) || (gMaxChar != 1))
{
cout << "formatSBTables can only handle single byte encodings"
<< endl;
exit(1);
}
//
// First, we want to sort the table by the code page value field. This
// is the order required for the 'from' table to convert from the code
// page to the internal Unicode format.
//
qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom);
//
// Now spit out the header for the table. This is the same for all
// of them, since they are static to the file and can just all have
// the same name.
//
fprintf
(
gOutFile
, "static const XMLCh gFromTable[256] =\n{\n "
);
//
// Now for each unique entry in the cp value field, we want to put out
// the Unicode value for that entry. Since we sorted them such that
// dups have the one with the smaller Unicode value in the lower index,
// we always hit the desired value first, and then can just skip over
// a duplicate.
//
unsigned int curValue = 0;
unsigned int index;
for (index = 0; index < gMainTableSz; index++)
{
if (curValue)
{
if (!(curValue % 8))
fprintf(gOutFile, "\n , ");
else
fprintf(gOutFile, ", ");
}
if (curValue == gMainTable[index].cpVal)
{
fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal);
// If there is a dump, then skip it
if (index < gMainTableSz)
{
if (gMainTable[index + 1].cpVal == curValue)
index++;
}
}
else if (curValue < gMainTable[index].cpVal)
{
fprintf(gOutFile, "0xFFFF");
}
else
{
// Screwed up
cout << "Current value got above target value\n" << endl;
exit(1);
}
curValue++;
// If the current value goes over 256, we are in trouble
if (curValue > 256)
{
cout << "The code page value cannot be > 256 in SB mode\n" << endl;
exit(1);
}
}
// And print the trailer for this table
fprintf(gOutFile, "\n};\n\n");
//
// Now lets sort by the Unicode value field. This sort is used for
// the 'to' table. The Unicode value is found by binary search and
// used to map to the right output encoding value.
//
qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo);
// Output the table ehader for this one
fprintf
(
gOutFile
, "static const XMLTransService::TransRec gToTable[] =\n{\n "
);
for (index = 0; index < gMainTableSz; index++)
{
if (index)
{
if (!(index % 4))
fprintf(gOutFile, "\n , ");
else
fprintf(gOutFile, ", ");
}
fprintf
(
gOutFile
, "{ 0x%04X, 0x%02X }"
, (unsigned int)gMainTable[index].uniVal
, (unsigned int)gMainTable[index].cpVal
);
}
// Print the trailer for this table
fprintf(gOutFile, "\n};\n");
// And print out the table size constant
fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz);
}
static void showUsage()
{
cout << "ICUData inputUCMfile outputfile\n" << endl;
}
// ---------------------------------------------------------------------------
// The parameters are:
//
// argV[1] = The source UCM file
// argV[2] = The path to the output file
// ---------------------------------------------------------------------------
int main(int argC, char** argV)
{
// We have to have 3 parameters
if (argC != 3)
{
showUsage();
return 1;
}
// Try to open the first file for input
gInFile = fopen(argV[1], "rt");
if (!gInFile)
{
cout << "Could not find input file: " << argV[1] << endl;
return 1;
}
// Try to open the second file for output (truncated)
gOutFile = fopen(argV[2], "wt+");
if (!gOutFile)
{
cout << "Could not create output file: " << argV[1] << endl;
return 1;
}
//
// This will parse the file and load the table. It will also look for
// a couple of key fields in the file header and store that data into
// globals.
//
loadTable();
// If we didn't get any table entries, then give up
if (!gMainTableSz)
{
cout << "No translation table entries were found in the file" << endl;
return 1;
}
//
// Ok, we got the data loaded. Now lets output the tables. This method
// spit out both tables to the output file, in a format ready to be
// incorporated directly into the source code.
//
formatSBTables();
// Close our files
fclose(gInFile);
fclose(gOutFile);
return 0;
}