blob: f4f7ba2997e25a0ff0d101e253bff54452658ad4 [file] [log] [blame]
/**********************************************************************
// @@@ START COPYRIGHT @@@
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//
// @@@ END COPYRIGHT @@@
**********************************************************************/
/* -*-C++-*-
*****************************************************************************
*
* File: conversionSJIS.cpp
* RCS: $Id:
* Description: The implementation of SJIS related conversion routins
*
*
* Created: 7/8/98
* Modified: $ $Date: 1998/08/10 16:00:50 $ (GMT)
* Language: C++
* Status: $State: Exp $
*
*
*
*
*****************************************************************************
*/
// define MODULE_DEBUG when the module is to be debugged separately.
//#define MODULE_DEBUG
#include "NLSConversion.h"
#include "nawstring.h"
#ifdef MODULE_DEBUG
#include "stringBuf.cpp"
#endif
///////////////////////////////////////////////////////////////////////
// Standard SJIS code point ranges based on the book "Understanding
// Japanese Information Processing" Table 4-9. pp. 73 and the massaged
// SJIS Unicode mapping table. The original table is available from
// unicode.org.
///////////////////////////////////////////////////////////////////////
inline Int32 in_range(Int32 x, Int32 lower, Int32 upper)
{
return ( lower <= x && x <= upper ) ? 1 : 0;
}
// 1st block of single-byte characters in [0, 0x7F]
#define isSingleByteSJIS1stBlock(x) in_range(x, 0x00, 0x7F)
// 2nd block of single-byte characters in [0xA1, 0xDF]
#define isSingleByteSJIS2ndBlock(x) in_range(x, 0xA1, 0xDF)
typedef struct SJISCodeBound {
unsigned char lower;
unsigned char upper;
} SJISCodeBoundT;
// This function takes a SJIS string and returns its Unicode equivalent.
// The optional result argument holds the buffer into which the Unicode string
// will be stored. In case the result argument is NULL or the buffer it points
// at is not big enough, the function allocates memory from the heap
// (if the heap pointer is not NULL), or from the C run-time system heap
// and returns the result.
//
// If memory allocation fails, the function returns NULL.
// If any illegal characters are encountered, the function also returns NULL.
//
NAWcharBuf*
sjisToUnicode(const charBuf& sjisString, CollHeap *heap,
NAWcharBuf*& result, NABoolean addNullAtEnd)
{
//
// These arrays are generated by the script /MiscVOB/i18n/sjis.ksh
// and are copied into /sqlvob4/common. The seed file is
// /MiscVOB/i18n/mx_sjis.txt which defines the Sjis to Unicode mapping.
// Whenever a change is made to mx_sjis.txt, these arrays have to be
// regenerated by running sh sjis.ksh.
//
// Each of them (array_x for x in 0 to 4) represents the
// Unicode code values corresponding to blocks of SJIS code in a particular
// range. The content of each array is included from a .h file whose name
// contains two hex numbers that defines the SJIS char range the array represents.
// Note the range is relevant to the leading byte of the SJIS characters only.
// For example, sjis_81_84.h holds all Unicode chars mappable from SJIS in the
// range from 0x81 to 0x84.
//
// The trailing bytes of SJIS characters in each range run continuously
// in strict ascending order, from 0x40 to 0xFC, except the "gap" characters
// at 0x7F. Such gap characters are excluded from these arrays because they are
// not in SJIS.
//
// A few filler non-Unicode characters (0xFFFF) are purposely inserted in
// these arrays to represent any un-defined characters in SJIS. These fillers
// are useful in making fast algorithmic conversion possible through code point
// value manipulation. No filler character will be returned however.
//
// In each included file generated by the tool sjis.ksh, each line is in the
// format
// { /* SJIS code value */ Unicode code value }, /* remark */
//
// For example
// { /* 0x8140 */ 0x3000 }, /* IDEOGRAPHIC SPACE */
//
//
static const NAWchar array_0[] = {
#include "sjis_81_84.h"
};
static const NAWchar array_1[] = {
#include "sjis_87_9f.h"
};
static const NAWchar array_2[] = {
#include "sjis_e0_ea.h"
};
static const NAWchar array_3[] = {
#include "sjis_ed_ee.h"
};
static const NAWchar array_4[] = {
#include "sjis_fa_fc.h"
};
// ranges determined by the leading byte
static const SJISCodeBoundT SJISLeadByteBounds[] =
{
{0x81, 0x84}, { 0x87, 0x9F}, {0xE0, 0xEA}, {0xED, 0xEE}, {0xFA, 0xFC},
{0x0, 0xFF } // catch all
};
// blocks determined by the trailing byte
static const SJISCodeBoundT SJISTrailByteBounds[] =
{
{0x40, 0x7E}, {0x80, 0xFC},
{0x0, 0xFF } // catch all
};
unsigned char* source = sjisString.data();
Int32 sourceLen = sjisString.getStrLen();
// the output Unicode string will have at most sjisString.length()
// characters. An extra char may be added depending on addNullAtEnd.
NAWcharBuf* output = checkSpace(heap, sjisString.getStrLen(), result, addNullAtEnd);
if ( output == 0 )
return 0;
NAWchar *base, *target;
base = target = output -> data();
unsigned char c, d;
NAWchar u;
Int32 i=0;
while ( i < sourceLen ) {
c = source[i++];
if ( isSingleByteSJIS1stBlock(c) )
u = (NAWchar)c; // found in the first single-byte block
else
if ( isSingleByteSJIS2ndBlock(c) )
u = (NAWchar)c - 0xA1 + 0xFF61; // found in the 2nd single-byte block
else {
// the second byte does not exist!
if ( i == sjisString.getStrLen() ) {
return 0;
}
// get the trailing byte
d = source[i++];
// is d a bad trailing byte?
if ( d == 0x7F || d == 0xFD || d == 0xFE || d == 0xFF ) {
return 0;
}
Int32 lead, trail;
// find the range in which c is in.
for ( lead = 0; lead<=4; lead++ )
if ( in_range(c, SJISLeadByteBounds[lead].lower,
SJISLeadByteBounds[lead].upper))
break;
// find the first or the second block for c.
for (trail=0; trail<=2; trail++ )
if ( in_range(d, SJISTrailByteBounds[trail].lower,
SJISTrailByteBounds[trail].upper ))
break;
// out of range of SJIS legal code values
if ( (c == 0xEA && d >= 0xA4) ||
(c == 0xFC && d >= 0x4C) ||
(lead == 5 ) || (trail == 2)
)
{
return 0;
}
// Each chunk (all chars with identical lead byte)
// has 16 * 18 = 192 chars. But because of the missing
// ones at 0x7F, 0xFD, 0xFE and 0xFF tailing byte,
// each chunk contains only 192 - 4 = 188 chars.
//
// In additional, a char with greater than 0x7F
// tailing byte (say, d) should be mapped to the entry
// with the index (d-1) instead of (d) in the right chunk.
// We do this by using the expression "- trail".
switch (lead) {
case 0:
u = array_0[(c-0x81) * 188 + d - 0x40 - trail];
break;
case 1:
u = array_1[(c-0x87) * 188 + d - 0x40 - trail];
break;
case 2:
u = array_2[(c-0xe0) * 188 + d - 0x40 - trail];
break;
case 3:
u = array_3[(c-0xed) * 188 + d - 0x40 - trail];
break;
default:
u = array_4[(c-0xfa) * 188 + d - 0x40 - trail];
}
}
if ( u == 0xFFFF ) // filler chars are not defined in SJIS standard
return 0;
*target = u;
target++;
}
Int32 finalLength = target-base;
if ( addNullAtEnd == TRUE )
(output -> data())[finalLength] = 0;
output -> setStrLen(finalLength);
return output;
}
typedef struct Unicode2SjisMap
{
NAWchar Unicode;
NAWchar sjis;
} Unicode2SjisMapT;
//
// Using the binary search method to find the SJIS code for a Unicode character
// contained in argument u. The SJIS code is returned in the argument sjis. The
// function returns TRUE if the conversion is sucessful, FALSE otherwise.
//
// The function only returns the SJIS that are double-byte. Hense it is static.
//
static
NABoolean binarySearchU2STable(NAWchar u, NAWchar& sjis)
{
//
// This array contains the Unicode to SJIS mapping with. Each line in
// the include file is in the format
//
// { Unicode_code_value, SJIS_code_value }, /* remark */
//
// It is assumed the Unicode_code_value column is in strict ascending order.
//
static const Unicode2SjisMapT array_u2s[] = {
#include "sjis_from_ucs2.h"
};
Int32 lowerLimit = 0;
Int32 upperLimit = sizeof(array_u2s)/sizeof(Unicode2SjisMapT) - 1;
Int32 middle = 0;
do {
middle = (lowerLimit + upperLimit ) / 2;
if ( array_u2s[middle].Unicode == u ) {
sjis = array_u2s[middle].sjis;
return TRUE;
}
if ( array_u2s[middle].Unicode < u )
lowerLimit = middle+1;
else
upperLimit = middle-1;
} while ( lowerLimit <= upperLimit );
return FALSE;
}
//
// Convert a single Unicode character (in argument wc) into SJIS multibyte
// format. The result is placed into the sjis argument. The number of SJIS
// bytes is returned by the function. The function returns 0 if the
// Unicode character is not mappable from SJIS.
//
Int32 unicodeToSjisChar(NAWchar wc, unsigned char *sjis, NABoolean allowInvalidCodePoint)
{
NAWchar t;
// single-byte SJIS characters, in [0, 0x7F]. The Unicode range is
// also [0, 0x007F]
if ( in_range(wc, 0, 0x007F) ) {
sjis[0] = (unsigned char)wc;
return 1;
}
// single-byte SJIS characters, in [0xA1, 0xDF]. The Unicode range
// is [0xFF61, 0xFF9F]
if ( in_range(wc, 0xFF61, 0xFF9F) ) {
sjis[0] = (unsigned char)(wc - 0xFF61 + 0xA1);
return 1;
}
// double byte
if ( binarySearchU2STable(wc, t) && t >= 0x8140 ) {
sjis[0] = (unsigned char)((t>>8) & (0xFF));
sjis[1] = (unsigned char)(t & (0xFF));
return 2;
}
// bad case
if ( allowInvalidCodePoint == FALSE )
return 0;
else {
sjis[0] = '?';
return 1;
}
}
// Unicode to SJIS conversion.
//
// This function takes a Unicode string and returns its SJIS equivalent.
// The optional sjisString argument holds the buffer into which the Unicode string
// will be stored. In case the argument is NULL or it is not big enough,
// the function allocates memory from the heap (if the heap pointer is not NULL),
// or from the C run-time system heap.
// If the memory allocation fails, the function returns 0. If any illegal
// characters are encountered, the function also returns 0.
//
charBuf* unicodeToSjis(const NAWcharBuf& unicodeString, CollHeap *heap,
charBuf*& sjisString, NABoolean addNullAtEnd,
NABoolean allowInvalidCodePoint)
{
NAWchar* source = unicodeString.data();
Int32 sourceLen = unicodeString.getStrLen();
// the output Unicode string will have at most 2*unicodeString.length()
// characters. An extra char may be added depending on addNullAtEnd.
charBuf* output = checkSpace(heap, BYTES_PER_NAWCHAR*unicodeString.getStrLen(), sjisString, addNullAtEnd);
if ( output == 0 )
return 0;
unsigned char *base, *target;
base = target = (unsigned char*) (output -> data());
Int32 ct = 0;
for ( Int32 i=0; i<sourceLen; i++ ) {
ct = unicodeToSjisChar(source[i], target, allowInvalidCodePoint);
if ( ct == 0 ) {
if ( sjisString == NULL ) {
if (heap)
NADELETE(output, charBuf, heap);
else
delete output;
}
return NULL;
} else
target += ct;
}
Int32 finalLength = target - base;
if ( addNullAtEnd == TRUE )
(output -> data())[finalLength] = 0;
output -> setStrLen(finalLength);
return output;
}
#ifdef MODULE_DEBUG
//
// testing: dumping all Unicode codes for Sjis chars.
// Use the tool /MiscVOB/i18n/verify.pl to see the
// difference between the dumping and mx_sjis.txt.l2u.
// Should only see complaining about 0xFFFF.
//
static
Int32 leadbyte[] = {
0x81, 0x82, 0x83, 0x84, 0x87, 0x88,
0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
0x9e, 0x9f,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
0xe7, 0xe8, 0xe9, 0xea, 0xed, 0xee, 0xfa,
0xfb, 0xfc
};
void sjisDump()
{
charBuf sjis(2);
NAWcharBuf* unicode = new NAWcharBuf(10);
for ( Int32 i=0; i<0xFF; i++ ) {
sjis.data()[0] = i;
sjis.setLength(1);
unicode = sjisToUnicode(sjis, 0, unicode);
if ( unicode )
printf("0x%X 0x%X\n", i, unicode->data()[0]);
}
for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
for (Int32 j=0x40; j<=0xFF; j++ ) {
sjis.data()[0] = leadbyte[i];
sjis.data()[1] = j;
sjis.setLength(2);
unicode = sjisToUnicode(sjis, 0, unicode);
if ( unicode )
printf("0x%X%X 0x%X\n", leadbyte[i], j, unicode->data()[0]);
}
delete unicode;
}
// Roundtrip verification.
//
// Perform a round-trip conversion testing for each valid SJIS.
//
// should not see any warnings.
//
void sjis2Unicode2sjis()
{
printf("SJIS round-trip testing: ");
charBuf sjis(2);
NAWcharBuf* unicode = new NAWcharBuf(10);
charBuf* remapped_sjis = new charBuf(10);
// do the test for single-byte chars
for ( Int32 i=0; i<0xDF; i++ ) {
// skip non-SJIS chars
if ( in_range(i, 0x80, 0xA0) )
continue;
sjis.data()[0] = i;
sjis.setLength(1);
unicode = sjisToUnicode(sjis, 0, unicode);
if ( unicode == NULL ) {
printf("SJIS to Unicode mapping failed: 0x%X.\n", i);
return;
}
remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);
if ( remapped_sjis == NULL ) {
printf("Unicode 0x%X can not be mapped to SJIS\n",
unicode->data()[0]);
return;
}
if ( remapped_sjis->data()[0] != sjis.data()[0] ) {
printf("Roundtrip mapping failed: 0x%X 0x%X\n",
sjis.data()[0], remapped_sjis->data()[0]
);
return;
}
}
// do the test for double-byte chars
for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
for (Int32 j=0x40; j<=0xFF; j++ ) {
sjis.data()[0] = leadbyte[i];
sjis.data()[1] = j;
sjis.setLength(2);
unicode = sjisToUnicode(sjis, 0, unicode);
if ( unicode == NULL ) {
continue; // skip non-exist chars
}
remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);
if ( remapped_sjis == NULL ) {
printf("Unicode 0x%X can not be mapped to SJIS\n",
unicode->data()[0]);
return;
}
if ( remapped_sjis->data()[0] != sjis.data()[0] ||
remapped_sjis->data()[1] != sjis.data()[1]
) {
printf("Roundtrip mapping failed: sjis=0x%X%X unicode=0x%X, remapped_sjis=0x%X%X\n",
sjis.data()[0], sjis.data()[1],
unicode->data()[0],
remapped_sjis->data()[0], remapped_sjis->data()[1]
);
return;
}
}
delete unicode;
delete remapped_sjis;
printf("OK\n");
}
//
// Testing: assure all SJIS-mappable Unicode chars can be mapped
// to SJIS. Should not see any "Bad code ..." output.
//
void UCS2ToSjis()
{
printf("Unicode to SJIS testing: ");
NAWchar sjisChar;
NAWcharBuf unicode(1);
charBuf* sjis = new charBuf(10);
Int32 n = sizeof(array_u2s)/sizeof(Unicode2SjisMapT);
for ( Int32 i=0; i<n; i++ )
{
unicode.data()[0] = array_u2s[i].Unicode;
unicodeToSjis(unicode, 0, sjis);
switch (sjis->length()) {
case 2:
sjisChar = (NAWchar)(sjis->data()[1]);
sjisChar |= (NAWchar)((sjis->data()[0] <<8));
break;
default:
printf("Bad code 0x%X\n", array_u2s[i].Unicode);
return;
}
if ( sjisChar != array_u2s[i].sjis ) {
printf("Bad code 0x%x for 0x%X\n", sjisChar, array_u2s[i].Unicode);
return;
}
}
for ( i=0; i<=0x7F; i++ )
{
unicode.data()[0] = (NAWchar)i;
unicodeToSjis(unicode, 0, sjis);
switch (sjis->length()) {
case 1:
sjisChar = (NAWchar)sjis->data()[0];
break;
default:
printf("Bad code 0x%X\n", i);
return;
}
if ( sjisChar != i ) {
printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
return;
}
}
for ( i=0xFF61; i<=0xFF9F; i++ )
{
unicode.data()[0] = (NAWchar)i;
unicodeToSjis(unicode, 0, sjis);
switch (sjis->length()) {
case 1:
sjisChar = (NAWchar)sjis->data()[0];
break;
default:
printf("Bad code 0x%X\n", i);
return;
}
if ( sjisChar != i - 0xFF61 + 0xA1 ) {
printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
return;
}
}
printf("OK\n");
}
//
// Define MODULE_DEBUG at the beginning of this file and select one
// of the following to test. Build the test app with the command:
//
// cl /nologo /Zp4 /W3 /GX /Od /MDd /D "_DEBUG" /D "NA_WINNT" /Z7 \
// -o a.exe conversionsjis.cpp
//
Int32 main(Int32 argc, char** argv)
{
//sjisDump();
sjis2Unicode2sjis();
UCS2ToSjis();
return 0;
}
#endif