| /********************************************************************** |
| // @@@ START COPYRIGHT @@@ |
| // |
| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // |
| // @@@ END COPYRIGHT @@@ |
| **********************************************************************/ |
| /* -*-C++-*- |
| ***************************************************************************** |
| * |
| * File: conversionSJIS.cpp |
| * RCS: $Id: |
| * Description: The implementation of SJIS related conversion routins |
| * |
| * |
| * Created: 7/8/98 |
| * Modified: $ $Date: 1998/08/10 16:00:50 $ (GMT) |
| * Language: C++ |
| * Status: $State: Exp $ |
| * |
| * |
| * |
| * |
| ***************************************************************************** |
| */ |
| // define MODULE_DEBUG when the module is to be debugged separately. |
| //#define MODULE_DEBUG |
| |
| #include "NLSConversion.h" |
| #include "nawstring.h" |
| |
| |
| #ifdef MODULE_DEBUG |
| #include "stringBuf.cpp" |
| #endif |
| |
| |
| /////////////////////////////////////////////////////////////////////// |
| // Standard SJIS code point ranges based on the book "Understanding |
| // Japanese Information Processing" Table 4-9. pp. 73 and the massaged |
| // SJIS Unicode mapping table. The original table is available from |
| // unicode.org. |
| /////////////////////////////////////////////////////////////////////// |
| |
| inline Int32 in_range(Int32 x, Int32 lower, Int32 upper) |
| { |
| return ( lower <= x && x <= upper ) ? 1 : 0; |
| } |
| |
| // 1st block of single-byte characters in [0, 0x7F] |
| #define isSingleByteSJIS1stBlock(x) in_range(x, 0x00, 0x7F) |
| |
| // 2nd block of single-byte characters in [0xA1, 0xDF] |
| #define isSingleByteSJIS2ndBlock(x) in_range(x, 0xA1, 0xDF) |
| |
| typedef struct SJISCodeBound { |
| unsigned char lower; |
| unsigned char upper; |
| } SJISCodeBoundT; |
| |
| // This function takes a SJIS string and returns its Unicode equivalent. |
| // The optional result argument holds the buffer into which the Unicode string |
| // will be stored. In case the result argument is NULL or the buffer it points |
| // at is not big enough, the function allocates memory from the heap |
| // (if the heap pointer is not NULL), or from the C run-time system heap |
| // and returns the result. |
| // |
| // If memory allocation fails, the function returns NULL. |
| // If any illegal characters are encountered, the function also returns NULL. |
| // |
| NAWcharBuf* |
| sjisToUnicode(const charBuf& sjisString, CollHeap *heap, |
| NAWcharBuf*& result, NABoolean addNullAtEnd) |
| { |
| |
| // |
| // These arrays are generated by the script /MiscVOB/i18n/sjis.ksh |
| // and are copied into /sqlvob4/common. The seed file is |
| // /MiscVOB/i18n/mx_sjis.txt which defines the Sjis to Unicode mapping. |
| // Whenever a change is made to mx_sjis.txt, these arrays have to be |
| // regenerated by running sh sjis.ksh. |
| // |
| // Each of them (array_x for x in 0 to 4) represents the |
| // Unicode code values corresponding to blocks of SJIS code in a particular |
| // range. The content of each array is included from a .h file whose name |
| // contains two hex numbers that defines the SJIS char range the array represents. |
| // Note the range is relevant to the leading byte of the SJIS characters only. |
| // For example, sjis_81_84.h holds all Unicode chars mappable from SJIS in the |
| // range from 0x81 to 0x84. |
| // |
| // The trailing bytes of SJIS characters in each range run continuously |
| // in strict ascending order, from 0x40 to 0xFC, except the "gap" characters |
| // at 0x7F. Such gap characters are excluded from these arrays because they are |
| // not in SJIS. |
| // |
| // A few filler non-Unicode characters (0xFFFF) are purposely inserted in |
| // these arrays to represent any un-defined characters in SJIS. These fillers |
| // are useful in making fast algorithmic conversion possible through code point |
| // value manipulation. No filler character will be returned however. |
| // |
| // In each included file generated by the tool sjis.ksh, each line is in the |
| // format |
| // { /* SJIS code value */ Unicode code value }, /* remark */ |
| // |
| // For example |
| // { /* 0x8140 */ 0x3000 }, /* IDEOGRAPHIC SPACE */ |
| // |
| // |
| static const NAWchar array_0[] = { |
| #include "sjis_81_84.h" |
| }; |
| |
| static const NAWchar array_1[] = { |
| #include "sjis_87_9f.h" |
| }; |
| |
| static const NAWchar array_2[] = { |
| #include "sjis_e0_ea.h" |
| }; |
| |
| static const NAWchar array_3[] = { |
| #include "sjis_ed_ee.h" |
| }; |
| |
| static const NAWchar array_4[] = { |
| #include "sjis_fa_fc.h" |
| }; |
| |
| // ranges determined by the leading byte |
| static const SJISCodeBoundT SJISLeadByteBounds[] = |
| { |
| {0x81, 0x84}, { 0x87, 0x9F}, {0xE0, 0xEA}, {0xED, 0xEE}, {0xFA, 0xFC}, |
| {0x0, 0xFF } // catch all |
| }; |
| |
| // blocks determined by the trailing byte |
| static const SJISCodeBoundT SJISTrailByteBounds[] = |
| { |
| {0x40, 0x7E}, {0x80, 0xFC}, |
| {0x0, 0xFF } // catch all |
| }; |
| |
| |
| unsigned char* source = sjisString.data(); |
| Int32 sourceLen = sjisString.getStrLen(); |
| |
| // the output Unicode string will have at most sjisString.length() |
| // characters. An extra char may be added depending on addNullAtEnd. |
| NAWcharBuf* output = checkSpace(heap, sjisString.getStrLen(), result, addNullAtEnd); |
| |
| if ( output == 0 ) |
| return 0; |
| |
| NAWchar *base, *target; |
| base = target = output -> data(); |
| |
| unsigned char c, d; |
| NAWchar u; |
| Int32 i=0; |
| |
| while ( i < sourceLen ) { |
| |
| c = source[i++]; |
| |
| if ( isSingleByteSJIS1stBlock(c) ) |
| u = (NAWchar)c; // found in the first single-byte block |
| else |
| if ( isSingleByteSJIS2ndBlock(c) ) |
| u = (NAWchar)c - 0xA1 + 0xFF61; // found in the 2nd single-byte block |
| else { |
| |
| // the second byte does not exist! |
| if ( i == sjisString.getStrLen() ) { |
| return 0; |
| } |
| |
| // get the trailing byte |
| d = source[i++]; |
| |
| // is d a bad trailing byte? |
| if ( d == 0x7F || d == 0xFD || d == 0xFE || d == 0xFF ) { |
| return 0; |
| } |
| |
| Int32 lead, trail; |
| |
| // find the range in which c is in. |
| for ( lead = 0; lead<=4; lead++ ) |
| if ( in_range(c, SJISLeadByteBounds[lead].lower, |
| SJISLeadByteBounds[lead].upper)) |
| break; |
| |
| // find the first or the second block for c. |
| for (trail=0; trail<=2; trail++ ) |
| if ( in_range(d, SJISTrailByteBounds[trail].lower, |
| SJISTrailByteBounds[trail].upper )) |
| break; |
| |
| // out of range of SJIS legal code values |
| if ( (c == 0xEA && d >= 0xA4) || |
| (c == 0xFC && d >= 0x4C) || |
| (lead == 5 ) || (trail == 2) |
| ) |
| { |
| return 0; |
| } |
| |
| |
| // Each chunk (all chars with identical lead byte) |
| // has 16 * 18 = 192 chars. But because of the missing |
| // ones at 0x7F, 0xFD, 0xFE and 0xFF tailing byte, |
| // each chunk contains only 192 - 4 = 188 chars. |
| // |
| // In additional, a char with greater than 0x7F |
| // tailing byte (say, d) should be mapped to the entry |
| // with the index (d-1) instead of (d) in the right chunk. |
| // We do this by using the expression "- trail". |
| switch (lead) { |
| case 0: |
| u = array_0[(c-0x81) * 188 + d - 0x40 - trail]; |
| break; |
| case 1: |
| u = array_1[(c-0x87) * 188 + d - 0x40 - trail]; |
| break; |
| case 2: |
| u = array_2[(c-0xe0) * 188 + d - 0x40 - trail]; |
| break; |
| case 3: |
| u = array_3[(c-0xed) * 188 + d - 0x40 - trail]; |
| break; |
| default: |
| u = array_4[(c-0xfa) * 188 + d - 0x40 - trail]; |
| } |
| } |
| |
| if ( u == 0xFFFF ) // filler chars are not defined in SJIS standard |
| return 0; |
| |
| *target = u; |
| target++; |
| } |
| |
| Int32 finalLength = target-base; |
| |
| if ( addNullAtEnd == TRUE ) |
| (output -> data())[finalLength] = 0; |
| |
| output -> setStrLen(finalLength); |
| return output; |
| } |
| |
| typedef struct Unicode2SjisMap |
| { |
| NAWchar Unicode; |
| NAWchar sjis; |
| } Unicode2SjisMapT; |
| |
| // |
| // Using the binary search method to find the SJIS code for a Unicode character |
| // contained in argument u. The SJIS code is returned in the argument sjis. The |
| // function returns TRUE if the conversion is sucessful, FALSE otherwise. |
| // |
| // The function only returns the SJIS that are double-byte. Hense it is static. |
| // |
| static |
| NABoolean binarySearchU2STable(NAWchar u, NAWchar& sjis) |
| { |
| |
| |
| // |
| // This array contains the Unicode to SJIS mapping with. Each line in |
| // the include file is in the format |
| // |
| // { Unicode_code_value, SJIS_code_value }, /* remark */ |
| // |
| // It is assumed the Unicode_code_value column is in strict ascending order. |
| // |
| static const Unicode2SjisMapT array_u2s[] = { |
| #include "sjis_from_ucs2.h" |
| }; |
| |
| Int32 lowerLimit = 0; |
| Int32 upperLimit = sizeof(array_u2s)/sizeof(Unicode2SjisMapT) - 1; |
| Int32 middle = 0; |
| |
| do { |
| middle = (lowerLimit + upperLimit ) / 2; |
| |
| if ( array_u2s[middle].Unicode == u ) { |
| sjis = array_u2s[middle].sjis; |
| return TRUE; |
| } |
| |
| |
| if ( array_u2s[middle].Unicode < u ) |
| lowerLimit = middle+1; |
| else |
| upperLimit = middle-1; |
| |
| } while ( lowerLimit <= upperLimit ); |
| |
| return FALSE; |
| } |
| |
| // |
| // Convert a single Unicode character (in argument wc) into SJIS multibyte |
| // format. The result is placed into the sjis argument. The number of SJIS |
| // bytes is returned by the function. The function returns 0 if the |
| // Unicode character is not mappable from SJIS. |
| // |
| Int32 unicodeToSjisChar(NAWchar wc, unsigned char *sjis, NABoolean allowInvalidCodePoint) |
| { |
| NAWchar t; |
| |
| // single-byte SJIS characters, in [0, 0x7F]. The Unicode range is |
| // also [0, 0x007F] |
| if ( in_range(wc, 0, 0x007F) ) { |
| sjis[0] = (unsigned char)wc; |
| return 1; |
| } |
| |
| // single-byte SJIS characters, in [0xA1, 0xDF]. The Unicode range |
| // is [0xFF61, 0xFF9F] |
| if ( in_range(wc, 0xFF61, 0xFF9F) ) { |
| sjis[0] = (unsigned char)(wc - 0xFF61 + 0xA1); |
| return 1; |
| } |
| |
| // double byte |
| if ( binarySearchU2STable(wc, t) && t >= 0x8140 ) { |
| sjis[0] = (unsigned char)((t>>8) & (0xFF)); |
| sjis[1] = (unsigned char)(t & (0xFF)); |
| return 2; |
| } |
| |
| // bad case |
| if ( allowInvalidCodePoint == FALSE ) |
| return 0; |
| else { |
| sjis[0] = '?'; |
| return 1; |
| } |
| } |
| |
| // Unicode to SJIS conversion. |
| // |
| // This function takes a Unicode string and returns its SJIS equivalent. |
| // The optional sjisString argument holds the buffer into which the Unicode string |
| // will be stored. In case the argument is NULL or it is not big enough, |
| // the function allocates memory from the heap (if the heap pointer is not NULL), |
| // or from the C run-time system heap. |
| // If the memory allocation fails, the function returns 0. If any illegal |
| // characters are encountered, the function also returns 0. |
| // |
| charBuf* unicodeToSjis(const NAWcharBuf& unicodeString, CollHeap *heap, |
| charBuf*& sjisString, NABoolean addNullAtEnd, |
| NABoolean allowInvalidCodePoint) |
| { |
| NAWchar* source = unicodeString.data(); |
| Int32 sourceLen = unicodeString.getStrLen(); |
| |
| // the output Unicode string will have at most 2*unicodeString.length() |
| // characters. An extra char may be added depending on addNullAtEnd. |
| charBuf* output = checkSpace(heap, BYTES_PER_NAWCHAR*unicodeString.getStrLen(), sjisString, addNullAtEnd); |
| |
| if ( output == 0 ) |
| return 0; |
| |
| unsigned char *base, *target; |
| base = target = (unsigned char*) (output -> data()); |
| |
| Int32 ct = 0; |
| for ( Int32 i=0; i<sourceLen; i++ ) { |
| |
| ct = unicodeToSjisChar(source[i], target, allowInvalidCodePoint); |
| |
| if ( ct == 0 ) { |
| if ( sjisString == NULL ) { |
| if (heap) |
| NADELETE(output, charBuf, heap); |
| else |
| delete output; |
| } |
| return NULL; |
| } else |
| target += ct; |
| } |
| |
| Int32 finalLength = target - base; |
| |
| if ( addNullAtEnd == TRUE ) |
| (output -> data())[finalLength] = 0; |
| |
| output -> setStrLen(finalLength); |
| return output; |
| } |
| |
| #ifdef MODULE_DEBUG |
| |
| // |
| // testing: dumping all Unicode codes for Sjis chars. |
| // Use the tool /MiscVOB/i18n/verify.pl to see the |
| // difference between the dumping and mx_sjis.txt.l2u. |
| // Should only see complaining about 0xFFFF. |
| // |
| |
| static |
| Int32 leadbyte[] = { |
| 0x81, 0x82, 0x83, 0x84, 0x87, 0x88, |
| 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, |
| 0x8f, |
| 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, |
| 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, |
| 0x9e, 0x9f, |
| 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, |
| 0xe7, 0xe8, 0xe9, 0xea, 0xed, 0xee, 0xfa, |
| 0xfb, 0xfc |
| }; |
| |
| void sjisDump() |
| { |
| charBuf sjis(2); |
| NAWcharBuf* unicode = new NAWcharBuf(10); |
| |
| for ( Int32 i=0; i<0xFF; i++ ) { |
| sjis.data()[0] = i; |
| sjis.setLength(1); |
| |
| unicode = sjisToUnicode(sjis, 0, unicode); |
| if ( unicode ) |
| printf("0x%X 0x%X\n", i, unicode->data()[0]); |
| } |
| |
| for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ ) |
| for (Int32 j=0x40; j<=0xFF; j++ ) { |
| sjis.data()[0] = leadbyte[i]; |
| sjis.data()[1] = j; |
| sjis.setLength(2); |
| |
| unicode = sjisToUnicode(sjis, 0, unicode); |
| if ( unicode ) |
| printf("0x%X%X 0x%X\n", leadbyte[i], j, unicode->data()[0]); |
| } |
| |
| delete unicode; |
| } |
| |
| // Roundtrip verification. |
| // |
| // Perform a round-trip conversion testing for each valid SJIS. |
| // |
| // should not see any warnings. |
| // |
| void sjis2Unicode2sjis() |
| { |
| printf("SJIS round-trip testing: "); |
| |
| charBuf sjis(2); |
| NAWcharBuf* unicode = new NAWcharBuf(10); |
| charBuf* remapped_sjis = new charBuf(10); |
| |
| // do the test for single-byte chars |
| for ( Int32 i=0; i<0xDF; i++ ) { |
| |
| // skip non-SJIS chars |
| if ( in_range(i, 0x80, 0xA0) ) |
| continue; |
| |
| sjis.data()[0] = i; |
| sjis.setLength(1); |
| |
| unicode = sjisToUnicode(sjis, 0, unicode); |
| |
| if ( unicode == NULL ) { |
| printf("SJIS to Unicode mapping failed: 0x%X.\n", i); |
| return; |
| } |
| |
| remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis); |
| |
| if ( remapped_sjis == NULL ) { |
| printf("Unicode 0x%X can not be mapped to SJIS\n", |
| unicode->data()[0]); |
| return; |
| } |
| |
| if ( remapped_sjis->data()[0] != sjis.data()[0] ) { |
| printf("Roundtrip mapping failed: 0x%X 0x%X\n", |
| sjis.data()[0], remapped_sjis->data()[0] |
| ); |
| return; |
| } |
| } |
| |
| // do the test for double-byte chars |
| for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ ) |
| for (Int32 j=0x40; j<=0xFF; j++ ) { |
| |
| sjis.data()[0] = leadbyte[i]; |
| sjis.data()[1] = j; |
| sjis.setLength(2); |
| |
| unicode = sjisToUnicode(sjis, 0, unicode); |
| if ( unicode == NULL ) { |
| continue; // skip non-exist chars |
| } |
| |
| remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis); |
| |
| if ( remapped_sjis == NULL ) { |
| printf("Unicode 0x%X can not be mapped to SJIS\n", |
| unicode->data()[0]); |
| return; |
| } |
| |
| if ( remapped_sjis->data()[0] != sjis.data()[0] || |
| remapped_sjis->data()[1] != sjis.data()[1] |
| ) { |
| printf("Roundtrip mapping failed: sjis=0x%X%X unicode=0x%X, remapped_sjis=0x%X%X\n", |
| sjis.data()[0], sjis.data()[1], |
| unicode->data()[0], |
| remapped_sjis->data()[0], remapped_sjis->data()[1] |
| ); |
| return; |
| } |
| } |
| |
| delete unicode; |
| delete remapped_sjis; |
| |
| printf("OK\n"); |
| } |
| |
| // |
| // Testing: assure all SJIS-mappable Unicode chars can be mapped |
| // to SJIS. Should not see any "Bad code ..." output. |
| // |
| void UCS2ToSjis() |
| { |
| printf("Unicode to SJIS testing: "); |
| |
| NAWchar sjisChar; |
| NAWcharBuf unicode(1); |
| charBuf* sjis = new charBuf(10); |
| |
| Int32 n = sizeof(array_u2s)/sizeof(Unicode2SjisMapT); |
| for ( Int32 i=0; i<n; i++ ) |
| { |
| unicode.data()[0] = array_u2s[i].Unicode; |
| unicodeToSjis(unicode, 0, sjis); |
| |
| switch (sjis->length()) { |
| case 2: |
| sjisChar = (NAWchar)(sjis->data()[1]); |
| sjisChar |= (NAWchar)((sjis->data()[0] <<8)); |
| break; |
| default: |
| printf("Bad code 0x%X\n", array_u2s[i].Unicode); |
| return; |
| } |
| if ( sjisChar != array_u2s[i].sjis ) { |
| printf("Bad code 0x%x for 0x%X\n", sjisChar, array_u2s[i].Unicode); |
| return; |
| } |
| } |
| |
| for ( i=0; i<=0x7F; i++ ) |
| { |
| unicode.data()[0] = (NAWchar)i; |
| unicodeToSjis(unicode, 0, sjis); |
| |
| switch (sjis->length()) { |
| case 1: |
| sjisChar = (NAWchar)sjis->data()[0]; |
| break; |
| default: |
| printf("Bad code 0x%X\n", i); |
| return; |
| } |
| if ( sjisChar != i ) { |
| printf("Bad code 0x%x for 0x%X\n", sjisChar, i); |
| return; |
| } |
| } |
| |
| for ( i=0xFF61; i<=0xFF9F; i++ ) |
| { |
| unicode.data()[0] = (NAWchar)i; |
| unicodeToSjis(unicode, 0, sjis); |
| |
| switch (sjis->length()) { |
| case 1: |
| sjisChar = (NAWchar)sjis->data()[0]; |
| break; |
| default: |
| printf("Bad code 0x%X\n", i); |
| return; |
| } |
| if ( sjisChar != i - 0xFF61 + 0xA1 ) { |
| printf("Bad code 0x%x for 0x%X\n", sjisChar, i); |
| return; |
| } |
| } |
| printf("OK\n"); |
| } |
| |
| // |
| // Define MODULE_DEBUG at the beginning of this file and select one |
| // of the following to test. Build the test app with the command: |
| // |
| // cl /nologo /Zp4 /W3 /GX /Od /MDd /D "_DEBUG" /D "NA_WINNT" /Z7 \ |
| // -o a.exe conversionsjis.cpp |
| // |
| Int32 main(Int32 argc, char** argv) |
| { |
| //sjisDump(); |
| sjis2Unicode2sjis(); |
| UCS2ToSjis(); |
| return 0; |
| } |
| |
| #endif |