core/sql/common/conversionSJIS.cpp - trafodion - Git at Google

 /**********************************************************************
 // @@@ START COPYRIGHT @@@
 //
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
 //
 // @@@ END COPYRIGHT @@@
 **********************************************************************/
 /* -*-C++-*-
  *****************************************************************************
  *
  * File:         conversionSJIS.cpp
  * RCS:          $Id:
  * Description:  The implementation of SJIS related conversion routins
  *
  *
  * Created:      7/8/98
  * Modified:     $ $Date: 1998/08/10 16:00:50 $ (GMT)
  * Language:     C++
  * Status:       $State: Exp $
  *
  *
  *
  *
  *****************************************************************************
  */
 // define MODULE_DEBUG when the module is to be debugged separately.
 //#define MODULE_DEBUG

 #include "NLSConversion.h"
 #include "nawstring.h"


 #ifdef MODULE_DEBUG
 #include "stringBuf.cpp"
 #endif


 ///////////////////////////////////////////////////////////////////////
 // Standard SJIS code point ranges based on the book "Understanding
 // Japanese Information Processing" Table 4-9. pp. 73 and the massaged
 // SJIS Unicode mapping table.  The original table is available from
 // unicode.org.
 ///////////////////////////////////////////////////////////////////////

 inline Int32 in_range(Int32 x, Int32 lower, Int32 upper)
 {
    return ( lower <= x  &&  x <= upper ) ? 1 : 0;
 }

 // 1st block of single-byte characters in [0, 0x7F]
 #define isSingleByteSJIS1stBlock(x) in_range(x, 0x00, 0x7F)

 // 2nd block of single-byte characters in [0xA1, 0xDF]
 #define isSingleByteSJIS2ndBlock(x) in_range(x, 0xA1, 0xDF)

 typedef struct SJISCodeBound {
         unsigned char lower;
         unsigned char upper;
 } SJISCodeBoundT;

 // This function takes a SJIS string and returns its Unicode equivalent.
 // The optional result argument holds the buffer into which the Unicode string
 // will be stored. In case the result argument is NULL or the buffer it points
 // at is not big enough, the function allocates memory from the heap
 // (if the heap pointer is not NULL), or from the C run-time system heap
 // and returns the result.
 //
 // If memory allocation fails, the function returns NULL.
 // If any illegal characters are encountered, the function also returns NULL.
 //
 NAWcharBuf*
 sjisToUnicode(const charBuf& sjisString, CollHeap *heap,
               NAWcharBuf*& result, NABoolean addNullAtEnd)
 {

 //
 // These arrays are generated by the script /MiscVOB/i18n/sjis.ksh
 // and are copied into /sqlvob4/common. The seed file is
 // /MiscVOB/i18n/mx_sjis.txt which defines the Sjis to Unicode mapping.
 // Whenever a change is made to mx_sjis.txt, these arrays have to be
 // regenerated by running sh sjis.ksh.
 //
 // Each of them (array_x for x in 0 to 4) represents the
 // Unicode code values corresponding to blocks of SJIS code in a particular
 // range. The content of each array is included from a .h file whose name
 // contains two hex numbers that defines the SJIS char range the array represents.
 // Note the range is relevant to the leading byte of the SJIS characters only.
 // For example, sjis_81_84.h holds all Unicode chars mappable from SJIS in the
 // range from 0x81 to 0x84.
 //
 // The trailing bytes of SJIS characters in each range run continuously
 // in strict ascending order, from 0x40 to 0xFC, except the "gap" characters
 // at 0x7F. Such gap characters are excluded from these arrays because they are
 // not in SJIS.
 //
 // A few filler non-Unicode characters (0xFFFF) are purposely inserted in
 // these arrays to represent any un-defined characters in SJIS. These fillers
 // are useful in making fast algorithmic conversion possible through code point
 // value manipulation. No filler character will be returned however.
 //
 // In each included file generated by the tool sjis.ksh, each line is in the
 // format
 //     { /* SJIS code value */  Unicode code value }, /* remark */
 //
 // For example
 //     { /* 0x8140 */ 0x3000 }, /* IDEOGRAPHIC SPACE */
 //
 //
    static const NAWchar array_0[] = {
    #include "sjis_81_84.h"
    	};

    static const NAWchar array_1[] = {
    #include "sjis_87_9f.h"
    	};

    static const NAWchar array_2[] = {
    #include "sjis_e0_ea.h"
    	};

    static const NAWchar array_3[] = {
    #include "sjis_ed_ee.h"
    	};

    static const NAWchar array_4[] = {
    #include "sjis_fa_fc.h"
    	};

    // ranges determined by the leading byte
    static const SJISCodeBoundT SJISLeadByteBounds[] =
    {
       {0x81, 0x84}, { 0x87, 0x9F}, {0xE0, 0xEA}, {0xED, 0xEE}, {0xFA, 0xFC},
       {0x0, 0xFF } // catch all
    };

    // blocks determined by the trailing byte
    static const SJISCodeBoundT SJISTrailByteBounds[] =
    {
       {0x40, 0x7E}, {0x80, 0xFC},
       {0x0, 0xFF } // catch all
    };


    unsigned char* source = sjisString.data();
    Int32 sourceLen = sjisString.getStrLen();

    // the output Unicode string will have at most sjisString.length()
    // characters. An extra char may be added depending on addNullAtEnd.
    NAWcharBuf* output = checkSpace(heap, sjisString.getStrLen(), result, addNullAtEnd);

    if ( output == 0 )
       return 0;

    NAWchar *base, *target;
    base = target = output -> data();

    unsigned char c, d;
    NAWchar u;
    Int32 i=0;

    while ( i < sourceLen ) {

       c = source[i++];

       if ( isSingleByteSJIS1stBlock(c) )
 	u = (NAWchar)c;                 // found in the first single-byte block
       else
       if ( isSingleByteSJIS2ndBlock(c) )
 	u = (NAWchar)c - 0xA1 + 0xFF61; // found in the 2nd single-byte block
       else {

         // the second byte does not exist!
         if ( i == sjisString.getStrLen() ) {
           return 0;
         }

         // get the trailing byte
 	d = source[i++];

         // is d a bad trailing byte?
         if ( d == 0x7F || d == 0xFD || d == 0xFE || d == 0xFF ) {
            return 0;
         }

         Int32 lead, trail;

         // find the range in which c is in.
         for ( lead = 0; lead<=4; lead++ )
            if ( in_range(c, SJISLeadByteBounds[lead].lower,
                             SJISLeadByteBounds[lead].upper))
               break;

         // find the first or the second block for c.
         for (trail=0; trail<=2; trail++ )
            if ( in_range(d, SJISTrailByteBounds[trail].lower,
 	                    SJISTrailByteBounds[trail].upper ))
               break;

         // out of range of SJIS legal code values
         if ( (c == 0xEA && d >= 0xA4) ||
              (c == 0xFC && d >= 0x4C) ||
              (lead == 5 ) || (trail == 2)
            )
         {
 	   return 0;
         }


         // Each chunk (all chars with identical lead byte)
         // has 16 * 18 = 192 chars. But because of the missing
         // ones at 0x7F, 0xFD, 0xFE and 0xFF tailing byte,
         // each chunk contains only 192 - 4 = 188 chars.
         //
         // In additional, a char with greater than 0x7F
         // tailing byte (say, d) should be mapped to the entry
         // with the index (d-1) instead of (d) in the right chunk.
         // We do this by using the expression "- trail".
         switch (lead) {
             case 0:
 	      u = array_0[(c-0x81) * 188 + d - 0x40 - trail];
               break;
             case 1:
 	      u = array_1[(c-0x87) * 188 + d - 0x40 - trail];
               break;
             case 2:
 	      u = array_2[(c-0xe0) * 188 + d - 0x40 - trail];
               break;
 	    case 3:
 	      u = array_3[(c-0xed) * 188 + d - 0x40 - trail];
               break;
             default:
               u = array_4[(c-0xfa) * 188 + d - 0x40 - trail];
 	}
       }

       if ( u == 0xFFFF ) // filler chars are not defined in SJIS standard
          return 0;

       *target = u;
       target++;
    }

    Int32 finalLength = target-base;

    if ( addNullAtEnd == TRUE )
       (output -> data())[finalLength] = 0;

    output -> setStrLen(finalLength);
    return output;
 }

 typedef struct Unicode2SjisMap
 {
    NAWchar Unicode;
    NAWchar sjis;
 } Unicode2SjisMapT;

 //
 // Using the binary search method to find the SJIS code for a Unicode character
 // contained in argument u. The SJIS code is returned in the argument sjis. The
 // function returns TRUE if the conversion is sucessful, FALSE otherwise.
 //
 // The function only returns the SJIS that are double-byte.  Hense it is static.
 //
 static
 NABoolean binarySearchU2STable(NAWchar u, NAWchar& sjis)
 {


 //
 // This array contains the Unicode to SJIS mapping with. Each line in
 // the include file is in the format
 //
 // { Unicode_code_value, SJIS_code_value }, /* remark */
 //
 // It is assumed the Unicode_code_value column is in strict ascending order.
 //
    static const Unicode2SjisMapT array_u2s[] = {
       #include "sjis_from_ucs2.h"
    };

    Int32 lowerLimit = 0;
    Int32 upperLimit = sizeof(array_u2s)/sizeof(Unicode2SjisMapT) - 1;
    Int32 middle = 0;

    do {
      middle = (lowerLimit + upperLimit ) / 2;

      if ( array_u2s[middle].Unicode == u ) {
         sjis = array_u2s[middle].sjis;
         return TRUE;
      }


      if ( array_u2s[middle].Unicode < u )
         lowerLimit = middle+1;
      else
         upperLimit = middle-1;

    } while ( lowerLimit <= upperLimit );

    return FALSE;
 }

 //
 // Convert a single Unicode character (in argument wc) into SJIS multibyte
 // format. The result is placed into the sjis argument. The number of SJIS
 // bytes is returned by the function. The function returns 0 if the
 // Unicode character is not mappable from SJIS.
 //
 Int32 unicodeToSjisChar(NAWchar wc, unsigned char *sjis, NABoolean allowInvalidCodePoint)
 {
    NAWchar t;

    // single-byte SJIS characters, in [0, 0x7F]. The Unicode range is
    // also [0, 0x007F]
    if ( in_range(wc, 0, 0x007F) ) {
       sjis[0] = (unsigned char)wc;
       return 1;
    }

    // single-byte SJIS characters, in [0xA1, 0xDF]. The Unicode range
    // is [0xFF61, 0xFF9F]
    if ( in_range(wc, 0xFF61, 0xFF9F) ) {
       sjis[0] = (unsigned char)(wc - 0xFF61 + 0xA1);
       return 1;
    }

    // double byte
    if ( binarySearchU2STable(wc, t) && t >= 0x8140 ) {
       sjis[0] = (unsigned char)((t>>8) & (0xFF));
       sjis[1] = (unsigned char)(t & (0xFF));
       return 2;
    }

    // bad case
    if ( allowInvalidCodePoint == FALSE )
       return 0;
    else {
       sjis[0] = '?';
       return 1;
    }
 }

 // Unicode to SJIS conversion.
 //
 // This function takes a Unicode string and returns its SJIS equivalent.
 // The optional sjisString argument holds the buffer into which the Unicode string
 // will be stored. In case the argument is NULL or it is not big enough,
 // the function allocates memory from the heap (if the heap pointer is not NULL),
 // or from the C run-time system heap.
 // If the memory allocation fails, the function returns 0. If any illegal
 // characters are encountered, the function also returns 0.
 //
 charBuf* unicodeToSjis(const NAWcharBuf& unicodeString, CollHeap *heap,
                        charBuf*& sjisString, NABoolean addNullAtEnd,
                        NABoolean allowInvalidCodePoint)
 {
    NAWchar* source = unicodeString.data();
    Int32 sourceLen = unicodeString.getStrLen();

    // the output Unicode string will have at most 2*unicodeString.length()
    // characters. An extra char may be added depending on addNullAtEnd.
    charBuf* output = checkSpace(heap, BYTES_PER_NAWCHAR*unicodeString.getStrLen(), sjisString, addNullAtEnd);

    if ( output == 0 )
       return 0;

    unsigned char *base, *target;
    base = target = (unsigned char*) (output -> data());

    Int32 ct = 0;
    for ( Int32 i=0; i<sourceLen; i++ ) {

       ct = unicodeToSjisChar(source[i], target, allowInvalidCodePoint);

       if ( ct == 0 ) {
         if ( sjisString == NULL ) {
            if (heap)
              NADELETE(output, charBuf, heap);
            else
              delete output;
         }
         return NULL;
       } else
         target += ct;
    }

    Int32 finalLength = target - base;

    if ( addNullAtEnd == TRUE )
       (output -> data())[finalLength] = 0;

    output -> setStrLen(finalLength);
    return output;
 }

 #ifdef MODULE_DEBUG

 //
 // testing: dumping all Unicode codes for Sjis chars.
 // Use the tool /MiscVOB/i18n/verify.pl to see the
 // difference between the dumping and mx_sjis.txt.l2u.
 // Should only see complaining about 0xFFFF.
 //

 static
 Int32 leadbyte[] = {
         0x81, 0x82, 0x83, 0x84, 0x87, 0x88,
         0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
         0x8f,
         0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
         0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
         0x9e, 0x9f,
         0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
         0xe7, 0xe8, 0xe9, 0xea, 0xed, 0xee, 0xfa,
         0xfb, 0xfc
    };

 void sjisDump()
 {
    charBuf sjis(2);
    NAWcharBuf* unicode = new NAWcharBuf(10);

    for ( Int32 i=0; i<0xFF; i++ ) {
       sjis.data()[0] = i;
       sjis.setLength(1);

       unicode = sjisToUnicode(sjis, 0, unicode);
       if ( unicode )
          printf("0x%X  0x%X\n", i, unicode->data()[0]);
    }

    for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
       for (Int32 j=0x40; j<=0xFF; j++ ) {
          sjis.data()[0] = leadbyte[i];
          sjis.data()[1] = j;
          sjis.setLength(2);

          unicode = sjisToUnicode(sjis, 0, unicode);
          if ( unicode )
             printf("0x%X%X  0x%X\n", leadbyte[i], j, unicode->data()[0]);
       }

    delete unicode;
 }

 // Roundtrip verification.
 //
 // Perform a round-trip conversion testing for each valid SJIS.
 //
 // should not see any warnings.
 //
 void sjis2Unicode2sjis()
 {
    printf("SJIS round-trip testing: ");

    charBuf sjis(2);
    NAWcharBuf* unicode = new NAWcharBuf(10);
    charBuf* remapped_sjis = new charBuf(10);

    // do the test for single-byte chars
    for ( Int32 i=0; i<0xDF; i++ ) {

       // skip non-SJIS chars
       if ( in_range(i, 0x80, 0xA0) )
          continue;

       sjis.data()[0] = i;
       sjis.setLength(1);

       unicode = sjisToUnicode(sjis, 0, unicode);

       if ( unicode == NULL ) {
          printf("SJIS to Unicode mapping failed: 0x%X.\n", i);
          return;
       }

       remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

       if ( remapped_sjis == NULL ) {
         printf("Unicode 0x%X can not be mapped to SJIS\n",
                unicode->data()[0]);
         return;
       }

       if ( remapped_sjis->data()[0] != sjis.data()[0] ) {
         printf("Roundtrip mapping failed: 0x%X  0x%X\n",
                sjis.data()[0], remapped_sjis->data()[0]
               );
         return;
       }
    }

    // do the test for double-byte chars
    for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
       for (Int32 j=0x40; j<=0xFF; j++ ) {

          sjis.data()[0] = leadbyte[i];
          sjis.data()[1] = j;
          sjis.setLength(2);

          unicode = sjisToUnicode(sjis, 0, unicode);
          if ( unicode == NULL ) {
             continue; // skip non-exist chars
          }

          remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

          if ( remapped_sjis == NULL ) {
             printf("Unicode 0x%X can not be mapped to SJIS\n",
                    unicode->data()[0]);
             return;
          }

          if ( remapped_sjis->data()[0] != sjis.data()[0] ||
               remapped_sjis->data()[1] != sjis.data()[1]
             ) {
            printf("Roundtrip mapping failed: sjis=0x%X%X  unicode=0x%X, remapped_sjis=0x%X%X\n",
                   sjis.data()[0], sjis.data()[1],
                   unicode->data()[0],
                   remapped_sjis->data()[0], remapped_sjis->data()[1]
                  );
            return;
          }
       }

    delete unicode;
    delete remapped_sjis;

    printf("OK\n");
 }

 //
 // Testing: assure all SJIS-mappable Unicode chars can be mapped
 // to SJIS. Should not see any "Bad code ..." output.
 //
 void UCS2ToSjis()
 {
    printf("Unicode to SJIS testing: ");

    NAWchar sjisChar;
    NAWcharBuf unicode(1);
    charBuf* sjis = new charBuf(10);

    Int32 n = sizeof(array_u2s)/sizeof(Unicode2SjisMapT);
    for ( Int32 i=0; i<n; i++ )
    {
       unicode.data()[0] = array_u2s[i].Unicode;
       unicodeToSjis(unicode, 0, sjis);

       switch (sjis->length()) {
          case 2:
             sjisChar = (NAWchar)(sjis->data()[1]);
             sjisChar |= (NAWchar)((sjis->data()[0] <<8));
             break;
          default:
             printf("Bad code 0x%X\n", array_u2s[i].Unicode);
             return;
       }
       if ( sjisChar != array_u2s[i].sjis ) {
          printf("Bad code 0x%x for 0x%X\n", sjisChar, array_u2s[i].Unicode);
          return;
       }
    }

    for ( i=0; i<=0x7F; i++ )
    {
      unicode.data()[0] = (NAWchar)i;
      unicodeToSjis(unicode, 0, sjis);

      switch (sjis->length()) {
         case 1:
            sjisChar = (NAWchar)sjis->data()[0];
            break;
         default:
            printf("Bad code 0x%X\n", i);
            return;
      }
      if ( sjisChar != i ) {
         printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
         return;
      }
    }

    for ( i=0xFF61; i<=0xFF9F; i++ )
    {
      unicode.data()[0] = (NAWchar)i;
      unicodeToSjis(unicode, 0, sjis);

      switch (sjis->length()) {
         case 1:
            sjisChar = (NAWchar)sjis->data()[0];
            break;
         default:
            printf("Bad code 0x%X\n", i);
            return;
      }
      if ( sjisChar != i - 0xFF61 + 0xA1 ) {
         printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
         return;
      }
    }
    printf("OK\n");
 }

 //
 // Define MODULE_DEBUG at the beginning of this file and select one
 // of the following to test.  Build the test app with the command:
 //
 // cl /nologo /Zp4 /W3 /GX /Od /MDd /D "_DEBUG" /D "NA_WINNT" /Z7 \
 // -o a.exe conversionsjis.cpp
 //
 Int32 main(Int32 argc, char** argv)
 {
    //sjisDump();
    sjis2Unicode2sjis();
    UCS2ToSjis();
    return 0;
 }

 #endif
	/**********************************************************************
	// @@@ START COPYRIGHT @@@
	//
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.
	//
	// @@@ END COPYRIGHT @@@
	**********************************************************************/
	/* --C++--
	*****************************************************************************
	*
	* File: conversionSJIS.cpp
	* RCS: $Id:
	* Description: The implementation of SJIS related conversion routins
	*
	*
	* Created: 7/8/98
	* Modified: $ $Date: 1998/08/10 16:00:50 $ (GMT)
	* Language: C++
	* Status: $State: Exp $
	*
	*
	*
	*
	*****************************************************************************
	*/
	// define MODULE_DEBUG when the module is to be debugged separately.
	//#define MODULE_DEBUG

	#include "NLSConversion.h"
	#include "nawstring.h"


	#ifdef MODULE_DEBUG
	#include "stringBuf.cpp"
	#endif


	///////////////////////////////////////////////////////////////////////
	// Standard SJIS code point ranges based on the book "Understanding
	// Japanese Information Processing" Table 4-9. pp. 73 and the massaged
	// SJIS Unicode mapping table. The original table is available from
	// unicode.org.
	///////////////////////////////////////////////////////////////////////

	inline Int32 in_range(Int32 x, Int32 lower, Int32 upper)
	{
	return ( lower <= x && x <= upper ) ? 1 : 0;
	}

	// 1st block of single-byte characters in [0, 0x7F]
	#define isSingleByteSJIS1stBlock(x) in_range(x, 0x00, 0x7F)

	// 2nd block of single-byte characters in [0xA1, 0xDF]
	#define isSingleByteSJIS2ndBlock(x) in_range(x, 0xA1, 0xDF)

	typedef struct SJISCodeBound {
	unsigned char lower;
	unsigned char upper;
	} SJISCodeBoundT;

	// This function takes a SJIS string and returns its Unicode equivalent.
	// The optional result argument holds the buffer into which the Unicode string
	// will be stored. In case the result argument is NULL or the buffer it points
	// at is not big enough, the function allocates memory from the heap
	// (if the heap pointer is not NULL), or from the C run-time system heap
	// and returns the result.
	//
	// If memory allocation fails, the function returns NULL.
	// If any illegal characters are encountered, the function also returns NULL.
	//
	NAWcharBuf*
	sjisToUnicode(const charBuf& sjisString, CollHeap *heap,
	NAWcharBuf*& result, NABoolean addNullAtEnd)
	{

	//
	// These arrays are generated by the script /MiscVOB/i18n/sjis.ksh
	// and are copied into /sqlvob4/common. The seed file is
	// /MiscVOB/i18n/mx_sjis.txt which defines the Sjis to Unicode mapping.
	// Whenever a change is made to mx_sjis.txt, these arrays have to be
	// regenerated by running sh sjis.ksh.
	//
	// Each of them (array_x for x in 0 to 4) represents the
	// Unicode code values corresponding to blocks of SJIS code in a particular
	// range. The content of each array is included from a .h file whose name
	// contains two hex numbers that defines the SJIS char range the array represents.
	// Note the range is relevant to the leading byte of the SJIS characters only.
	// For example, sjis_81_84.h holds all Unicode chars mappable from SJIS in the
	// range from 0x81 to 0x84.
	//
	// The trailing bytes of SJIS characters in each range run continuously
	// in strict ascending order, from 0x40 to 0xFC, except the "gap" characters
	// at 0x7F. Such gap characters are excluded from these arrays because they are
	// not in SJIS.
	//
	// A few filler non-Unicode characters (0xFFFF) are purposely inserted in
	// these arrays to represent any un-defined characters in SJIS. These fillers
	// are useful in making fast algorithmic conversion possible through code point
	// value manipulation. No filler character will be returned however.
	//
	// In each included file generated by the tool sjis.ksh, each line is in the
	// format
	// { /* SJIS code value / Unicode code value }, / remark */
	//
	// For example
	// { /* 0x8140 / 0x3000 }, / IDEOGRAPHIC SPACE */
	//
	//
	static const NAWchar array_0[] = {
	#include "sjis_81_84.h"
	};

	static const NAWchar array_1[] = {
	#include "sjis_87_9f.h"
	};

	static const NAWchar array_2[] = {
	#include "sjis_e0_ea.h"
	};

	static const NAWchar array_3[] = {
	#include "sjis_ed_ee.h"
	};

	static const NAWchar array_4[] = {
	#include "sjis_fa_fc.h"
	};

	// ranges determined by the leading byte
	static const SJISCodeBoundT SJISLeadByteBounds[] =
	{
	{0x81, 0x84}, { 0x87, 0x9F}, {0xE0, 0xEA}, {0xED, 0xEE}, {0xFA, 0xFC},
	{0x0, 0xFF } // catch all
	};

	// blocks determined by the trailing byte
	static const SJISCodeBoundT SJISTrailByteBounds[] =
	{
	{0x40, 0x7E}, {0x80, 0xFC},
	{0x0, 0xFF } // catch all
	};


	unsigned char* source = sjisString.data();
	Int32 sourceLen = sjisString.getStrLen();

	// the output Unicode string will have at most sjisString.length()
	// characters. An extra char may be added depending on addNullAtEnd.
	NAWcharBuf* output = checkSpace(heap, sjisString.getStrLen(), result, addNullAtEnd);

	if ( output == 0 )
	return 0;

	NAWchar base, target;
	base = target = output -> data();

	unsigned char c, d;
	NAWchar u;
	Int32 i=0;

	while ( i < sourceLen ) {

	c = source[i++];

	if ( isSingleByteSJIS1stBlock(c) )
	u = (NAWchar)c; // found in the first single-byte block
	else
	if ( isSingleByteSJIS2ndBlock(c) )
	u = (NAWchar)c - 0xA1 + 0xFF61; // found in the 2nd single-byte block
	else {

	// the second byte does not exist!
	if ( i == sjisString.getStrLen() ) {
	return 0;
	}

	// get the trailing byte
	d = source[i++];

	// is d a bad trailing byte?
	if ( d == 0x7F \|\| d == 0xFD \|\| d == 0xFE \|\| d == 0xFF ) {
	return 0;
	}

	Int32 lead, trail;

	// find the range in which c is in.
	for ( lead = 0; lead<=4; lead++ )
	if ( in_range(c, SJISLeadByteBounds[lead].lower,
	SJISLeadByteBounds[lead].upper))
	break;

	// find the first or the second block for c.
	for (trail=0; trail<=2; trail++ )
	if ( in_range(d, SJISTrailByteBounds[trail].lower,
	SJISTrailByteBounds[trail].upper ))
	break;

	// out of range of SJIS legal code values
	if ( (c == 0xEA && d >= 0xA4) \|\|
	(c == 0xFC && d >= 0x4C) \|\|
	(lead == 5 ) \|\| (trail == 2)
	)
	{
	return 0;
	}


	// Each chunk (all chars with identical lead byte)
	// has 16 * 18 = 192 chars. But because of the missing
	// ones at 0x7F, 0xFD, 0xFE and 0xFF tailing byte,
	// each chunk contains only 192 - 4 = 188 chars.
	//
	// In additional, a char with greater than 0x7F
	// tailing byte (say, d) should be mapped to the entry
	// with the index (d-1) instead of (d) in the right chunk.
	// We do this by using the expression "- trail".
	switch (lead) {
	case 0:
	u = array_0[(c-0x81) * 188 + d - 0x40 - trail];
	break;
	case 1:
	u = array_1[(c-0x87) * 188 + d - 0x40 - trail];
	break;
	case 2:
	u = array_2[(c-0xe0) * 188 + d - 0x40 - trail];
	break;
	case 3:
	u = array_3[(c-0xed) * 188 + d - 0x40 - trail];
	break;
	default:
	u = array_4[(c-0xfa) * 188 + d - 0x40 - trail];
	}
	}

	if ( u == 0xFFFF ) // filler chars are not defined in SJIS standard
	return 0;

	*target = u;
	target++;
	}

	Int32 finalLength = target-base;

	if ( addNullAtEnd == TRUE )
	(output -> data())[finalLength] = 0;

	output -> setStrLen(finalLength);
	return output;
	}

	typedef struct Unicode2SjisMap
	{
	NAWchar Unicode;
	NAWchar sjis;
	} Unicode2SjisMapT;

	//
	// Using the binary search method to find the SJIS code for a Unicode character
	// contained in argument u. The SJIS code is returned in the argument sjis. The
	// function returns TRUE if the conversion is sucessful, FALSE otherwise.
	//
	// The function only returns the SJIS that are double-byte. Hense it is static.
	//
	static
	NABoolean binarySearchU2STable(NAWchar u, NAWchar& sjis)
	{


	//
	// This array contains the Unicode to SJIS mapping with. Each line in
	// the include file is in the format
	//
	// { Unicode_code_value, SJIS_code_value }, /* remark */
	//
	// It is assumed the Unicode_code_value column is in strict ascending order.
	//
	static const Unicode2SjisMapT array_u2s[] = {
	#include "sjis_from_ucs2.h"
	};

	Int32 lowerLimit = 0;
	Int32 upperLimit = sizeof(array_u2s)/sizeof(Unicode2SjisMapT) - 1;
	Int32 middle = 0;

	do {
	middle = (lowerLimit + upperLimit ) / 2;

	if ( array_u2s[middle].Unicode == u ) {
	sjis = array_u2s[middle].sjis;
	return TRUE;
	}


	if ( array_u2s[middle].Unicode < u )
	lowerLimit = middle+1;
	else
	upperLimit = middle-1;

	} while ( lowerLimit <= upperLimit );

	return FALSE;
	}

	//
	// Convert a single Unicode character (in argument wc) into SJIS multibyte
	// format. The result is placed into the sjis argument. The number of SJIS
	// bytes is returned by the function. The function returns 0 if the
	// Unicode character is not mappable from SJIS.
	//
	Int32 unicodeToSjisChar(NAWchar wc, unsigned char *sjis, NABoolean allowInvalidCodePoint)
	{
	NAWchar t;

	// single-byte SJIS characters, in [0, 0x7F]. The Unicode range is
	// also [0, 0x007F]
	if ( in_range(wc, 0, 0x007F) ) {
	sjis[0] = (unsigned char)wc;
	return 1;
	}

	// single-byte SJIS characters, in [0xA1, 0xDF]. The Unicode range
	// is [0xFF61, 0xFF9F]
	if ( in_range(wc, 0xFF61, 0xFF9F) ) {
	sjis[0] = (unsigned char)(wc - 0xFF61 + 0xA1);
	return 1;
	}

	// double byte
	if ( binarySearchU2STable(wc, t) && t >= 0x8140 ) {
	sjis[0] = (unsigned char)((t>>8) & (0xFF));
	sjis[1] = (unsigned char)(t & (0xFF));
	return 2;
	}

	// bad case
	if ( allowInvalidCodePoint == FALSE )
	return 0;
	else {
	sjis[0] = '?';
	return 1;
	}
	}

	// Unicode to SJIS conversion.
	//
	// This function takes a Unicode string and returns its SJIS equivalent.
	// The optional sjisString argument holds the buffer into which the Unicode string
	// will be stored. In case the argument is NULL or it is not big enough,
	// the function allocates memory from the heap (if the heap pointer is not NULL),
	// or from the C run-time system heap.
	// If the memory allocation fails, the function returns 0. If any illegal
	// characters are encountered, the function also returns 0.
	//
	charBuf* unicodeToSjis(const NAWcharBuf& unicodeString, CollHeap *heap,
	charBuf*& sjisString, NABoolean addNullAtEnd,
	NABoolean allowInvalidCodePoint)
	{
	NAWchar* source = unicodeString.data();
	Int32 sourceLen = unicodeString.getStrLen();

	// the output Unicode string will have at most 2*unicodeString.length()
	// characters. An extra char may be added depending on addNullAtEnd.
	charBuf* output = checkSpace(heap, BYTES_PER_NAWCHAR*unicodeString.getStrLen(), sjisString, addNullAtEnd);

	if ( output == 0 )
	return 0;

	unsigned char base, target;
	base = target = (unsigned char*) (output -> data());

	Int32 ct = 0;
	for ( Int32 i=0; i<sourceLen; i++ ) {

	ct = unicodeToSjisChar(source[i], target, allowInvalidCodePoint);

	if ( ct == 0 ) {
	if ( sjisString == NULL ) {
	if (heap)
	NADELETE(output, charBuf, heap);
	else
	delete output;
	}
	return NULL;
	} else
	target += ct;
	}

	Int32 finalLength = target - base;

	if ( addNullAtEnd == TRUE )
	(output -> data())[finalLength] = 0;

	output -> setStrLen(finalLength);
	return output;
	}

	#ifdef MODULE_DEBUG

	//
	// testing: dumping all Unicode codes for Sjis chars.
	// Use the tool /MiscVOB/i18n/verify.pl to see the
	// difference between the dumping and mx_sjis.txt.l2u.
	// Should only see complaining about 0xFFFF.
	//

	static
	Int32 leadbyte[] = {
	0x81, 0x82, 0x83, 0x84, 0x87, 0x88,
	0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
	0x8f,
	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
	0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
	0x9e, 0x9f,
	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
	0xe7, 0xe8, 0xe9, 0xea, 0xed, 0xee, 0xfa,
	0xfb, 0xfc
	};

	void sjisDump()
	{
	charBuf sjis(2);
	NAWcharBuf* unicode = new NAWcharBuf(10);

	for ( Int32 i=0; i<0xFF; i++ ) {
	sjis.data()[0] = i;
	sjis.setLength(1);

	unicode = sjisToUnicode(sjis, 0, unicode);
	if ( unicode )
	printf("0x%X 0x%X\n", i, unicode->data()[0]);
	}

	for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
	for (Int32 j=0x40; j<=0xFF; j++ ) {
	sjis.data()[0] = leadbyte[i];
	sjis.data()[1] = j;
	sjis.setLength(2);

	unicode = sjisToUnicode(sjis, 0, unicode);
	if ( unicode )
	printf("0x%X%X 0x%X\n", leadbyte[i], j, unicode->data()[0]);
	}

	delete unicode;
	}

	// Roundtrip verification.
	//
	// Perform a round-trip conversion testing for each valid SJIS.
	//
	// should not see any warnings.
	//
	void sjis2Unicode2sjis()
	{
	printf("SJIS round-trip testing: ");

	charBuf sjis(2);
	NAWcharBuf* unicode = new NAWcharBuf(10);
	charBuf* remapped_sjis = new charBuf(10);

	// do the test for single-byte chars
	for ( Int32 i=0; i<0xDF; i++ ) {

	// skip non-SJIS chars
	if ( in_range(i, 0x80, 0xA0) )
	continue;

	sjis.data()[0] = i;
	sjis.setLength(1);

	unicode = sjisToUnicode(sjis, 0, unicode);

	if ( unicode == NULL ) {
	printf("SJIS to Unicode mapping failed: 0x%X.\n", i);
	return;
	}

	remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

	if ( remapped_sjis == NULL ) {
	printf("Unicode 0x%X can not be mapped to SJIS\n",
	unicode->data()[0]);
	return;
	}

	if ( remapped_sjis->data()[0] != sjis.data()[0] ) {
	printf("Roundtrip mapping failed: 0x%X 0x%X\n",
	sjis.data()[0], remapped_sjis->data()[0]
	);
	return;
	}
	}

	// do the test for double-byte chars
	for ( i = 0; i<sizeof(leadbyte)/sizeof(Int32); i++ )
	for (Int32 j=0x40; j<=0xFF; j++ ) {

	sjis.data()[0] = leadbyte[i];
	sjis.data()[1] = j;
	sjis.setLength(2);

	unicode = sjisToUnicode(sjis, 0, unicode);
	if ( unicode == NULL ) {
	continue; // skip non-exist chars
	}

	remapped_sjis = unicodeToSjis(*unicode, 0, remapped_sjis);

	if ( remapped_sjis == NULL ) {
	printf("Unicode 0x%X can not be mapped to SJIS\n",
	unicode->data()[0]);
	return;
	}

	if ( remapped_sjis->data()[0] != sjis.data()[0] \|\|
	remapped_sjis->data()[1] != sjis.data()[1]
	) {
	printf("Roundtrip mapping failed: sjis=0x%X%X unicode=0x%X, remapped_sjis=0x%X%X\n",
	sjis.data()[0], sjis.data()[1],
	unicode->data()[0],
	remapped_sjis->data()[0], remapped_sjis->data()[1]
	);
	return;
	}
	}

	delete unicode;
	delete remapped_sjis;

	printf("OK\n");
	}

	//
	// Testing: assure all SJIS-mappable Unicode chars can be mapped
	// to SJIS. Should not see any "Bad code ..." output.
	//
	void UCS2ToSjis()
	{
	printf("Unicode to SJIS testing: ");

	NAWchar sjisChar;
	NAWcharBuf unicode(1);
	charBuf* sjis = new charBuf(10);

	Int32 n = sizeof(array_u2s)/sizeof(Unicode2SjisMapT);
	for ( Int32 i=0; i<n; i++ )
	{
	unicode.data()[0] = array_u2s[i].Unicode;
	unicodeToSjis(unicode, 0, sjis);

	switch (sjis->length()) {
	case 2:
	sjisChar = (NAWchar)(sjis->data()[1]);
	sjisChar \|= (NAWchar)((sjis->data()[0] <<8));
	break;
	default:
	printf("Bad code 0x%X\n", array_u2s[i].Unicode);
	return;
	}
	if ( sjisChar != array_u2s[i].sjis ) {
	printf("Bad code 0x%x for 0x%X\n", sjisChar, array_u2s[i].Unicode);
	return;
	}
	}

	for ( i=0; i<=0x7F; i++ )
	{
	unicode.data()[0] = (NAWchar)i;
	unicodeToSjis(unicode, 0, sjis);

	switch (sjis->length()) {
	case 1:
	sjisChar = (NAWchar)sjis->data()[0];
	break;
	default:
	printf("Bad code 0x%X\n", i);
	return;
	}
	if ( sjisChar != i ) {
	printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
	return;
	}
	}

	for ( i=0xFF61; i<=0xFF9F; i++ )
	{
	unicode.data()[0] = (NAWchar)i;
	unicodeToSjis(unicode, 0, sjis);

	switch (sjis->length()) {
	case 1:
	sjisChar = (NAWchar)sjis->data()[0];
	break;
	default:
	printf("Bad code 0x%X\n", i);
	return;
	}
	if ( sjisChar != i - 0xFF61 + 0xA1 ) {
	printf("Bad code 0x%x for 0x%X\n", sjisChar, i);
	return;
	}
	}
	printf("OK\n");
	}

	//
	// Define MODULE_DEBUG at the beginning of this file and select one
	// of the following to test. Build the test app with the command:
	//
	// cl /nologo /Zp4 /W3 /GX /Od /MDd /D "_DEBUG" /D "NA_WINNT" /Z7 \
	// -o a.exe conversionsjis.cpp
	//
	Int32 main(Int32 argc, char** argv)
	{
	//sjisDump();
	sjis2Unicode2sjis();
	UCS2ToSjis();
	return 0;
	}

	#endif