blob: 3d456c6599d2b8632e7398fb58dc34d3e6ba20b0 [file] [log] [blame]
////////////////////////////////////////////////////////////////////////////////
//
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////
/*
CharConv.cpp
Author: Sudhakar Pandey <sudhakar@adobe.com>
*/
#include <iostream>
#include <cstring>
#include <stdexcept>
#include "EncConv.h"
extern "C"
{
#include "unicode/ucnv.h" //Adding this as an alternative to native iconv library
#include "unicode/ustring.h" //Adding this as an alternative to native iconv library
}
#ifndef MAX
#define MAX(a,b) (((a)>(b))?(a):(b))
#endif /* MAX */
// =============================================================================
namespace EncConv
{
// -----------------------------------------------------------------------------
U16Char_t* convSpecialCharsInU16Str(const U16Char_t* src)
{
const LM_UInt32 srcLen = GetNumOfUnits(src);
U16Char_t* pdst = new U16Char_t [srcLen + 1];
LM_UInt32 i;
for(i = 0; i < srcLen; ++i)
{
U16Char_t c = src[i];
switch (c)
{
case 0x2018: // U+2018: Left Single Quotation Mark
case 0x2019: // U+2019: Right Single Quotation Mark
c = '\''; break;
case 0x201C: // U+201C: Left Double Quotation Mark
case 0x201D: // U+201D: Right Double Quotation Mark
c = '"'; break;
}
pdst[i] = c;
}
pdst[i] = 0;
return pdst;
//delete[] pdst;
//should be deleted by calling releaseU16CharString() function.
}
const char * getPlatformEncoding(const char* enc)
{
//If encoding is null or not specified then try the default encoding "ISO-8859-1"
if(strlen(enc) == 0)
return "ISO-8859-1";
if(strcmp(enc,"ISO8859-1") == 0)
return "ISO-8859-1";
else if (strcmp(enc,"ISO8859-2") == 0)
return "ISO-8859-2";
else if (strcmp(enc,"ISO8859-3") == 0)
return "ISO-8859-3";
else if (strcmp(enc,"ISO8859-4") == 0)
return "ISO-8859-4";
else if (strcmp(enc,"ISO8859-5") == 0)
return "ISO-8859-5";
else if (strcmp(enc,"ISO8859-6") == 0)
return "ISO-8859-6";
else if (strcmp(enc,"ISO8859-7") == 0)
return "ISO-8859-7";
else if (strcmp(enc,"ISO8859-8") == 0)
return "ISO-8859-8";
else if (strcmp(enc,"ISO8859-9") == 0)
return "ISO-8859-9";
else if (strcmp(enc,"ISO8859-10") == 0)
return "ISO-8859-10";
else if (strcmp(enc,"KOI8-R") == 0)
return "KOI8-R";
else if (strcmp(enc,"KOI8-U") == 0)
return "KOI8-U";
else if (strcmp(enc,"microsoft-cp1251") == 0)
return "cp1251";
else if (strcmp(enc,"ISO8859-13") == 0)
return "ISO-8859-13";
else if (strcmp(enc,"ISO8859-14") == 0)
return "ISO-8859-14";
else if (strcmp(enc,"ISO8859-15") == 0)
return "ISO-8859-15";
else if (strcmp(enc,"ISCII-DEVANAGARI") == 0)
return "ibm-1137";
else if (strcmp(enc,"TIS620-2533") == 0)
return "TIS-620";
else if (strcmp(enc,"UTF-8") == 0)
return "UTF-8";
else
return enc;
}
const std::string convU16StrToCharStr(const U16Char_t* src, const char* Encoding)
{
//static char const* const tocode = CHARCONV_ICONV_UTF16;
char const* const tocode = getPlatformEncoding(Encoding);
UErrorCode status = U_ZERO_ERROR;
#ifdef ENCCONV_DEBUG
std::cout << "\t" "convString" << std::endl;
std::cout << "\t\t" "tocode = " << tocode << std::endl;
//std::cout << "\t\t" "fromcode = " << fromcode << std::endl;
#endif
//iconv_t cd = iconv_open(tocode, fromcode);
// Initializing ICU converter
UConverter *conv = ucnv_open(tocode, &status);
#ifdef CHARCONV_DEBUG
std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl;
#endif
if (conv == NULL)
{ // try default encoding "ISO-8859-1"
//throw std::runtime_error("Unable to create Unicode converter object");
status = U_ZERO_ERROR;
conv = ucnv_open("ISO-8859-1", &status);
}
//still if conv is null simply return blank string
if (conv == NULL)
{
return std::string("");
}
U16Char_t const* srcWrk = src;
const size_t srcSizeInUnits = GetNumOfUnits(src);
const size_t srcSizeInBytes = srcSizeInUnits * sizeof(U16Char_t);
const size_t dstSizeInBytes = MAX(256, (srcSizeInUnits + 1)) * 4; // How much byte buffer is needed? (UTF16 --> MBCS)
char* dst = new char [dstSizeInBytes];
if(dst==NULL)
{
//Fix for #3211945
ucnv_close(conv);
return std::string("");
}
char* dstWrk =(char*)(dst);
size_t srcLeftInBytes = srcSizeInBytes;
size_t dstLeftInBytes = dstSizeInBytes - sizeof(char);
status = U_ZERO_ERROR;
ucnv_fromUChars(conv, dstWrk, dstLeftInBytes, (UChar*)srcWrk, -1, &status);
U16Char_t* reverseConvertedVal = convCharStrToU16Str(dstWrk,Encoding);
if(strcmp((char*)reverseConvertedVal,(char*)src)!=0)
{
EncConv::releaseU16Str(reverseConvertedVal);
//Fix for #3211945
dstWrk = NULL;
ucnv_close(conv);
delete[] dst;
return std::string("");
}
EncConv::releaseU16Str(reverseConvertedVal);
#ifdef CHARCONV_DEBUG
std::cout << "\t\t" "aft iconv: status = " << status << std::endl;
#endif
if (status != U_ZERO_ERROR )
{
// throw std::runtime_error("Unable to convert to string");
*dstWrk = 0;
}
std::string dst2(dst);
//Fix for #3211945
dstWrk = NULL;
delete[] dst;
//const int err = iconv_close(cd);
ucnv_close(conv);
//if (err == -1)
// throw std::runtime_error("Unable to deallocate iconv_t object");
return dst2;
}
U16Char_t* convCharStrToU16Str(const char* src, const char* Encoding)
{
//static char const* const tocode = CHARCONV_ICONV_UTF16;
char const* const fromcode = getPlatformEncoding(Encoding);
UErrorCode status = U_ZERO_ERROR;
#ifdef ENCCONV_DEBUG
std::cout << "\t" "convString" << std::endl;
//std::cout << "\t\t" "tocode = " << tocode << std::endl;
std::cout << "\t\t" "fromcode = " << fromcode << std::endl;
#endif
//iconv_t cd = iconv_open(tocode, fromcode);
// Initializing ICU converter
UConverter *conv= ucnv_open(fromcode, &status);
#ifdef CHARCONV_DEBUG
std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl;
#endif
if (conv == NULL)
{ // try default encoding "ISO-8859-1"
//throw std::runtime_error("Unable to create Unicode converter object");
conv = ucnv_open("ISO-8859-1", &status);
}
char const* srcWrk = src;
const size_t srcSizeInBytes = std::strlen(src);
const size_t dstSizeInBytes = MAX(256, (srcSizeInBytes + 1)) * sizeof(U16Char_t);
U16Char_t* dst = new U16Char_t [dstSizeInBytes / sizeof(U16Char_t)];
U16Char_t* dstWrk = dst;
size_t srcLeftInBytes = srcSizeInBytes;
size_t dstLeftInBytes = dstSizeInBytes - sizeof(U16Char_t);
status = U_ZERO_ERROR;
//still if conv is null simply return blank string
if (conv == NULL)
{
dst[0] = NULL;
//Fix for #3211945
dstWrk = NULL;
return dst;
}
ucnv_toUChars(conv, (UChar *) dstWrk, dstLeftInBytes, (char*)srcWrk, srcLeftInBytes, &status);
#ifdef CHARCONV_DEBUG
std::cout << "\t\t" "aft iconv: status = " << status << std::endl;
#endif
if (status != U_ZERO_ERROR )
{
// throw std::runtime_error("Unable to convert to string");
*dstWrk = 0;
}
//const int err = iconv_close(cd);
ucnv_close(conv);
//if (err == -1)
// throw std::runtime_error("Unable to deallocate iconv_t object");
//Fix for #3211945
dstWrk = NULL;
return dst;
}
void releaseU16Str(const U16Char_t* buf)
{
if(buf != NULL)
{
delete[] buf;
buf = NULL;
}
return;
}
}// namespace
// -----------------------------------------------------------------------------