//////////////////////////////////////////////////////////////////////////////// | |
// | |
// Licensed to the Apache Software Foundation (ASF) under one or more | |
// contributor license agreements. See the NOTICE file distributed with | |
// this work for additional information regarding copyright ownership. | |
// The ASF licenses this file to You under the Apache License, Version 2.0 | |
// (the "License"); you may not use this file except in compliance with | |
// the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
//////////////////////////////////////////////////////////////////////////////// | |
/* | |
CharConv.cpp | |
Author: Sudhakar Pandey <sudhakar@adobe.com> | |
*/ | |
#include <iostream> | |
#include <cstring> | |
#include <stdexcept> | |
#include "EncConv.h" | |
#include "unicode/ucnv.h" //Adding this as an alternative to native iconv library | |
#include "unicode/ustring.h" //Adding this as an alternative to native iconv library | |
#ifndef MAX | |
#define MAX(a,b) (((a)>(b))?(a):(b)) | |
#endif /* MAX */ | |
// ============================================================================= | |
namespace EncConv | |
{ | |
// ----------------------------------------------------------------------------- | |
U16Char_t* convSpecialCharsInU16Str(const U16Char_t* src) | |
{ | |
const LM_UInt32 srcLen = GetNumOfUnits(src); | |
U16Char_t* pdst = new U16Char_t [srcLen + 1]; | |
LM_UInt32 i; | |
for(i = 0; i < srcLen; ++i) | |
{ | |
U16Char_t c = src[i]; | |
switch (c) | |
{ | |
case 0x2018: // U+2018: Left Single Quotation Mark | |
case 0x2019: // U+2019: Right Single Quotation Mark | |
c = '\''; break; | |
case 0x201C: // U+201C: Left Double Quotation Mark | |
case 0x201D: // U+201D: Right Double Quotation Mark | |
c = '"'; break; | |
} | |
pdst[i] = c; | |
} | |
pdst[i] = 0; | |
return pdst; | |
//delete[] pdst; | |
//should be deleted by calling releaseU16CharString() function. | |
} | |
const char * getPlatformEncoding(const char* enc) | |
{ | |
//If encoding is null or not specified then try the default encoding "ISO-8859-1" | |
if(strlen(enc) == 0) | |
return "ISO-8859-1"; | |
if(strcmp(enc,"ISO8859-1") == 0) | |
return "ISO-8859-1"; | |
else if (strcmp(enc,"ISO8859-2") == 0) | |
return "ISO-8859-2"; | |
else if (strcmp(enc,"ISO8859-3") == 0) | |
return "ISO-8859-3"; | |
else if (strcmp(enc,"ISO8859-4") == 0) | |
return "ISO-8859-4"; | |
else if (strcmp(enc,"ISO8859-5") == 0) | |
return "ISO-8859-5"; | |
else if (strcmp(enc,"ISO8859-6") == 0) | |
return "ISO-8859-6"; | |
else if (strcmp(enc,"ISO8859-7") == 0) | |
return "ISO-8859-7"; | |
else if (strcmp(enc,"ISO8859-8") == 0) | |
return "ISO-8859-8"; | |
else if (strcmp(enc,"ISO8859-9") == 0) | |
return "ISO-8859-9"; | |
else if (strcmp(enc,"ISO8859-10") == 0) | |
return "ISO-8859-10"; | |
else if (strcmp(enc,"KOI8-R") == 0) | |
return "KOI8-R"; | |
else if (strcmp(enc,"KOI8-U") == 0) | |
return "KOI8-U"; | |
else if (strcmp(enc,"microsoft-cp1251") == 0) | |
return "cp1251"; | |
else if (strcmp(enc,"ISO8859-13") == 0) | |
return "ISO-8859-13"; | |
else if (strcmp(enc,"ISO8859-14") == 0) | |
return "ISO-8859-14"; | |
else if (strcmp(enc,"ISO8859-15") == 0) | |
return "ISO-8859-15"; | |
else if (strcmp(enc,"ISCII-DEVANAGARI") == 0) | |
return "ibm-1137"; | |
else if (strcmp(enc,"TIS620-2533") == 0) | |
return "TIS-620"; | |
else if (strcmp(enc,"UTF-8") == 0) | |
return "UTF-8"; | |
else | |
return enc; | |
} | |
const std::string convU16StrToCharStr(const U16Char_t* src, const char* Encoding) | |
{ | |
//static char const* const tocode = CHARCONV_ICONV_UTF16; | |
char const* const tocode = getPlatformEncoding(Encoding); | |
UErrorCode status = U_ZERO_ERROR; | |
#ifdef ENCCONV_DEBUG | |
std::cout << "\t" "convString" << std::endl; | |
std::cout << "\t\t" "tocode = " << tocode << std::endl; | |
//std::cout << "\t\t" "fromcode = " << fromcode << std::endl; | |
#endif | |
//iconv_t cd = iconv_open(tocode, fromcode); | |
// Initializing ICU converter | |
UConverter *conv = ucnv_open(tocode, &status); | |
#ifdef CHARCONV_DEBUG | |
std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl; | |
#endif | |
if (conv == NULL) | |
{ // try default encoding "ISO-8859-1" | |
//throw std::runtime_error("Unable to create Unicode converter object"); | |
status = U_ZERO_ERROR; | |
conv = ucnv_open("ISO-8859-1", &status); | |
} | |
//still if conv is null simply return blank string | |
if (conv == NULL) | |
{ | |
return std::string(""); | |
} | |
U16Char_t const* srcWrk = src; | |
const size_t srcSizeInUnits = GetNumOfUnits(src); | |
const size_t srcSizeInBytes = srcSizeInUnits * sizeof(U16Char_t); | |
const size_t dstSizeInBytes = MAX(256, (srcSizeInUnits + 1)) * 4; // How much byte buffer is needed? (UTF16 --> MBCS) | |
char* dst = new char [dstSizeInBytes]; | |
if(dst==NULL) return std::string(""); | |
char* dstWrk =(char*)(dst); | |
size_t srcLeftInBytes = srcSizeInBytes; | |
size_t dstLeftInBytes = dstSizeInBytes - sizeof(char); | |
status = U_ZERO_ERROR; | |
ucnv_fromUChars(conv, dstWrk, dstLeftInBytes, (UChar*)srcWrk, -1, &status); | |
U16Char_t* reverseConvertedVal = convCharStrToU16Str(dstWrk,Encoding); | |
if(strcmp((char*)reverseConvertedVal,(char*)src)!=0) | |
{ | |
EncConv::releaseU16Str(reverseConvertedVal); | |
delete[] dst; | |
return std::string(""); | |
} | |
EncConv::releaseU16Str(reverseConvertedVal); | |
#ifdef CHARCONV_DEBUG | |
std::cout << "\t\t" "aft iconv: status = " << status << std::endl; | |
#endif | |
if (status != U_ZERO_ERROR ) | |
{ | |
// throw std::runtime_error("Unable to convert to string"); | |
*dstWrk = 0; | |
} | |
std::string dst2(dst); | |
delete[] dst; | |
//const int err = iconv_close(cd); | |
ucnv_close(conv); | |
//if (err == -1) | |
// throw std::runtime_error("Unable to deallocate iconv_t object"); | |
return dst2; | |
} | |
U16Char_t* convCharStrToU16Str(const char* src, const char* Encoding) | |
{ | |
//static char const* const tocode = CHARCONV_ICONV_UTF16; | |
char const* const fromcode = getPlatformEncoding(Encoding); | |
UErrorCode status = U_ZERO_ERROR; | |
#ifdef ENCCONV_DEBUG | |
std::cout << "\t" "convString" << std::endl; | |
//std::cout << "\t\t" "tocode = " << tocode << std::endl; | |
std::cout << "\t\t" "fromcode = " << fromcode << std::endl; | |
#endif | |
//iconv_t cd = iconv_open(tocode, fromcode); | |
// Initializing ICU converter | |
UConverter *conv = ucnv_open(fromcode, &status); | |
#ifdef CHARCONV_DEBUG | |
std::cout << "\t\t" "aft ucnv_open: status = " << status << std::endl; | |
#endif | |
if (conv == NULL) | |
{ // try default encoding "ISO-8859-1" | |
//throw std::runtime_error("Unable to create Unicode converter object"); | |
conv = ucnv_open("ISO-8859-1", &status); | |
} | |
char const* srcWrk = src; | |
const size_t srcSizeInBytes = std::strlen(src); | |
const size_t dstSizeInBytes = MAX(256, (srcSizeInBytes + 1)) * sizeof(U16Char_t); | |
U16Char_t* dst = new U16Char_t [dstSizeInBytes / sizeof(U16Char_t)]; | |
U16Char_t* dstWrk = dst; | |
size_t srcLeftInBytes = srcSizeInBytes; | |
size_t dstLeftInBytes = dstSizeInBytes - sizeof(U16Char_t); | |
status = U_ZERO_ERROR; | |
//still if conv is null simply return blank string | |
if (conv == NULL) | |
{ | |
dst[0] = NULL; | |
return dst; | |
} | |
ucnv_toUChars(conv, (UChar *) dstWrk, dstLeftInBytes, (char*)srcWrk, srcLeftInBytes, &status); | |
#ifdef CHARCONV_DEBUG | |
std::cout << "\t\t" "aft iconv: status = " << status << std::endl; | |
#endif | |
if (status != U_ZERO_ERROR ) | |
{ | |
// throw std::runtime_error("Unable to convert to string"); | |
*dstWrk = 0; | |
} | |
//const int err = iconv_close(cd); | |
ucnv_close(conv); | |
//if (err == -1) | |
// throw std::runtime_error("Unable to deallocate iconv_t object"); | |
return dst; | |
} | |
void releaseU16Str(const U16Char_t* buf) | |
{ | |
if(buf != NULL) | |
{ | |
delete[] buf; | |
buf = NULL; | |
} | |
return; | |
} | |
}// namespace | |
// ----------------------------------------------------------------------------- | |