blob: 5070d35a7f70496669328c951102f3160b131483 [file]
/* $Id$
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version
* 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* etch_encoding.c -- character encoding
*/
/* we include an APR header only to get required platform specific headers (windows.h or whatever */
#include <apr_thread_cond.h>
#include "etch_encoding.h"
#include "etch_global.h"
#include "etchlog.h"
#if IS_WINDOWS_ETCH
#include "winnls.h" /* for WideCharToMultiByte etc */
#endif
char* ETCH_ENCODING_LOGID = "ENCO";
char* ETCH_ENCODING_EMASK = "encoding conversion error %d\n";
void etch_encoding_errcheck(const int);
/*
* etch_unicode_to_8bit()
* currently windows-specific. todo need portable version.
* @param unicode_string_in a character string, caller retains ownership.
* @param which which codepage to use, utf-8 (1) or ansi (0)
* @return an 8-bit character string, via the out parameter,
* caller assumes ownership of this memory.
* @return 0 or -1
*/
int etch_unicode_to_8bit(char** bit8_string_out, wchar_t* unicode_string_in, const which)
{
const int charcount_in = (int) wcslen(unicode_string_in);
int bit8_charcount = 0, bytecount_out = 0, codepage = 0, result = -1;
char* bit8_buf = NULL;
if (NULL == bit8_string_out) return -1;
#if IS_WINDOWS_ETCH /* todo: portable implementation */
codepage = which == 1? CP_UTF8: CP_ACP;
/* call system first to calculate utf-8 buffer size required for
* given unicode input and output code page. out buffer length zero
* instructs system to do so. */
bit8_charcount = WideCharToMultiByte(codepage, 0,
unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);
bit8_buf = etch_malloc(++bit8_charcount, ETCHTYPEB_BYTES); /* add null term */
memset(bit8_buf, 0, bit8_charcount);
bytecount_out = WideCharToMultiByte(codepage, 0, /* do conversion */
unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);
result = bytecount_out > 0? 0: -1;
if (result == 0) /* transfer ownership of out buffer to caller */
*bit8_string_out = bit8_buf;
else
{ *bit8_string_out = NULL;
etch_free(bit8_buf); bit8_buf = NULL;
}
etch_encoding_errcheck(result);
#endif /* IS_WINDOWS_ETCH */
return result;
}
/*
* etch_unicode_to_utf8()
* @param unicode_string_in a character string owned by caller.
* @return a UTF-8 encoded string, via the out parameter,
* caller assumes ownership of this memory.
* @return 0 or -1
*/
int etch_unicode_to_utf8(char** utf8_string_out, wchar_t* unicode_string_in)
{
return etch_unicode_to_8bit(utf8_string_out, unicode_string_in, 1);
}
/*
* etch_unicode_to_ansi()
* @param unicode_string_in a character string owned by caller.
* @return an ascii encoded string, via the out parameter,
* caller assumes ownership of this memory, which must be etch_free()'d
* @return 0 or -1
*/
int etch_unicode_to_ansi(char** ansi_string_out, wchar_t* unicode_string_in)
{
return etch_unicode_to_8bit(ansi_string_out, unicode_string_in, 0);
}
/*
* etch_unicode_to_8bit()
* currently windows-specific. todo need portable version.
* @param bit8_string_in a character string owned by caller.
* @return a unicode string, via the out parameter,
* caller assumes ownership of this memory, which must be etch_free()'d
* @return 0 or -1
*/
int etch_8bit_to_unicode(wchar_t** unicode_string_out, char* bit8_string_in, const which)
{
const int charcount_in = (int) strlen(bit8_string_in);
int unicode_bufsize = 0, charcount_out = 0, codepage = 0, result = -1;
wchar_t* ubuf = NULL;
if (NULL == unicode_string_out) return -1;
#if IS_WINDOWS_ETCH /* todo: portable implementation */
codepage = which == 1? CP_UTF8: CP_ACP;
unicode_bufsize = (charcount_in + 1) * 2;
ubuf = etch_malloc(unicode_bufsize, ETCHTYPEB_BYTES);
memset(ubuf, 0, unicode_bufsize);
charcount_out = MultiByteToWideChar(codepage, 0, /* do conversion */
bit8_string_in, charcount_in + 1, ubuf, unicode_bufsize);
result = charcount_out > 0? 0: -1;
if (result == 0) /* transfer ownership of out buffer to caller */
*unicode_string_out = ubuf;
else
{ *unicode_string_out = NULL;
etch_free(ubuf); ubuf = NULL;
}
etch_encoding_errcheck(result);
#endif /* IS_WINDOWS_ETCH */
return result;
}
/*
* etch_utf8_to_unicode()
* @param utf8_string_in a character string owned by caller.
* @return a UTF-16 encoded string, via the out parameter,
* caller assumes ownership of this memory, which must be etch_free()'d
* @return 0 or -1
*/
int etch_utf8_to_unicode(wchar_t** unicode_string_out, char* utf8_string_in)
{
return etch_8bit_to_unicode(unicode_string_out, utf8_string_in, 1);
}
/*
* etch_ansi_to_unicode()
* @param ascii_string_in a character string owned by caller.
* @return a UTF-16 encoded string, via the out parameter,
* caller assumes ownership of this memory, which must be etch_free()'d
*/
int etch_ansi_to_unicode(wchar_t** unicode_string_out, char* ascii_string_in)
{
return etch_8bit_to_unicode(unicode_string_out, ascii_string_in, 0);
}
/*
* etch_encoding_errcheck()
* private method to check result of an encoding conversion and log error.
*/
void etch_encoding_errcheck(const int result)
{
int ecode, whicherr;
if (result == 0) return;
#if IS_WINDOWS_ETCH
ecode = GetLastError();
switch(whicherr = ecode)
{ case ERROR_INSUFFICIENT_BUFFER: whicherr = 1; break;
case ERROR_INVALID_FLAGS: whicherr = 2; break;
case ERROR_INVALID_PARAMETER: whicherr = 3; break;
}
etchlog(ETCH_ENCODING_LOGID,ETCHLOG_ERROR,ETCH_ENCODING_EMASK, whicherr);
#endif
}
/*
* etch_get_unicode_bytecount()
* @param widestring a unicode string.
* @return number of bytes in the string, including the null terminator.
*/
size_t etch_get_unicode_bytecount (wchar_t* widestring)
{
wchar_t* q = NULL;
size_t bytecount = 0;
const int charcount = widestring? (int) wcslen(widestring): 0;
if (0 == charcount) return 0;
q = &widestring[charcount];
q++;
bytecount = ((size_t)(char*) q) - ((size_t)(char*) widestring);
return bytecount;
}