binding-c/runtime/c/src/common/etch_encoding.c - etch - Git at Google

 /* $Id$
  *
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to you under the Apache License, Version
  * 2.0 (the "License"); you may not use this file except in compliance
  * with the License. You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * etch_encoding.c -- character encoding
  */

 /* we include an APR header only to get required platform specific headers (windows.h or whatever */
 #include <apr_thread_cond.h>
 #include "etch_encoding.h"
 #include "etch_global.h"
 #include "etchlog.h"

 #if IS_WINDOWS_ETCH
 #include "winnls.h" /* for WideCharToMultiByte etc */
 #endif
 char* ETCH_ENCODING_LOGID = "ENCO";
 char* ETCH_ENCODING_EMASK = "encoding conversion error %d\n";
 void etch_encoding_errcheck(const int);


 /*
  * etch_unicode_to_8bit()
  * currently windows-specific. todo need portable version.
  * @param unicode_string_in a character string, caller retains ownership.
  * @param which which codepage to use, utf-8 (1) or ansi (0)
  * @return an 8-bit character string, via the out parameter,
  * caller assumes ownership of this memory.
  * @return 0 or -1
  */
 int etch_unicode_to_8bit(char** bit8_string_out, wchar_t* unicode_string_in, const which)
 {
     const int charcount_in = (int) wcslen(unicode_string_in);
     int   bit8_charcount = 0, bytecount_out = 0, codepage = 0, result = -1;
     char* bit8_buf = NULL;
     if (NULL == bit8_string_out) return -1;

     #if IS_WINDOWS_ETCH  /* todo: portable implementation */

     codepage = which == 1? CP_UTF8: CP_ACP;

     /* call system first to calculate utf-8 buffer size required for
      * given unicode input and output code page. out buffer length zero
      * instructs system to do so. */
     bit8_charcount = WideCharToMultiByte(codepage, 0,
         unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);

     bit8_buf = etch_malloc(++bit8_charcount, ETCHTYPEB_BYTES); /* add null term */
     memset(bit8_buf, 0, bit8_charcount);

     bytecount_out = WideCharToMultiByte(codepage, 0, /* do conversion */
         unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);

     result = bytecount_out > 0? 0: -1;

     if (result == 0)    /* transfer ownership of out buffer to caller */
        *bit8_string_out = bit8_buf;
     else
     {  *bit8_string_out = NULL;
         etch_free(bit8_buf); bit8_buf = NULL;
     }

     etch_encoding_errcheck(result);

     #endif /* IS_WINDOWS_ETCH */

     return result;
 }


 /*
  * etch_unicode_to_utf8()
  * @param unicode_string_in a character string owned by caller.
  * @return a UTF-8 encoded string, via the out parameter,
  * caller assumes ownership of this memory.
  * @return 0 or -1
  */
 int etch_unicode_to_utf8(char** utf8_string_out, wchar_t* unicode_string_in)
 {
     return etch_unicode_to_8bit(utf8_string_out, unicode_string_in, 1);
 }


 /*
  * etch_unicode_to_ansi()
  * @param unicode_string_in a character string owned by caller.
  * @return an ascii encoded string, via the out parameter,
  * caller assumes ownership of this memory, which must be etch_free()'d
  * @return 0 or -1
  */
 int etch_unicode_to_ansi(char** ansi_string_out, wchar_t* unicode_string_in)
 {
     return etch_unicode_to_8bit(ansi_string_out, unicode_string_in, 0);
 }


 /*
  * etch_unicode_to_8bit()
  * currently windows-specific. todo need portable version.
  * @param bit8_string_in a character string owned by caller.
  * @return a unicode string, via the out parameter,
  * caller assumes ownership of this memory, which must be etch_free()'d
  * @return 0 or -1
  */
 int etch_8bit_to_unicode(wchar_t** unicode_string_out, char* bit8_string_in, const which)
 {
     const int charcount_in = (int) strlen(bit8_string_in);
     int unicode_bufsize = 0, charcount_out = 0, codepage = 0, result = -1;
     wchar_t* ubuf = NULL;
     if (NULL == unicode_string_out) return -1;

     #if IS_WINDOWS_ETCH  /* todo: portable implementation */

     codepage = which == 1? CP_UTF8: CP_ACP;

     unicode_bufsize = (charcount_in + 1) * 2;
     ubuf = etch_malloc(unicode_bufsize, ETCHTYPEB_BYTES);
     memset(ubuf, 0, unicode_bufsize);

     charcount_out = MultiByteToWideChar(codepage, 0, /* do conversion */
         bit8_string_in, charcount_in + 1, ubuf, unicode_bufsize);

     result = charcount_out > 0? 0: -1;

     if (result == 0)    /* transfer ownership of out buffer to caller */
        *unicode_string_out = ubuf;
     else
     {  *unicode_string_out = NULL;
         etch_free(ubuf); ubuf = NULL;
     }

     etch_encoding_errcheck(result);

     #endif /* IS_WINDOWS_ETCH */

     return result;
 }


 /*
  * etch_utf8_to_unicode()
  * @param utf8_string_in a character string owned by caller.
  * @return a UTF-16 encoded string, via the out parameter,
  * caller assumes ownership of this memory, which must be etch_free()'d
  * @return 0 or -1
  */
 int etch_utf8_to_unicode(wchar_t** unicode_string_out, char* utf8_string_in)
 {
     return etch_8bit_to_unicode(unicode_string_out, utf8_string_in, 1);
 }


 /*
  * etch_ansi_to_unicode()
  * @param ascii_string_in a character string owned by caller.
  * @return a UTF-16 encoded string, via the out parameter,
  * caller assumes ownership of this memory, which must be etch_free()'d
  */
 int etch_ansi_to_unicode(wchar_t** unicode_string_out, char* ascii_string_in)
 {
     return etch_8bit_to_unicode(unicode_string_out, ascii_string_in, 0);
 }


 /*
  * etch_encoding_errcheck()
  * private method to check result of an encoding conversion and log error.
  */
 void etch_encoding_errcheck(const int result)
 {
     int ecode, whicherr;
     if (result == 0) return;

     #if IS_WINDOWS_ETCH

     ecode = GetLastError();
     switch(whicherr = ecode)
     {   case ERROR_INSUFFICIENT_BUFFER: whicherr = 1; break;
         case ERROR_INVALID_FLAGS:       whicherr = 2; break;
         case ERROR_INVALID_PARAMETER:   whicherr = 3; break;
     }
     etchlog(ETCH_ENCODING_LOGID,ETCHLOG_ERROR,ETCH_ENCODING_EMASK, whicherr);

    #endif
 }


 /*
  * etch_get_unicode_bytecount()
  * @param widestring a unicode string.
  * @return number of bytes in the string, including the null terminator.
  */
 size_t etch_get_unicode_bytecount (wchar_t* widestring)
 {
     wchar_t* q = NULL;
     size_t bytecount = 0;
     const int charcount = widestring? (int) wcslen(widestring): 0;
     if (0 == charcount) return 0;
     q = &widestring[charcount];
     q++;
     bytecount = ((size_t)(char*) q) - ((size_t)(char*) widestring);
     return bytecount;
 }
	/* $Id$
	*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to you under the Apache License, Version
	* 2.0 (the "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* etch_encoding.c -- character encoding
	*/

	/* we include an APR header only to get required platform specific headers (windows.h or whatever */
	#include <apr_thread_cond.h>
	#include "etch_encoding.h"
	#include "etch_global.h"
	#include "etchlog.h"

	#if IS_WINDOWS_ETCH
	#include "winnls.h" /* for WideCharToMultiByte etc */
	#endif
	char* ETCH_ENCODING_LOGID = "ENCO";
	char* ETCH_ENCODING_EMASK = "encoding conversion error %d\n";
	void etch_encoding_errcheck(const int);


	/*
	* etch_unicode_to_8bit()
	* currently windows-specific. todo need portable version.
	* @param unicode_string_in a character string, caller retains ownership.
	* @param which which codepage to use, utf-8 (1) or ansi (0)
	* @return an 8-bit character string, via the out parameter,
	* caller assumes ownership of this memory.
	* @return 0 or -1
	*/
	int etch_unicode_to_8bit(char** bit8_string_out, wchar_t* unicode_string_in, const which)
	{
	const int charcount_in = (int) wcslen(unicode_string_in);
	int bit8_charcount = 0, bytecount_out = 0, codepage = 0, result = -1;
	char* bit8_buf = NULL;
	if (NULL == bit8_string_out) return -1;

	#if IS_WINDOWS_ETCH /* todo: portable implementation */

	codepage = which == 1? CP_UTF8: CP_ACP;

	/* call system first to calculate utf-8 buffer size required for
	* given unicode input and output code page. out buffer length zero
	* instructs system to do so. */
	bit8_charcount = WideCharToMultiByte(codepage, 0,
	unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);

	bit8_buf = etch_malloc(++bit8_charcount, ETCHTYPEB_BYTES); /* add null term */
	memset(bit8_buf, 0, bit8_charcount);

	bytecount_out = WideCharToMultiByte(codepage, 0, /* do conversion */
	unicode_string_in, charcount_in, bit8_buf, bit8_charcount, NULL, NULL);

	result = bytecount_out > 0? 0: -1;

	if (result == 0) /* transfer ownership of out buffer to caller */
	*bit8_string_out = bit8_buf;
	else
	{ *bit8_string_out = NULL;
	etch_free(bit8_buf); bit8_buf = NULL;
	}

	etch_encoding_errcheck(result);

	#endif /* IS_WINDOWS_ETCH */

	return result;
	}


	/*
	* etch_unicode_to_utf8()
	* @param unicode_string_in a character string owned by caller.
	* @return a UTF-8 encoded string, via the out parameter,
	* caller assumes ownership of this memory.
	* @return 0 or -1
	*/
	int etch_unicode_to_utf8(char** utf8_string_out, wchar_t* unicode_string_in)
	{
	return etch_unicode_to_8bit(utf8_string_out, unicode_string_in, 1);
	}


	/*
	* etch_unicode_to_ansi()
	* @param unicode_string_in a character string owned by caller.
	* @return an ascii encoded string, via the out parameter,
	* caller assumes ownership of this memory, which must be etch_free()'d
	* @return 0 or -1
	*/
	int etch_unicode_to_ansi(char** ansi_string_out, wchar_t* unicode_string_in)
	{
	return etch_unicode_to_8bit(ansi_string_out, unicode_string_in, 0);
	}


	/*
	* etch_unicode_to_8bit()
	* currently windows-specific. todo need portable version.
	* @param bit8_string_in a character string owned by caller.
	* @return a unicode string, via the out parameter,
	* caller assumes ownership of this memory, which must be etch_free()'d
	* @return 0 or -1
	*/
	int etch_8bit_to_unicode(wchar_t** unicode_string_out, char* bit8_string_in, const which)
	{
	const int charcount_in = (int) strlen(bit8_string_in);
	int unicode_bufsize = 0, charcount_out = 0, codepage = 0, result = -1;
	wchar_t* ubuf = NULL;
	if (NULL == unicode_string_out) return -1;

	#if IS_WINDOWS_ETCH /* todo: portable implementation */

	codepage = which == 1? CP_UTF8: CP_ACP;

	unicode_bufsize = (charcount_in + 1) * 2;
	ubuf = etch_malloc(unicode_bufsize, ETCHTYPEB_BYTES);
	memset(ubuf, 0, unicode_bufsize);

	charcount_out = MultiByteToWideChar(codepage, 0, /* do conversion */
	bit8_string_in, charcount_in + 1, ubuf, unicode_bufsize);

	result = charcount_out > 0? 0: -1;

	if (result == 0) /* transfer ownership of out buffer to caller */
	*unicode_string_out = ubuf;
	else
	{ *unicode_string_out = NULL;
	etch_free(ubuf); ubuf = NULL;
	}

	etch_encoding_errcheck(result);

	#endif /* IS_WINDOWS_ETCH */

	return result;
	}


	/*
	* etch_utf8_to_unicode()
	* @param utf8_string_in a character string owned by caller.
	* @return a UTF-16 encoded string, via the out parameter,
	* caller assumes ownership of this memory, which must be etch_free()'d
	* @return 0 or -1
	*/
	int etch_utf8_to_unicode(wchar_t** unicode_string_out, char* utf8_string_in)
	{
	return etch_8bit_to_unicode(unicode_string_out, utf8_string_in, 1);
	}


	/*
	* etch_ansi_to_unicode()
	* @param ascii_string_in a character string owned by caller.
	* @return a UTF-16 encoded string, via the out parameter,
	* caller assumes ownership of this memory, which must be etch_free()'d
	*/
	int etch_ansi_to_unicode(wchar_t** unicode_string_out, char* ascii_string_in)
	{
	return etch_8bit_to_unicode(unicode_string_out, ascii_string_in, 0);
	}


	/*
	* etch_encoding_errcheck()
	* private method to check result of an encoding conversion and log error.
	*/
	void etch_encoding_errcheck(const int result)
	{
	int ecode, whicherr;
	if (result == 0) return;

	#if IS_WINDOWS_ETCH

	ecode = GetLastError();
	switch(whicherr = ecode)
	{ case ERROR_INSUFFICIENT_BUFFER: whicherr = 1; break;
	case ERROR_INVALID_FLAGS: whicherr = 2; break;
	case ERROR_INVALID_PARAMETER: whicherr = 3; break;
	}
	etchlog(ETCH_ENCODING_LOGID,ETCHLOG_ERROR,ETCH_ENCODING_EMASK, whicherr);

	#endif
	}


	/*
	* etch_get_unicode_bytecount()
	* @param widestring a unicode string.
	* @return number of bytes in the string, including the null terminator.
	*/
	size_t etch_get_unicode_bytecount (wchar_t* widestring)
	{
	wchar_t* q = NULL;
	size_t bytecount = 0;
	const int charcount = widestring? (int) wcslen(widestring): 0;
	if (0 == charcount) return 0;
	q = &widestring[charcount];
	q++;
	bytecount = ((size_t)(char) q) - ((size_t)(char) widestring);
	return bytecount;
	}