misc/win32/utf8.c - apr - Git at Google

 /* Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include "apr.h"
 #include "apr_private.h"
 #include "apr_errno.h"
 #include "apr_arch_utf8.h"

 /* Implement the design principal specified by RFC 2718 2.2.5
  * Guidelines for new URL Schemes - within the APR.
  *
  * Since many architectures support unicode, and UCS2 is the most
  * efficient storage used by those archictures, these functions
  * exist to validate a UCS string.  It is up to the operating system
  * to determine the validitity of the string in the context of it's
  * native language support.  File systems that support filename
  * characters of 0x80-0xff but have no support of Unicode will find
  * this function useful only for validating the character sequences
  * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
  * desired.
  *
  * from RFC 2279 UTF-8, a transformation format of ISO 10646
  *
  *     UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  * 1:2 0000 0000-0000 007F   0xxxxxxx
  * 2:2 0000 0080-0000 07FF   110XXXXx 10xxxxxx
  * 3:2 0000 0800-0000 FFFF   1110XXXX 10Xxxxxx 10xxxxxx
  * 4:4 0001 0000-001F FFFF   11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
  * inv 0020 0000-03FF FFFF   111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
  * inv 0400 0000-7FFF FFFF   1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  *
  * One of the X values must be one for the encoding length to be legit.
  * Neither the z bit, nor the final two forms, are used for ucs-2
  *
  *   "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
  *   Unicode parlance), being actually UCS-4 characters transformed
  *   through UTF-16, need special treatment: the UTF-16 transformation
  *   must be undone, yielding a UCS-4 character that is then transformed
  *   as above."
  *
  * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
  *
  *  U' = U - 0x10000
  *  U' = 000000000000yyyyyyyyyyxxxxxxxxxx
  *                  W1 = 110110yyyyyyyyyy
  *                  W2 = 110111xxxxxxxxxx
  *
  * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
  *
  * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
  */

 APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
                                                 apr_size_t *inbytes,
                                                 apr_wchar_t *out,
                                                 apr_size_t *outwords)
 {
     apr_int64_t newch, mask;
     apr_size_t expect, eating;
     int ch;

     while (*inbytes && *outwords)
     {
         ch = (unsigned char)(*in++);
         if (!(ch & 0200)) {
             /* US-ASCII-7 plain text
              */
             --*inbytes;
             --*outwords;
             *(out++) = ch;
         }
         else
         {
             if ((ch & 0300) != 0300) {
                 /* Multibyte Continuation is out of place
                  */
                 return APR_EINVAL;
             }
             else
             {
                 /* Multibyte Sequence Lead Character
                  *
                  * Compute the expected bytes while adjusting
                  * or lead byte and leading zeros mask.
                  */
                 mask = 0340;
                 expect = 1;
                 while ((ch & mask) == mask) {
                     mask |= mask >> 1;
                     if (++expect > 3) /* (truly 5 for ucs-4) */
                         return APR_EINVAL;
                 }
                 newch = ch & ~mask;
                 eating = expect + 1;
                 if (*inbytes <= expect)
                     return APR_INCOMPLETE;
                 /* Reject values of excessive leading 0 bits
                  * utf-8 _demands_ the shortest possible byte length
                  */
                 if (expect == 1) {
                     if (!(newch & 0036))
                         return APR_EINVAL;
                 }
                 else {
                     /* Reject values of excessive leading 0 bits
                      */
                     if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
                         return APR_EINVAL;
                     if (expect == 2) {
                         /* Reject values D800-DFFF when not utf16 encoded
                          * (may not be an appropriate restriction for ucs-4)
                          */
                         if (newch == 0015 && ((unsigned char)*in & 0040))
                             return APR_EINVAL;
                     }
                     else if (expect == 3) {
                         /* Short circuit values > 110000
                          */
                         if (newch > 4)
                             return APR_EINVAL;
                         if (newch == 4 && ((unsigned char)*in & 0060))
                             return APR_EINVAL;
                     }
                 }
                 /* Where the boolean (expect > 2) is true, we will need
                  * an extra word for the output.
                  */
                 if (*outwords < (apr_size_t)(expect > 2) + 1)
                     break; /* buffer full */
                 while (expect--)
                 {
                     /* Multibyte Continuation must be legal */
                     if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
                         return APR_EINVAL;
                     newch <<= 6;
                     newch |= (ch & 0077);
                 }
                 *inbytes -= eating;
                 /* newch is now a true ucs-4 character
                  *
                  * now we need to fold to ucs-2
                  */
                 if (newch < 0x10000)
                 {
                     --*outwords;
                     *(out++) = (apr_wchar_t) newch;
                 }
                 else
                 {
                     *outwords -= 2;
                     newch -= 0x10000;
                     *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
                     *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
                 }
             }
         }
     }
     /* Buffer full 'errors' aren't errors, the client must inspect both
      * the inbytes and outwords values
      */
     return APR_SUCCESS;
 }

 APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
                                                 apr_size_t *inwords,
                                                 char *out,
                                                 apr_size_t *outbytes)
 {
     apr_int64_t newch, require;
     apr_size_t need;
     char *invout;
     int ch;

     while (*inwords && *outbytes)
     {
         ch = (unsigned short)(*in++);
         if (ch < 0x80)
         {
             --*inwords;
             --*outbytes;
             *(out++) = (unsigned char) ch;
         }
         else
         {
             if ((ch & 0xFC00) == 0xDC00) {
                 /* Invalid Leading ucs-2 Multiword Continuation Character
                  */
                 return APR_EINVAL;
             }
             if ((ch & 0xFC00) == 0xD800) {
                 /* Leading ucs-2 Multiword Character
                  */
                 if (*inwords < 2) {
                     /* Missing ucs-2 Multiword Continuation Character
                      */
                     return APR_INCOMPLETE;
                 }
                 if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
                     /* Invalid ucs-2 Multiword Continuation Character
                      */
                     return APR_EINVAL;
                 }
                 newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
                 newch += 0x10000;
             }
             else {
                 /* ucs-2 Single Word Character
                  */
                 newch = ch;
             }
             /* Determine the absolute minimum utf-8 bytes required
              */
             require = newch >> 11;
             need = 1;
             while (require)
                 require >>= 5, ++need;
             if (need >= *outbytes)
                 break; /* Insufficient buffer */
             *inwords -= (need > 2) + 1;
             *outbytes -= need + 1;
             /* Compute the utf-8 characters in last to first order,
              * calculating the lead character length bits along the way.
              */
             ch = 0200;
             out += need + 1;
             invout = out;
             while (need--) {
                 ch |= ch >> 1;
                 *(--invout) = (unsigned char)(0200 | (newch & 0077));
                 newch >>= 6;
             }
             /* Compute the lead utf-8 character and move the dest offset
              */
             *(--invout) = (unsigned char)(ch | newch);
         }
     }
     /* Buffer full 'errors' aren't errors, the client must inspect both
      * the inwords and outbytes values
      */
     return APR_SUCCESS;
 }
	/* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include "apr.h"
	#include "apr_private.h"
	#include "apr_errno.h"
	#include "apr_arch_utf8.h"

	/* Implement the design principal specified by RFC 2718 2.2.5
	* Guidelines for new URL Schemes - within the APR.
	*
	* Since many architectures support unicode, and UCS2 is the most
	* efficient storage used by those archictures, these functions
	* exist to validate a UCS string. It is up to the operating system
	* to determine the validitity of the string in the context of it's
	* native language support. File systems that support filename
	* characters of 0x80-0xff but have no support of Unicode will find
	* this function useful only for validating the character sequences
	* and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
	* desired.
	*
	* from RFC 2279 UTF-8, a transformation format of ISO 10646
	*
	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
	* 1:2 0000 0000-0000 007F 0xxxxxxx
	* 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx
	* 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx
	* 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
	* inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
	* inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	*
	* One of the X values must be one for the encoding length to be legit.
	* Neither the z bit, nor the final two forms, are used for ucs-2
	*
	* "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
	* Unicode parlance), being actually UCS-4 characters transformed
	* through UTF-16, need special treatment: the UTF-16 transformation
	* must be undone, yielding a UCS-4 character that is then transformed
	* as above."
	*
	* from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
	*
	* U' = U - 0x10000
	* U' = 000000000000yyyyyyyyyyxxxxxxxxxx
	* W1 = 110110yyyyyyyyyy
	* W2 = 110111xxxxxxxxxx
	*
	* apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
	*
	* apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
	*/

	APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
	apr_size_t *inbytes,
	apr_wchar_t *out,
	apr_size_t *outwords)
	{
	apr_int64_t newch, mask;
	apr_size_t expect, eating;
	int ch;

	while (inbytes && outwords)
	{
	ch = (unsigned char)(*in++);
	if (!(ch & 0200)) {
	/* US-ASCII-7 plain text
	*/
	--*inbytes;
	--*outwords;
	*(out++) = ch;
	}
	else
	{
	if ((ch & 0300) != 0300) {
	/* Multibyte Continuation is out of place
	*/
	return APR_EINVAL;
	}
	else
	{
	/* Multibyte Sequence Lead Character
	*
	* Compute the expected bytes while adjusting
	* or lead byte and leading zeros mask.
	*/
	mask = 0340;
	expect = 1;
	while ((ch & mask) == mask) {
	mask \|= mask >> 1;
	if (++expect > 3) /* (truly 5 for ucs-4) */
	return APR_EINVAL;
	}
	newch = ch & ~mask;
	eating = expect + 1;
	if (*inbytes <= expect)
	return APR_INCOMPLETE;
	/* Reject values of excessive leading 0 bits
	* utf-8 _demands_ the shortest possible byte length
	*/
	if (expect == 1) {
	if (!(newch & 0036))
	return APR_EINVAL;
	}
	else {
	/* Reject values of excessive leading 0 bits
	*/
	if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
	return APR_EINVAL;
	if (expect == 2) {
	/* Reject values D800-DFFF when not utf16 encoded
	* (may not be an appropriate restriction for ucs-4)
	*/
	if (newch == 0015 && ((unsigned char)*in & 0040))
	return APR_EINVAL;
	}
	else if (expect == 3) {
	/* Short circuit values > 110000
	*/
	if (newch > 4)
	return APR_EINVAL;
	if (newch == 4 && ((unsigned char)*in & 0060))
	return APR_EINVAL;
	}
	}
	/* Where the boolean (expect > 2) is true, we will need
	* an extra word for the output.
	*/
	if (*outwords < (apr_size_t)(expect > 2) + 1)
	break; /* buffer full */
	while (expect--)
	{
	/* Multibyte Continuation must be legal */
	if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
	return APR_EINVAL;
	newch <<= 6;
	newch \|= (ch & 0077);
	}
	*inbytes -= eating;
	/* newch is now a true ucs-4 character
	*
	* now we need to fold to ucs-2
	*/
	if (newch < 0x10000)
	{
	--*outwords;
	*(out++) = (apr_wchar_t) newch;
	}
	else
	{
	*outwords -= 2;
	newch -= 0x10000;
	*(out++) = (apr_wchar_t) (0xD800 \| (newch >> 10));
	*(out++) = (apr_wchar_t) (0xDC00 \| (newch & 0x03FF));
	}
	}
	}
	}
	/* Buffer full 'errors' aren't errors, the client must inspect both
	* the inbytes and outwords values
	*/
	return APR_SUCCESS;
	}

	APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
	apr_size_t *inwords,
	char *out,
	apr_size_t *outbytes)
	{
	apr_int64_t newch, require;
	apr_size_t need;
	char *invout;
	int ch;

	while (inwords && outbytes)
	{
	ch = (unsigned short)(*in++);
	if (ch < 0x80)
	{
	--*inwords;
	--*outbytes;
	*(out++) = (unsigned char) ch;
	}
	else
	{
	if ((ch & 0xFC00) == 0xDC00) {
	/* Invalid Leading ucs-2 Multiword Continuation Character
	*/
	return APR_EINVAL;
	}
	if ((ch & 0xFC00) == 0xD800) {
	/* Leading ucs-2 Multiword Character
	*/
	if (*inwords < 2) {
	/* Missing ucs-2 Multiword Continuation Character
	*/
	return APR_INCOMPLETE;
	}
	if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
	/* Invalid ucs-2 Multiword Continuation Character
	*/
	return APR_EINVAL;
	}
	newch = (ch & 0x03FF) << 10 \| ((unsigned short)(*in++) & 0x03FF);
	newch += 0x10000;
	}
	else {
	/* ucs-2 Single Word Character
	*/
	newch = ch;
	}
	/* Determine the absolute minimum utf-8 bytes required
	*/
	require = newch >> 11;
	need = 1;
	while (require)
	require >>= 5, ++need;
	if (need >= *outbytes)
	break; /* Insufficient buffer */
	*inwords -= (need > 2) + 1;
	*outbytes -= need + 1;
	/* Compute the utf-8 characters in last to first order,
	* calculating the lead character length bits along the way.
	*/
	ch = 0200;
	out += need + 1;
	invout = out;
	while (need--) {
	ch \|= ch >> 1;
	*(--invout) = (unsigned char)(0200 \| (newch & 0077));
	newch >>= 6;
	}
	/* Compute the lead utf-8 character and move the dest offset
	*/
	*(--invout) = (unsigned char)(ch \| newch);
	}
	}
	/* Buffer full 'errors' aren't errors, the client must inspect both
	* the inwords and outbytes values
	*/
	return APR_SUCCESS;
	}