c_src/utf8.c - couchdb-jiffy - Git at Google

 // This file is part of Jiffy released under the MIT license.
 // See the LICENSE file for more information.
 #include "jiffy.h"
 #include <stdio.h>

 static const unsigned char hexvals[256] = {
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
       0,   1,   2,   3,   4,   5,   6,   7,
       8,   9, 255, 255, 255, 255, 255, 255,
     255,  10,  11,  12,  13,  14,  15, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255,  10,  11,  12,  13,  14,  15, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,

     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255,
     255, 255, 255, 255, 255, 255, 255, 255
 };

 static const char hexdigits[16] = {
     '0', '1', '2', '3',
     '4', '5', '6', '7',
     '8', '9', 'A', 'B',
     'C', 'D', 'E', 'F'
 };

 int
 int_from_hex(const unsigned char* p)
 {
     unsigned char* h = (unsigned char*) p;
     int ret;

     if(hexvals[*(h+0)] == 255) return -1;
     if(hexvals[*(h+1)] == 255) return -1;
     if(hexvals[*(h+2)] == 255) return -1;
     if(hexvals[*(h+3)] == 255) return -1;

     ret = (hexvals[*(h+0)] << 12)
         + (hexvals[*(h+1)] << 8)
         + (hexvals[*(h+2)] << 4)
         + (hexvals[*(h+3)] << 0);

     return ret;
 }

 int
 int_to_hex(int val, unsigned char* p)
 {
     if(val < 0 || val > 65535)
         return -1;

     p[0] = hexdigits[(val >> 12) & 0xF];
     p[1] = hexdigits[(val >> 8) & 0xF];
     p[2] = hexdigits[(val >> 4) & 0xF];
     p[3] = hexdigits[val & 0xF];

     return 1;
 }

 int
 utf8_len(int c)
 {
     if(c < 128) {
         return 1;
     } else if(c < 0x800) {
         return 2;
     } else if(c < 0x10000) {
         if(c < 0xD800 || (c > 0xDFFF)) {
             return 3;
         } else {
             return -1;
         }
     } else if(c <= 0x10FFFF) {
         return 4;
     } else {
         return -1;
     }
 }

 int
 utf8_esc_len(int c)
 {
     if(c < 0x10000) {
         return 6;
     } else if(c <= 0x10FFFF) {
         return 12;
     } else {
         return -1;
     }
 }

 int
 utf8_validate(unsigned char* data, size_t size)
 {
     int ulen = -1;
     int ui;

     if((data[0] & 0x80) == 0x00) {
         ulen = 1;
     } if((data[0] & 0xE0) == 0xC0) {
         ulen = 2;
     } else if((data[0] & 0xF0) == 0xE0) {
         ulen = 3;
     } else if((data[0] & 0xF8) == 0xF0) {
         ulen = 4;
     }
     if(ulen < 0 || ulen > size) {
         return -1;
     }

     // Check each continuation byte.
     for(ui = 1; ui < ulen; ui++) {
         if((data[ui] & 0xC0) != 0x80) return -1;
     }

     // Wikipedia says I have to check that a UTF-8 encoding
     // uses as few bits as possible. This means that we
     // can't do things like encode 't' in three bytes.
     // To check this all we need to ensure is that for each
     // of the following bit patterns that there is at least
     // one 1 bit in any of the x's
     //  1: 0yyyyyyy
     //  2: 110xxxxy 10yyyyyy
     //  3: 1110xxxx 10xyyyyy 10yyyyyy
     //  4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy

     // ulen == 1 passes by definition
     if(ulen == 2) {
         if((data[0] & 0x1E) == 0)
             return -1;
     } else if(ulen == 3) {
         if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
             return -1;
     } else if(ulen == 4) {
         if((data[0] & 0x07) + (data[1] & 0x30) == 0)
             return -1;
     }

     // Lastly we need to check some miscellaneous ranges for
     // some of the larger code point values.
     if(ulen >= 3) {
         ui = utf8_to_unicode(data, ulen);
         if(ui < 0) {
             return -1;
         } else if(ui >= 0xD800 && ui <= 0xDFFF) {
             return -1;
         } else if(ui > 0x10FFFF) {
             return -1;
         }
     }

     return ulen;
 }

 int
 utf8_to_unicode(unsigned char* buf, size_t size)
 {
     int ret;
     if((buf[0] & 0x80) == 0x00) {
         // 0xxxxxxx
         ret = buf[0];
     } else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
         // 110xxxxy 10yyyyyy
         ret = ((buf[0] & 0x1F) << 6)
             | ((buf[1] & 0x3F));
     } else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
         // 1110xxxx 10xyyyyy 10yyyyyy
         ret = ((buf[0] & 0x0F) << 12)
             | ((buf[1] & 0x3F) << 6)
             | ((buf[2] & 0x3F));
         if(ret >= 0xD800 && ret <= 0xDFFF) {
             ret = -1;
         }
     } else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
         // 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
         ret = ((buf[0] & 0x07) << 18)
             | ((buf[1] & 0x3F) << 12)
             | ((buf[2] & 0x3F) << 6)
             | ((buf[3] & 0x3F));
     } else {
         ret = -1;
     }
     return ret;
 }

 int
 unicode_to_utf8(int c, unsigned char* buf)
 {
     if(c < 0x80) {
         buf[0] = c;
         return 1;
     } else if(c < 0x800) {
         buf[0] = 0xC0 + (c >> 6);
         buf[1] = 0x80 + (c & 0x3F);
         return 2;
     } else if(c < 0x10000) {
         if(c < 0xD800 || (c > 0xDFFF)) {
             buf[0] = 0xE0 + (c >> 12);
             buf[1] = 0x80 + ((c >> 6) & 0x3F);
             buf[2] = 0x80 + (c & 0x3F);
             return 3;
         } else {
             return -1;
         }
     } else if(c <= 0x10FFFF) {
         buf[0] = 0xF0 + (c >> 18);
         buf[1] = 0x80 + ((c >> 12) & 0x3F);
         buf[2] = 0x80 + ((c >> 6) & 0x3F);
         buf[3] = 0x80 + (c & 0x3F);
         return 4;
     }
     return -1;
 }

 int
 unicode_from_pair(int hi, int lo)
 {
     if(hi < 0xD800 || hi >= 0xDC00) return -1;
     if(lo < 0xDC00 || lo > 0xDFFF) return -1;
     return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
 }

 int
 unicode_uescape(int val, unsigned char* p)
 {
     int n;
     if(val < 0x10000) {
         p[0] = '\\';
         p[1] = 'u';
         if(int_to_hex(val, p+2) < 0) {
             return -1;
         }
         return 6;
     } else if (val <= 0x10FFFF) {
         n = val - 0x10000;
         p[0] = '\\';
         p[1] = 'u';
         if(int_to_hex((0xD800 | ((n >> 10) & 0x03FF)), p+2) < 0) {
             return -1;
         }
         p[6] = '\\';
         p[7] = 'u';
         if(int_to_hex((0xDC00 | (n & 0x03FF)), p+8) < 0) {
             return -1;
         }
         return 12;
     }
     return -1;
 }
	// This file is part of Jiffy released under the MIT license.
	// See the LICENSE file for more information.
	#include "jiffy.h"
	#include <stdio.h>

	static const unsigned char hexvals[256] = {
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	0, 1, 2, 3, 4, 5, 6, 7,
	8, 9, 255, 255, 255, 255, 255, 255,
	255, 10, 11, 12, 13, 14, 15, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 10, 11, 12, 13, 14, 15, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,

	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255,
	255, 255, 255, 255, 255, 255, 255, 255
	};

	static const char hexdigits[16] = {
	'0', '1', '2', '3',
	'4', '5', '6', '7',
	'8', '9', 'A', 'B',
	'C', 'D', 'E', 'F'
	};

	int
	int_from_hex(const unsigned char* p)
	{
	unsigned char* h = (unsigned char*) p;
	int ret;

	if(hexvals[*(h+0)] == 255) return -1;
	if(hexvals[*(h+1)] == 255) return -1;
	if(hexvals[*(h+2)] == 255) return -1;
	if(hexvals[*(h+3)] == 255) return -1;

	ret = (hexvals[*(h+0)] << 12)
	+ (hexvals[*(h+1)] << 8)
	+ (hexvals[*(h+2)] << 4)
	+ (hexvals[*(h+3)] << 0);

	return ret;
	}

	int
	int_to_hex(int val, unsigned char* p)
	{
	if(val < 0 \|\| val > 65535)
	return -1;

	p[0] = hexdigits[(val >> 12) & 0xF];
	p[1] = hexdigits[(val >> 8) & 0xF];
	p[2] = hexdigits[(val >> 4) & 0xF];
	p[3] = hexdigits[val & 0xF];

	return 1;
	}

	int
	utf8_len(int c)
	{
	if(c < 128) {
	return 1;
	} else if(c < 0x800) {
	return 2;
	} else if(c < 0x10000) {
	if(c < 0xD800 \|\| (c > 0xDFFF)) {
	return 3;
	} else {
	return -1;
	}
	} else if(c <= 0x10FFFF) {
	return 4;
	} else {
	return -1;
	}
	}

	int
	utf8_esc_len(int c)
	{
	if(c < 0x10000) {
	return 6;
	} else if(c <= 0x10FFFF) {
	return 12;
	} else {
	return -1;
	}
	}

	int
	utf8_validate(unsigned char* data, size_t size)
	{
	int ulen = -1;
	int ui;

	if((data[0] & 0x80) == 0x00) {
	ulen = 1;
	} if((data[0] & 0xE0) == 0xC0) {
	ulen = 2;
	} else if((data[0] & 0xF0) == 0xE0) {
	ulen = 3;
	} else if((data[0] & 0xF8) == 0xF0) {
	ulen = 4;
	}
	if(ulen < 0 \|\| ulen > size) {
	return -1;
	}

	// Check each continuation byte.
	for(ui = 1; ui < ulen; ui++) {
	if((data[ui] & 0xC0) != 0x80) return -1;
	}

	// Wikipedia says I have to check that a UTF-8 encoding
	// uses as few bits as possible. This means that we
	// can't do things like encode 't' in three bytes.
	// To check this all we need to ensure is that for each
	// of the following bit patterns that there is at least
	// one 1 bit in any of the x's
	// 1: 0yyyyyyy
	// 2: 110xxxxy 10yyyyyy
	// 3: 1110xxxx 10xyyyyy 10yyyyyy
	// 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy

	// ulen == 1 passes by definition
	if(ulen == 2) {
	if((data[0] & 0x1E) == 0)
	return -1;
	} else if(ulen == 3) {
	if((data[0] & 0x0F) + (data[1] & 0x20) == 0)
	return -1;
	} else if(ulen == 4) {
	if((data[0] & 0x07) + (data[1] & 0x30) == 0)
	return -1;
	}

	// Lastly we need to check some miscellaneous ranges for
	// some of the larger code point values.
	if(ulen >= 3) {
	ui = utf8_to_unicode(data, ulen);
	if(ui < 0) {
	return -1;
	} else if(ui >= 0xD800 && ui <= 0xDFFF) {
	return -1;
	} else if(ui > 0x10FFFF) {
	return -1;
	}
	}

	return ulen;
	}

	int
	utf8_to_unicode(unsigned char* buf, size_t size)
	{
	int ret;
	if((buf[0] & 0x80) == 0x00) {
	// 0xxxxxxx
	ret = buf[0];
	} else if((buf[0] & 0xE0) == 0xC0 && size >= 2) {
	// 110xxxxy 10yyyyyy
	ret = ((buf[0] & 0x1F) << 6)
	\| ((buf[1] & 0x3F));
	} else if((buf[0] & 0xF0) == 0xE0 && size >= 3) {
	// 1110xxxx 10xyyyyy 10yyyyyy
	ret = ((buf[0] & 0x0F) << 12)
	\| ((buf[1] & 0x3F) << 6)
	\| ((buf[2] & 0x3F));
	if(ret >= 0xD800 && ret <= 0xDFFF) {
	ret = -1;
	}
	} else if((buf[0] & 0xF8) == 0xF0 && size >= 4) {
	// 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy
	ret = ((buf[0] & 0x07) << 18)
	\| ((buf[1] & 0x3F) << 12)
	\| ((buf[2] & 0x3F) << 6)
	\| ((buf[3] & 0x3F));
	} else {
	ret = -1;
	}
	return ret;
	}

	int
	unicode_to_utf8(int c, unsigned char* buf)
	{
	if(c < 0x80) {
	buf[0] = c;
	return 1;
	} else if(c < 0x800) {
	buf[0] = 0xC0 + (c >> 6);
	buf[1] = 0x80 + (c & 0x3F);
	return 2;
	} else if(c < 0x10000) {
	if(c < 0xD800 \|\| (c > 0xDFFF)) {
	buf[0] = 0xE0 + (c >> 12);
	buf[1] = 0x80 + ((c >> 6) & 0x3F);
	buf[2] = 0x80 + (c & 0x3F);
	return 3;
	} else {
	return -1;
	}
	} else if(c <= 0x10FFFF) {
	buf[0] = 0xF0 + (c >> 18);
	buf[1] = 0x80 + ((c >> 12) & 0x3F);
	buf[2] = 0x80 + ((c >> 6) & 0x3F);
	buf[3] = 0x80 + (c & 0x3F);
	return 4;
	}
	return -1;
	}

	int
	unicode_from_pair(int hi, int lo)
	{
	if(hi < 0xD800 \|\| hi >= 0xDC00) return -1;
	if(lo < 0xDC00 \|\| lo > 0xDFFF) return -1;
	return ((hi & 0x3FF) << 10) + (lo & 0x3FF) + 0x10000;
	}

	int
	unicode_uescape(int val, unsigned char* p)
	{
	int n;
	if(val < 0x10000) {
	p[0] = '\\';
	p[1] = 'u';
	if(int_to_hex(val, p+2) < 0) {
	return -1;
	}
	return 6;
	} else if (val <= 0x10FFFF) {
	n = val - 0x10000;
	p[0] = '\\';
	p[1] = 'u';
	if(int_to_hex((0xD800 \| ((n >> 10) & 0x03FF)), p+2) < 0) {
	return -1;
	}
	p[6] = '\\';
	p[7] = 'u';
	if(int_to_hex((0xDC00 \| (n & 0x03FF)), p+8) < 0) {
	return -1;
	}
	return 12;
	}
	return -1;
	}