| /******************************************************************************* |
| * Copyright (c) 2009, 2013 IBM Corp. |
| * |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * and Eclipse Distribution License v1.0 which accompany this distribution. |
| * |
| * The Eclipse Public License is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * and the Eclipse Distribution License is available at |
| * http://www.eclipse.org/org/documents/edl-v10.php. |
| * |
| * Contributors: |
| * Ian Craggs - initial API and implementation and/or initial documentation |
| *******************************************************************************/ |
| |
| |
| /** |
| * @file |
| * \brief Functions for checking that strings contain UTF-8 characters only |
| * |
| * See page 104 of the Unicode Standard 5.0 for the list of well formed |
| * UTF-8 byte sequences. |
| * |
| */ |
| #include "utf-8.h" |
| |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "StackTrace.h" |
| |
| /** |
| * Macro to determine the number of elements in a single-dimension array |
| */ |
| #if !defined(ARRAY_SIZE) |
| #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) |
| #endif |
| |
| |
| /** |
| * Structure to hold the valid ranges of UTF-8 characters, for each byte up to 4 |
| */ |
| struct |
| { |
| int len; /**< number of elements in the following array (1 to 4) */ |
| struct |
| { |
| char lower; /**< lower limit of valid range */ |
| char upper; /**< upper limit of valid range */ |
| } bytes[4]; /**< up to 4 bytes can be used per character */ |
| } |
| valid_ranges[] = |
| { |
| {1, { {00, 0x7F} } }, |
| {2, { {0xC2, 0xDF}, {0x80, 0xBF} } }, |
| {3, { {0xE0, 0xE0}, {0xA0, 0xBF}, {0x80, 0xBF} } }, |
| {3, { {0xE1, 0xEC}, {0x80, 0xBF}, {0x80, 0xBF} } }, |
| {3, { {0xED, 0xED}, {0x80, 0x9F}, {0x80, 0xBF} } }, |
| {3, { {0xEE, 0xEF}, {0x80, 0xBF}, {0x80, 0xBF} } }, |
| {4, { {0xF0, 0xF0}, {0x90, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } }, |
| {4, { {0xF1, 0xF3}, {0x80, 0xBF}, {0x80, 0xBF}, {0x80, 0xBF} } }, |
| {4, { {0xF4, 0xF4}, {0x80, 0x8F}, {0x80, 0xBF}, {0x80, 0xBF} } }, |
| }; |
| |
| |
| static const char* UTF8_char_validate(int len, const char* data); |
| |
| |
| /** |
| * Validate a single UTF-8 character |
| * @param len the length of the string in "data" |
| * @param data the bytes to check for a valid UTF-8 char |
| * @return pointer to the start of the next UTF-8 character in "data" |
| */ |
| static const char* UTF8_char_validate(int len, const char* data) |
| { |
| int good = 0; |
| int charlen = 2; |
| int i, j; |
| const char *rc = NULL; |
| |
| FUNC_ENTRY; |
| /* first work out how many bytes this char is encoded in */ |
| if ((data[0] & 128) == 0) |
| charlen = 1; |
| else if ((data[0] & 0xF0) == 0xF0) |
| charlen = 4; |
| else if ((data[0] & 0xE0) == 0xE0) |
| charlen = 3; |
| |
| if (charlen > len) |
| goto exit; /* not enough characters in the string we were given */ |
| |
| for (i = 0; i < ARRAY_SIZE(valid_ranges); ++i) |
| { /* just has to match one of these rows */ |
| if (valid_ranges[i].len == charlen) |
| { |
| good = 1; |
| for (j = 0; j < charlen; ++j) |
| { |
| if (data[j] < valid_ranges[i].bytes[j].lower || |
| data[j] > valid_ranges[i].bytes[j].upper) |
| { |
| good = 0; /* failed the check */ |
| break; |
| } |
| } |
| if (good) |
| break; |
| } |
| } |
| |
| if (good) |
| rc = data + charlen; |
| exit: |
| FUNC_EXIT; |
| return rc; |
| } |
| |
| |
| /** |
| * Validate a length-delimited string has only UTF-8 characters |
| * @param len the length of the string in "data" |
| * @param data the bytes to check for valid UTF-8 characters |
| * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise |
| */ |
| int UTF8_validate(int len, const char* data) |
| { |
| const char* curdata = NULL; |
| int rc = 0; |
| |
| FUNC_ENTRY; |
| if (len == 0) |
| { |
| rc = 1; |
| goto exit; |
| } |
| curdata = UTF8_char_validate(len, data); |
| while (curdata && (curdata < data + len)) |
| curdata = UTF8_char_validate(len, curdata); |
| |
| rc = curdata != NULL; |
| exit: |
| FUNC_EXIT_RC(rc); |
| return rc; |
| } |
| |
| |
| /** |
| * Validate a null-terminated string has only UTF-8 characters |
| * @param string the string to check for valid UTF-8 characters |
| * @return 1 (true) if the string has only UTF-8 characters, 0 (false) otherwise |
| */ |
| int UTF8_validateString(const char* string) |
| { |
| int rc = 0; |
| |
| FUNC_ENTRY; |
| rc = UTF8_validate((int)strlen(string), string); |
| FUNC_EXIT_RC(rc); |
| return rc; |
| } |
| |
| |
| |
| #if defined(UNIT_TESTS) |
| #include <stdio.h> |
| |
| typedef struct |
| { |
| int len; |
| char data[20]; |
| } tests; |
| |
| tests valid_strings[] = |
| { |
| {3, "hjk" }, |
| {7, {0x41, 0xE2, 0x89, 0xA2, 0xCE, 0x91, 0x2E} }, |
| {3, {'f', 0xC9, 0xB1 } }, |
| {9, {0xED, 0x95, 0x9C, 0xEA, 0xB5, 0xAD, 0xEC, 0x96, 0xB4} }, |
| {9, {0xE6, 0x97, 0xA5, 0xE6, 0x9C, 0xAC, 0xE8, 0xAA, 0x9E} }, |
| {4, {0x2F, 0x2E, 0x2E, 0x2F} }, |
| {7, {0xEF, 0xBB, 0xBF, 0xF0, 0xA3, 0x8E, 0xB4} }, |
| }; |
| |
| tests invalid_strings[] = |
| { |
| {2, {0xC0, 0x80} }, |
| {5, {0x2F, 0xC0, 0xAE, 0x2E, 0x2F} }, |
| {6, {0xED, 0xA1, 0x8C, 0xED, 0xBE, 0xB4} }, |
| {1, {0xF4} }, |
| }; |
| |
| int main (int argc, char *argv[]) |
| { |
| int i, failed = 0; |
| |
| for (i = 0; i < ARRAY_SIZE(valid_strings); ++i) |
| { |
| if (!UTF8_validate(valid_strings[i].len, valid_strings[i].data)) |
| { |
| printf("valid test %d failed\n", i); |
| failed = 1; |
| } |
| else |
| printf("valid test %d passed\n", i); |
| } |
| |
| for (i = 0; i < ARRAY_SIZE(invalid_strings); ++i) |
| { |
| if (UTF8_validate(invalid_strings[i].len, invalid_strings[i].data)) |
| { |
| printf("invalid test %d failed\n", i); |
| failed = 1; |
| } |
| else |
| printf("invalid test %d passed\n", i); |
| } |
| |
| if (failed) |
| printf("Failed\n"); |
| else |
| printf("Passed\n"); |
| |
| return 0; |
| } /* End of main function*/ |
| |
| #endif |
| |