| /** |
| * @file unicode.cpp |
| * Detects, read and writes characters in the proper format. |
| * |
| * @author Ben Gardner |
| * @license GPL v2+ |
| */ |
| #include "uncrustify_types.h" |
| #include "prototypes.h" |
| #include "unc_ctype.h" |
| #include <cstring> |
| #include <cstdlib> |
| |
| |
| /** |
| * See if all characters are ASCII (0-127) |
| */ |
| static bool is_ascii(const vector<UINT8>& data, int& non_ascii_cnt, int& zero_cnt) |
| { |
| non_ascii_cnt = zero_cnt = 0; |
| for (int idx = 0; idx < (int)data.size(); idx++) |
| { |
| if (data[idx] & 0x80) |
| { |
| non_ascii_cnt++; |
| } |
| if (!data[idx]) |
| { |
| zero_cnt++; |
| } |
| } |
| return((non_ascii_cnt + zero_cnt) == 0); |
| } |
| |
| |
| /** |
| * Convert the array of bytes into an array of ints |
| */ |
| static bool decode_bytes(const vector<UINT8>& in_data, deque<int>& out_data) |
| { |
| out_data.resize(in_data.size()); |
| for (int idx = 0; idx < (int)in_data.size(); idx++) |
| { |
| out_data[idx] = in_data[idx]; |
| } |
| return true; |
| } |
| |
| |
| void encode_utf8(int ch, vector<UINT8>& res) |
| { |
| if (ch < 0) |
| { |
| /* illegal code - do not store */ |
| } |
| else if (ch < 0x80) |
| { |
| /* 0xxxxxxx */ |
| res.push_back(ch); |
| } |
| else if (ch < 0x0800) |
| { |
| /* 110xxxxx 10xxxxxx */ |
| res.push_back(0xC0 | (ch >> 6)); |
| res.push_back(0x80 | (ch & 0x3f)); |
| } |
| else if (ch < 0x10000) |
| { |
| /* 1110xxxx 10xxxxxx 10xxxxxx */ |
| res.push_back(0xE0 | (ch >> 12)); |
| res.push_back(0x80 | ((ch >> 6) & 0x3f)); |
| res.push_back(0x80 | (ch & 0x3f)); |
| } |
| else if (ch < 0x200000) |
| { |
| /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
| res.push_back(0xF0 | (ch >> 18)); |
| res.push_back(0x80 | ((ch >> 12) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 6) & 0x3f)); |
| res.push_back(0x80 | (ch & 0x3f)); |
| } |
| else if (ch < 0x4000000) |
| { |
| /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
| res.push_back(0xF8 | (ch >> 24)); |
| res.push_back(0x80 | ((ch >> 18) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 12) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 6) & 0x3f)); |
| res.push_back(0x80 | (ch & 0x3f)); |
| } |
| else /* (ch <= 0x7fffffff) */ |
| { |
| /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
| res.push_back(0xFC | (ch >> 30)); |
| res.push_back(0x80 | ((ch >> 24) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 18) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 12) & 0x3f)); |
| res.push_back(0x80 | ((ch >> 6) & 0x3f)); |
| res.push_back(0x80 | (ch & 0x3f)); |
| } |
| } |
| |
| |
| /** |
| * Decode UTF-8 sequences from in_data and put the chars in out_data. |
| * If there are any decoding errors, then return false. |
| */ |
| static bool decode_utf8(const vector<UINT8>& in_data, deque<int>& out_data) |
| { |
| int idx = 0; |
| int ch, tmp, cnt; |
| |
| out_data.clear(); |
| |
| /* check for UTF-8 BOM silliness and skip */ |
| if (in_data.size() >= 3) |
| { |
| if ((in_data[0] == 0xef) && |
| (in_data[1] == 0xbb) && |
| (in_data[2] == 0xbf)) |
| { |
| /* skip it */ |
| idx = 3; |
| } |
| } |
| |
| while (idx < (int)in_data.size()) |
| { |
| ch = in_data[idx++]; |
| if (ch < 0x80) /* 1-byte sequence */ |
| { |
| out_data.push_back(ch); |
| continue; |
| } |
| else if ((ch & 0xE0) == 0xC0) /* 2-byte sequence */ |
| { |
| ch &= 0x1F; |
| cnt = 1; |
| } |
| else if ((ch & 0xF0) == 0xE0) /* 3-byte sequence */ |
| { |
| ch &= 0x0F; |
| cnt = 2; |
| } |
| else if ((ch & 0xF8) == 0xF0) /* 4-byte sequence */ |
| { |
| ch &= 0x07; |
| cnt = 3; |
| } |
| else if ((ch & 0xFC) == 0xF8) /* 5-byte sequence */ |
| { |
| ch &= 0x03; |
| cnt = 4; |
| } |
| else if ((ch & 0xFE) == 0xFC) /* 6-byte sequence */ |
| { |
| ch &= 0x01; |
| cnt = 5; |
| } |
| else |
| { |
| /* invalid UTF-8 sequence */ |
| return false; |
| } |
| |
| while ((cnt-- > 0) && (idx < (int)in_data.size())) |
| { |
| tmp = in_data[idx++]; |
| if ((tmp & 0xC0) != 0x80) |
| { |
| /* invalid UTF-8 sequence */ |
| return false; |
| } |
| ch = (ch << 6) | (tmp & 0x3f); |
| } |
| if (cnt >= 0) |
| { |
| /* short UTF-8 sequence */ |
| return false; |
| } |
| out_data.push_back(ch); |
| } |
| return true; |
| } |
| |
| |
| /** |
| * Extract 2 bytes from the stream and increment idx by 2 |
| */ |
| static int get_word(const vector<UINT8>& in_data, int& idx, bool be) |
| { |
| int ch; |
| |
| if ((idx + 2) > (int)in_data.size()) |
| { |
| ch = -1; |
| } |
| else if (be) |
| { |
| ch = (in_data[idx] << 8) | in_data[idx + 1]; |
| } |
| else |
| { |
| ch = in_data[idx] | (in_data[idx + 1] << 8); |
| } |
| idx += 2; |
| return ch; |
| } |
| |
| |
| /** |
| * Deocde a UTF-16 sequence. |
| * Sets enc based on the BOM. |
| * Must have the BOM as the first two bytes. |
| */ |
| static bool decode_utf16(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc) |
| { |
| out_data.clear(); |
| |
| if (in_data.size() & 1) |
| { |
| /* can't have and odd length */ |
| return false; |
| } |
| |
| if (in_data.size() < 2) |
| { |
| /* we require the BOM or at least 1 char */ |
| return false; |
| } |
| |
| int idx = 2; |
| if ((in_data[0] == 0xfe) && (in_data[1] == 0xff)) |
| { |
| enc = ENC_UTF16_BE; |
| } |
| else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe)) |
| { |
| enc = ENC_UTF16_LE; |
| } |
| else |
| { |
| /* If we have a few words, we can take a guess, assuming the first few |
| * chars are ASCII */ |
| enc = ENC_ASCII; |
| idx = 0; |
| if (in_data.size() >= 6) |
| { |
| if ((in_data[0] == 0) && (in_data[2] == 0) && (in_data[4] == 0)) |
| { |
| enc = ENC_UTF16_BE; |
| } |
| else if ((in_data[1] == 0) && (in_data[3] == 0) && (in_data[5] == 0)) |
| { |
| enc = ENC_UTF16_LE; |
| } |
| } |
| if (enc == ENC_ASCII) |
| { |
| return false; |
| } |
| } |
| |
| bool be = (enc == ENC_UTF16_BE); |
| |
| while (idx < (int)in_data.size()) |
| { |
| int ch = get_word(in_data, idx, be); |
| if ((ch & 0xfc00) == 0xd800) |
| { |
| ch &= 0x3ff; |
| ch <<= 10; |
| int tmp = get_word(in_data, idx, be); |
| if ((tmp & 0xfc00) != 0xdc00) |
| { |
| return false; |
| } |
| ch |= (tmp & 0x3ff); |
| ch += 0x10000; |
| out_data.push_back(ch); |
| } |
| else if (((ch >= 0) && (ch < 0xD800)) || (ch >= 0xE000)) |
| { |
| out_data.push_back(ch); |
| } |
| else |
| { |
| /* invalid character */ |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| |
| /** |
| * Looks for the BOM of UTF-16 BE/LE and UTF-8. |
| * If found, set enc and return true. |
| * Sets enc to ENC_ASCII and returns false if not found. |
| */ |
| static bool decode_bom(const vector<UINT8>& in_data, CharEncoding& enc) |
| { |
| enc = ENC_ASCII; |
| if (in_data.size() >= 2) |
| { |
| if ((in_data[0] == 0xfe) && (in_data[1] == 0xff)) |
| { |
| enc = ENC_UTF16_BE; |
| return true; |
| } |
| else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe)) |
| { |
| enc = ENC_UTF16_LE; |
| return true; |
| } |
| else if ((in_data.size() >= 3) && |
| (in_data[0] == 0xef) && |
| (in_data[1] == 0xbb) && |
| (in_data[2] == 0xbf)) |
| { |
| enc = ENC_UTF8; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| |
| /** |
| * Figure out the encoding and convert to an int sequence |
| */ |
| bool decode_unicode(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc, bool& has_bom) |
| { |
| /* check for a BOM */ |
| if (decode_bom(in_data, enc)) |
| { |
| has_bom = true; |
| if (enc == ENC_UTF8) |
| { |
| return decode_utf8(in_data, out_data); |
| } |
| else |
| { |
| return decode_utf16(in_data, out_data, enc); |
| } |
| } |
| has_bom = false; |
| |
| /* Check for simple ASCII */ |
| int non_ascii_cnt; |
| int zero_cnt; |
| if (is_ascii(in_data, non_ascii_cnt, zero_cnt)) |
| { |
| enc = ENC_ASCII; |
| return decode_bytes(in_data, out_data); |
| } |
| |
| /* There are alot of 0's in UTF-16 (~50%) */ |
| if ((zero_cnt > ((int)in_data.size() / 4)) && |
| (zero_cnt <= ((int)in_data.size() / 2))) |
| { |
| /* likely is UTF-16 */ |
| if (decode_utf16(in_data, out_data, enc)) |
| { |
| return true; |
| } |
| } |
| |
| if (decode_utf8(in_data, out_data)) |
| { |
| enc = ENC_UTF8; |
| return true; |
| } |
| |
| /* it is an unrecognized byte sequence */ |
| enc = ENC_BYTE; |
| return decode_bytes(in_data, out_data); |
| } |
| |
| |
| /** |
| * Write for ASCII and BYTE encoding |
| */ |
| static void write_byte(int ch) |
| { |
| if ((ch & 0xff) == ch) |
| { |
| if (cpd.fout) |
| { |
| fputc(ch, cpd.fout); |
| } |
| if (cpd.bout) |
| { |
| cpd.bout->push_back((UINT8)ch); |
| } |
| } |
| else |
| { |
| /* illegal code - do not store */ |
| } |
| } |
| |
| |
| /** |
| * Writes a single character to a file using UTF-8 encoding |
| */ |
| static void write_utf8(int ch) |
| { |
| vector<UINT8> vv; |
| vv.reserve(6); |
| |
| encode_utf8(ch, vv); |
| for (int idx = 0; idx < (int)vv.size(); idx++) |
| { |
| write_byte(vv[idx]); |
| } |
| } |
| |
| |
| static void write_utf16(int ch, bool be) |
| { |
| /* U+0000 to U+D7FF and U+E000 to U+FFFF */ |
| if (((ch >= 0) && (ch < 0xD800)) || ((ch >= 0xE000) && (ch < 0x10000))) |
| { |
| if (be) |
| { |
| write_byte(ch >> 8); |
| write_byte(ch & 0xff); |
| } |
| else |
| { |
| write_byte(ch & 0xff); |
| write_byte(ch >> 8); |
| } |
| } |
| else if ((ch >= 0x10000) && (ch < 0x110000)) |
| { |
| int v1 = ch - 0x10000; |
| int w1 = 0xD800 + (v1 >> 10); |
| int w2 = 0xDC00 + (v1 & 0x3ff); |
| if (be) |
| { |
| write_byte(w1 >> 8); |
| write_byte(w1 & 0xff); |
| write_byte(w2 >> 8); |
| write_byte(w2 & 0xff); |
| } |
| else |
| { |
| write_byte(w1 & 0xff); |
| write_byte(w1 >> 8); |
| write_byte(w2 & 0xff); |
| write_byte(w2 >> 8); |
| } |
| } |
| else |
| { |
| /* illegal code - do not store */ |
| } |
| } |
| |
| |
| void write_bom() |
| { |
| switch (cpd.enc) |
| { |
| case ENC_UTF8: |
| write_byte(0xef); |
| write_byte(0xbb); |
| write_byte(0xbf); |
| break; |
| |
| case ENC_UTF16_LE: |
| write_utf16(0xfeff, false); |
| break; |
| |
| case ENC_UTF16_BE: |
| write_utf16(0xfeff, true); |
| break; |
| |
| default: |
| break; |
| } |
| } |
| |
| |
| /** |
| * @param ch the 31-bit char value |
| */ |
| void write_char(int ch) |
| { |
| if (ch >= 0) |
| { |
| switch (cpd.enc) |
| { |
| case ENC_BYTE: |
| write_byte(ch & 0xff); |
| break; |
| |
| case ENC_ASCII: |
| default: |
| write_byte(ch); |
| break; |
| |
| case ENC_UTF8: |
| write_utf8(ch); |
| break; |
| |
| case ENC_UTF16_LE: |
| write_utf16(ch, false); |
| break; |
| |
| case ENC_UTF16_BE: |
| write_utf16(ch, true); |
| break; |
| } |
| } |
| } |
| |
| |
| void write_string(const unc_text& text) |
| { |
| for (int idx = 0; idx < (int)text.size(); idx++) |
| { |
| write_char(text[idx]); |
| } |
| } |