blob: 8e3fe314b43bcd26550c5c64d77de02bd4791e43 [file] [log] [blame]
/**
* @file unicode.cpp
* Detects, read and writes characters in the proper format.
*
* @author Ben Gardner
* @license GPL v2+
*/
#include "uncrustify_types.h"
#include "prototypes.h"
#include "unc_ctype.h"
#include <cstring>
#include <cstdlib>
/**
* See if all characters are ASCII (0-127)
*/
static bool is_ascii(const vector<UINT8>& data, int& non_ascii_cnt, int& zero_cnt)
{
non_ascii_cnt = zero_cnt = 0;
for (int idx = 0; idx < (int)data.size(); idx++)
{
if (data[idx] & 0x80)
{
non_ascii_cnt++;
}
if (!data[idx])
{
zero_cnt++;
}
}
return((non_ascii_cnt + zero_cnt) == 0);
}
/**
* Convert the array of bytes into an array of ints
*/
static bool decode_bytes(const vector<UINT8>& in_data, deque<int>& out_data)
{
out_data.resize(in_data.size());
for (int idx = 0; idx < (int)in_data.size(); idx++)
{
out_data[idx] = in_data[idx];
}
return true;
}
void encode_utf8(int ch, vector<UINT8>& res)
{
if (ch < 0)
{
/* illegal code - do not store */
}
else if (ch < 0x80)
{
/* 0xxxxxxx */
res.push_back(ch);
}
else if (ch < 0x0800)
{
/* 110xxxxx 10xxxxxx */
res.push_back(0xC0 | (ch >> 6));
res.push_back(0x80 | (ch & 0x3f));
}
else if (ch < 0x10000)
{
/* 1110xxxx 10xxxxxx 10xxxxxx */
res.push_back(0xE0 | (ch >> 12));
res.push_back(0x80 | ((ch >> 6) & 0x3f));
res.push_back(0x80 | (ch & 0x3f));
}
else if (ch < 0x200000)
{
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
res.push_back(0xF0 | (ch >> 18));
res.push_back(0x80 | ((ch >> 12) & 0x3f));
res.push_back(0x80 | ((ch >> 6) & 0x3f));
res.push_back(0x80 | (ch & 0x3f));
}
else if (ch < 0x4000000)
{
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
res.push_back(0xF8 | (ch >> 24));
res.push_back(0x80 | ((ch >> 18) & 0x3f));
res.push_back(0x80 | ((ch >> 12) & 0x3f));
res.push_back(0x80 | ((ch >> 6) & 0x3f));
res.push_back(0x80 | (ch & 0x3f));
}
else /* (ch <= 0x7fffffff) */
{
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
res.push_back(0xFC | (ch >> 30));
res.push_back(0x80 | ((ch >> 24) & 0x3f));
res.push_back(0x80 | ((ch >> 18) & 0x3f));
res.push_back(0x80 | ((ch >> 12) & 0x3f));
res.push_back(0x80 | ((ch >> 6) & 0x3f));
res.push_back(0x80 | (ch & 0x3f));
}
}
/**
* Decode UTF-8 sequences from in_data and put the chars in out_data.
* If there are any decoding errors, then return false.
*/
static bool decode_utf8(const vector<UINT8>& in_data, deque<int>& out_data)
{
int idx = 0;
int ch, tmp, cnt;
out_data.clear();
/* check for UTF-8 BOM silliness and skip */
if (in_data.size() >= 3)
{
if ((in_data[0] == 0xef) &&
(in_data[1] == 0xbb) &&
(in_data[2] == 0xbf))
{
/* skip it */
idx = 3;
}
}
while (idx < (int)in_data.size())
{
ch = in_data[idx++];
if (ch < 0x80) /* 1-byte sequence */
{
out_data.push_back(ch);
continue;
}
else if ((ch & 0xE0) == 0xC0) /* 2-byte sequence */
{
ch &= 0x1F;
cnt = 1;
}
else if ((ch & 0xF0) == 0xE0) /* 3-byte sequence */
{
ch &= 0x0F;
cnt = 2;
}
else if ((ch & 0xF8) == 0xF0) /* 4-byte sequence */
{
ch &= 0x07;
cnt = 3;
}
else if ((ch & 0xFC) == 0xF8) /* 5-byte sequence */
{
ch &= 0x03;
cnt = 4;
}
else if ((ch & 0xFE) == 0xFC) /* 6-byte sequence */
{
ch &= 0x01;
cnt = 5;
}
else
{
/* invalid UTF-8 sequence */
return false;
}
while ((cnt-- > 0) && (idx < (int)in_data.size()))
{
tmp = in_data[idx++];
if ((tmp & 0xC0) != 0x80)
{
/* invalid UTF-8 sequence */
return false;
}
ch = (ch << 6) | (tmp & 0x3f);
}
if (cnt >= 0)
{
/* short UTF-8 sequence */
return false;
}
out_data.push_back(ch);
}
return true;
}
/**
* Extract 2 bytes from the stream and increment idx by 2
*/
static int get_word(const vector<UINT8>& in_data, int& idx, bool be)
{
int ch;
if ((idx + 2) > (int)in_data.size())
{
ch = -1;
}
else if (be)
{
ch = (in_data[idx] << 8) | in_data[idx + 1];
}
else
{
ch = in_data[idx] | (in_data[idx + 1] << 8);
}
idx += 2;
return ch;
}
/**
* Deocde a UTF-16 sequence.
* Sets enc based on the BOM.
* Must have the BOM as the first two bytes.
*/
static bool decode_utf16(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc)
{
out_data.clear();
if (in_data.size() & 1)
{
/* can't have and odd length */
return false;
}
if (in_data.size() < 2)
{
/* we require the BOM or at least 1 char */
return false;
}
int idx = 2;
if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
{
enc = ENC_UTF16_BE;
}
else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
{
enc = ENC_UTF16_LE;
}
else
{
/* If we have a few words, we can take a guess, assuming the first few
* chars are ASCII */
enc = ENC_ASCII;
idx = 0;
if (in_data.size() >= 6)
{
if ((in_data[0] == 0) && (in_data[2] == 0) && (in_data[4] == 0))
{
enc = ENC_UTF16_BE;
}
else if ((in_data[1] == 0) && (in_data[3] == 0) && (in_data[5] == 0))
{
enc = ENC_UTF16_LE;
}
}
if (enc == ENC_ASCII)
{
return false;
}
}
bool be = (enc == ENC_UTF16_BE);
while (idx < (int)in_data.size())
{
int ch = get_word(in_data, idx, be);
if ((ch & 0xfc00) == 0xd800)
{
ch &= 0x3ff;
ch <<= 10;
int tmp = get_word(in_data, idx, be);
if ((tmp & 0xfc00) != 0xdc00)
{
return false;
}
ch |= (tmp & 0x3ff);
ch += 0x10000;
out_data.push_back(ch);
}
else if (((ch >= 0) && (ch < 0xD800)) || (ch >= 0xE000))
{
out_data.push_back(ch);
}
else
{
/* invalid character */
return false;
}
}
return true;
}
/**
* Looks for the BOM of UTF-16 BE/LE and UTF-8.
* If found, set enc and return true.
* Sets enc to ENC_ASCII and returns false if not found.
*/
static bool decode_bom(const vector<UINT8>& in_data, CharEncoding& enc)
{
enc = ENC_ASCII;
if (in_data.size() >= 2)
{
if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
{
enc = ENC_UTF16_BE;
return true;
}
else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
{
enc = ENC_UTF16_LE;
return true;
}
else if ((in_data.size() >= 3) &&
(in_data[0] == 0xef) &&
(in_data[1] == 0xbb) &&
(in_data[2] == 0xbf))
{
enc = ENC_UTF8;
return true;
}
}
return false;
}
/**
* Figure out the encoding and convert to an int sequence
*/
bool decode_unicode(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc, bool& has_bom)
{
/* check for a BOM */
if (decode_bom(in_data, enc))
{
has_bom = true;
if (enc == ENC_UTF8)
{
return decode_utf8(in_data, out_data);
}
else
{
return decode_utf16(in_data, out_data, enc);
}
}
has_bom = false;
/* Check for simple ASCII */
int non_ascii_cnt;
int zero_cnt;
if (is_ascii(in_data, non_ascii_cnt, zero_cnt))
{
enc = ENC_ASCII;
return decode_bytes(in_data, out_data);
}
/* There are alot of 0's in UTF-16 (~50%) */
if ((zero_cnt > ((int)in_data.size() / 4)) &&
(zero_cnt <= ((int)in_data.size() / 2)))
{
/* likely is UTF-16 */
if (decode_utf16(in_data, out_data, enc))
{
return true;
}
}
if (decode_utf8(in_data, out_data))
{
enc = ENC_UTF8;
return true;
}
/* it is an unrecognized byte sequence */
enc = ENC_BYTE;
return decode_bytes(in_data, out_data);
}
/**
* Write for ASCII and BYTE encoding
*/
static void write_byte(int ch)
{
if ((ch & 0xff) == ch)
{
if (cpd.fout)
{
fputc(ch, cpd.fout);
}
if (cpd.bout)
{
cpd.bout->push_back((UINT8)ch);
}
}
else
{
/* illegal code - do not store */
}
}
/**
* Writes a single character to a file using UTF-8 encoding
*/
static void write_utf8(int ch)
{
vector<UINT8> vv;
vv.reserve(6);
encode_utf8(ch, vv);
for (int idx = 0; idx < (int)vv.size(); idx++)
{
write_byte(vv[idx]);
}
}
static void write_utf16(int ch, bool be)
{
/* U+0000 to U+D7FF and U+E000 to U+FFFF */
if (((ch >= 0) && (ch < 0xD800)) || ((ch >= 0xE000) && (ch < 0x10000)))
{
if (be)
{
write_byte(ch >> 8);
write_byte(ch & 0xff);
}
else
{
write_byte(ch & 0xff);
write_byte(ch >> 8);
}
}
else if ((ch >= 0x10000) && (ch < 0x110000))
{
int v1 = ch - 0x10000;
int w1 = 0xD800 + (v1 >> 10);
int w2 = 0xDC00 + (v1 & 0x3ff);
if (be)
{
write_byte(w1 >> 8);
write_byte(w1 & 0xff);
write_byte(w2 >> 8);
write_byte(w2 & 0xff);
}
else
{
write_byte(w1 & 0xff);
write_byte(w1 >> 8);
write_byte(w2 & 0xff);
write_byte(w2 >> 8);
}
}
else
{
/* illegal code - do not store */
}
}
void write_bom()
{
switch (cpd.enc)
{
case ENC_UTF8:
write_byte(0xef);
write_byte(0xbb);
write_byte(0xbf);
break;
case ENC_UTF16_LE:
write_utf16(0xfeff, false);
break;
case ENC_UTF16_BE:
write_utf16(0xfeff, true);
break;
default:
break;
}
}
/**
* @param ch the 31-bit char value
*/
void write_char(int ch)
{
if (ch >= 0)
{
switch (cpd.enc)
{
case ENC_BYTE:
write_byte(ch & 0xff);
break;
case ENC_ASCII:
default:
write_byte(ch);
break;
case ENC_UTF8:
write_utf8(ch);
break;
case ENC_UTF16_LE:
write_utf16(ch, false);
break;
case ENC_UTF16_BE:
write_utf16(ch, true);
break;
}
}
}
void write_string(const unc_text& text)
{
for (int idx = 0; idx < (int)text.size(); idx++)
{
write_char(text[idx]);
}
}