Fix for wrong encoding of Unicode values above 0xffff
diff --git a/lang/c++/impl/json/JsonIO.cc b/lang/c++/impl/json/JsonIO.cc
index 6254948..da2d85f 100644
--- a/lang/c++/impl/json/JsonIO.cc
+++ b/lang/c++/impl/json/JsonIO.cc
@@ -314,11 +314,37 @@
}
}
+static string::const_iterator unicodeParse(string::const_iterator b, string::const_iterator e, uint32_t &n) {
+ string::const_iterator start = b;
+ for (int i = 0; i < 4; i++) {
+ ++b;
+ if (b == e) {
+ throw Exception(boost::format(
+ "Invalid unicode escape: %1%") % string(start, b));
+ }
+ n *= 16;
+ char c = *b;
+ if (isdigit(c)) {
+ n += c - '0';
+ } else if (c >= 'a' && c <= 'f') {
+ n += c - 'a' + 10;
+ } else if (c >= 'A' && c <= 'F') {
+ n += c - 'A' + 10;
+ } else {
+ throw Exception(boost::format( "Invalid hex character: %1%") % c);
+ }
+ }
+ return b;
+}
+
+// Decode the given string and return contents as UTF8-encoded bytes.
+// The input does not have the enclosing double-quotes.
string JsonParser::decodeString(const string &s, bool binary) {
string result;
for (string::const_iterator it = s.begin(); it != s.end(); ++it) {
char ch = *it;
if (ch == '\\') {
+ string::const_iterator startSeq = it;
ch = *++it;
switch (ch) {
case '"':
@@ -344,29 +370,49 @@
case 'u':
case 'U': {
uint32_t n = 0;
- char e[4];
- for (char &i : e) {
- n *= 16;
- char c = *++it;
- i = c;
- if (isdigit(c)) {
- n += c - '0';
- } else if (c >= 'a' && c <= 'f') {
- n += c - 'a' + 10;
- } else if (c >= 'A' && c <= 'F') {
- n += c - 'A' + 10;
- }
- }
+ it = unicodeParse(it, s.end(), n);
if (binary) {
if (n > 0xff) {
throw Exception(boost::format(
"Invalid byte for binary: %1%%2%")
- % ch % string(e, 4));
+ % ch % string(startSeq, ++it));
} else {
result.push_back(n);
continue;
}
}
+ if (n >= 0xd800) {
+ ++it;
+ if (n > 0xdbff || it == s.end()) {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, it));
+ }
+ if (*it != '\\') {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, ++it));
+ }
+ ++it;
+ if (it == s.end()) {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, it));
+ }
+ if (*it != 'u' && *it != 'U') {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, ++it));
+ }
+ uint32_t m = 0;
+ it = unicodeParse(it, s.end(), m);
+ if (m < 0xdc00 || m > 0xdfff) {
+ throw Exception(boost::format(
+ "Invalid unicode sequence: %1%")
+ % string(startSeq, ++it));
+ }
+ n = 0x10000 + (((n - 0xd800) << 10) | (m - 0xdc00));
+ }
if (n < 0x80) {
result.push_back(n);
} else if (n < 0x800) {
@@ -376,15 +422,15 @@
result.push_back((n >> 12) | 0xe0);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
- } else if (n < 110000) {
+ } else if (n < 0x110000) {
result.push_back((n >> 18) | 0xf0);
result.push_back(((n >> 12) & 0x3f) | 0x80);
result.push_back(((n >> 6) & 0x3f) | 0x80);
result.push_back((n & 0x3f) | 0x80);
} else {
throw Exception(boost::format(
- "Invalid unicode value: %1%i%2%")
- % ch % string(e, 4));
+ "Invalid unicode value: %1%%2%")
+ % n % string(startSeq, ++it));
}
}
continue;
diff --git a/lang/c++/impl/json/JsonIO.hh b/lang/c++/impl/json/JsonIO.hh
index 94889e5..447c0b0 100644
--- a/lang/c++/impl/json/JsonIO.hh
+++ b/lang/c++/impl/json/JsonIO.hh
@@ -263,12 +263,23 @@
out_.write(toHex((static_cast<unsigned char>(c)) % 16));
}
- void escapeUnicode(uint32_t c) {
+ void escapeUnicode16(uint32_t c) {
out_.write('\\');
out_.write('u');
writeHex((c >> 8) & 0xff);
writeHex(c & 0xff);
}
+ void escapeUnicode(uint32_t c) {
+ if (c < 0x10000) {
+ escapeUnicode16(c);
+ } else if (c < 0x110000) {
+ c -= 0x10000;
+ escapeUnicode16(((c >> 10) & 0x3ff) | 0xd800);
+ escapeUnicode16((c & 0x3ff) | 0xdc00);
+ } else {
+ throw Exception(boost::format("Invalid code-point: %1%") % c);
+ }
+ }
void doEncodeString(const char *b, size_t len, bool binary) {
const char *e = b + len;
out_.write('"');
diff --git a/lang/c++/test/JsonTests.cc b/lang/c++/test/JsonTests.cc
index da9722f..125b6d6 100644
--- a/lang/c++/test/JsonTests.cc
+++ b/lang/c++/test/JsonTests.cc
@@ -68,6 +68,7 @@
{R"("\/")", EntityType::String, "/", R"("\/")"},
{R"("\u20ac")", EntityType::String, "\xe2\x82\xac", R"("\u20ac")"},
{R"("\u03c0")", EntityType::String, "\xcf\x80", R"("\u03c0")"},
+ {R"("\Ud8ab\udccd")", EntityType::String, "\xf0\xba\xb3\x8d", R"("\ud8ab\udccd")"},
};
void testBool(const TestData<bool> &d) {