From e2ec3f976803b360c70d9ae2ba13852fa5d11665 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Thu, 11 Apr 2013 18:07:21 +0200 Subject: qjson: to_json() case QTYPE_QSTRING is buggy, rewrite Known bugs in to_json(): * A start byte for a three-byte sequence followed by less than two continuation bytes is split into one-byte sequences. * Start bytes for sequences longer than three bytes get misinterpreted as start bytes for three-byte sequences. Continuation bytes beyond byte three become one-byte sequences. This means all characters outside the BMP are decoded incorrectly. * One-byte sequences with the MSB are put into the JSON string verbatim when char is unsigned, producing invalid UTF-8. When char is signed, they're replaced by "\\uFFFF" instead. This includes \xFE, \xFF, and stray continuation bytes. * Overlong sequences are happily accepted, unless screwed up by the bugs above. * Likewise, sequences encoding surrogate code points or noncharacters. * Unlike other control characters, ASCII DEL is not escaped. Except in overlong encodings. My rewrite fixes them as follows: * Malformed UTF-8 sequences are replaced. Except the overlong encoding \xC0\x80 of U+0000 is still accepted. Permits embedding NUL characters in C strings. This trick is known as "Modified UTF-8". * Sequences encoding code points beyond Unicode range are replaced. * Sequences encoding code points beyond the BMP produce a surrogate pair. * Sequences encoding surrogate code points are replaced. * Sequences encoding noncharacters are replaced. * ASCII DEL is now always escaped. The replacement character is U+FFFD. Signed-off-by: Markus Armbruster Reviewed-by: Laszlo Ersek Signed-off-by: Blue Swirl --- qobject/qjson.c | 102 +++++++++++++++++++++++++------------------------------- 1 file changed, 45 insertions(+), 57 deletions(-) (limited to 'qobject') diff --git a/qobject/qjson.c b/qobject/qjson.c index 83a6b4f7c1..19085a1bb7 100644 --- a/qobject/qjson.c +++ b/qobject/qjson.c @@ -136,68 +136,56 @@ static void to_json(const QObject *obj, QString *str, int pretty, int indent) case QTYPE_QSTRING: { QString *val = qobject_to_qstring(obj); const char *ptr; + int cp; + char buf[16]; + char *end; ptr = qstring_get_str(val); qstring_append(str, "\""); - while (*ptr) { - if ((ptr[0] & 0xE0) == 0xE0 && - (ptr[1] & 0x80) && (ptr[2] & 0x80)) { - uint16_t wchar; - char escape[7]; - - wchar = (ptr[0] & 0x0F) << 12; - wchar |= (ptr[1] & 0x3F) << 6; - wchar |= (ptr[2] & 0x3F); - ptr += 2; - - snprintf(escape, sizeof(escape), "\\u%04X", wchar); - qstring_append(str, escape); - } else if ((ptr[0] & 0xE0) == 0xC0 && (ptr[1] & 0x80)) { - uint16_t wchar; - char escape[7]; - - wchar = (ptr[0] & 0x1F) << 6; - wchar |= (ptr[1] & 0x3F); - ptr++; - - snprintf(escape, sizeof(escape), "\\u%04X", wchar); - qstring_append(str, escape); - } else switch (ptr[0]) { - case '\"': - qstring_append(str, "\\\""); - break; - case '\\': - qstring_append(str, "\\\\"); - break; - case '\b': - qstring_append(str, "\\b"); - break; - case '\f': - qstring_append(str, "\\f"); - break; - case '\n': - qstring_append(str, "\\n"); - break; - case '\r': - qstring_append(str, "\\r"); - break; - case '\t': - qstring_append(str, "\\t"); - break; - default: { - if (ptr[0] <= 0x1F) { - char escape[7]; - snprintf(escape, sizeof(escape), "\\u%04X", ptr[0]); - qstring_append(str, escape); - } else { - char buf[2] = { ptr[0], 0 }; - qstring_append(str, buf); - } - break; + + for (; *ptr; ptr = end) { + cp = mod_utf8_codepoint(ptr, 6, &end); + switch (cp) { + case '\"': + qstring_append(str, "\\\""); + break; + case '\\': + qstring_append(str, "\\\\"); + break; + case '\b': + qstring_append(str, "\\b"); + break; + case '\f': + qstring_append(str, "\\f"); + break; + case '\n': + qstring_append(str, "\\n"); + break; + case '\r': + qstring_append(str, "\\r"); + break; + case '\t': + qstring_append(str, "\\t"); + break; + default: + if (cp < 0) { + cp = 0xFFFD; /* replacement character */ } + if (cp > 0xFFFF) { + /* beyond BMP; need a surrogate pair */ + snprintf(buf, sizeof(buf), "\\u%04X\\u%04X", + 0xD800 + ((cp - 0x10000) >> 10), + 0xDC00 + ((cp - 0x10000) & 0x3FF)); + } else if (cp < 0x20 || cp >= 0x7F) { + snprintf(buf, sizeof(buf), "\\u%04X", cp); + } else { + buf[0] = cp; + buf[1] = 0; } - ptr++; - } + qstring_append(str, buf); + } + }; + qstring_append(str, "\""); break; } -- cgit v1.2.1