Fix bug #9618: Invalid utf8 causes JSON dissector assertion failure "g_utf8_validate"

Validate JSON UTF-8 characters, replace with '?' when invalid. svn path=/trunk/; revision=54633
author: Jakub Zawadzki <darkjames-ws@darkjames.pl> 2014-01-07 22:17:32 +0000
committer: Jakub Zawadzki <darkjames-ws@darkjames.pl> 2014-01-07 22:17:32 +0000
commit: abda30e9e6d8fd9aa28edc4677796e61a9c88997 (patch)
tree: ab90b4994f29142a1554c36b6160033b3d499405
parent: d1dcee936b2a0ed257c526889b664e2b314d3eb0 (diff)
download: wireshark-abda30e9e6d8fd9aa28edc4677796e61a9c88997.tar.gz
7 files changed, 71 insertions, 20 deletions
diff --git a/epan/dissectors/packet-json.c b/epan/dissectors/packet-json.c
index dc339b5e61..3c9f09193d 100644
--- a/epan/dissectors/packet-json.c
+++ b/epan/dissectors/packet-json.c
@@ -249,6 +249,30 @@ static void after_array(void *tvbparse_data, const void *wanted_data _U_, tvbpar
 	wmem_stack_pop(data->stack);
 }
 
+static int
+json_tvb_memcpy_utf8(char *buf, tvbuff_t *tvb, int offset, int offset_max)
+{
+	int len = ws_utf8_char_len((guint8) *buf);
+
+	/* XXX, before moving to core API check if it's off-by-one safe.
+	 * For JSON analyzer it's not a problem 
+	 * (string always terminated by ", which is not valid UTF-8 continuation character) */
+	if (len == -1 || ((guint) (offset + len)) >= (guint) offset_max) {
+		*buf = '?';
+		return 1;
+	}
+
+	/* assume it's valid UTF-8 */
+	tvb_memcpy(tvb, buf + 1, offset + 1, len - 1);
+
+	if (!g_utf8_validate(buf, len, NULL)) {
+		*buf = '?';
+		return 1;
+	}
+
+	return len;
+}
+
 static char *json_string_unescape(tvbparse_elem_t *tok)
 {
 	char *str = (char *)wmem_alloc(wmem_packet_scope(), tok->len - 1);
@@ -266,7 +290,6 @@ static char *json_string_unescape(tvbparse_elem_t *tok)
 				case '\"':
 				case '\\':
 				case '/':
-				default:
 					str[j++] = ch;
 					break;
 
@@ -361,10 +384,22 @@ static char *json_string_unescape(tvbparse_elem_t *tok)
 						str[j++] = '?';
 					break;
 				}
+
+				default:
+					/* not valid by JSON grammar (also tvbparse rules should not allow it) */
+					DISSECTOR_ASSERT_NOT_REACHED();
+					break;
 			}
 
-		} else
-			str[j++] = ch;
+		} else {
+			int utf_len;
+
+			str[j] = ch;
+			/* XXX if it's not valid UTF-8 character, add some expert info? (it violates JSON grammar) */
+			utf_len = json_tvb_memcpy_utf8(&str[j], tok->tvb, i, tok->len);
+			j += utf_len;
+			i += (utf_len - 1);
+		}
 
 	}
 	str[j] = '\0';
diff --git a/wsutil/CMakeLists.txt b/wsutil/CMakeLists.txt
index b14c236238..c4251ea0bb 100644
--- a/wsutil/CMakeLists.txt
+++ b/wsutil/CMakeLists.txt
@@ -30,7 +30,6 @@ IF(WIN32)
     inet_ntop.c
     inet_pton.c
     strptime.c
-    unicode-utils.c
     wsgetopt.c
   )
 ENDIF(WIN32)
@@ -69,6 +68,7 @@ set(WSUTIL_FILES
   tempfile.c
   type_util.c
   u3.c
+  unicode-utils.c
   ${WSUTIL_PLATFORM_FILES}
 )
 
diff --git a/wsutil/Makefile.am b/wsutil/Makefile.am
index 6ead86a92f..0b1256c7e8 100644
--- a/wsutil/Makefile.am
+++ b/wsutil/Makefile.am
@@ -99,8 +99,6 @@ EXTRA_DIST =		\
 	Makefile.nmake	\
 	file_util.c	\
 	file_util.h 	\
-	unicode-utils.c	\
-	unicode-utils.h \
 	wsgcrypt.h
 
 CLEANFILES = \
diff --git a/wsutil/Makefile.common b/wsutil/Makefile.common
index 96b5a7cfa3..0efc26fb1a 100644
--- a/wsutil/Makefile.common
+++ b/wsutil/Makefile.common
@@ -61,7 +61,8 @@ LIBWSUTIL_SRC = 	\
 	report_err.c	\
 	tempfile.c	\
 	type_util.c	\
-	u3.c
+	u3.c		\
+	unicode-utils.c
 
 # Header files that are not generated from other files
 LIBWSUTIL_INCLUDES = 	\
@@ -100,4 +101,5 @@ LIBWSUTIL_INCLUDES = 	\
 	report_err.h	\
 	tempfile.h	\
 	type_util.h	\
-	u3.h
+	u3.h		\
+	unicode-utils.h
diff --git a/wsutil/Makefile.nmake b/wsutil/Makefile.nmake
index 673c6e4817..4da5224075 100644
--- a/wsutil/Makefile.nmake
+++ b/wsutil/Makefile.nmake
@@ -28,7 +28,6 @@ OBJECTS = file_util.obj		\
 	inet_pton.obj		\
 	$(LIBWSUTIL_SRC:.c=.obj) \
 	strptime.obj		\
-	unicode-utils.obj	\
 	wsgetopt.obj
 
 # For use when making libwsutil.dll
diff --git a/wsutil/unicode-utils.c b/wsutil/unicode-utils.c
index 8935e46f38..21cc489df7 100644
--- a/wsutil/unicode-utils.c
+++ b/wsutil/unicode-utils.c
@@ -22,12 +22,23 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
-#ifndef _WIN32
-#error "This is only for Windows"
-#endif
-
 #include "unicode-utils.h"
 
+int
+ws_utf8_char_len(guint8 ch)
+{
+  if (ch >= 0xfe) return -1;
+  if (ch >= 0xfc) return  6;
+  if (ch >= 0xf8) return  5;
+  if (ch >= 0xf0) return  4;
+  if (ch >= 0xe0) return  3;
+  if (ch >= 0xc0) return  2;
+  else            return  1;
+}
+
+
+#ifdef _WIN32
+
 #include <shellapi.h>
 
 /** @file
@@ -157,3 +168,5 @@ arg_list_utf_16to8(int argc, char *argv[]) {
     }
   } /* XXX else bail because something is horribly, horribly wrong? */
 }
+
+#endif
diff --git a/wsutil/unicode-utils.h b/wsutil/unicode-utils.h
index a9d5318f6b..7b29d8460a 100644
--- a/wsutil/unicode-utils.h
+++ b/wsutil/unicode-utils.h
@@ -25,21 +25,25 @@
 #ifndef __UNICODEUTIL_H__
 #define __UNICODEUTIL_H__
 
-#include "ws_symbol_export.h"
-
-#ifdef _WIN32
-
 #include "config.h"
 
+#include "ws_symbol_export.h"
+
 #include <glib.h>
-#include <windows.h>
-#include <tchar.h>
-#include <wchar.h>
 
 /**
  * @file Unicode convenience routines.
  */
 
+WS_DLL_PUBLIC
+int ws_utf8_char_len(guint8 ch);
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <tchar.h>
+#include <wchar.h>
+
 /** Given a UTF-8 string, convert it to UTF-16.  This is meant to be used
  * to convert between GTK+ 2.x (UTF-8) to Windows (UTF-16).
  *
author	Jakub Zawadzki <darkjames-ws@darkjames.pl>	2014-01-07 22:17:32 +0000
committer	Jakub Zawadzki <darkjames-ws@darkjames.pl>	2014-01-07 22:17:32 +0000
commit	abda30e9e6d8fd9aa28edc4677796e61a9c88997 (patch)
tree	ab90b4994f29142a1554c36b6160033b3d499405
parent	d1dcee936b2a0ed257c526889b664e2b314d3eb0 (diff)
download	wireshark-abda30e9e6d8fd9aa28edc4677796e61a9c88997.tar.gz