From 7a87d7b6a3fc6f8d2a6ad5c163379cfd6ee97cb9 Mon Sep 17 00:00:00 2001 From: Guy Harris Date: Fri, 11 May 2012 23:55:54 +0000 Subject: Add ENC_UTF_16 and ENC_UCS_2. Note that UTF-16 and UCS-2 are not the same, and that the routines to get "Unicode" strings are really doing UCS-2 (and not doing anything about code values that aren't valid in UCS-2 strings). Have tvb_get_ephemeral_string_enc() separate cases for ASCII and UTF-8, even though they're *currently* treated the same. For FT_UINT_STRING, treat an encoding value of TRUE as meaning "little-endian ASCII"; pass all other encodings through to tvb_get_ephemeral_string_enc(). svn path=/trunk/; revision=42592 --- epan/proto.h | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'epan/proto.h') diff --git a/epan/proto.h b/epan/proto.h index 0828020485..1a77b336d9 100644 --- a/epan/proto.h +++ b/epan/proto.h @@ -245,41 +245,43 @@ typedef struct _protocol protocol_t; * was with FT_UINT_STRINGs, where we had FALSE for the string length * being big-endian and TRUE for it being little-endian. * - * This is a quick and dirty hack for bug 6084, which doesn't require - * support for multiple character encodings in FT_UINT_STRING. We - * introduce ENC_UTF_8 and ENC_EBCDIC, with ENC_UTF_8 being 0 and - * ENC_EBCDIC being the unlikely value 0x0EBCD000, and treat all values - * other than ENC_EBCDIC as UTF-8. That way, no matter how a dissector - * not converted to use ENC_ values calculates the last argument to - * proto_tree_add_item(), it's unlikely to get EBCDIC. + * We now have encoding values for the character encoding. The encoding + * values are encoded in all but the top bit (which is the byte-order + * bit, required for FT_UINT_STRING and for UCS-2 and UTF-16 strings) + * and the bottom bit (which we ignore for now so that programs that + * pass TRUE for the encoding just do ASCII). * - * The value for ENC_EBCDIC is subject to change in a future release (or - * to replacement with multiple values for different flavors of EBCDIC). + * We don't yet process ASCII and UTF-8 differently. Ultimately, for + * ASCII, all bytes with the 8th bit set should be mapped to some "this + * is not a valid character" code point, as ENC_ASCII should mean "this + * is ASCII, not some extended variant thereof". We should also map + * 0x00 to that as well - null-terminated and null-padded strings + * never have NULs in them, but counted strings might. (Either that, + * or the values for strings should be counted, not null-terminated.) + * For UTF-8, invalid UTF-8 sequences should be mapped to the same + * code point. * - * We currently add some additional encodings, for various ASCII-based - * encodings, but use the same value as ENC_UTF_8, for now, so that we - * can mark the appropriate encoding. Ultimately, we should handle - * those encodings by mapping them to UTF-8 for display; for ASCII, - * all bytes with the 8th bit set should be mapped to some "this is - * not a valid character" glyph, as ENC_ASCII should mean "this is - * ASCII, not some extended variant thereof". Perhaps we should also - * map control characters to the Unicode glyphs showing the name of - * the control character in small caps, diagonally. (Unfortunately, - * those only exist for C0, not C1.) + * We also don't process UTF-16 or UCS-2 differently - we don't + * handle surrogate pairs, and don't handle 2-byte values that + * aren't valid in UTF-16 or UCS-2 strings. + * + * For display, perhaps we should also map control characters to the + * Unicode glyphs showing the name of the control character in small + * caps, diagonally. (Unfortunately, those only exist for C0, not C1.) */ #define ENC_CHARENCODING_MASK 0x7FFFFFFE /* mask out byte-order bits */ -#define ENC_UTF_8 0x00000000 -#define ENC_ASCII 0x00000000 -#define ENC_EBCDIC 0x0EBCD1C0 +#define ENC_ASCII (0 << 1) /* shift up to avoid low-order bit */ +#define ENC_UTF_8 (1 << 1) +#define ENC_UTF_16 (2 << 1) +#define ENC_UCS_2 (3 << 1) +#define ENC_EBCDIC (4 << 1) /* * TODO: * * These could probably be used by existing code: * - * ENC_UTF_16 - UTF-16 * ENC_UCS_4 - UCS-4 - * ENC_UCS_2 - UCS-2 (not the same as UTF-16!) * ENC_ISO_8859_1 - ISO 8859/1 * ENC_ISO_8859_8 - ISO 8859/8 * - "IBM MS DBCS" -- cgit v1.2.1