From 7dbff5189e7ab2e31c1b711ce25be067c648a3a2 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Sun, 28 Jun 2015 01:08:51 +0200
Subject: Add cleanup routine script and notes

For tracking purposes and in case I need to do something similar again.
---
 one-off/cleanup-notes.txt  |  54 +++++
 one-off/cleanup-rewrite.py | 480 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 534 insertions(+)
 create mode 100644 one-off/cleanup-notes.txt
 create mode 100755 one-off/cleanup-rewrite.py

diff --git a/one-off/cleanup-notes.txt b/one-off/cleanup-notes.txt
new file mode 100644
index 0000000..7383598
--- /dev/null
+++ b/one-off/cleanup-notes.txt
@@ -0,0 +1,54 @@
+
+grep -rnw register_init_routine | grep -P 'register_init_routine *\( *&?\K[a-z0-9_]+(?= *\))' -o
+
+grep -P 'register_init_routine *\( *&?\K[a-z0-9_]+(?= *\))' -oHr
+
+# Read files and format in a standard way
+time while IFS=: read filename func; do out=/tmp/out/$filename;
+    mkdir -p "${out%/*}" && clang-format "$filename" >"$out"; done < /tmp/1
+# definition is in different file
+clang-format plugins/mate/mate_runtime.c > /tmp/out/plugins/mate/packet-mate.c
+# change initialize_mate_runtime from extern to static
+clang-format epan/dissectors/packet-ncp2222.inc > /tmp/out/tools/ncp2222.py
+
+# Extract all functions
+while IFS=: read filename func; do out=/tmp/fns/$filename; mkdir -p "${out%/*}" && awk "/^(static )?void $func\\(/&&! /;$/{p=1;if(/}$/){print;exit}};p;p&&/^}/{exit}" "/tmp/out/$filename" > "$out"; done < /tmp/1
+
+Do not forget to regenerate:
+--- asn1/camel/packet-camel-template.c  2015-06-23 22:58:45.547098846 +0200
++++ -   2015-06-23 23:40:03.580116775 +0200
+@@ -504,7 +504,7 @@
+  * Routine called when the TAP is initialized.
+  * so hash table are (re)created
+  */
+-void camelsrt_init_routine(void)
++static void camelsrt_init_routine(void)
+ {
+ 
+     /* free hash-table for SRT */
+
+TODO lua only has init, needs free?
+wslua_init_routine
+
+TODO rtp uses wmem_register_callback() to clean memory... move this to cleanup
+func?  rtp_dyn_payloads_init
+
+mp2t dissector leaks at:
+1524     heur_subdissector_list = register_heur_dissector_list("mp2t.pid");
+view -p $(cat /tmp/fns/.x/epan.txt)
+
+# Extract functions from a file given file:funcName
+while IFS=: read filename func; do awk "function f(){print \"/// $func\n\";exit};/\\<$func *\(/&&!/;$/{p=1;print \"//\", FILENAME; if(/}$/){print;f()}};p;p&&/^}/{f()}" $filename; done < /tmp/fns/.x/epan-func.txt
+
+# given a list of files, extract the functions from files
+while read filename; do
+func=$(awk -F: "\"$filename\"~\$1{print \$2;exit}" /tmp/fns/.x/epan-func.txt)
+[ -n "$func" ] || continue
+    awk "function f(){print \"/// $func\n\";exit};/\\<$func *\(/&&!/;$/{p=1;print \"//\", FILENAME; if(/}$/){print;f()}};p;p&&/^}/{f()}" $filename; done
+
+# Test dissectors with cleanup script, run from /tmp/wireshark
+f(){ read x;n=${x:-$n}; f=epan/dissectors/packet-$n.c && [ -f $f ] && ~/projects/wireshark-notes/one-off/cleanup-rewrite.py $f |& colordiff -u $f - | less -R;}
+while :;do f;done
+
+# convert all dissectors with cleanup script
+time for i in $(cat /tmp/fns/.x/epan.txt);do ../wireshark-notes/one-off/cleanup-rewrite.py -w $i 2>/tmp/errs/${i##*/}.txt;done
diff --git a/one-off/cleanup-rewrite.py b/one-off/cleanup-rewrite.py
new file mode 100755
index 0000000..0024e8e
--- /dev/null
+++ b/one-off/cleanup-rewrite.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python
+# Detects init functions with just reassembly functionality and adds a
+# corresponding cleanup function for it.
+
+# 1. Load file containing lines with: path/to/file.c:foo_init
+# 2. Find function, extract it.
+# 3. Append cleanup func.
+# 4. Find register_init_routine call and append cleanup call.
+#
+# Detect init function:
+#   static void foo_init(void) {
+#       // one or more lines. Non-empty lines are processed as shown below.
+#       // Note that functions may split over multiple lines and that indent
+#       // might differ.
+#   }
+#
+# Keep comments in output:
+#       /* optional comments,
+#        * possibly multiline */
+#
+# Keep reassembly, remember R_NAME:
+#       reassembly_table_init(&R_NAME, &functions);
+#
+# Strip hash table destroy and if conditions, remember name:
+#       if (HT_NAME) g_hash_table_destroy(HT_NAME);
+#       if (HT_NAME != NULL) { /* ... */ }
+#       if (HT_NAME) {
+#           g_hash_table_destroy(HT_NAME);
+#           HT_NAME = NULL; // ignore this as well if any
+#       }
+#
+# Keep hash table init:
+#       HT_NAME = g_hash_table_new_full(...);
+#       HT_NAME = g_hash_table_new(...);
+#
+# Keep, but mark as TODO (or ignore for now?):
+#       varname = 0;
+#       varname = NULL;
+#
+#
+# After init function:
+# Output g_hash_table_destroy for each HT_NAME
+# Output reassembly_table_destroy for each R_NAME.
+
+import sys, re, logging
+_logger = logging.getLogger(__name__)
+
+# For quick sanity checking (funcName)
+RE_FUNCTION_HEADER = re.compile(
+        r'(?:static\s+)?void\s+(?P<funcName>\w+)\s*\(\s*void\s*\)')
+# TODO: maybe detect prototypes?
+# Matches init/cleanup function signature (funcName, body)
+RE_FUNCTION = re.compile(
+        r'''
+        ^(?:static \s+ )?void   \s+ # "static void" - prefix
+        (?P<funcName>\w+)       \s* # "foo_init" - function name
+        \([^)]*\) \s*   \{          # "(void) {" - function params
+        (?P<body>
+            [^\n]+                  # everything on one line { ... }
+            |
+            (?:                     # Handle multiple lines
+                \n[^}][^\n]+        # heh, forget '\n' and you run into a loop...
+                |
+                \n                  # Handle empty lines
+            )+
+        )               \}[^\n]*\n  # "} /* foo_init */" - end of function
+        ''', re.M | re.X)
+RE_IF = re.compile(
+        r'''
+        if\s*\(\s*                  # "if ("
+            (?P<varName>[.\w]+)\s*  # "HT_NAME "
+            (?:!=\s* (?:NULL|0))?   # "!= NULL
+        \)                          # ")"
+        ''', re.X)
+# Matches reassembly lines
+RE_REASS = re.compile('reassembly_table_init\s*\(\s*(?P<name>[^\s,]+)')
+# Matches "g_hash_table_destroy(HT_NAME)"
+RE_HT_DESTROY = re.compile(r'''
+        g_hash_table_destroy\s*\(\s*    # "g_hash_table_destroy("
+        (?P<varName>[.\w]+)\s*          # "struct.ht_name"
+        \)                              # ")"
+        ''', re.X)
+RE_ASSIGNMENT = re.compile(r'(?P<varName>[.\w]+)\s*=\s*(?P<value>[^;]*)')
+
+class Function(object):
+    def __init__(self, name, body, func_match):
+        self.name = name
+        self.body = body
+        self.func_match = func_match
+        self.lines_keep = ''
+        self.reassemble_names = []
+        self.ht_names = []
+        self.unknown_lines = ''
+
+    def detect_comment(self, text, multiline_comment):
+        if multiline_comment:
+            multiline_comment = not text.endswith('*/')
+            # Assume that there is no code after the end marker
+            return True, multiline_comment
+        else:
+            multiline_comment = text.startswith('/*')
+            if multiline_comment:
+                multiline_comment = not text.endswith('*/')
+                return True, multiline_comment
+            if text.startswith('//'):
+                return True, False
+        # Not a comment, not a multi-line comment
+        return False, False
+
+    def parse(self):
+        """Call it once to parse the given function body."""
+        multiline_comment = False
+        # Find all functional lines
+        self._lines_iter = iter(self.body.splitlines(True))
+        for line in self._lines_iter:
+            # Track whether the line was understood or not
+            # None = needs check, False = invalid, True = handled
+            handled = None
+
+            # Ignore empty lines.
+            text = line.strip()
+            if not text:
+                continue
+
+            # Keep comments, but ignore them for parsing
+            is_comment, multiline_comment = self.detect_comment(text,
+                    multiline_comment)
+            if is_comment:
+                # Uncomment to keep comments (might also have to do this for
+                # RE_ASSIGNMENT below).
+                #self.lines_keep += line
+                handled = True
+
+            # detect reassembly function
+            if handled is None:
+                reass_match = RE_REASS.match(text)
+                if reass_match:
+                    handled = self.handle_reasembly(reass_match, line)
+
+            if handled is None:
+                # Find if/hashtable stuff
+                if_match = RE_IF.match(text)
+                if if_match:
+                    _logger.debug('Found if in: %s', text)
+                    handled = self.handle_if(if_match, line)
+
+            if handled is None:
+                # Find assignments such as hash table things
+                assignment_match = RE_ASSIGNMENT.match(text)
+                if assignment_match:
+                    _logger.debug('Found assignment in: %s', text)
+                    varName = assignment_match.group('varName')
+                    # Hash table creation
+                    line, text = self._read_stmt(line)
+                    if 'g_hash_table_new' in text:
+                        _logger.debug('Found hash table in: %s', text)
+                        if varName not in self.ht_names:
+                            _logger.warn('HT %s was not destructed', varName)
+                            #self.ht_names.append(varName)
+                        self.lines_keep += line
+                        handled = True
+
+            if not handled:
+                self.unknown_lines += line
+
+        if self.unknown_lines:
+            _logger.error('Unknown lines in %s:\n%s',
+                self.name, self.unknown_lines)
+            return False
+        _logger.info('Found function %s', self.name)
+        _logger.info('Keep function  %s:\n%s', self.name, self.lines_keep)
+        return True
+
+    def _read_stmt(self, line='', terminator=';'):
+        """
+        Reads lines until a full statement is ready.
+        :param line: current buffer that needs to be finished
+        """
+        text = line.strip()
+        ml_comment = False
+        while terminator not in text:
+            line2 = next(self._lines_iter)
+            text2 = line2.strip()
+            is_comment, ml_comment = self.detect_comment(text2, ml_comment)
+            line += line2
+            if not is_comment:
+                text += '\n' + text2
+        return line, text
+
+    def handle_reasembly(self, reass_match, line):
+        self.reassemble_names.append(reass_match.group('name'))
+        # Handle following lines and jump to next detection.
+        line, _ = self._read_stmt(line)
+        self.lines_keep += line
+        return True
+
+    def handle_if(self, if_match, line):
+        text = line.strip()
+        # Expected more?
+        if '{' in text:
+            # Look for if (...) { ... }
+            line, text = self._read_stmt(line, '}')
+        else:
+            # Look for if (...) ...;
+            line, text = self._read_stmt(line, ';')
+
+        # Check for else that is not understood.
+        if re.search('\}\s*else\b', text):
+            self.unknown_lines += line
+            return True # Cannot handle else yet! True to avoid double append
+
+        # Get rid of if condition and brackets
+        if '{' in text:
+            text = text.split('{', 1)[1].split('}')[0]
+        else:
+            text = text.split(')', 1)[1]
+
+        # The variable that was tested for destruction
+        varName = if_match.group('varName')
+        # For each statement in the if-body, check validity
+        for stmt in text.split(';'):
+            stmt = stmt.strip()
+            if not stmt:
+                continue
+            ht_destroy_match = RE_HT_DESTROY.match(stmt)
+            if ht_destroy_match:
+                if ht_destroy_match.group('varName') != varName:
+                    _logger.error('cond %s != destroy %s' %
+                            (varName, ht_destroy_match.group('varName')))
+                    self.unknown_lines += line
+                    return True
+                # Remember name for later destruction
+                self.ht_names.append(varName)
+                _logger.debug('Skipping line for ht destroy %s', varName)
+                continue
+            assignment_match = RE_ASSIGNMENT.match(stmt)
+            if assignment_match:
+                if assignment_match.group('varName') == varName and \
+                    assignment_match.group('value') in ('NULL', '0') and \
+                    self._is_ht_name(varName):
+                    # Ignore clearing variable for hash table
+                    continue
+            _logger.warn('Unhandled if stmt: %s', stmt)
+            self.unknown_lines += line
+            return True
+
+        return True
+
+    def _is_ht_name(self, varName):
+        patt_ht_new = r'^\s*' + re.escape(varName) + r'\s*=\s*g_hash_table_new'
+        return re.search(patt_ht_new, self.body, re.M) is not None
+
+    def get_indent(self):
+        indent_match = re.search(r'^\n*([ \t]+)', self.body, re.M)
+        if not indent_match:
+            _logger.error('Could not detect indent level for %s!', funcName)
+            # XXX can this actually happen?
+            return ''
+        return indent_match.group(1)
+
+    def _make_function(self, funcName, body, keep_trailer=False):
+        # "static void" funcName "(void) {" body "}\n"
+        begin,   end   = self.func_match.span()
+        f_begin, f_end = self.func_match.span('funcName')
+        b_begin, b_end = self.func_match.span('body')
+        context = self.func_match.string
+        code = ''
+        code += context[begin:f_begin] + funcName   # "static void" funcName
+        code += context[f_end:b_begin] + '\n'       # "(void) {\n"
+        code += body
+        # Strip comments in "}\n" unless requested otherwise (for init)
+        code += context[b_end:] if keep_trailer else '}\n'
+        return code
+
+    def make_cleanup_function(self, cleanupFuncName):
+        body = self._make_cleanup_function_body()
+        if not body:
+            return
+        code = self._make_function(cleanupFuncName, body)
+        _logger.debug('Emitting cleanup routine %s:\n%s', cleanupFuncName, code)
+        return code
+
+    def _make_cleanup_function_body(self):
+        body = ''
+        indent = self.get_indent()
+        for name in self.reassemble_names:
+            body += '%sreassembly_table_destroy(%s);\n' % (indent, name)
+        for name in self.ht_names:
+            body += '%sg_hash_table_destroy(%s);\n' % (indent, name)
+        return body
+
+    def make_init_function(self):
+        """Generates the stripped init routine."""
+        code = self._make_function(self.name, self.lines_keep, keep_trailer=True)
+        assert code
+        _logger.debug('Emitting init routine %s:\n%s', self.name, code)
+        # As the block is replaced, remember the context
+        begin, end = self.func_match.span()
+        context = self.func_match.string
+        return context[0:begin] + code + context[end:]
+
+
+class Source(object):
+    def __init__(self, filename):
+        self.filename = filename
+        self.blocks = []
+        # map from function names to a tuple
+        # (blockIndex:int, func:Function, func_match:re.Match)
+        self.functions = {}
+
+    def parse_func(self, block, blockIndex):
+        """
+        Parses the code block. The blockIndex parameter is used for indexing the
+        functions.
+        """
+        # Quick sanity check (multiple names may show up as it matches
+        # prototypes and other functions with any number of parameters).
+        funcNames_guessed = RE_FUNCTION_HEADER.findall(block)
+        if not funcNames_guessed:
+            return
+        _logger.debug('Found functions %s', ', '.join(funcNames_guessed))
+
+        # Try to match the init function
+        func_match = RE_FUNCTION.search(block)
+        if not func_match:
+            _logger.info('No function body detected for %s',
+                    ', '.join(funcNames_guessed))
+            return
+
+        # Try to parse everything from the function body
+        funcName = func_match.group('funcName')
+        body = func_match.group('body')
+        func = Function(funcName, body, func_match)
+        if funcName in self.functions:
+            _logger.error('Function %s is already known, overwriting!', funcName)
+        _logger.debug('Saving function %s', funcName)
+        self.functions[funcName] = (blockIndex, func, func_match)
+
+    def parse_block(self, block):
+        self.parse_func(block, len(self.blocks))
+        self.blocks.append(block)
+
+    def parse(self):
+        block = ''
+        # Pass 1: read file contents and extract functions
+        with open(self.filename) as f:
+            for line in f:
+                block += line
+                # Assume end of line / begin of block
+                # use heuristics to match:
+                # static void reset_dissector(void) { ...; }
+                if line.startswith('}') or (
+                        line.startswith('static void') and
+                        '(void)' in line and
+                        line.endswith('}\n')
+                    ):
+                    self.parse_block(block)
+                    block = ''
+                    continue
+            # Remainder
+            if block:
+                self.parse_block(block)
+                block = ''
+
+        # Pass 2: find register_init_routine, append cleanup call and append
+        # cleanup function.
+        for blockIndex, block in enumerate(self.blocks):
+            if self.try_init_fix(block, blockIndex):
+                # Ok, cleanup routine is fixed.
+                return True
+        return False
+
+    def make_cleanup_name(self, funcName):
+        newName = funcName.replace('init', 'cleanup')
+        newName = newName.replace('setup', 'cleanup')
+        if funcName == newName:
+            _logger.error('Cannot create unique cleanup function name %s',
+                funcName)
+        return newName
+
+    def try_init_fix(self, block, blockIndex):
+        # Matches " register_init_routine (&foo_init);"
+        caller_match = re.search(
+                r'''
+                ^([ \t]*)register_init_routine\s*
+                \(\s* &? \s*(?P<name>\w+)\s* \);\n
+                ''', block, re.M | re.X)
+        if not caller_match:
+            # Sanity check
+            if re.search(r'register_init_routine\s*\(', block):
+                _logger.error('Could not detect register_init_routine properly!')
+            return False # Continue searching
+
+        # Locate init function and generate matching cleanup function
+        funcName = caller_match.group('name')
+        cleanupFuncName = self.make_cleanup_name(funcName)
+        if not self.fix_cleanup_function(funcName, cleanupFuncName):
+            return
+
+        # Yields " register_cleanup_routine (&foo_cleanup);"
+        extra_line = caller_match.group() \
+            .replace('register_init_routine', 'register_cleanup_routine') \
+            .replace(funcName, cleanupFuncName)
+        begin, end = caller_match.span()
+        self.blocks[blockIndex] = block[0:end] + extra_line + block[end:]
+        return True # Done searching
+
+    def fix_cleanup_function(self, funcName, cleanupFuncName):
+        if not funcName in self.functions:
+            _logger.error('Init routine %s not found!', funcName)
+            return False
+
+        if cleanupFuncName in self.functions:
+            _logger.error('Cleanup routine %s already exists!', cleanupFuncName)
+            return False
+
+        blockIndex, func, func_match = self.functions[funcName]
+        if not func.parse():
+            return False
+
+        initCode = func.make_init_function()
+        cleanupCode = func.make_cleanup_function(cleanupFuncName)
+        if not cleanupCode:
+            return False # Empty function
+
+        self.blocks[blockIndex] = initCode
+        self.blocks[blockIndex] += '\n' + cleanupCode
+        return True
+
+    def __str__(self):
+        return ''.join(self.blocks)
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG,
+        format='%(name)s:%(levelname)s: %(message)s')
+    # Color!
+    for _level, _color in {
+        'ERROR':    31,
+        'WARNING':  33,
+        'INFO':     37,
+        'DEBUG':    34,
+    }.items():
+        logging.addLevelName(getattr(logging, _level),
+            '\033[%d;1m%s\033[m' % (_color, _level))
+
+    write_file = lambda f, data: sys.stdout.write(data)
+
+    args = sys.argv[1:]
+    if not args:
+        _logger.error('Usage: cleanup-rewrite.py [-w] files..')
+        sys.exit(1)
+
+    if args[0] == '-w':
+        args = args[1:]
+        _logger.info('Will write new files')
+        write_file = lambda f, data: open(f, 'w').write(data)
+
+    ok = None
+    for filename in args:
+        # Support aliasing files such as /dev/stdin:/dev/stdout
+        if ':' in filename:
+            filename_in, filename = filename.split(':', 1)
+        else:
+            filename_in = filename
+
+        # Linux-only hack: alias - as stdin or stdout
+        if filename_in == '-':
+            filename_in = '/dev/stdin'
+        if filename == '-':
+            filename = '/dev/stdout'
+
+        src = Source(filename_in)
+        if src.parse():
+            if ok is None:
+                ok = True
+            write_file(filename, str(src))
+        else:
+            ok = False
+
+    sys.exit(0 if ok else 1)
-- 
cgit v1.2.1