#!/usr/bin/env python """Compiles C into assembly for the practicum processor (PP2) All rights reserved, you may not redistribute or use this program without prior permission from Peter Wu or Xander Houtman. Use of this program is entirely your own risk. In no circumstances can the authors of this program be held responsible for any damage including, but not limited to, financial damage or data loss. Modification of this program is not allowed without prior permission. The generated output (assembly and messages) are not subject to this license. """ import sys, re from pycparser import c_parser, c_ast __author__ = "Peter Wu" __copyright__ = "Copyright 2011, Peter Wu" __credits__ = ["Peter Wu", "Xander Houtman"] __license__ = "Proprietary" __version__ = "1.0" __maintainer__ = "Peter Wu" __email__ = "uwretep@gmail.com" parser = c_parser.CParser() class Registers(object): """Register related functions Some functions were supposed to optimize things (alloc and free), but it's not implemented """ def __init__(self): # which registers are in use? self.registers = {} for reg in range(0, 8): self.registers[str(reg)] = False def alloc(self): """Retrieves a register which is marked unused and marks it as in use""" for register in range(0, 6): # find a free register if not self.registers[register]: self.registers[register] = True return "R" + register raise RuntimeError("No free registers are available") def free(self, register): """Marks a register as unused Keyword arguments: register -- a register in the format Rn where 0 <= n < 8 """ # remove leading R register = register[1:] if self.registers[register]: self.registers[register] = False; else: raise RuntimeError("free() of unused register") def get_register(self, text): """Retrieve the first register from a binary instruction""" text = text.strip() # ignore commented lines if text.startswith(";"): return None # skip labels if any text = text.upper().split(":", 1)[-1] # exclude the first operand, e.g. LOAD text = text.lstrip(" ").split(" ", 1)[-1] # retieve the first element before the next space, e.g. R0 text = text.lstrip(" ").split(" ", 1)[0] # is it a register? if len(text) == 2 and text[0] == "R" and text[1] in self.registers: return text # out of luck return None def find_register(self, instructions_list, fatal=False): """Finds the last modified register in a list of instructions""" for line in reversed(instructions_list): reg = self.get_register(line) if reg: return reg if fatal: raise RuntimeError("No register found in instructions list") return None class Asm(object): def __init__(self): self.level = 0 def format_line(self, line, label="", indent_size=-1): """Returns an indented line optionally prefixed with label""" if indent_size < 0: indent_size = 8 + self.level * 4 if label: indent_size -= len(label) + 2 if indent_size >= 0: label += ": " else: label += ":" # whitespace is generated if indent_size > 0 indent = indent_size * " " return indent + label + line def binary_op(self, binop, reg, operand, label=""): # output format: BBBB RR OO... return self.format_line("{:<4} {:<2} {}".format(binop, reg, operand), label) def unary_op(self, unop, operand, label=""): # output format: UUUU OO... return self.format_line("{:<4} {}".format(unop, operand), label) def branch_op(self, brop, branch_label, label=""): # output format: BBB LL... return self.format_line("{:<4} {}".format(brop, branch_label), label) def push(self, register, label=""): return self.format_line("PUSH " + register, label) def pull(self, register, label=""): return self.format_line("PULL " + register, label) def insert_label(self, instruction, label): new_instruction = instruction.lstrip(" ") indent_size = len(instruction) - len(new_instruction) return self.format_line(new_instruction, label, indent_size) def has_label(self, line): """"Returns True if a line appears to contain a label, False otherwise""" return Parse.is_identifier(line.split(":", 1)[0].strip()) def noop(self, label, register="R0"): """Returns a labelled no operation operator Note that the Zero, Negative and oVerflow flags are modified """ return self.binary_op("LOAD", register, register, label) class LinkedNode(object): """Stores nodes with a reference to the parent""" def __init__(self, node, parent=None, level_increment=False): self.node = node self.parent = parent self.function = None self.level = 0 # inherit level from parent if parent: self.level = parent.level if level_increment: self.incrementLevel() def incrementLevel(self): self.level += 1 def getFunctionNode(self): if isinstance(self.node, c_ast.FuncDef): return self if self.parent: assert isinstance(self.parent, LinkedNode), "parent is not a LinkedNode!" return self.parent.getFunctionNode() return None def setFunction(self, function): """Sets the function object containing label information""" self.function = function def getLocation(self): if hasattr(self.node, "coord"): return self.node.coord return "Unknown" class Function(object): def __init__(self, node): self.name = node.decl.name self.node = node def labelBegin(self): """Returns a label pointing to the begin of a function""" return "fn_" + self.name def labelEnd(self): """Returns a label pointing to the end of a function""" return "fne_" + self.name class Logger(object): def __init__(self): pass def warning(self, message, linked_node=None): self.log(message, linked_node=linked_node, type="warning") def log(self, message, linked_node=None, type="log"): source = "" if isinstance(linked_node, LinkedNode): source = str(linked_node.getLocation()) + ": " print type + ":" + source + message class Parse(object): re_identifier = re.compile("^[0-9a-zA-Z_][0-9a-zA-Z_]*$") @classmethod def is_identifier(cls, text): return cls.re_identifier.match(text) def __init__(self, filename, source=''): if source is "": file = open(filename, 'r') source = file.read() file.close() self.node = parser.parse(source, filename=filename); self.logger = Logger() # word size in bits self.WORDSIZE = 18 # preload operators and their instructions self.binary_ops = { "+": "ADD", "-": "SUB", "*": "MULS", "/": "DIV", "%": "MOD", "&": "AND", "|": "OR", "^": "XOR" } self.comparison_ops = { "<" : "BLT", ">" : "BGT", "<=": "BLE", ">=": "BGE", "==": "BEQ", "!=": "BNE" } self.binary_ops.update(self.comparison_ops) self.globalVars = [] self.functions = {} self.codeSegment = [] self.labels = set() # helper for asm code generation self.asm = Asm() # watches which registers are used and which are not self.registers = Registers() def show(self): self.node.show(showcoord=True) def uniqLbl(self, labelname): """Generates an unique label name, save and return it""" lbl = labelname if labelname in self.labels: i = 0 while True: lbl = labelname + "_" + str(i) if not lbl in self.labels: break i += 1 self.addLabel(lbl) return lbl def addLabel(self, labelname): if labelname in self.labels: raise RuntimeError("Redefinition of label '{}'".format(labelname)) self.labels.add(labelname) def compile(self): """Loops through the nodes in an AST syntax tree and generates ASM""" root_node = LinkedNode(self.node) for thing in self.node.ext: linked_node = LinkedNode(thing, root_node) self.asm.level = linked_node.level if isinstance(thing, c_ast.Decl): self.defineGlobalVar(linked_node) elif isinstance(thing, c_ast.FuncDef): self.defineFunction(linked_node) else: raise RuntimeError("I don't know a node in the global scope: " + str(thing)) def getSource(self): """Retrieves the ASM source. You need to compile it first""" self.addSource("@DATA") for varName in self.globalVars: padding = " " * (16 - len(varName) - 1) self.addSource(varName + padding + " DW 1") self.addSource() self.addSource("@CODE") for line in self.codeSegment: self.addSource(line) self.addSource() self.addSource() self.addSource("@END") def addSource(self, line=''): print line def defineGlobalVar(self, linked_node): varname = linked_node.node.name self.globalVars.append(varname) def defineFunction(self, linked_node): node = linked_node.node linked_node.incrementLevel() self.asm.level = linked_node.level funcname = node.decl.name if funcname in self.functions: raise RuntimeError("Redefinition of function '{}'".format(funcname)) function = self.functions[funcname] = Function(node) lbl_func = function.labelBegin() lbl_end = function.labelEnd() self.addLabel(lbl_func) self.addLabel(lbl_end) linked_node.setFunction(function) self.codeSegment.append(self.asm.noop(lbl_func)) stmts = self.parseStatement(node.body, linked_node) self.asm.level = linked_node.level for stmt in stmts: self.codeSegment.append(stmt) # functions other than main() are subroutines #if funcname != "main": # since main should never be left, add the instruction anyway. # Otherwise, it'd crash anyway because it loops into the stack (if it # had not crashed before) self.codeSegment.append(self.asm.format_line("RTS", label=lbl_end)) # add an extra newline self.codeSegment.append("") def functionLabel(self, name): if name in self.functions: function = self.functions[name] return function.labelBegin() # name not found :? perhaps an external declaration lbl = "fn_" + name self.logger.warning("call to undefined function, assuming label '{}'".format(lbl)) return lbl def parseCompound(self, linked_node): linked_node.incrementLevel() lines = [] if linked_node.node.block_items: for cn in linked_node.node.block_items: lines += self.parseStatement(cn, linked_node) return lines def parseBinaryLogicalOp(self, linked_node): """Returns lines of assembly for a logical OR or AND operation ASM for logical OR: operand1 BEQ fal ; if operand1 is false operand2 BEQ fal ; if operand2 is false tru:LOAD R0 1 BRA end fal:LOAD R0 0 end:NOOP ASM for logical AND: operand1 BNE tru ; if operand1 is true operand2 BNE tru ; if operand2 is true fal:LOAD R0 0 BRA end tru:LOAD R0 1 end:NOOP """ node = linked_node.node assert node.op in ("&&", "||"), "Expected a && or || operator, found '{}' instead".format(node.op) isAnd = node.op == "&&" reg_result = "R0" lbl_false = self.uniqLbl("lOp_false") lbl_true = self.uniqLbl("lOp_true") lbl_end = self.uniqLbl("lOp_end") # branch instruction to be inserted if && yields false or if || is true if isAnd: branch_early = self.asm.branch_op("BEQ", lbl_false) else: branch_early = self.asm.branch_op("BNE", lbl_true) result_true = self.asm.binary_op("LOAD", reg_result, 1, label=lbl_true) result_false = self.asm.binary_op("LOAD", reg_result, 0, label=lbl_false) lines = self.parseStatement(node.left, linked_node, level_increment=True) lines.append(branch_early) lines += self.parseStatement(node.right, linked_node, level_increment=True) lines.append(branch_early) # fix indentation self.asm.level = linked_node.level # if 'true && true' or 'false || false' if isAnd: lines.append(result_true) else: lines.append(result_false) lines.append(self.asm.branch_op("BRA", lbl_end)) # if 'false && x' or 'true && false' or 'x || true' or 'true || x' if isAnd: lines.append(result_false) else: lines.append(result_true) lines.append(self.asm.noop(lbl_end, reg_result)) return lines def parseBinaryOp(self, linked_node): node = linked_node.node lines = [] if node.op in self.binary_ops: op = self.binary_ops[node.op] elif node.op in ("&&", "||"): return self.parseBinaryLogicalOp(linked_node) elif node.op in ("<<", ">>"): op = node.op else: raise RuntimeError("Binary op is not implemented yet: " + node.op) # process the first operand, the result is likely in R0 or R1 ops_first = self.parseStatement(node.left, linked_node, level_increment=True) assert len(ops_first), "Empty operand for binary operation (first part)" lines += ops_first reg_first = self.registers.find_register(ops_first, fatal=True) # fix indentation after running parseStatement self.asm.level = linked_node.level # optimization, but this should probably be done afterwards if isinstance(node.right, c_ast.Constant): operand = self.convertInteger(node=node.right) else: lines.append(self.asm.push(reg_first)) ops_second = self.parseStatement(node.right, linked_node, level_increment=True) assert len(ops_second), "Empty operand for binary operation (second part)" lines += ops_second # the register storing the result of the second operand reg_second = self.registers.find_register(ops_second, fatal=True) operand = reg_second # since we use the stack for calculations, we can use less # registers: R0 and R1 if reg_second == "R0": reg_first = "R1" else: reg_first = "R0" # put the data back in the free register lines.append(self.asm.pull(reg_first)) # perform binary operation if node.op in self.comparison_ops: bin_end = self.uniqLbl("cmpEnd") bin_true = self.uniqLbl("cmpTrue") lines.append(self.asm.binary_op("CMP", reg_first, operand)) # a comparison is more likely to be true, so optimize for that: # true will follow the path CMP - - LOAD - NOOP, where false # will follow: CMP - - LOAD - BRA - NOOP. This just happened # to be the easiest to implement, so don't blame me for premature # optimization ;) lines.append(self.asm.branch_op(op, bin_true)) lines.append(self.asm.binary_op("LOAD", reg_first, 0)) lines.append(self.asm.branch_op("BRA", bin_end)) lines.append(self.asm.binary_op("LOAD", reg_first, 1, label=bin_true)) lines.append(self.asm.noop(bin_end, register=reg_first)) elif op in ("<<", ">>"): op = { "<<": "MULS", ">>": "DIV" }[op] if isinstance(node.right, c_ast.Constant): shift_bits = self.convertStrToInteger(operand) if shift_bits < 0: self.logger.warning("The number of bits to be shifted are" "negative, no operation will be performed", linked_node=linked_node) lines.append(self.asm.binary_op("LOAD", reg_first, reg_first)) elif shift_bits > 0: # shifting only makes a difference if shifting a number # larger than 0 (e.g. 1 << 1) operand = pow(2, shift_bits) lines.append(self.asm.binary_op(op, reg_first, operand)) else: lbl_gen = self.uniqLbl("gen2_loop") lbl_gen_end = self.uniqLbl("gen2_done") # necessary to set the status flags for the branch conditions lines.append(self.asm.binary_op("CMP", reg_second, 0)) # if operand 2 is lower than 1, do not bother shifting operand # 1 and jump to the end lines.append(self.asm.branch_op("BLE", lbl_gen_end)) lines.append(self.asm.binary_op(op, reg_first, 2, label=lbl_gen)) lines.append(self.asm.binary_op("SUB", reg_second, 1)) # if we came from BLE, x <= 0. BGT only jumps if x > 0, so # we can safe another NOOP by adding the label here lines.append(self.asm.branch_op("BGT", lbl_gen, label=lbl_gen_end)) else: lines.append(self.asm.binary_op(op, reg_first, operand)) return lines def parseUnaryOp(self, linked_node): """Returns lines of assembly for unary operators Supported operators are logical negation, one's complement (bitwise NOT), plus and minus ASM for logical NOT: operand BEQ fal ; if operand is false tru:LOAD R0 1 BRA end fal:LOAD R0 0 end:NOOP """ op = linked_node.node.op # first fetch the operand lines = self.parseStatement(linked_node.node.expr, linked_node) # determine the register in which the result is stored reg = self.registers.find_register(lines, fatal=True) if op == "+": # K&R A7.4.4 "[the unary +] was added for symmetry with unary -" pass elif op == "-": lines.append(self.asm.binary_op("MULS", reg, "-1")) elif op == "~": # XXX unsigned vs signed integers. Assume signed for now mask = "1" * (self.WORDSIZE - 1) lines.append(self.asm.binary_op("XOR", reg, "%0" + mask)) elif op == "!": lbl_false = self.uniqLbl("not_false") lbl_true = self.uniqLbl("not_true") lbl_end = self.uniqLbl("not_end") # assume that the status words of the registers describes the value # of the expression lines.append(self.asm.branch_op("BEQ", lbl_false)) lines.append(self.asm.binary_op("LOAD", reg, 1, label=lbl_true)) lines.append(self.asm.branch_op("BRA", lbl_end)) lines.append(self.asm.binary_op("LOAD", reg, 0, label=lbl_false)) lines.append(self.asm.noop(lbl_end, register=reg)) elif op == "--": lines.append(self.asm.binary_op("SUB", reg, 1)) elif op == "++": lines.append(self.asm.binary_op("ADD", reg, 1)) elif op == "&": raise RuntimeError("Address operator '&' is not supported.") elif op == "*": # XXX support writing to addresses? (IOAREA) raise RuntimeError("Indirection operator '*' is not supported.") elif op == "sizeof": raise RuntimeError("Sizeof operator 'sizeof' is not supported.") elif op == "p++" or op == "p--": # XXX postfix/prefix raise RuntimeError("Post-inc or post-dec is not supported (yet?).") else: raise RuntimeError("Unknown unary operator '{}'".format(op)) return lines def parseTernaryOp(self, linked_node): """Returns lines of ASM for ternary operation cond?expr_true:expr_false It looks very similar to an If expect that the return value is important """ linked_node.incrementLevel() node = linked_node.node lbl_el = self.uniqLbl("el") # else lbl_end = self.uniqLbl("fi") # endif lines = self.parseStatement(node.cond, linked_node) lines.append(self.asm.branch_op("BEQ", lbl_el)) # if true... then_block = self.parseStatement(node.iftrue, linked_node) # use a single register for the result, using the one of expr_true result_reg = self.registers.find_register(then_block, fatal=True) lines += then_block lines.append(self.asm.branch_op("BRA", lbl_end)) # else... else_block = self.parseStatement(node.iffalse, linked_node) assert len(else_block), "Else block is empty" result_reg_else = self.registers.find_register(else_block, fatal=True) # for consistency with the result of expr_true if result_reg_else != result_reg: lines.append(self.asm.binary_op("LOAD", result_reg, result_reg_else)) # add labels for the else block if self.asm.has_label(else_block[0]): lines.append(self.asm.noop(lbl_el)) else: else_block[0] = self.asm.insert_label(else_block[0], lbl_el) lines += else_block lines.append(self.asm.noop(lbl_end, register=result_reg)) return lines def convertInteger(self, value=None, node=None): if value is None: if isinstance(node, c_ast.Constant) and node.type == "int": value = node.value else: raise RuntimeError("Value is not given and node is not a valid integer constant") value = value.upper() if "LL" in value: # long long integer, ignore value = value.replace("LL", "", 1) elif "L" in value: # long integer, ignore value = value.replace("L", "", 1) if "U" in value: # unsigned integer, ignore for now value = value.replace("U", "", 1) if value.startswith("0X"): value = value[2:] if len(value) == 5: # highest possible value is 3FFFF, so adding a zero would not make sense value = "$" + value else: value = "$0" + value elif value.startswith("0"): # octal numbers are not understood by PP2 assembly, therefore convert it value = str(int(value, 8)) return value def convertStrToInteger(self, str): """Converts a PP2 assembly integer string to an integer""" str = str.upper() if str.startswith("$"): return int(str[1:], 16) elif str.startswith("%"): return int(str[1:], 8) return int(str, 10) def parseConstant(self, linked_node, register="R0"): node = linked_node.node if node.type == "int": value = self.convertInteger(node.value) else: raise RuntimeError("Unsupported constant type: " + node.type) return [self.asm.binary_op("LOAD", register, value)] def parseID(self, linked_node): raise RuntimeError("Not implemented yet: symbolic names") def parseIf(self, linked_node): linked_node.incrementLevel() node = linked_node.node #lbl_if = self.uniqLbl("if") # if #lbl_th = self.uniqLbl("th") # then if node.iffalse: lbl_el = self.uniqLbl("el") # else lbl_end = self.uniqLbl("fi") # endif else: lbl_end = lbl_fi = self.uniqLbl("fi") # endif lines = self.parseStatement(node.cond, linked_node) # unused label, but insert it for clearness if possible #if len(lines) > 0: # if not self.asm.has_label(lines[0]): # lines[0] = self.asm.insert_label(lines[0], lbl_if) # if cond is zero, go to else (BEQ branches if Z=0). Do not use CMP lines.append(self.asm.branch_op("BEQ", lbl_el)) # if true... then_block = self.parseStatement(node.iftrue, linked_node) # required label, so insert a NOOP if necessary #if len(then_block) > 0: # if self.asm.has_label(then_block[0]): # lines.append(self.asm.noop(lbl_th)) # else: # then_block[0] = self.asm.insert_label(then_block, lbl_th) lines += then_block lines.append(self.asm.branch_op("BRA", lbl_end)) # else... if node.iffalse: else_block = self.parseStatement(node.iffalse, linked_node) if len(else_block) > 0: if self.asm.has_label(else_block[0]): lines.append(self.asm.noop(lbl_el)) else: else_block[0] = self.asm.insert_label(else_block[0], lbl_el) lines += else_block # why branch if already at the end of the block? hmm #lines.append(self.asm.branch_op("BRA", lbl_end)) lines.append(self.asm.noop(lbl_end)) return lines def parseReturn(self, linked_node): """Return from a function. For non-void functions the result is in R0""" func_node = linked_node.getFunctionNode() if not func_node: raise RuntimeError("return not in function") return_value = linked_node.node.expr if return_value: lines = self.parseStatement(return_value, linked_node) else: lines = [] if return_value: reg = self.registers.find_register(lines, fatal=True) # ensure that the return value is in R0 for consistency if reg != "R0": lines.append(self.asm.binary_op("LOAD", "R0", reg)) self.asm.level = linked_node.level - 1 lines.append(self.asm.branch_op("BRA", func_node.function.labelEnd())) # don't do the below, it may not cleanup properly #lines.append(self.asm.format_line("RTS")) return lines def parseFuncCall(self, linked_node): # XXX function arguments # node.name is a c_ast.ID, the real function name is in name funcname = linked_node.node.name.name return [self.asm.branch_op("BRS", self.functionLabel(funcname))] def parseCast(self, linked_node): self.logger.warning("Found a type cast, but these are unsupported.", linked_node=linked_node) return self.parseStatement(linked_node.node.expr, linked_node) def parseDoWhile(self, linked_node): node = linked_node.node lbl_do = self.uniqLbl("do") # provides a way to redo the loop lines = [self.asm.noop(lbl_do)] # do { ... }, expect do{/*empty*/} as well lines += self.parseStatement(node.stmt, linked_node) # while (...) lines += self.parseStatement(node.cond, linked_node) lines.append(self.asm.branch_op("BNE", lbl_do)) return lines def parseStatement(self, node, parent_linked_node, level_increment=False): linked_node = LinkedNode(node, parent_linked_node, level_increment=level_increment) self.asm.level = linked_node.level lines = [] if isinstance(node, c_ast.Assignment): raise RuntimeError("Not implemented for type " + repr(node)) elif isinstance(node, c_ast.Decl): self.defineGlobalVar(linked_node, parent_linked_node) elif isinstance(node, c_ast.ExprList): raise RuntimeError("Not implemented for type " + repr(node)) elif isinstance(node, c_ast.For): raise RuntimeError("Not implemented for type " + repr(node)) elif isinstance(node, c_ast.FuncDecl): raise RuntimeError("Not implemented for type " + repr(node)) elif isinstance(node, c_ast.PtrDecl): raise RuntimeError("Not implemented for type " + repr(node)) elif isinstance(node, c_ast.While): raise RuntimeError("Not implemented for type " + repr(node)) else: for entry in ("Compound", "BinaryOp", "If", "Constant", "ID", "Return", "FuncCall", "UnaryOp", "Cast", "TernaryOp", "DoWhile"): if isinstance(node, getattr(c_ast, entry)): lines += getattr(self, "parse" + entry)(linked_node) break else: raise RuntimeError("Not implemented and unknown: " + repr(node)) return lines # currently, there is no difference between an expression and statement. This might # change in the future def parseExpression(self, node, parent): pass try: filename = sys.argv[1] parse = Parse(filename) if len(sys.argv) == 3: parse.show() #node.ext[0].ext[0].show() else: parse.compile() parse.getSource() except c_parser.ParseError: e = sys.exc_info()[1] print "Parse error:" + str(e)