optimize: optimize using nonzero bits

This adds two optimizations using the non-zero bit mask. In some cases involving shifts or ANDs the value can become zero, and can thus be optimized to a move of zero. Second, useless zero-extension or an AND with constant can be detected that would only zero bits that are already zero. The main advantage of this optimization is that it turns zero-extensions into moves, thus enabling much better copy propagation (around 1% code reduction). Here is for example a "test $0xff0000,%ecx + je" before optimization: mov_i64 tmp0,rcx movi_i64 tmp1,$0xff0000 discard cc_src and_i64 cc_dst,tmp0,tmp1 movi_i32 cc_op,$0x1c ext32u_i64 tmp0,cc_dst movi_i64 tmp12,$0x0 brcond_i64 tmp0,tmp12,eq,$0x0 and after (without patch on the left, with on the right): movi_i64 tmp1,$0xff0000 movi_i64 tmp1,$0xff0000 discard cc_src discard cc_src and_i64 cc_dst,rcx,tmp1 and_i64 cc_dst,rcx,tmp1 movi_i32 cc_op,$0x1c movi_i32 cc_op,$0x1c ext32u_i64 tmp0,cc_dst movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0 brcond_i64 tmp0,tmp12,eq,$0x0 brcond_i64 cc_dst,tmp12,eq,$0x0 Other similar cases: "test %eax, %eax + jne" where eax is already 32-bit (after optimization, without patch on the left, with on the right): discard cc_src discard cc_src mov_i64 cc_dst,rax mov_i64 cc_dst,rax movi_i32 cc_op,$0x1c movi_i32 cc_op,$0x1c ext32u_i64 tmp0,cc_dst movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0 brcond_i64 tmp0,tmp12,ne,$0x0 brcond_i64 rax,tmp12,ne,$0x0 "test $0x1, %dl + je": movi_i64 tmp1,$0x1 movi_i64 tmp1,$0x1 discard cc_src discard cc_src and_i64 cc_dst,rdx,tmp1 and_i64 cc_dst,rdx,tmp1 movi_i32 cc_op,$0x1a movi_i32 cc_op,$0x1a ext8u_i64 tmp0,cc_dst movi_i64 tmp12,$0x0 movi_i64 tmp12,$0x0 brcond_i64 tmp0,tmp12,eq,$0x0 brcond_i64 cc_dst,tmp12,eq,$0x0 In some cases TCG even outsmarts GCC. :) Here the input code has "and $0x2,%eax + movslq %eax,%rbx + test %rbx, %rbx" and the optimizer, thanks to copy propagation, does the following: movi_i64 tmp12,$0x2 movi_i64 tmp12,$0x2 and_i64 rax,rax,tmp12 and_i64 rax,rax,tmp12 mov_i64 cc_dst,rax mov_i64 cc_dst,rax ext32s_i64 tmp0,rax -> nop mov_i64 rbx,tmp0 -> mov_i64 rbx,cc_dst and_i64 cc_dst,rbx,rbx -> nop Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Richard Henderson <rth@twiddle.net> Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
author: Paolo Bonzini <pbonzini@redhat.com> 2013-01-11 15:42:53 -0800
committer: Blue Swirl <blauwirbel@gmail.com> 2013-01-19 10:13:16 +0000
commit: 633f6502544e3dd6a615679ce440875be7ebbc58 (patch)
tree: 7dc66485bcdff64982cfd5a116592bf89b03647e /tcg/optimize.c
parent: 3a9d8b179b1e43237365ede641d5aa6779ba91bc (diff)
download: qemu-633f6502544e3dd6a615679ce440875be7ebbc58.tar.gz
1 files changed, 28 insertions, 2 deletions
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 090efbc193..973d2d679f 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -484,7 +484,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
                                     TCGArg *args, TCGOpDef *tcg_op_defs)
 {
     int i, nb_ops, op_index, nb_temps, nb_globals, nb_call_args;
-    tcg_target_ulong mask;
+    tcg_target_ulong mask, affected;
     TCGOpcode op;
     const TCGOpDef *def;
     TCGArg *gen_args;
@@ -629,6 +629,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
 
         /* Simplify using known-zero bits */
         mask = -1;
+        affected = -1;
         switch (op) {
         CASE_OP_32_64(ext8s):
             if ((temps[args[1]].mask & 0x80) != 0) {
@@ -656,7 +657,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             mask = temps[args[2]].mask;
             if (temps[args[2]].state == TCG_TEMP_CONST) {
         and_const:
-                ;
+                affected = temps[args[1]].mask & ~mask;
             }
             mask = temps[args[1]].mask & mask;
             break;
@@ -708,6 +709,31 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr,
             break;
         }
 
+        if (mask == 0) {
+            assert(def->nb_oargs == 1);
+            s->gen_opc_buf[op_index] = op_to_movi(op);
+            tcg_opt_gen_movi(gen_args, args[0], 0);
+            args += def->nb_oargs + def->nb_iargs + def->nb_cargs;
+            gen_args += 2;
+            continue;
+        }
+        if (affected == 0) {
+            assert(def->nb_oargs == 1);
+            if (temps_are_copies(args[0], args[1])) {
+                s->gen_opc_buf[op_index] = INDEX_op_nop;
+            } else if (temps[args[1]].state != TCG_TEMP_CONST) {
+                s->gen_opc_buf[op_index] = op_to_mov(op);
+                tcg_opt_gen_mov(s, gen_args, args[0], args[1]);
+                gen_args += 2;
+            } else {
+                s->gen_opc_buf[op_index] = op_to_movi(op);
+                tcg_opt_gen_movi(gen_args, args[0], temps[args[1]].val);
+                gen_args += 2;
+            }
+            args += def->nb_iargs + 1;
+            continue;
+        }
+
         /* Simplify expression for "op r, a, 0 => movi r, 0" cases */
         switch (op) {
         CASE_OP_32_64(and):
author	Paolo Bonzini <pbonzini@redhat.com>	2013-01-11 15:42:53 -0800
committer	Blue Swirl <blauwirbel@gmail.com>	2013-01-19 10:13:16 +0000
commit	633f6502544e3dd6a615679ce440875be7ebbc58 (patch)
tree	7dc66485bcdff64982cfd5a116592bf89b03647e /tcg/optimize.c
parent	3a9d8b179b1e43237365ede641d5aa6779ba91bc (diff)
download	qemu-633f6502544e3dd6a615679ce440875be7ebbc58.tar.gz