From b76f0d8c2e3eac94bc7fd90a510cb7426b2a2699 Mon Sep 17 00:00:00 2001 From: Yeongkyoon Lee Date: Wed, 31 Oct 2012 16:04:25 +0900 Subject: tcg: Optimize qemu_ld/st by generating slow paths at the end of a block Add optimized TCG qemu_ld/st generation which locates the code of TLB miss cases at the end of a block after generating the other IRs. Currently, this optimization supports only i386 and x86_64 hosts. Signed-off-by: Yeongkyoon Lee Signed-off-by: Blue Swirl --- tcg/i386/tcg-target.c | 404 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 278 insertions(+), 126 deletions(-) (limited to 'tcg/i386/tcg-target.c') diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index e45a5a0426..6f3ad3ceab 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -1002,6 +1002,17 @@ static const void *qemu_st_helpers[4] = { helper_stq_mmu, }; +static void add_qemu_ldst_label(TCGContext *s, + int is_ld, + int opc, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint8_t **label_ptr); + /* Perform the TLB load and compare. Inputs: @@ -1060,19 +1071,19 @@ static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx, tcg_out_mov(s, type, r1, addrlo); - /* jne label1 */ - tcg_out8(s, OPC_JCC_short + JCC_JNE); + /* jne slow_path */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); label_ptr[0] = s->code_ptr; - s->code_ptr++; + s->code_ptr += 4; if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { /* cmp 4(r0), addrhi */ tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r0, 4); - /* jne label1 */ - tcg_out8(s, OPC_JCC_short + JCC_JNE); + /* jne slow_path */ + tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); label_ptr[1] = s->code_ptr; - s->code_ptr++; + s->code_ptr += 4; } /* TLB Hit. */ @@ -1193,10 +1204,7 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; -#if TCG_TARGET_REG_BITS == 32 - int stack_adjust; -#endif - uint8_t *label_ptr[3]; + uint8_t *label_ptr[2]; #endif data_reg = args[0]; @@ -1216,87 +1224,17 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, /* TLB Hit. */ tcg_out_qemu_ld_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc); - /* jmp label2 */ - tcg_out8(s, OPC_JMP_short); - label_ptr[2] = s->code_ptr; - s->code_ptr++; - - /* TLB Miss. */ - - /* label1: */ - *label_ptr[0] = s->code_ptr - label_ptr[0] - 1; - if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { - *label_ptr[1] = s->code_ptr - label_ptr[1] - 1; - } - - /* XXX: move that code at the end of the TB */ -#if TCG_TARGET_REG_BITS == 32 - tcg_out_pushi(s, mem_index); - stack_adjust = 4; - if (TARGET_LONG_BITS == 64) { - tcg_out_push(s, args[addrlo_idx + 1]); - stack_adjust += 4; - } - tcg_out_push(s, args[addrlo_idx]); - stack_adjust += 4; - tcg_out_push(s, TCG_AREG0); - stack_adjust += 4; -#else - tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); - /* The second argument is already loaded with addrlo. */ - tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); -#endif - - tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); - -#if TCG_TARGET_REG_BITS == 32 - if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { - /* Pop and discard. This is 2 bytes smaller than the add. */ - tcg_out_pop(s, TCG_REG_ECX); - } else if (stack_adjust != 0) { - tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); - } -#endif - - switch(opc) { - case 0 | 4: - tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 1 | 4: - tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); - break; - case 0: - tcg_out_ext8u(s, data_reg, TCG_REG_EAX); - break; - case 1: - tcg_out_ext16u(s, data_reg, TCG_REG_EAX); - break; - case 2: - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - break; -#if TCG_TARGET_REG_BITS == 64 - case 2 | 4: - tcg_out_ext32s(s, data_reg, TCG_REG_EAX); - break; -#endif - case 3: - if (TCG_TARGET_REG_BITS == 64) { - tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); - } else if (data_reg == TCG_REG_EDX) { - /* xchg %edx, %eax */ - tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); - tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX); - } else { - tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); - tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX); - } - break; - default: - tcg_abort(); - } - - /* label2: */ - *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; + /* Record the current context of a load into ldst label */ + add_qemu_ldst_label(s, + 1, + opc, + data_reg, + data_reg2, + args[addrlo_idx], + args[addrlo_idx + 1], + mem_index, + s->code_ptr, + label_ptr); #else { int32_t offset = GUEST_BASE; @@ -1393,8 +1331,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, int addrlo_idx; #if defined(CONFIG_SOFTMMU) int mem_index, s_bits; - int stack_adjust; - uint8_t *label_ptr[3]; + uint8_t *label_ptr[2]; #endif data_reg = args[0]; @@ -1414,20 +1351,221 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, /* TLB Hit. */ tcg_out_qemu_st_direct(s, data_reg, data_reg2, TCG_REG_L1, 0, 0, opc); - /* jmp label2 */ + /* Record the current context of a store into ldst label */ + add_qemu_ldst_label(s, + 0, + opc, + data_reg, + data_reg2, + args[addrlo_idx], + args[addrlo_idx + 1], + mem_index, + s->code_ptr, + label_ptr); +#else + { + int32_t offset = GUEST_BASE; + int base = args[addrlo_idx]; + int seg = 0; + + /* ??? We assume all operations have left us with register contents + that are zero extended. So far this appears to be true. If we + want to enforce this, we can either do an explicit zero-extension + here, or (if GUEST_BASE == 0, or a segment register is in use) + use the ADDR32 prefix. For now, do nothing. */ + if (GUEST_BASE && guest_base_flags) { + seg = guest_base_flags; + offset = 0; + } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) { + tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE); + tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base); + base = TCG_REG_L1; + offset = 0; + } + + tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, seg, opc); + } +#endif +} + +#if defined(CONFIG_SOFTMMU) +/* + * Record the context of a call to the out of line helper code for the slow path + * for a load or store, so that we can later generate the correct helper code + */ +static void add_qemu_ldst_label(TCGContext *s, + int is_ld, + int opc, + int data_reg, + int data_reg2, + int addrlo_reg, + int addrhi_reg, + int mem_index, + uint8_t *raddr, + uint8_t **label_ptr) +{ + int idx; + TCGLabelQemuLdst *label; + + if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) { + tcg_abort(); + } + + idx = s->nb_qemu_ldst_labels++; + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx]; + label->is_ld = is_ld; + label->opc = opc; + label->datalo_reg = data_reg; + label->datahi_reg = data_reg2; + label->addrlo_reg = addrlo_reg; + label->addrhi_reg = addrhi_reg; + label->mem_index = mem_index; + label->raddr = raddr; + label->label_ptr[0] = label_ptr[0]; + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + label->label_ptr[1] = label_ptr[1]; + } +} + +/* + * Generate code for the slow path for a load at the end of block + */ +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int opc = label->opc; + int mem_index = label->mem_index; +#if TCG_TARGET_REG_BITS == 32 + int stack_adjust; + int addrlo_reg = label->addrlo_reg; + int addrhi_reg = label->addrhi_reg; +#endif + int data_reg = label->datalo_reg; + int data_reg2 = label->datahi_reg; + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); + if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { + *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); + } + +#if TCG_TARGET_REG_BITS == 32 + tcg_out_pushi(s, mem_index); + stack_adjust = 4; + if (TARGET_LONG_BITS == 64) { + tcg_out_push(s, addrhi_reg); + stack_adjust += 4; + } + tcg_out_push(s, addrlo_reg); + stack_adjust += 4; + tcg_out_push(s, TCG_AREG0); + stack_adjust += 4; +#else + tcg_out_mov(s, TCG_TYPE_I64, tcg_target_call_iarg_regs[0], TCG_AREG0); + /* The second argument is already loaded with addrlo. */ + tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], mem_index); +#endif + + /* Code generation of qemu_ld/st's slow path calling MMU helper + + PRE_PROC ... + call MMU helper + jmp POST_PROC (2b) : short forward jump <- GETRA() + jmp next_code (5b) : dummy long backward jump which is never executed + POST_PROC ... : do post-processing <- GETRA() + 7 + jmp next_code : jump to the code corresponding to next IR of qemu_ld/st + */ + + tcg_out_calli(s, (tcg_target_long)qemu_ld_helpers[s_bits]); + + /* Jump to post-processing code */ tcg_out8(s, OPC_JMP_short); - label_ptr[2] = s->code_ptr; - s->code_ptr++; + tcg_out8(s, 5); + /* Dummy backward jump having information of fast path'pc for MMU helpers */ + tcg_out8(s, OPC_JMP_long); + *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4); + s->code_ptr += 4; - /* TLB Miss. */ +#if TCG_TARGET_REG_BITS == 32 + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { + /* Pop and discard. This is 2 bytes smaller than the add. */ + tcg_out_pop(s, TCG_REG_ECX); + } else if (stack_adjust != 0) { + tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); + } +#endif - /* label1: */ - *label_ptr[0] = s->code_ptr - label_ptr[0] - 1; + switch(opc) { + case 0 | 4: + tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 1 | 4: + tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW); + break; + case 0: + tcg_out_ext8u(s, data_reg, TCG_REG_EAX); + break; + case 1: + tcg_out_ext16u(s, data_reg, TCG_REG_EAX); + break; + case 2: + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + break; +#if TCG_TARGET_REG_BITS == 64 + case 2 | 4: + tcg_out_ext32s(s, data_reg, TCG_REG_EAX); + break; +#endif + case 3: + if (TCG_TARGET_REG_BITS == 64) { + tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX); + } else if (data_reg == TCG_REG_EDX) { + /* xchg %edx, %eax */ + tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EAX); + } else { + tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX); + tcg_out_mov(s, TCG_TYPE_I32, data_reg2, TCG_REG_EDX); + } + break; + default: + tcg_abort(); + } + + /* Jump to the code corresponding to next IR of qemu_st */ + tcg_out_jmp(s, (tcg_target_long)raddr); +} + +/* + * Generate code for the slow path for a store at the end of block + */ +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *label) +{ + int s_bits; + int stack_adjust; + int opc = label->opc; + int mem_index = label->mem_index; + int data_reg = label->datalo_reg; +#if TCG_TARGET_REG_BITS == 32 + int data_reg2 = label->datahi_reg; + int addrlo_reg = label->addrlo_reg; + int addrhi_reg = label->addrhi_reg; +#endif + uint8_t *raddr = label->raddr; + uint8_t **label_ptr = &label->label_ptr[0]; + + s_bits = opc & 3; + + /* resolve label address */ + *(uint32_t *)label_ptr[0] = (uint32_t)(s->code_ptr - label_ptr[0] - 4); if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { - *label_ptr[1] = s->code_ptr - label_ptr[1] - 1; + *(uint32_t *)label_ptr[1] = (uint32_t)(s->code_ptr - label_ptr[1] - 4); } - /* XXX: move that code at the end of the TB */ #if TCG_TARGET_REG_BITS == 32 tcg_out_pushi(s, mem_index); stack_adjust = 4; @@ -1438,10 +1576,10 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, tcg_out_push(s, data_reg); stack_adjust += 4; if (TARGET_LONG_BITS == 64) { - tcg_out_push(s, args[addrlo_idx + 1]); + tcg_out_push(s, addrhi_reg); stack_adjust += 4; } - tcg_out_push(s, args[addrlo_idx]); + tcg_out_push(s, addrlo_reg); stack_adjust += 4; tcg_out_push(s, TCG_AREG0); stack_adjust += 4; @@ -1454,8 +1592,26 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, stack_adjust = 0; #endif + /* Code generation of qemu_ld/st's slow path calling MMU helper + + PRE_PROC ... + call MMU helper + jmp POST_PROC (2b) : short forward jump <- GETRA() + jmp next_code (5b) : dummy long backward jump which is never executed + POST_PROC ... : do post-processing <- GETRA() + 7 + jmp next_code : jump to the code corresponding to next IR of qemu_ld/st + */ + tcg_out_calli(s, (tcg_target_long)qemu_st_helpers[s_bits]); + /* Jump to post-processing code */ + tcg_out8(s, OPC_JMP_short); + tcg_out8(s, 5); + /* Dummy backward jump having information of fast path'pc for MMU helpers */ + tcg_out8(s, OPC_JMP_long); + *(int32_t *)s->code_ptr = (int32_t)(raddr - s->code_ptr - 4); + s->code_ptr += 4; + if (stack_adjust == (TCG_TARGET_REG_BITS / 8)) { /* Pop and discard. This is 2 bytes smaller than the add. */ tcg_out_pop(s, TCG_REG_ECX); @@ -1463,33 +1619,29 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, tcg_out_addi(s, TCG_REG_CALL_STACK, stack_adjust); } - /* label2: */ - *label_ptr[2] = s->code_ptr - label_ptr[2] - 1; -#else - { - int32_t offset = GUEST_BASE; - int base = args[addrlo_idx]; - int seg = 0; + /* Jump to the code corresponding to next IR of qemu_st */ + tcg_out_jmp(s, (tcg_target_long)raddr); +} - /* ??? We assume all operations have left us with register contents - that are zero extended. So far this appears to be true. If we - want to enforce this, we can either do an explicit zero-extension - here, or (if GUEST_BASE == 0, or a segment register is in use) - use the ADDR32 prefix. For now, do nothing. */ - if (GUEST_BASE && guest_base_flags) { - seg = guest_base_flags; - offset = 0; - } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) { - tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, GUEST_BASE); - tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base); - base = TCG_REG_L1; - offset = 0; +/* + * Generate TB finalization at the end of block + */ +void tcg_out_tb_finalize(TCGContext *s) +{ + int i; + TCGLabelQemuLdst *label; + + /* qemu_ld/st slow paths */ + for (i = 0; i < s->nb_qemu_ldst_labels; i++) { + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[i]; + if (label->is_ld) { + tcg_out_qemu_ld_slow_path(s, label); + } else { + tcg_out_qemu_st_slow_path(s, label); } - - tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, seg, opc); } -#endif } +#endif /* CONFIG_SOFTMMU */ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) -- cgit v1.2.1