From 321c535105a182501b888f095f7ec4dbb5f3f6ae Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 21 Jan 2013 13:32:02 -0800 Subject: target-i386: Implement tzcnt and fix lzcnt We weren't computing flags for lzcnt at all. At the same time, adjust the implementation of bsf/bsr to avoid the local branch, using movcond instead. Signed-off-by: Richard Henderson --- target-i386/helper.h | 5 ++- target-i386/int_helper.c | 11 ++----- target-i386/translate.c | 86 +++++++++++++++++++++++++++--------------------- 3 files changed, 54 insertions(+), 48 deletions(-) (limited to 'target-i386') diff --git a/target-i386/helper.h b/target-i386/helper.h index e1ecdb81e9..26a0cc80a4 100644 --- a/target-i386/helper.h +++ b/target-i386/helper.h @@ -195,9 +195,8 @@ DEF_HELPER_3(frstor, void, env, tl, int) DEF_HELPER_3(fxsave, void, env, tl, int) DEF_HELPER_3(fxrstor, void, env, tl, int) -DEF_HELPER_FLAGS_1(bsf, TCG_CALL_NO_RWG_SE, tl, tl) -DEF_HELPER_FLAGS_1(bsr, TCG_CALL_NO_RWG_SE, tl, tl) -DEF_HELPER_FLAGS_2(lzcnt, TCG_CALL_NO_RWG_SE, tl, tl, int) +DEF_HELPER_FLAGS_1(clz, TCG_CALL_NO_RWG_SE, tl, tl) +DEF_HELPER_FLAGS_1(ctz, TCG_CALL_NO_RWG_SE, tl, tl) DEF_HELPER_FLAGS_2(pdep, TCG_CALL_NO_RWG_SE, tl, tl, tl) DEF_HELPER_FLAGS_2(pext, TCG_CALL_NO_RWG_SE, tl, tl, tl) diff --git a/target-i386/int_helper.c b/target-i386/int_helper.c index 7bec4ebdd6..3b56075a9d 100644 --- a/target-i386/int_helper.c +++ b/target-i386/int_helper.c @@ -456,19 +456,14 @@ void helper_idivq_EAX(CPUX86State *env, target_ulong t0) #endif /* bit operations */ -target_ulong helper_bsf(target_ulong t0) +target_ulong helper_ctz(target_ulong t0) { return ctztl(t0); } -target_ulong helper_lzcnt(target_ulong t0, int wordsize) +target_ulong helper_clz(target_ulong t0) { - return clztl(t0) - (TARGET_LONG_BITS - wordsize); -} - -target_ulong helper_bsr(target_ulong t0) -{ - return clztl(t0) ^ (TARGET_LONG_BITS - 1); + return clztl(t0); } target_ulong helper_pdep(target_ulong src, target_ulong mask) diff --git a/target-i386/translate.c b/target-i386/translate.c index 436658a33a..97c565f697 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -7157,46 +7157,58 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s, tcg_gen_movi_tl(cpu_cc_dst, 0); } break; - case 0x1bc: /* bsf */ - case 0x1bd: /* bsr */ - { - int label1; - TCGv t0; - - ot = dflag + OT_WORD; - modrm = cpu_ldub_code(env, s->pc++); - reg = ((modrm >> 3) & 7) | rex_r; - gen_ldst_modrm(env, s,modrm, ot, OR_TMP0, 0); - gen_extu(ot, cpu_T[0]); - t0 = tcg_temp_local_new(); - tcg_gen_mov_tl(t0, cpu_T[0]); - if ((b & 1) && (prefixes & PREFIX_REPZ) && - (s->cpuid_ext3_features & CPUID_EXT3_ABM)) { - switch(ot) { - case OT_WORD: gen_helper_lzcnt(cpu_T[0], t0, - tcg_const_i32(16)); break; - case OT_LONG: gen_helper_lzcnt(cpu_T[0], t0, - tcg_const_i32(32)); break; - case OT_QUAD: gen_helper_lzcnt(cpu_T[0], t0, - tcg_const_i32(64)); break; - } - gen_op_mov_reg_T0(ot, reg); + case 0x1bc: /* bsf / tzcnt */ + case 0x1bd: /* bsr / lzcnt */ + ot = dflag + OT_WORD; + modrm = cpu_ldub_code(env, s->pc++); + reg = ((modrm >> 3) & 7) | rex_r; + gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); + gen_extu(ot, cpu_T[0]); + + /* Note that lzcnt and tzcnt are in different extensions. */ + if ((prefixes & PREFIX_REPZ) + && (b & 1 + ? s->cpuid_ext3_features & CPUID_EXT3_ABM + : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) { + int size = 8 << ot; + tcg_gen_mov_tl(cpu_cc_src, cpu_T[0]); + if (b & 1) { + /* For lzcnt, reduce the target_ulong result by the + number of zeros that we expect to find at the top. */ + gen_helper_clz(cpu_T[0], cpu_T[0]); + tcg_gen_subi_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - size); } else { - label1 = gen_new_label(); - tcg_gen_movi_tl(cpu_cc_dst, 0); - tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0, label1); - if (b & 1) { - gen_helper_bsr(cpu_T[0], t0); - } else { - gen_helper_bsf(cpu_T[0], t0); - } - gen_op_mov_reg_T0(ot, reg); - tcg_gen_movi_tl(cpu_cc_dst, 1); - gen_set_label(label1); - set_cc_op(s, CC_OP_LOGICB + ot); + /* For tzcnt, a zero input must return the operand size: + force all bits outside the operand size to 1. */ + target_ulong mask = (target_ulong)-2 << (size - 1); + tcg_gen_ori_tl(cpu_T[0], cpu_T[0], mask); + gen_helper_ctz(cpu_T[0], cpu_T[0]); + } + /* For lzcnt/tzcnt, C and Z bits are defined and are + related to the result. */ + gen_op_update1_cc(); + set_cc_op(s, CC_OP_BMILGB + ot); + } else { + /* For bsr/bsf, only the Z bit is defined and it is related + to the input and not the result. */ + tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); + set_cc_op(s, CC_OP_LOGICB + ot); + if (b & 1) { + /* For bsr, return the bit index of the first 1 bit, + not the count of leading zeros. */ + gen_helper_clz(cpu_T[0], cpu_T[0]); + tcg_gen_xori_tl(cpu_T[0], cpu_T[0], TARGET_LONG_BITS - 1); + } else { + gen_helper_ctz(cpu_T[0], cpu_T[0]); } - tcg_temp_free(t0); + /* ??? The manual says that the output is undefined when the + input is zero, but real hardware leaves it unchanged, and + real programs appear to depend on that. */ + tcg_gen_movi_tl(cpu_tmp0, 0); + tcg_gen_movcond_tl(TCG_COND_EQ, cpu_T[0], cpu_cc_dst, cpu_tmp0, + cpu_regs[reg], cpu_T[0]); } + gen_op_mov_reg_T0(ot, reg); break; /************************/ /* bcd */ -- cgit v1.2.1