--- valgrind-3.5.0/configure.in 2009-08-19 15:37:48.000000000 +0200 +++ valgrind-3.5.0/configure.in 2010-01-23 01:30:14.991575477 +0100 @@ -1441,6 +1441,24 @@ AM_CONDITIONAL(BUILD_SSSE3_TESTS, test x$ac_have_as_ssse3 = xyes) +AC_MSG_CHECKING([if x86/amd64 assembler speaks SSE4a]) + +AC_TRY_COMPILE(, [ + do { + __asm__ __volatile__("lzcnt %rax,%rax" ); } + while (0) +], +[ +ac_have_as_sse4a=yes +AC_MSG_RESULT([yes]) +], [ +ac_have_as_sse4a=no +AC_MSG_RESULT([no]) +]) + +AM_CONDITIONAL(BUILD_SSE4A_TESTS, test x$ac_have_as_sse4a = xyes) + + # Check for TLS support in the compiler and linker if test "x${cross_compiling}" = "xno"; then # Native compilation: check whether running a program using TLS succeeds. --- valgrind-3.5.0/none/tests/amd64/insn_sse4a.def 1970-01-01 01:00:00.000000000 +0100 +++ valgrind-3.5.0/none/tests/amd64/insn_sse4a.def 2010-01-23 01:30:15.101224464 +0100 @@ -0,0 +1,12 @@ +lzcntw r16.uw[0x0468] r16.uw[0] => 1.uw[5] +lzcntw m16.uw[0x2642] r16.uw[0] => 1.uw[2] +lzcntw r16.uw[0x0000] r16.uw[0] => 1.uw[16] +lzcntw r16.uw[0x8000] r16.uw[0] => 1.uw[0] +lzcntl r32.ud[0x00072468] r32.ud[0] => 1.ud[13] +lzcntl m32.ud[0x75318642] r32.ud[0] => 1.ud[1] +lzcntl r32.ud[0x00000000] r32.ud[0] => 1.ud[32] +lzcntl r32.ud[0x80000000] r32.ud[0] => 1.ud[0] +lzcntq r64.uq[0x1357246813572468] r64.uq[0] => 1.uq[3] +lzcntq m64.uq[0x8531864275318642] r64.uq[0] => 1.uq[0] +lzcntq m64.uq[0x7531864275318642] r64.uq[0] => 1.uq[1] +lzcntq r64.uq[0x0000000000000000] r64.uq[0] => 1.uq[64] --- valgrind-3.5.0/none/tests/amd64/Makefile.am 2009-08-19 15:37:15.000000000 +0200 +++ valgrind-3.5.0/none/tests/amd64/Makefile.am 2010-01-23 01:30:15.268229136 +0100 @@ -12,6 +12,10 @@ if BUILD_SSSE3_TESTS INSN_TESTS += insn_ssse3 endif +if BUILD_SSE4A_TESTS + INSN_TESTS += insn_sse4a test_sse4a_flags +endif + # Explicitly include insn_sse3 even if ! BUILD_SSE3_TESTS, # to avoid packaging screwups if 'make dist' is run on a machine @@ -96,9 +100,11 @@ insn_sse3_LDADD = -lm insn_ssse3_SOURCES = insn_ssse3.def insn_ssse3_LDADD = -lm +insn_sse4a_SOURCES = insn_sse4a.def insn_fpu_SOURCES = insn_fpu.def insn_fpu_LDADD = -lm fxtract_LDADD = -lm +test_sse4a_flags_SOURCES = test_sse4a_flags.c .def.c: $(srcdir)/gen_insn_test.pl $(PERL) $(srcdir)/gen_insn_test.pl < $< > $@ --- valgrind-3.5.0/none/tests/amd64/test_sse4a_flags.c 1970-01-01 01:00:00.000000000 +0100 +++ valgrind-3.5.0/none/tests/amd64/test_sse4a_flags.c 2010-01-23 00:22:57.000000000 +0100 @@ -0,0 +1,79 @@ + +/* Derived from: */ + +/* + * amd64 CPU test + * + * Copyright (c) 2010 Gert Wollny + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +#define CARRY_FLAG 0x0001 +#define ZERO_FLAG 0x0040 +#define FLAG_MASK (CARRY_FLAG|ZERO_FLAG) + +int test_LZCNT_flags(long value, short expect) +{ + short eflags; + asm ("mov %1,%%rax\n" + "lzcnt %%rax,%%rax\n" + "pushf\n" + "pop %0\n" + : "=r" (eflags), "=m" (value) + ); + printf("flags = %d (%d) ", eflags & FLAG_MASK, eflags); + return ((eflags & FLAG_MASK) == expect); +} + + +void test_zero_sets_C() +{ + printf("zero_sets_C ... "); + + if (test_LZCNT_flags(0UL, CARRY_FLAG)) + printf("ok\n"); + else + printf("not ok\n"); +} + +void test_FFFF_sets_Z() +{ + printf("0xFF..._sets Z ... "); + if (test_LZCNT_flags(-1L, ZERO_FLAG)) + printf("ok\n"); + else + printf("not ok\n"); +} + +void test_normal_no_C_or_Z() +{ + printf("positive values set no flag ... "); + if (test_LZCNT_flags(321L, 0)) + printf("ok\n"); + else + printf("not ok\n"); +} + +int main(int argc, char **args) +{ + test_zero_sets_C(); + test_FFFF_sets_Z(); + test_normal_no_C_or_Z(); + + return 0; +} --- valgrind-3.5.0/VEX/priv/guest_amd64_toIR.c 2009-08-19 15:37:52.000000000 +0200 +++ valgrind-3.5.0/VEX/priv/guest_amd64_toIR.c 2010-01-23 01:30:15.310342294 +0100 @@ -7325,6 +7325,120 @@ +/* Handle LZCNT in terms of BSR */ +static +ULong dis_LZCNT ( VexAbiInfo* vbi, + Prefix pfx, Int sz, Long delta) +{ + Bool isReg; + UChar modrm; + HChar dis_buf[50]; + + IRType ty = szToITy(sz); + IRTemp src = newTemp(ty); + IRTemp dst = newTemp(ty); + IRTemp src64 = newTemp(Ity_I64); + IRTemp dst64 = newTemp(Ity_I64); + IRTemp src8 = newTemp(Ity_I8); + IRTemp dst8 = newTemp(Ity_I8); + + vassert(sz == 8 || sz == 4 || sz == 2); + + ULong maxbit = 8 * sz -1; + + modrm = getUChar(delta); + isReg = epartIsReg(modrm); + if (isReg) { + delta++; + assign( src, getIRegE(sz, pfx, modrm) ); + } else { + Int len; + IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 ); + delta += len; + assign( src, loadLE(ty, mkexpr(addr)) ); + } + + DIP("lzcnt %s, %s\n", + ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ), + nameIRegG(sz, pfx, modrm)); + + /* First, widen src to 64 bits if it is not already. */ + assign( src64, widenUto64(mkexpr(src)) ); + + /* make a zero expression iff src64 is zero */ + assign( src8, + unop(Iop_1Uto8, + binop(Iop_CmpNE64, + mkexpr(src64), mkU64(0))) ); + + /* clean all flags */ + stmt( IRStmt_Put( OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY) )); + stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) )); + + + /* Hack: Execute the operation by using BSR + Iff the input value is zero, store the size of + the operant in the + result. otherwise store sieof(src)-1-BSR(src). + */ + assign( dst64, + IRExpr_Mux0X( + mkexpr(src8), + /* src == 0 -- set result to width of src */ + mkU64(8 * sz), + /* src != 0 */ + binop(Iop_Sub64, + mkU64(maxbit), + binop(Iop_Sub64, + mkU64(63), + unop(Iop_Clz64, mkexpr(src64))) + ) + ) + ); + + assign( dst8, + unop(Iop_1Uto8, + binop(Iop_CmpNE64, + mkexpr(dst64), mkU64(0))) ); + + /* set c flag if src is zero and z flag if dst is zero */ + stmt( IRStmt_Put( + OFFB_CC_DEP1, + binop(Iop_Or64, + IRExpr_Mux0X( + mkexpr(src8), + /* src==0 */ + mkU64(AMD64G_CC_MASK_C), + /* src!=0 */ + mkU64(0) + ), + IRExpr_Mux0X( + mkexpr(dst8), + /* dst==0 */ + mkU64(AMD64G_CC_MASK_Z), + /* dst!=0 */ + mkU64(0) + ) + ) + ) + ); + + + if (sz == 2) + assign( dst, unop(Iop_64to16, mkexpr(dst64)) ); + else + if (sz == 4) + assign( dst, unop(Iop_64to32, mkexpr(dst64)) ); + else + assign( dst, mkexpr(dst64) ); + + /* dump result back */ + putIRegG( sz, pfx, modrm, mkexpr(dst) ); + + return delta; +} + + /* Handle BSF/BSR. Only v-size seems necessary. */ static ULong dis_bs_E_G ( VexAbiInfo* vbi, @@ -15348,9 +15462,14 @@ if (haveF2orF3(pfx)) goto decode_failure; delta = dis_bs_E_G ( vbi, pfx, sz, delta, True ); break; - case 0xBD: /* BSR Gv,Ev */ - if (haveF2orF3(pfx)) goto decode_failure; - delta = dis_bs_E_G ( vbi, pfx, sz, delta, False ); + case 0xBD: + if (haveF3(pfx)) + delta = dis_LZCNT ( vbi, pfx, sz, delta ); + else if (haveF2(pfx)) + goto decode_failure; + else + /* BSR Gv,Ev */ + delta = dis_bs_E_G ( vbi, pfx, sz, delta, False ); break; /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */