Index: gcc/doc/md.texi =================================================================== --- gcc/doc/md.texi (.../trunk) (revision 144197) +++ gcc/doc/md.texi (.../branches/ix86/atom) (revision 144446) @@ -7505,6 +7505,11 @@ recognize complicated bypasses, e.g.@: when the consumer is only an address of insn @samp{store} (not a stored value). +If there are more one bypass with the same output and input insns, the +chosen bypass is the first bypass with a guard in description whose +guard function returns nonzero. If there is no such bypass, then +bypass without the guard function is chosen. + @findex exclusion_set @findex presence_set @findex final_presence_set Index: gcc/rtlanal.c =================================================================== --- gcc/rtlanal.c (.../trunk) (revision 144197) +++ gcc/rtlanal.c (.../branches/ix86/atom) (revision 144446) @@ -728,6 +728,129 @@ } return 0; } + +static int +reg_mentioned_by_mem_p_1 (const_rtx reg, const_rtx in, + bool *mem_p) +{ + const char *fmt; + int i; + enum rtx_code code; + + if (in == 0) + return 0; + + if (reg == in) + return 1; + + if (GET_CODE (in) == LABEL_REF) + return reg == XEXP (in, 0); + + code = GET_CODE (in); + + switch (code) + { + /* Compare registers by number. */ + case REG: + return REG_P (reg) && REGNO (in) == REGNO (reg); + + /* These codes have no constituent expressions + and are unique. */ + case SCRATCH: + case CC0: + case PC: + return 0; + + case CONST_INT: + case CONST_VECTOR: + case CONST_DOUBLE: + case CONST_FIXED: + /* These are kept unique for a given value. */ + return 0; + + default: + break; + } + + if (GET_CODE (reg) == code && rtx_equal_p (reg, in)) + return 1; + + fmt = GET_RTX_FORMAT (code); + + for (i = GET_RTX_LENGTH (code) - 1; i >= 0; i--) + { + if (fmt[i] == 'E') + { + int j; + for (j = XVECLEN (in, i) - 1; j >= 0; j--) + if (reg_mentioned_by_mem_p_1 (reg, XVECEXP (in, i, j), mem_p)) + { + if (code == MEM) + *mem_p = true; + + return 1; + } + } + else if (fmt[i] == 'e' + && reg_mentioned_by_mem_p_1 (reg, XEXP (in, i), mem_p)) + { + if (code == MEM) + *mem_p = true; + + return 1; + } + } + return 0; +} + +/* Similar to the function reg_mentioned_p, return true only when + register REG appears in a MEM container of RTX IN. */ + +bool +reg_mentioned_by_mem_p (const_rtx reg, const_rtx in) +{ + bool mem = false; + + reg_mentioned_by_mem_p_1 (reg, in, &mem); + return mem; +} + +/* Return true if dest regsiter in set_insn is used in use_insn as + address calculation. + For example, returns true if + set_insn: reg_a = reg_b + use_insn: reg_c = (reg_a) # reg_a used in addr calculation + False if + set_insn: reg_a = reg_b + use_insn: (reg_c) = reg_a # reg_a is used, by not as addr. */ + +bool +reg_dep_by_addr_p (const_rtx set_insn, const_rtx use_insn) +{ + rtx pattern = PATTERN (set_insn); + rtx set_dest = NULL; + + switch (GET_CODE (pattern)) + { + case SET: + set_dest = SET_DEST (pattern); + break; + case PARALLEL: + { + rtx pattern2 = XVECEXP (PATTERN (set_insn), 0,0); + if (GET_CODE (pattern2) == SET) + set_dest = SET_DEST (pattern2); + break; + } + default: + set_dest = NULL; + } + + /* True if destination of set is reg and used as address. */ + return set_dest && REG_P (set_dest) + && reg_mentioned_by_mem_p (set_dest, use_insn); +} + /* Return 1 if in between BEG and END, exclusive of BEG and END, there is no CODE_LABEL insn. */ Index: gcc/genautomata.c =================================================================== --- gcc/genautomata.c (.../trunk) (revision 144197) +++ gcc/genautomata.c (.../branches/ix86/atom) (revision 144446) @@ -1,5 +1,5 @@ /* Pipeline hazard description translator. - Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008 + Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc. Written by Vladimir Makarov @@ -22,21 +22,25 @@ /* References: - 1. Detecting pipeline structural hazards quickly. T. Proebsting, + 1. The finite state automaton based pipeline hazard recognizer and + instruction scheduler in GCC. V. Makarov. Proceedings of GCC + summit, 2003. + + 2. Detecting pipeline structural hazards quickly. T. Proebsting, C. Fraser. Proceedings of ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages, pages 280--286, 1994. This article is a good start point to understand usage of finite state automata for pipeline hazard recognizers. But I'd - recommend the 2nd article for more deep understanding. + recommend the 1st and 3rd article for more deep understanding. - 2. Efficient Instruction Scheduling Using Finite State Automata: + 3. Efficient Instruction Scheduling Using Finite State Automata: V. Bala and N. Rubin, Proceedings of MICRO-28. This is the best article about usage of finite state automata for pipeline hazard recognizers. - The current implementation is different from the 2nd article in the - following: + The current implementation is described in the 1st article and it + is different from the 3rd article in the following: 1. New operator `|' (alternative) is permitted in functional unit reservation which can be treated deterministically and @@ -463,7 +467,10 @@ insn. */ int insn_num; /* The following field value is list of bypasses in which given insn - is output insn. */ + is output insn. Bypasses with the same input insn stay one after + another in the list in the same order as their occurrences in the + description but the bypass without a guard stays always the last + in a row of bypasses with the same input insn. */ struct bypass_decl *bypass_list; /* The following fields are defined by automaton generator. */ @@ -2367,18 +2374,67 @@ } -/* The function searches for bypass with given IN_INSN_RESERV in given - BYPASS_LIST. */ -static struct bypass_decl * -find_bypass (struct bypass_decl *bypass_list, - struct insn_reserv_decl *in_insn_reserv) +/* The function inserts BYPASS in the list of bypasses of the + corresponding output insn. The order of bypasses in the list is + decribed in a comment for member `bypass_list' (see above). If + there is already the same bypass in the list the function reports + this and does nothing. */ +static void +insert_bypass (struct bypass_decl *bypass) { - struct bypass_decl *bypass; - - for (bypass = bypass_list; bypass != NULL; bypass = bypass->next) - if (bypass->in_insn_reserv == in_insn_reserv) - break; - return bypass; + struct bypass_decl *curr, *last; + struct insn_reserv_decl *out_insn_reserv = bypass->out_insn_reserv; + struct insn_reserv_decl *in_insn_reserv = bypass->in_insn_reserv; + + for (curr = out_insn_reserv->bypass_list, last = NULL; + curr != NULL; + last = curr, curr = curr->next) + if (curr->in_insn_reserv == in_insn_reserv) + { + if ((bypass->bypass_guard_name != NULL + && curr->bypass_guard_name != NULL + && ! strcmp (bypass->bypass_guard_name, curr->bypass_guard_name)) + || bypass->bypass_guard_name == curr->bypass_guard_name) + { + if (bypass->bypass_guard_name == NULL) + { + if (!w_flag) + error ("the same bypass `%s - %s' is already defined", + bypass->out_insn_name, bypass->in_insn_name); + else + warning (0, "the same bypass `%s - %s' is already defined", + bypass->out_insn_name, bypass->in_insn_name); + } + else if (!w_flag) + error ("the same bypass `%s - %s' (guard %s) is already defined", + bypass->out_insn_name, bypass->in_insn_name, + bypass->bypass_guard_name); + else + warning + (0, "the same bypass `%s - %s' (guard %s) is already defined", + bypass->out_insn_name, bypass->in_insn_name, + bypass->bypass_guard_name); + return; + } + if (curr->bypass_guard_name == NULL) + break; + if (curr->next == NULL || curr->next->in_insn_reserv != in_insn_reserv) + { + last = curr; + break; + } + + } + if (last == NULL) + { + bypass->next = out_insn_reserv->bypass_list; + out_insn_reserv->bypass_list = bypass; + } + else + { + bypass->next = last->next; + last->next = bypass; + } } /* The function processes pipeline description declarations, checks @@ -2391,7 +2447,6 @@ decl_t decl_in_table; decl_t out_insn_reserv; decl_t in_insn_reserv; - struct bypass_decl *bypass; int automaton_presence; int i; @@ -2514,36 +2569,7 @@ = DECL_INSN_RESERV (out_insn_reserv); DECL_BYPASS (decl)->in_insn_reserv = DECL_INSN_RESERV (in_insn_reserv); - bypass - = find_bypass (DECL_INSN_RESERV (out_insn_reserv)->bypass_list, - DECL_BYPASS (decl)->in_insn_reserv); - if (bypass != NULL) - { - if (DECL_BYPASS (decl)->latency == bypass->latency) - { - if (!w_flag) - error - ("the same bypass `%s - %s' is already defined", - DECL_BYPASS (decl)->out_insn_name, - DECL_BYPASS (decl)->in_insn_name); - else - warning - (0, "the same bypass `%s - %s' is already defined", - DECL_BYPASS (decl)->out_insn_name, - DECL_BYPASS (decl)->in_insn_name); - } - else - error ("bypass `%s - %s' is already defined", - DECL_BYPASS (decl)->out_insn_name, - DECL_BYPASS (decl)->in_insn_name); - } - else - { - DECL_BYPASS (decl)->next - = DECL_INSN_RESERV (out_insn_reserv)->bypass_list; - DECL_INSN_RESERV (out_insn_reserv)->bypass_list - = DECL_BYPASS (decl); - } + insert_bypass (DECL_BYPASS (decl)); } } } @@ -8159,19 +8185,32 @@ (advance_cycle_insn_decl)->insn_num)); fprintf (output_file, " case %d:\n", bypass->in_insn_reserv->insn_num); - if (bypass->bypass_guard_name == NULL) - fprintf (output_file, " return %d;\n", - bypass->latency); - else + for (;;) { - fprintf (output_file, - " if (%s (%s, %s))\n", - bypass->bypass_guard_name, INSN_PARAMETER_NAME, - INSN2_PARAMETER_NAME); - fprintf (output_file, - " return %d;\n break;\n", - bypass->latency); + if (bypass->bypass_guard_name == NULL) + { + gcc_assert (bypass->next == NULL + || (bypass->in_insn_reserv + != bypass->next->in_insn_reserv)); + fprintf (output_file, " return %d;\n", + bypass->latency); + } + else + { + fprintf (output_file, + " if (%s (%s, %s))\n", + bypass->bypass_guard_name, INSN_PARAMETER_NAME, + INSN2_PARAMETER_NAME); + fprintf (output_file, " return %d;\n", + bypass->latency); + } + if (bypass->next == NULL + || bypass->in_insn_reserv != bypass->next->in_insn_reserv) + break; + bypass = bypass->next; } + if (bypass->bypass_guard_name != NULL) + fprintf (output_file, " break;\n"); } fputs (" }\n break;\n", output_file); } Index: gcc/rtl.def =================================================================== --- gcc/rtl.def (.../trunk) (revision 144197) +++ gcc/rtl.def (.../branches/ix86/atom) (revision 144446) @@ -1088,7 +1088,11 @@ guard for the bypass. The function will get the two insns as parameters. If the function returns zero the bypass will be ignored for this case. Additional guard is necessary to recognize - complicated bypasses, e.g. when consumer is load address. */ + complicated bypasses, e.g. when consumer is load address. If there + are more one bypass with the same output and input insns, the + chosen bypass is the first bypass with a guard in description whose + guard function returns nonzero. If there is no such bypass, then + bypass without the guard function is chosen. */ DEF_RTL_EXPR(DEFINE_BYPASS, "define_bypass", "issS", RTX_EXTRA) /* (define_automaton string) describes names of automata generated and Index: gcc/ChangeLog.atom =================================================================== --- gcc/ChangeLog.atom (.../trunk) (revision 0) +++ gcc/ChangeLog.atom (.../branches/ix86/atom) (revision 144446) @@ -0,0 +1,134 @@ +2009-02-05 Joey Ye + Xuepeng Guo + H.J. Lu + + Atom pipeline model, tuning and insn selection. + * rtlanal.c (reg_mentioned_by_mem_p_1): New function. + (reg_mentioned_by_mem_p): New function. + (reg_dep_by_addr_p): New function. + + * rtl.h (reg_mentioned_by_mem_p): Declare new function. + (reg_dep_by_addr_p): Likewise. + + * config.gcc (atom): Add atom config options and target. + + * config/i386/i386.h (TARGET_ATOM): New target macro. + (X86_TUNE_OPT_AGU): New tuning flag. + (TARGET_OPT_AGU): New target option. + (TARGET_CPU_DEFAULT_atom): New CPU default. + (PROCESSOR_ATOM): New processor. + + * config/i386/i386-c.c (ix86_target_macros_internal): New case + PROCESSOR_ATOM. + (ix86_target_macros_internal): Likewise. + + * config/i386/i386-protos.h (ix86_lea_for_add_ok): Declare new + function. + (ix86_dep_by_shift_count): Likewise. + (ix86_agi_dependent): Likewise. + + * config/i386/i386.c (atom_cost): New cost. + (m_ATOM): New macro flag. + (initial_ix86_tune_fe): Set m_ATOM. + (x86_accumulate_outgoing_args): Likewise. + (x86_arch_always_fancy_math_387): Likewise. + (processor_target): Add Atom cost. + (cpu_names): Add Atom cpu name. + (override_options): Set Atom ISA. + (LEA_SEARCH_THRESHOLD): New macro. + (distance_non_agu_define): New function. + (distance_agu_use): Likewise. + (ix86_lea_for_add_ok): Likewise. + (ix86_dep_by_shift_count): Likewise. + (ix86_agi_dependent): Make it global. + (ix86_issue_rate): New case PROCESSOR_ATOM. + (ix86_adjust_cost): Likewise. + + * config/i386/i386.md (cpu): Add new value "atom". + (atom.md): Include atom.md. + (use_carry, movu): New attr. + (adddi3_carry_rex64): Set attr "use_carry". + (addqi3_carry): Likewise. + (addhi3_carry): Likewise. + (addsi3_carry): Likewise. + (*addsi3_carry_zext): Likewise. + (subdi3_carry_rex64): Likewise. + (subqi3_carry): Likewise. + (subhi3_carry): Likewise. + (subsi3_carry): Likewise. + (x86_movdicc_0_m1_rex64): Likewise. + (*x86_movdicc_0_m1_se): Likewise. + (x86_movsicc_0_m1): Likewise. + (*x86_movsicc_0_m1_se): Likewise. + (*adddi_1_rex64): Emit add insn as much as possible. + (*addsi_1): Likewise. + (return_internal): Set atom_unit. + (return_internal_long): Likewise. + (return_pop_internal): Likewise. + (*rcpsf2_sse): Set atom_sse_attr attr. + (*qrt2_sse): Likewise. + (*prefetch_sse): Likewise. + + * config/i386/sse.md (cpu): Set attr "atom_sse_attr". + (*prefetch_sse_rex): Likewise. + (sse_rcpv4sf2): Likewise. + (sse_vmrcpv4sf2): Likewise. + (sse_sqrtv4sf2): Likewise. + (_vmsqrt2): Likewise. + (sse_ldmxcsr): Likewise. + (sse_stmxcsr): Likewise. + (*sse_sfence): Likewise. + (sse2_clflush): Likewise. + (*sse2_mfence): Likewise. + (*sse2_lfence): Likewise. + (avx_movup): Set attr "movu". + (_movup): Likewise. + (avx_movdqu): Likewise. + (avx_lddqu): Likewise. + (sse2_movntv2di): Change attr "type" to "ssemov". + (sse2_movntsi): Likewise. + (rsqrtv8sf2): Change attr "type" to "sseadd". + (sse3_addsubv2df3): Set attr "atom_unit". + (sse3_hv4sf3): Likewise. + (*sse2_pmaddwd): Likewise. + (*vec_extractv2di_1_rex64): Likewise. + (*vec_extractv2di_1_avx): Likewise. + (sse2_psadbw): Likewise. + (ssse3_phaddwv8hi3): Likewise. + (ssse3_phaddwv4hi3): Likewise. + (ssse3_phadddv4si3): Likewise. + (ssse3_phadddv2si3): Likewise. + (ssse3_phaddswv8hi3): Likewise. + (ssse3_phaddswv4hi3): Likewise. + (ssse3_phsubwv8hi3): Likewise. + (ssse3_phsubwv4hi3): Likewise. + (ssse3_phsubdv4si3): Likewise. + (ssse3_phsubdv2si3): Likewise. + (ssse3_phsubswv8hi3): Likewise. + (ssse3_phsubswv4hi3): Likewise. + (ssse3_pmaddubsw128): Likewise. + (sse3_pmaddubsw: Likewise. + (ssse3_palignrti): Likewise. + (ssse3_palignrdi): Likewise. + + * config/i386/atom.md: New. + +2009-02-05 H.J. Lu + + * config/i386/i386.c (ix86_agi_dependent): Remove the third + argument. Swap the first 2 arguments. + (ix86_adjust_cost): Updated. + +2009-01-30 Vladimir Makarov + + * genautomata.c: Add a new year to the copyright. Add a new + reference. + (struct insn_reserv_decl): Add comments for member bypass_list. + (find_bypass): Remove. + (insert_bypass): New. + (process_decls): Use insert_bypass. + (output_internal_insn_latency_func): Output all bypasses with the + same input insn in one switch case. + + * rtl.def (define_bypass): Describe bypass choice. + * doc/md.texi (define_bypass): Ditto. Index: gcc/rtl.h =================================================================== --- gcc/rtl.h (.../trunk) (revision 144197) +++ gcc/rtl.h (.../branches/ix86/atom) (revision 144446) @@ -1731,6 +1731,8 @@ extern bool offset_within_block_p (const_rtx, HOST_WIDE_INT); extern void split_const (rtx, rtx *, rtx *); extern int reg_mentioned_p (const_rtx, const_rtx); +extern bool reg_mentioned_by_mem_p (const_rtx, const_rtx); +extern bool reg_dep_by_addr_p (const_rtx, const_rtx); extern int count_occurrences (const_rtx, const_rtx, int); extern int reg_referenced_p (const_rtx, const_rtx); extern int reg_used_between_p (const_rtx, const_rtx, const_rtx); Index: gcc/config.gcc =================================================================== --- gcc/config.gcc (.../trunk) (revision 144197) +++ gcc/config.gcc (.../branches/ix86/atom) (revision 144446) @@ -1087,7 +1087,7 @@ tmake_file="${tmake_file} i386/t-linux64" need_64bit_hwint=yes case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) if test x$with_cpu_64 = x; then @@ -1096,7 +1096,7 @@ ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -1201,7 +1201,7 @@ # libgcc/configure.ac instead. need_64bit_hwint=yes case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xatom|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) if test x$with_cpu_64 = x; then @@ -1210,7 +1210,7 @@ ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic atom core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -2803,7 +2803,7 @@ esac # OK ;; - "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | atom | generic) # OK ;; *) Index: gcc/config/i386/i386.h =================================================================== --- gcc/config/i386/i386.h (.../trunk) (revision 144197) +++ gcc/config/i386/i386.h (.../branches/ix86/atom) (revision 144446) @@ -231,6 +231,7 @@ #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) +#define TARGET_ATOM (ix86_tune == PROCESSOR_ATOM) /* Feature tests against the various tunings. */ enum ix86_tune_indices { @@ -295,6 +296,7 @@ X86_TUNE_USE_VECTOR_FP_CONVERTS, X86_TUNE_USE_VECTOR_CONVERTS, X86_TUNE_FUSE_CMP_AND_BRANCH, + X86_TUNE_OPT_AGU, X86_TUNE_LAST }; @@ -382,6 +384,7 @@ ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS] #define TARGET_FUSE_CMP_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH] +#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { @@ -564,6 +567,7 @@ TARGET_CPU_DEFAULT_prescott, TARGET_CPU_DEFAULT_nocona, TARGET_CPU_DEFAULT_core2, + TARGET_CPU_DEFAULT_atom, TARGET_CPU_DEFAULT_geode, TARGET_CPU_DEFAULT_k6, @@ -2256,6 +2260,7 @@ PROCESSOR_GENERIC32, PROCESSOR_GENERIC64, PROCESSOR_AMDFAM10, + PROCESSOR_ATOM, PROCESSOR_max }; Index: gcc/config/i386/i386.md =================================================================== --- gcc/config/i386/i386.md (.../trunk) (revision 144197) +++ gcc/config/i386/i386.md (.../branches/ix86/atom) (revision 144446) @@ -297,7 +297,7 @@ ;; Processor type. -(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2, +(define_attr "cpu" "none,pentium,pentiumpro,geode,k6,athlon,k8,core2,atom, generic64,amdfam10" (const (symbol_ref "ix86_schedule"))) @@ -593,6 +593,12 @@ (define_attr "i387_cw" "trunc,floor,ceil,mask_pm,uninitialized,any" (const_string "any")) +;; Define attribute to classify add/sub insns that consumes carry flag (CF) +(define_attr "use_carry" "0,1" (const_string "0")) + +;; Define attribute to indicate unaligned ssemov insns +(define_attr "movu" "0,1" (const_string "0")) + ;; Describe a user's asm statement. (define_asm_attributes [(set_attr "length" "128") @@ -708,6 +714,7 @@ (include "k6.md") (include "athlon.md") (include "geode.md") +(include "atom.md") ;; Operand and operator predicates and constraints @@ -5775,6 +5782,7 @@ "TARGET_64BIT && ix86_binary_operator_ok (PLUS, DImode, operands)" "adc{q}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "DI")]) @@ -5849,6 +5857,7 @@ "ix86_binary_operator_ok (PLUS, QImode, operands)" "adc{b}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "QI")]) @@ -5861,6 +5870,7 @@ "ix86_binary_operator_ok (PLUS, HImode, operands)" "adc{w}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "HI")]) @@ -5873,6 +5883,7 @@ "ix86_binary_operator_ok (PLUS, SImode, operands)" "adc{l}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "SI")]) @@ -5886,6 +5897,7 @@ "TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)" "adc{l}\t{%2, %k0|%k0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "SI")]) @@ -6115,9 +6127,9 @@ (set_attr "mode" "SI")]) (define_insn "*adddi_1_rex64" - [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r") - (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,r") - (match_operand:DI 2 "x86_64_general_operand" "rme,re,le"))) + [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r") + (plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0,0,r,r") + (match_operand:DI 2 "x86_64_general_operand" "rme,re,0,le"))) (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT && ix86_binary_operator_ok (PLUS, DImode, operands)" { @@ -6138,6 +6150,10 @@ } default: + /* Use add as much as possible to replace lea for AGU optimization. */ + if (which_alternative == 2 && TARGET_OPT_AGU) + return "add{q}\t{%1, %0|%0, %1}"; + gcc_assert (rtx_equal_p (operands[0], operands[1])); /* Make things pretty and `subl $4,%eax' rather than `addl $-4, %eax'. @@ -6156,8 +6172,11 @@ } } [(set (attr "type") - (cond [(eq_attr "alternative" "2") + (cond [(and (eq_attr "alternative" "2") + (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0))) (const_string "lea") + (eq_attr "alternative" "3") + (const_string "lea") ; Current assemblers are broken and do not allow @GOTOFF in ; ought but a memory context. (match_operand:DI 2 "pic_symbolic_operand" "") @@ -6174,8 +6193,8 @@ (plus:DI (match_operand:DI 1 "register_operand" "") (match_operand:DI 2 "x86_64_nonmemory_operand" ""))) (clobber (reg:CC FLAGS_REG))] - "TARGET_64BIT && reload_completed - && true_regnum (operands[0]) != true_regnum (operands[1])" + "TARGET_64BIT && reload_completed + && ix86_lea_for_add_ok (PLUS, insn, operands)" [(set (match_dup 0) (plus:DI (match_dup 1) (match_dup 2)))] @@ -6379,9 +6398,9 @@ (define_insn "*addsi_1" - [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm,r") - (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,r") - (match_operand:SI 2 "general_operand" "g,ri,li"))) + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,rm,r,r") + (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,0,r,r") + (match_operand:SI 2 "general_operand" "g,ri,0,li"))) (clobber (reg:CC FLAGS_REG))] "ix86_binary_operator_ok (PLUS, SImode, operands)" { @@ -6402,6 +6421,10 @@ } default: + /* Use add as much as possible to replace lea for AGU optimization. */ + if (which_alternative == 2 && TARGET_OPT_AGU) + return "add{l}\t{%1, %0|%0, %1}"; + gcc_assert (rtx_equal_p (operands[0], operands[1])); /* Make things pretty and `subl $4,%eax' rather than `addl $-4, %eax'. @@ -6418,7 +6441,10 @@ } } [(set (attr "type") - (cond [(eq_attr "alternative" "2") + (cond [(and (eq_attr "alternative" "2") + (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0))) + (const_string "lea") + (eq_attr "alternative" "3") (const_string "lea") ; Current assemblers are broken and do not allow @GOTOFF in ; ought but a memory context. @@ -6436,8 +6462,7 @@ (plus (match_operand 1 "register_operand" "") (match_operand 2 "nonmemory_operand" ""))) (clobber (reg:CC FLAGS_REG))] - "reload_completed - && true_regnum (operands[0]) != true_regnum (operands[1])" + "reload_completed && ix86_lea_for_add_ok (PLUS, insn, operands)" [(const_int 0)] { rtx pat; @@ -7538,6 +7563,7 @@ "TARGET_64BIT && ix86_binary_operator_ok (MINUS, DImode, operands)" "sbb{q}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "DI")]) @@ -7586,6 +7612,7 @@ "ix86_binary_operator_ok (MINUS, QImode, operands)" "sbb{b}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "QI")]) @@ -7598,6 +7625,7 @@ "ix86_binary_operator_ok (MINUS, HImode, operands)" "sbb{w}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "HI")]) @@ -7610,6 +7638,7 @@ "ix86_binary_operator_ok (MINUS, SImode, operands)" "sbb{l}\t{%2, %0|%0, %2}" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "mode" "SI")]) @@ -15223,6 +15252,7 @@ "reload_completed" "ret" [(set_attr "length" "1") + (set_attr "atom_unit" "jeu") (set_attr "length_immediate" "0") (set_attr "modrm" "0")]) @@ -15235,6 +15265,7 @@ "reload_completed" "rep\;ret" [(set_attr "length" "1") + (set_attr "atom_unit" "jeu") (set_attr "length_immediate" "0") (set_attr "prefix_rep" "1") (set_attr "modrm" "0")]) @@ -15245,6 +15276,7 @@ "reload_completed" "ret\t%0" [(set_attr "length" "3") + (set_attr "atom_unit" "jeu") (set_attr "length_immediate" "2") (set_attr "modrm" "0")]) @@ -16366,6 +16398,7 @@ "TARGET_SSE_MATH" "%vrcpss\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") (set_attr "mode" "SF")]) @@ -16717,6 +16750,7 @@ "TARGET_SSE_MATH" "%vrsqrtss\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") (set_attr "mode" "SF")]) @@ -16737,6 +16771,7 @@ "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH" "%vsqrts\t{%1, %d0|%d0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "") (set_attr "athlon_decode" "*") @@ -19732,6 +19767,7 @@ ; Since we don't have the proper number of operands for an alu insn, ; fill in all the blanks. [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "memory" "none") (set_attr "imm_disp" "false") @@ -19747,6 +19783,7 @@ "" "sbb{q}\t%0, %0" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "memory" "none") (set_attr "imm_disp" "false") @@ -19790,6 +19827,7 @@ ; Since we don't have the proper number of operands for an alu insn, ; fill in all the blanks. [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "memory" "none") (set_attr "imm_disp" "false") @@ -19805,6 +19843,7 @@ "" "sbb{l}\t%0, %0" [(set_attr "type" "alu") + (set_attr "use_carry" "1") (set_attr "pent_pair" "pu") (set_attr "memory" "none") (set_attr "imm_disp" "false") @@ -20137,7 +20176,8 @@ } } [(set (attr "type") - (cond [(eq_attr "alternative" "0") + (cond [(and (eq_attr "alternative" "0") + (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0))) (const_string "alu") (match_operand:SI 2 "const0_operand" "") (const_string "imov") @@ -20180,7 +20220,8 @@ } } [(set (attr "type") - (cond [(eq_attr "alternative" "0") + (cond [(and (eq_attr "alternative" "0") + (eq (symbol_ref "TARGET_OPT_AGU") (const_int 0))) (const_string "alu") (match_operand:DI 2 "const0_operand" "") (const_string "imov") @@ -21672,6 +21713,7 @@ return patterns[locality]; } [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "prefetch") (set_attr "memory" "none")]) (define_insn "*prefetch_sse_rex" @@ -21690,6 +21732,7 @@ return patterns[locality]; } [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "prefetch") (set_attr "memory" "none")]) (define_insn "*prefetch_3dnow" Index: gcc/config/i386/atom.md =================================================================== --- gcc/config/i386/atom.md (.../trunk) (revision 0) +++ gcc/config/i386/atom.md (.../branches/ix86/atom) (revision 144446) @@ -0,0 +1,796 @@ +;; Atom Scheduling +;; Copyright (C) 2009 Free Software Foundation, Inc. +;; +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING. If not, write to +;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. */ +;; +;; Atom is an in-order core with two integer pipelines. + + +(define_attr "atom_unit" "sishuf,simul,jeu,complex,other" + (const_string "other")) + +(define_attr "atom_sse_attr" "rcp,movdup,lfence,fence,prefetch,sqrt,mxcsr,other" + (const_string "other")) + +(define_automaton "atom") + +;; Atom has two ports: port 0 and port 1 connecting to all execution units +(define_cpu_unit "atom-port-0,atom-port-1" "atom") + +;; EU: Execution Unit +;; Atom EUs are connected by port 0 or port 1. + +(define_cpu_unit "atom-eu-0, atom-eu-1, + atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4" + "atom") + +;; Some EUs have duplicated copied and can be accessed via either +;; port 0 or port 1 +;; (define_reservation "atom-port-either" "(atom-port-0 | atom-port-1)") + +;;; Some instructions is dual-pipe execution, need both ports +;;; Complex multi-op macro-instructoins need both ports and all EUs +(define_reservation "atom-port-dual" "(atom-port-0 + atom-port-1)") +(define_reservation "atom-all-eu" "(atom-eu-0 + atom-eu-1 + + atom-imul-1 + atom-imul-2 + atom-imul-3 + + atom-imul-4)") + +;;; Most of simple instructions have 1 cycle latency. Some of them +;;; issue in port 0, some in port 0 and some in either port. +(define_reservation "atom-simple-0" "(atom-port-0 + atom-eu-0)") +(define_reservation "atom-simple-1" "(atom-port-1 + atom-eu-1)") +(define_reservation "atom-simple-either" "(atom-simple-0 | atom-simple-1)") + +;;; Some insn issues in port 0 with 3 cycle latency and 1 cycle tput +(define_reservation "atom-eu-0-3-1" "(atom-port-0 + atom-eu-0, nothing*2)") + +;;; fmul insn can have 4 or 5 cycles latency +(define_reservation "atom-fmul-5c" "(atom-port-0 + atom-eu-0), nothing*4") +(define_reservation "atom-fmul-4c" "(atom-port-0 + atom-eu-0), nothing*3") + +;;; fadd can has 5 cycles latency depends on instruction forms +(define_reservation "atom-fadd-5c" "(atom-port-1 + atom-eu-1), nothing*5") + +;;; imul insn has 5 cycles latency +(define_reservation "atom-imul-32" + "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4, + atom-port-0") +;;; imul instruction excludes other non-FP instructions. +(exclusion_set "atom-eu-0, atom-eu-1" + "atom-imul-1, atom-imul-2, atom-imul-3, atom-imul-4") + +;;; dual-execution instructions can have 1,2,4,5 cycles latency depends on +;;; instruction forms +(define_reservation "atom-dual-1c" "(atom-port-dual + atom-eu-0 + atom-eu-1)") +(define_reservation "atom-dual-2c" + "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing)") +(define_reservation "atom-dual-5c" + "(atom-port-dual + atom-eu-0 + atom-eu-1, nothing*4)") + +;;; Complex macro-instruction has variants of latency, and uses both ports. +(define_reservation "atom-complex" "(atom-port-dual + atom-all-eu)") + +(define_insn_reservation "atom_other" 9 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "other") + (eq_attr "atom_unit" "!jeu"))) + "atom-complex, atom-all-eu*8") + +;; return has type "other" with atom_unit "jeu" +(define_insn_reservation "atom_other_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "other") + (eq_attr "atom_unit" "jeu"))) + "atom-dual-1c") + +(define_insn_reservation "atom_multi" 9 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "multi")) + "atom-complex, atom-all-eu*8") + +;; Normal alu insns without carry +(define_insn_reservation "atom_alu" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "none") + (eq_attr "use_carry" "0")))) + "atom-simple-either") + +;; Normal alu insns without carry +(define_insn_reservation "atom_alu_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "!none") + (eq_attr "use_carry" "0")))) + "atom-simple-either") + +;; Alu insn consuming CF, such as add/sbb +(define_insn_reservation "atom_alu_carry" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "none") + (eq_attr "use_carry" "1")))) + "atom-simple-either") + +;; Alu insn consuming CF, such as add/sbb +(define_insn_reservation "atom_alu_carry_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu") + (and (eq_attr "memory" "!none") + (eq_attr "use_carry" "1")))) + "atom-simple-either") + +(define_insn_reservation "atom_alu1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu1") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_alu1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "alu1") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_negnot" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "negnot") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_negnot_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "negnot") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_imov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_imov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; 16<-16, 32<-32 +(define_insn_reservation "atom_imovx" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "none") + (ior (and (match_operand:HI 0 "register_operand") + (match_operand:HI 1 "general_operand")) + (and (match_operand:SI 0 "register_operand") + (match_operand:SI 1 "general_operand")))))) + "atom-simple-either") + +;; 16<-16, 32<-32, mem +(define_insn_reservation "atom_imovx_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "!none") + (ior (and (match_operand:HI 0 "register_operand") + (match_operand:HI 1 "general_operand")) + (and (match_operand:SI 0 "register_operand") + (match_operand:SI 1 "general_operand")))))) + "atom-simple-either") + +;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8 +(define_insn_reservation "atom_imovx_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "none") + (ior (match_operand:QI 0 "register_operand") + (ior (and (match_operand:SI 0 "register_operand") + (not (match_operand:SI 1 "general_operand"))) + (match_operand:DI 0 "register_operand")))))) + "atom-simple-0") + +;; 32<-16, 32<-8, 64<-16, 64<-8, 64<-32, 8<-8, mem +(define_insn_reservation "atom_imovx_2_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (eq_attr "memory" "!none") + (ior (match_operand:QI 0 "register_operand") + (ior (and (match_operand:SI 0 "register_operand") + (not (match_operand:SI 1 "general_operand"))) + (match_operand:DI 0 "register_operand")))))) + "atom-simple-0") + +;; 16<-8 +(define_insn_reservation "atom_imovx_3" 3 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imovx") + (and (match_operand:HI 0 "register_operand") + (match_operand:QI 1 "general_operand")))) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_lea" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "lea") + (eq_attr "mode" "!HI"))) + "atom-simple-either") + +;; lea 16bit address is complex insn +(define_insn_reservation "atom_lea_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "lea") + (eq_attr "mode" "HI"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_incdec" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "incdec") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_incdec_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "incdec") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; simple shift instruction use SHIFT eu, none memory +(define_insn_reservation "atom_ishift" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (and (eq_attr "memory" "none") (eq_attr "prefix_0f" "0")))) + "atom-simple-0") + +;; simple shift instruction use SHIFT eu, memory +(define_insn_reservation "atom_ishift_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (and (eq_attr "memory" "!none") (eq_attr "prefix_0f" "0")))) + "atom-simple-0") + +;; DF shift (prefixed with 0f) is complex insn with latency of 7 cycles +(define_insn_reservation "atom_ishift_3" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift") + (eq_attr "prefix_0f" "1"))) + "atom-complex, atom-all-eu*6") + +(define_insn_reservation "atom_ishift1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_ishift1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ishift1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_rotate1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "rotate1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +(define_insn_reservation "atom_imul" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (and (eq_attr "memory" "none") (eq_attr "mode" "SI")))) + "atom-imul-32") + +(define_insn_reservation "atom_imul_mem" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (and (eq_attr "memory" "!none") (eq_attr "mode" "SI")))) + "atom-imul-32") + +;; latency set to 10 as common 64x64 imul +(define_insn_reservation "atom_imul_3" 10 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "imul") + (eq_attr "mode" "!SI"))) + "atom-complex, atom-all-eu*9") + +(define_insn_reservation "atom_idiv" 65 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "idiv")) + "atom-complex, atom-all-eu*32, nothing*32") + +(define_insn_reservation "atom_icmp" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmp") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_icmp_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmp") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_test" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "test") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_test_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "test") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_ibr" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ibr") + (eq_attr "memory" "!load"))) + "atom-simple-1") + +;; complex if jump target is from address +(define_insn_reservation "atom_ibr_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ibr") + (eq_attr "memory" "load"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_setcc" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "setcc") + (eq_attr "memory" "!store"))) + "atom-simple-either") + +;; 2 cycles complex if target is in memory +(define_insn_reservation "atom_setcc_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "setcc") + (eq_attr "memory" "store"))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_icmov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_icmov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "icmov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; UCODE if segreg, ignored +(define_insn_reservation "atom_push" 2 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "push")) + "atom-dual-2c") + +;; pop r64 is 1 cycle. UCODE if segreg, ignored +(define_insn_reservation "atom_pop" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "pop") + (eq_attr "mode" "DI"))) + "atom-dual-1c") + +;; pop non-r64 is 2 cycles. UCODE if segreg, ignored +(define_insn_reservation "atom_pop_2" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "pop") + (eq_attr "mode" "!DI"))) + "atom-dual-2c") + +;; UCODE if segreg, ignored +(define_insn_reservation "atom_call" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "call")) + "atom-dual-1c") + +(define_insn_reservation "atom_callv" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "callv")) + "atom-dual-1c") + +(define_insn_reservation "atom_leave" 3 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "leave")) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_str" 3 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "str")) + "atom-complex, atom-all-eu*2") + +(define_insn_reservation "atom_sselog" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "none"))) + "atom-simple-either") + +(define_insn_reservation "atom_sselog_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +(define_insn_reservation "atom_sselog1" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog1") + (eq_attr "memory" "none"))) + "atom-simple-0") + +(define_insn_reservation "atom_sselog1_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sselog1") + (eq_attr "memory" "!none"))) + "atom-simple-0") + +;; not pmad, not psad +(define_insn_reservation "atom_sseiadd" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "!simul") + (eq_attr "atom_unit" "!complex"))))) + "atom-simple-either") + +;; pmad, psad and 64 +(define_insn_reservation "atom_sseiadd_2" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "simul" ) + (eq_attr "mode" "DI"))))) + "atom-fmul-4c") + +;; pmad, psad and 128 +(define_insn_reservation "atom_sseiadd_3" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (and (not (match_operand:V2DI 0 "register_operand")) + (and (eq_attr "atom_unit" "simul" ) + (eq_attr "mode" "TI"))))) + "atom-fmul-5c") + +;; if paddq(64 bit op), phadd/phsub +(define_insn_reservation "atom_sseiadd_4" 6 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseiadd") + (ior (match_operand:V2DI 0 "register_operand") + (eq_attr "atom_unit" "complex")))) + "atom-complex, atom-all-eu*5") + +;; if immediate op. +(define_insn_reservation "atom_sseishft" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseishft") + (and (eq_attr "atom_unit" "!sishuf") + (match_operand 2 "immediate_operand")))) + "atom-simple-either") + +;; if palignr or psrldq +(define_insn_reservation "atom_sseishft_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseishft") + (and (eq_attr "atom_unit" "sishuf") + (match_operand 2 "immediate_operand")))) + "atom-simple-0") + +;; if reg/mem op +(define_insn_reservation "atom_sseishft_3" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseishft") + (not (match_operand 2 "immediate_operand")))) + "atom-complex, atom-all-eu") + +(define_insn_reservation "atom_sseimul" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "sseimul")) + "atom-simple-0") + +;; rcpss or rsqrtss +(define_insn_reservation "atom_sse" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (and (eq_attr "atom_sse_attr" "rcp") (eq_attr "mode" "SF")))) + "atom-fmul-4c") + +;; movshdup, movsldup. Suggest to type sseishft +(define_insn_reservation "atom_sse_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (eq_attr "atom_sse_attr" "movdup"))) + "atom-simple-0") + +;; lfence +(define_insn_reservation "atom_sse_3" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (eq_attr "atom_sse_attr" "lfence"))) + "atom-simple-either") + +;; sfence,clflush,mfence, prefetch +(define_insn_reservation "atom_sse_4" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (ior (eq_attr "atom_sse_attr" "fence") + (eq_attr "atom_sse_attr" "prefetch")))) + "atom-simple-0") + +;; rcpps, rsqrtss, sqrt, ldmxcsr +(define_insn_reservation "atom_sse_5" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sse") + (ior (ior (eq_attr "atom_sse_attr" "sqrt") + (eq_attr "atom_sse_attr" "mxcsr")) + (and (eq_attr "atom_sse_attr" "rcp") + (eq_attr "mode" "V4SF"))))) + "atom-complex, atom-all-eu*6") + +;; xmm->xmm +(define_insn_reservation "atom_ssemov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "xy")))) + "atom-simple-either") + +;; reg->xmm +(define_insn_reservation "atom_ssemov_2" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "xy") (match_operand 1 "register_operand" "r")))) + "atom-simple-0") + +;; xmm->reg +(define_insn_reservation "atom_ssemov_3" 3 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (match_operand 0 "register_operand" "r") (match_operand 1 "register_operand" "xy")))) + "atom-eu-0-3-1") + +;; mov mem +(define_insn_reservation "atom_ssemov_4" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (and (eq_attr "movu" "0") (eq_attr "memory" "!none")))) + "atom-simple-0") + +;; movu mem +(define_insn_reservation "atom_ssemov_5" 2 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemov") + (ior (eq_attr "movu" "1") (eq_attr "memory" "!none")))) + "atom-complex, atom-all-eu") + +;; no memory simple +(define_insn_reservation "atom_sseadd" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (and (eq_attr "memory" "none") + (and (eq_attr "mode" "!V2DF") + (eq_attr "atom_unit" "!complex"))))) + "atom-fadd-5c") + +;; memory simple +(define_insn_reservation "atom_sseadd_mem" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (and (eq_attr "memory" "!none") + (and (eq_attr "mode" "!V2DF") + (eq_attr "atom_unit" "!complex"))))) + "atom-dual-5c") + +;; maxps, minps, *pd, hadd, hsub +(define_insn_reservation "atom_sseadd_3" 8 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseadd") + (ior (eq_attr "mode" "V2DF") (eq_attr "atom_unit" "complex")))) + "atom-complex, atom-all-eu*7") + +;; Except dppd/dpps +(define_insn_reservation "atom_ssemul" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "!SF"))) + "atom-fmul-5c") + +;; Except dppd/dpps, 4 cycle if mulss +(define_insn_reservation "atom_ssemul_2" 4 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssemul") + (eq_attr "mode" "SF"))) + "atom-fmul-4c") + +(define_insn_reservation "atom_ssecmp" 1 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssecmp")) + "atom-simple-either") + +(define_insn_reservation "atom_ssecomi" 10 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssecomi")) + "atom-complex, atom-all-eu*9") + +;; no memory and cvtpi2ps, cvtps2pi, cvttps2pi +(define_insn_reservation "atom_ssecvt" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "register_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "register_operand"))))) + "atom-fadd-5c") + +;; memory and cvtpi2ps, cvtps2pi, cvttps2pi +(define_insn_reservation "atom_ssecvt_2" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "memory_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "memory_operand"))))) + "atom-dual-5c") + +;; otherwise. 7 cycles average for cvtss2sd +(define_insn_reservation "atom_ssecvt_3" 7 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "ssecvt") + (not (ior (and (match_operand:V2SI 0 "register_operand") + (match_operand:V4SF 1 "nonimmediate_operand")) + (and (match_operand:V4SF 0 "register_operand") + (match_operand:V2SI 1 "nonimmediate_operand")))))) + "atom-complex, atom-all-eu*6") + +;; memory and cvtsi2sd +(define_insn_reservation "atom_sseicvt" 5 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseicvt") + (and (match_operand:V2DF 0 "register_operand") + (match_operand:SI 1 "memory_operand")))) + "atom-dual-5c") + +;; otherwise. 8 cycles average for cvtsd2si +(define_insn_reservation "atom_sseicvt_2" 8 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "sseicvt") + (not (and (match_operand:V2DF 0 "register_operand") + (match_operand:SI 1 "memory_operand"))))) + "atom-complex, atom-all-eu*7") + +(define_insn_reservation "atom_ssediv" 62 + (and (eq_attr "cpu" "atom") + (eq_attr "type" "ssediv")) + "atom-complex, atom-all-eu*12, nothing*49") + +;; simple for fmov +(define_insn_reservation "atom_fmov" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "none"))) + "atom-simple-either") + +;; simple for fmov +(define_insn_reservation "atom_fmov_mem" 1 + (and (eq_attr "cpu" "atom") + (and (eq_attr "type" "fmov") + (eq_attr "memory" "!none"))) + "atom-simple-either") + +;; Define bypass here + +;; There will be no stall from lea to non-mem EX insns +(define_bypass 0 "atom_lea" + "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec, atom_setcc, atom_icmov, atom_pop") + +(define_bypass 0 "atom_lea" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "!ix86_agi_dependent") + +;; There will be 3 cycles stall from EX insns to AGAN insns LEA +(define_bypass 4 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_lea") + +;; There will be 3 cycles stall from EX insns to insns need addr calculation +(define_bypass 4 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_imovx_mem, atom_imovx_2_mem, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_negnot_mem, atom_imov_mem, atom_incdec_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imul_mem, atom_icmp_mem, + atom_test_mem, atom_icmov_mem, atom_sselog_mem, + atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_agi_dependent") + +;; Stall from imul to lea is 8 cycles. +(define_bypass 9 "atom_imul, atom_imul_mem" "atom_lea") + +;; Stall from imul to memory address is 8 cycles. +(define_bypass 9 "atom_imul, atom_imul_mem" + "atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_negnot_mem, atom_imov_mem, atom_incdec_mem, + atom_ishift_mem, atom_ishift1_mem, atom_rotate_mem, + atom_rotate1_mem, atom_imul_mem, atom_icmp_mem, + atom_test_mem, atom_icmov_mem, atom_sselog_mem, + atom_sselog1_mem, atom_fmov_mem, atom_sseadd_mem" + "ix86_agi_dependent") + +;; There will be 0 cycle stall from cmp/test to jcc + +;; There will be 1 cycle stall from flag producer to cmov and adc/sbb +(define_bypass 2 "atom_icmp, atom_test, atom_alu, atom_alu_carry, + atom_alu1, atom_negnot, atom_incdec, atom_ishift, + atom_ishift1, atom_rotate, atom_rotate1" + "atom_icmov, atom_alu_carry") + +;; lea to shift count stall is 2 cycles +(define_bypass 3 "atom_lea" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_dep_by_shift_count") + +;; lea to shift source stall is 1 cycle +(define_bypass 2 "atom_lea" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1" + "!ix86_dep_by_shift_count") + +;; non-lea to shift count stall is 1 cycle +(define_bypass 2 "atom_alu_carry, + atom_alu,atom_alu1,atom_negnot,atom_imov,atom_imovx, + atom_incdec,atom_ishift,atom_ishift1,atom_rotate, + atom_rotate1, atom_setcc, atom_icmov, atom_pop, + atom_alu_mem, atom_alu_carry_mem, atom_alu1_mem, + atom_imovx_mem, atom_imovx_2_mem, + atom_imov_mem, atom_icmov_mem, atom_fmov_mem" + "atom_ishift, atom_ishift1, atom_rotate, atom_rotate1, + atom_ishift_mem, atom_ishift1_mem, + atom_rotate_mem, atom_rotate1_mem" + "ix86_dep_by_shift_count") Index: gcc/config/i386/sse.md =================================================================== --- gcc/config/i386/sse.md (.../trunk) (revision 144197) +++ gcc/config/i386/sse.md (.../branches/ix86/atom) (revision 144446) @@ -338,6 +338,7 @@ && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "vmovup\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") + (set_attr "movu" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -363,6 +364,7 @@ && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "movup\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") + (set_attr "movu" "1") (set_attr "mode" "")]) (define_insn "avx_movdqu" @@ -373,6 +375,7 @@ "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "vmovdqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") + (set_attr "movu" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -383,6 +386,7 @@ "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" "movdqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") + (set_attr "movu" "1") (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) @@ -424,7 +428,7 @@ UNSPEC_MOVNT))] "TARGET_SSE2" "movntdq\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) @@ -434,7 +438,7 @@ UNSPEC_MOVNT))] "TARGET_SSE2" "movnti\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "mode" "V2DF")]) (define_insn "avx_lddqu" @@ -445,6 +449,7 @@ "TARGET_AVX" "vlddqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") + (set_attr "movu" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -454,7 +459,8 @@ UNSPEC_LDDQU))] "TARGET_SSE3" "lddqu\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") + (set_attr "movu" "1") (set_attr "prefix_rep" "1") (set_attr "mode" "TI")]) @@ -761,6 +767,7 @@ "TARGET_SSE" "%vrcpps\t{%1, %0|%0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") (set_attr "prefix" "maybe_vex") (set_attr "mode" "V4SF")]) @@ -787,6 +794,7 @@ "TARGET_SSE" "rcpss\t{%1, %0|%0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "rcp") (set_attr "mode" "SF")]) (define_expand "sqrtv8sf2" @@ -832,6 +840,7 @@ "TARGET_SSE" "%vsqrtps\t{%1, %0|%0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "V4SF")]) @@ -876,6 +885,7 @@ "SSE_VEC_FLOAT_MODE_P (mode)" "sqrts\t{%1, %0|%0, %1}" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "sqrt") (set_attr "mode" "")]) (define_expand "rsqrtv8sf2" @@ -1039,7 +1049,7 @@ (const_int 1)))] "SSE_VEC_FLOAT_MODE_P (mode)" "s\t{%2, %0|%0, %2}" - [(set_attr "type" "sse") + [(set_attr "type" "sseadd") (set_attr "mode" "")]) ;; These versions of the min/max patterns implement exactly the operations @@ -1175,6 +1185,7 @@ "TARGET_SSE3" "addsubpd\t{%2, %0|%0, %2}" [(set_attr "type" "sseadd") + (set_attr "atom_unit" "complex") (set_attr "mode" "V2DF")]) (define_insn "avx_hv4df3" @@ -1298,6 +1309,7 @@ "TARGET_SSE3" "hps\t{%2, %0|%0, %2}" [(set_attr "type" "sseadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_rep" "1") (set_attr "mode" "V4SF")]) @@ -5066,6 +5078,7 @@ "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)" "pmaddwd\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) @@ -7025,6 +7038,7 @@ movq\t{%H1, %0|%0, %H1} mov{q}\t{%H1, %0|%0, %H1}" [(set_attr "type" "ssemov,sseishft,ssemov,imov") + (set_attr "atom_unit" "*,sishuf,*,*") (set_attr "memory" "*,none,*,*") (set_attr "mode" "V2SF,TI,TI,DI")]) @@ -7057,6 +7071,7 @@ psrldq\t{$8, %0|%0, 8} movq\t{%H1, %0|%0, %H1}" [(set_attr "type" "ssemov,sseishft,ssemov") + (set_attr "atom_unit" "*,sishuf,*") (set_attr "memory" "*,none,*") (set_attr "mode" "V2SF,TI,TI")]) @@ -7614,6 +7629,7 @@ "TARGET_SSE2" "psadbw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") (set_attr "prefix_data16" "1") (set_attr "mode" "TI")]) @@ -7635,7 +7651,7 @@ UNSPEC_MOVMSK))] "SSE_VEC_FLOAT_MODE_P (mode)" "%vmovmskp\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) @@ -7645,7 +7661,7 @@ UNSPEC_MOVMSK))] "TARGET_SSE2" "%vpmovmskb\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix_data16" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "SI")]) @@ -7668,7 +7684,7 @@ "TARGET_SSE2 && !TARGET_64BIT" ;; @@@ check ordering of operands in intel/nonintel syntax "%vmaskmovdqu\t{%2, %1|%1, %2}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix_data16" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -7682,7 +7698,7 @@ "TARGET_SSE2 && TARGET_64BIT" ;; @@@ check ordering of operands in intel/nonintel syntax "%vmaskmovdqu\t{%2, %1|%1, %2}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix_data16" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) @@ -7693,6 +7709,7 @@ "TARGET_SSE" "%vldmxcsr\t%0" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "mxcsr") (set_attr "prefix" "maybe_vex") (set_attr "memory" "load")]) @@ -7702,6 +7719,7 @@ "TARGET_SSE" "%vstmxcsr\t%0" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "mxcsr") (set_attr "prefix" "maybe_vex") (set_attr "memory" "store")]) @@ -7720,6 +7738,7 @@ "TARGET_SSE || TARGET_3DNOW_A" "sfence" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "fence") (set_attr "memory" "unknown")]) (define_insn "sse2_clflush" @@ -7728,6 +7747,7 @@ "TARGET_SSE2" "clflush\t%a0" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "fence") (set_attr "memory" "unknown")]) (define_expand "sse2_mfence" @@ -7745,6 +7765,7 @@ "TARGET_64BIT || TARGET_SSE2" "mfence" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "fence") (set_attr "memory" "unknown")]) (define_expand "sse2_lfence" @@ -7762,6 +7783,7 @@ "TARGET_SSE2" "lfence" [(set_attr "type" "sse") + (set_attr "atom_sse_attr" "lfence") (set_attr "memory" "unknown")]) (define_insn "sse3_mwait" @@ -7885,6 +7907,7 @@ "TARGET_SSSE3" "phaddw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -7913,6 +7936,7 @@ "TARGET_SSSE3" "phaddw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -7967,6 +7991,7 @@ "TARGET_SSSE3" "phaddd\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -7987,6 +8012,7 @@ "TARGET_SSSE3" "phaddd\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8073,6 +8099,7 @@ "TARGET_SSSE3" "phaddsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8101,6 +8128,7 @@ "TARGET_SSSE3" "phaddsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8187,6 +8215,7 @@ "TARGET_SSSE3" "phsubw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8215,6 +8244,7 @@ "TARGET_SSSE3" "phsubw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8269,6 +8299,7 @@ "TARGET_SSSE3" "phsubd\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8289,6 +8320,7 @@ "TARGET_SSSE3" "phsubd\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8375,6 +8407,7 @@ "TARGET_SSSE3" "phsubsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8403,6 +8436,7 @@ "TARGET_SSSE3" "phsubsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "complex") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8509,6 +8543,7 @@ "TARGET_SSSE3" "pmaddubsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8547,6 +8582,7 @@ "TARGET_SSSE3" "pmaddubsw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") + (set_attr "atom_unit" "simul") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8754,6 +8790,7 @@ return "palignr\t{%3, %2, %0|%0, %2, %3}"; } [(set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") (set_attr "prefix_data16" "1") (set_attr "prefix_extra" "1") (set_attr "mode" "TI")]) @@ -8770,6 +8807,7 @@ return "palignr\t{%3, %2, %0|%0, %2, %3}"; } [(set_attr "type" "sseishft") + (set_attr "atom_unit" "sishuf") (set_attr "prefix_extra" "1") (set_attr "mode" "DI")]) @@ -8956,7 +8994,7 @@ UNSPEC_MOVNTDQA))] "TARGET_SSE4_1" "%vmovntdqa\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) Index: gcc/config/i386/i386-c.c =================================================================== --- gcc/config/i386/i386-c.c (.../trunk) (revision 144197) +++ gcc/config/i386/i386-c.c (.../branches/ix86/atom) (revision 144446) @@ -119,6 +119,10 @@ def_or_undef (parse_in, "__core2"); def_or_undef (parse_in, "__core2__"); break; + case PROCESSOR_ATOM: + def_or_undef (parse_in, "__atom"); + def_or_undef (parse_in, "__atom__"); + break; /* use PROCESSOR_max to not set/unset the arch macro. */ case PROCESSOR_max: break; @@ -187,6 +191,9 @@ case PROCESSOR_CORE2: def_or_undef (parse_in, "__tune_core2__"); break; + case PROCESSOR_ATOM: + def_or_undef (parse_in, "__tune_atom__"); + break; case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: break; Index: gcc/config/i386/i386-protos.h =================================================================== --- gcc/config/i386/i386-protos.h (.../trunk) (revision 144197) +++ gcc/config/i386/i386-protos.h (.../branches/ix86/atom) (revision 144446) @@ -85,6 +85,9 @@ extern void ix86_expand_binary_operator (enum rtx_code, enum machine_mode, rtx[]); extern int ix86_binary_operator_ok (enum rtx_code, enum machine_mode, rtx[]); +extern bool ix86_lea_for_add_ok (enum rtx_code, rtx, rtx[]); +extern bool ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn); +extern bool ix86_agi_dependent (rtx set_insn, rtx use_insn); extern void ix86_expand_unary_operator (enum rtx_code, enum machine_mode, rtx[]); extern rtx ix86_build_const_vector (enum machine_mode, bool, rtx); Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (.../trunk) (revision 144197) +++ gcc/config/i386/i386.c (.../branches/ix86/atom) (revision 144446) @@ -1036,6 +1036,79 @@ 1, /* cond_not_taken_branch_cost. */ }; +static const +struct processor_costs atom_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (2)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (26), /* HI */ + COSTS_N_INSNS (42), /* SI */ + COSTS_N_INSNS (74), /* DI */ + COSTS_N_INSNS (74)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 17, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {4, 4, 4}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {12, 12, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {8, 8}, /* cost of loading MMX registers + in SImode and DImode */ + {8, 8}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {8, 8, 8}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {8, 8, 8}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 5, /* MMX or SSE register to integer */ + 32, /* size of l1 cache. */ + 256, /* size of l2 cache. */ + 64, /* size of prefetch block */ + 6, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (8), /* cost of FMUL instruction. */ + COSTS_N_INSNS (20), /* cost of FDIV instruction. */ + COSTS_N_INSNS (8), /* cost of FABS instruction. */ + COSTS_N_INSNS (8), /* cost of FCHS instruction. */ + COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {64, rep_prefix_4_byte}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {15, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{24, loop}, {32, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 1, /* scalar_stmt_cost. */ + 1, /* scalar load_cost. */ + 1, /* scalar_store_cost. */ + 1, /* vec_stmt_cost. */ + 1, /* vec_to_scalar_cost. */ + 1, /* scalar_to_vec_cost. */ + 1, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 1, /* vec_store_cost. */ + 3, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + /* Generic64 should produce code tuned for Nocona and K8. */ static const struct processor_costs generic64_cost = { @@ -1194,6 +1267,7 @@ #define m_PENT4 (1<preds) + if (e->src == bb) + { + simple_loop = true; + break; + } + + if (simple_loop) + { + rtx prev = BB_END (bb); + while (prev + && prev != insn + && distance < LEA_SEARCH_THRESHOLD) + { + if (INSN_P (prev)) + { + distance++; + if ((reg_op1 && reg_set_p (reg_op1, prev)) + || (reg_op2 && reg_set_p (reg_op2, prev))) + return distance; + } + prev = PREV_INSN (prev); + } + } + } + + return -1; +} + +/* Return the distance between this insn and the next insn that uses + result of this insn as memory address. + Return -1 if not found such a use within LEA_SEARCH_THRESHOLD. */ +static int +distance_agu_use (rtx op0, rtx insn) +{ + basic_block bb = BLOCK_FOR_INSN (insn); + int distance = 0; + + if (insn != BB_END(bb)) + { + rtx next = NEXT_INSN (insn); + + while (next && distance < LEA_SEARCH_THRESHOLD) + { + if (INSN_P (next)) + { + distance++; + if (reg_mentioned_by_mem_p (op0, next)) + return distance; + if (reg_set_p (op0, next)) + return -1; + } + if (next == BB_END (bb)) + break; + next = NEXT_INSN (next); + } + } + + if (distance < LEA_SEARCH_THRESHOLD) + { + edge e; + edge_iterator ei; + bool simple_loop = false; + + FOR_EACH_EDGE (e, ei, bb->succs) + if (e->dest == bb) + { + simple_loop = true; + break; + } + + if (simple_loop) + { + rtx next = BB_HEAD (bb); + while (next && distance < LEA_SEARCH_THRESHOLD) + { + if (next == insn) + break; + if (INSN_P (next)) + { + distance++; + if (reg_mentioned_by_mem_p (op0, next)) + return distance; + if (reg_set_p (op0, next)) + return -1; + } + next = NEXT_INSN (next); + } + } + } + + return -1; +} + +/* Define this macro to tune LEA priority vs ADD, it take effect when + there is a dilemma of choicing LEA or ADD + Negative value: ADD is more preferred than LEA + Zero: Netrual + Positive value: LEA is more preferred than ADD*/ +#define IX86_LEA_PRIORITY 2 + +/* Return true if it is ok to optimize an ADD operation to LEA + operation to avoid flag register consumation. For the processors + like ATOM, if the destination register of LEA holds an actual + address which will be used soon, LEA is better and otherwise ADD + is better. */ + +bool +ix86_lea_for_add_ok (enum rtx_code code ATTRIBUTE_UNUSED, + rtx insn, + rtx operands[]) +{ + gcc_assert (REG_P (operands[0])); + gcc_assert (operands[1] && operands[2]); + + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + { + if (true_regnum (operands[0]) != true_regnum (operands[1])) + return true; + else + return false; + } + + /* If a = b + c, (a!=b && a!=c), must use lea form. */ + if (true_regnum (operands[0]) != true_regnum (operands[1]) + && true_regnum (operands[0]) != true_regnum (operands[2])) + return true; + else + { + int dist_define, dist_use; + dist_define = distance_non_agu_define (operands[1], + operands[2], insn); + if (dist_define <= 0) + return true; + + /* If this insn has both backward non-agu dependence and forward + agu dependence, the one with short distance take effect. */ + dist_use = distance_agu_use (operands[0], insn); + if (dist_use <= 0 + || (dist_define + IX86_LEA_PRIORITY) < dist_use) + return false; + + return true; + } +} + +/* Return true if destination reg of SET_INSN is shift count of + USE_INSN. */ + +bool +ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn) +{ + rtx set_pattern = PATTERN (set_insn); + rtx set_dest; + rtx shift_rtx; + rtx use_pattern; + + /* Retrieve destination of set_insn */ + switch (GET_CODE (set_pattern)) + { + case SET: + set_dest = SET_DEST (set_pattern); + break; + case PARALLEL: + set_pattern = XVECEXP (set_pattern, 0, 0); + if (GET_CODE (set_pattern ) == SET) + { + set_dest = SET_DEST (set_pattern); + break; + } + default: + set_dest = NULL; + break; + } + if (!set_dest || !REG_P (set_dest)) + return false; + + /* Retrieve shift count of use_insn */ + use_pattern = PATTERN (use_insn); + switch (GET_CODE (use_pattern)) + { + case SET: + shift_rtx = XEXP (use_pattern, 1); + break; + case PARALLEL: + set_pattern = XVECEXP (use_pattern, 0, 0); + if (GET_CODE (set_pattern) == SET) + { + shift_rtx = XEXP (set_pattern, 1); + break; + } + default: + shift_rtx = NULL; + break; + } + + if (shift_rtx + && (GET_CODE (shift_rtx) == ASHIFT + || GET_CODE (shift_rtx) == LSHIFTRT + || GET_CODE (shift_rtx) == ASHIFTRT + || GET_CODE (shift_rtx) == ROTATE + || GET_CODE (shift_rtx) == ROTATERT)) + { + rtx shift_count = XEXP (shift_rtx, 1); + gcc_assert (shift_count); + + /* Return true if shift count is dest of set_insn */ + if (REG_P (shift_count) + && true_regnum (set_dest) == true_regnum (shift_count)) + return true; + } + + return false; +} + /* Return TRUE or FALSE depending on whether the unary operator meets the appropriate constraints. */ @@ -18943,6 +19291,7 @@ switch (ix86_tune) { case PROCESSOR_PENTIUM: + case PROCESSOR_ATOM: case PROCESSOR_K6: return 2; @@ -19009,41 +19358,21 @@ return 1; } -/* A subroutine of ix86_adjust_cost -- return true iff INSN has a memory - address with operands set by DEP_INSN. */ +/* Return true iff USE_INSN has a memory address with operands set by + SET_INSN. */ -static int -ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) +bool +ix86_agi_dependent (rtx set_insn, rtx use_insn) { - rtx addr; - - if (insn_type == TYPE_LEA - && TARGET_PENTIUM) - { - addr = PATTERN (insn); - - if (GET_CODE (addr) == PARALLEL) - addr = XVECEXP (addr, 0, 0); - - gcc_assert (GET_CODE (addr) == SET); - - addr = SET_SRC (addr); - } - else - { - int i; - extract_insn_cached (insn); - for (i = recog_data.n_operands - 1; i >= 0; --i) - if (MEM_P (recog_data.operand[i])) - { - addr = XEXP (recog_data.operand[i], 0); - goto found; - } - return 0; - found:; - } - - return modified_in_p (addr, dep_insn); + int i; + extract_insn_cached (use_insn); + for (i = recog_data.n_operands - 1; i >= 0; --i) + if (MEM_P (recog_data.operand[i])) + { + rtx addr = XEXP (recog_data.operand[i], 0); + return modified_in_p (addr, set_insn) != 0; + } + return false; } static int @@ -19071,9 +19400,20 @@ { case PROCESSOR_PENTIUM: /* Address Generation Interlock adds a cycle of latency. */ - if (ix86_agi_dependent (insn, dep_insn, insn_type)) - cost += 1; + if (insn_type == TYPE_LEA) + { + rtx addr = PATTERN (insn); + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + + gcc_assert (GET_CODE (addr) == SET); + + addr = SET_SRC (addr); + if (modified_in_p (addr, dep_insn)) + cost += 1; + } + /* ??? Compares pair with jump/setcc. */ if (ix86_flags_dependent (insn, dep_insn, insn_type)) cost = 0; @@ -19081,7 +19421,7 @@ /* Floating point stores require value to be ready one cycle earlier. */ if (insn_type == TYPE_FMOV && get_attr_memory (insn) == MEMORY_STORE - && !ix86_agi_dependent (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (dep_insn, insn)) cost += 1; break; @@ -19104,7 +19444,7 @@ in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (dep_insn, insn)) { /* Claim moves to take one cycle, as core can issue one load at time and the next load can start cycle later. */ @@ -19133,7 +19473,7 @@ in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (dep_insn, insn)) { /* Claim moves to take one cycle, as core can issue one load at time and the next load can start cycle later. */ @@ -19150,6 +19490,7 @@ case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: + case PROCESSOR_ATOM: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: memory = get_attr_memory (insn); @@ -19158,7 +19499,7 @@ in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependent (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (dep_insn, insn)) { enum attr_unit unit = get_attr_unit (insn); int loadcost = 3;