Attachment 902411 Details for Bug 939335 – GCC 15 Zen5 tunings for GCC 14 (experimental backport)

[patch] GCC 15 Zen5 tunings for GCC 14 (experimental backport)

backport-zen5-gcc14.patch (text/plain), 13.77 KB, created by Adrien Dessemond on 2024-09-08 16:41:13 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Adrien Dessemond

Created: 2024-09-08 16:41:13 UTC

Size: 13.77 KB

patch

obsolete

>diff '--color=auto' -aurd a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
>--- a/gcc/config/i386/i386.cc	2024-08-31 18:32:19.000000000 -0400
>+++ b/gcc/config/i386/i386.cc	2024-09-08 10:03:44.138531523 -0400
>@@ -24502,14 +24502,19 @@
>       if (width == 1)
> 	return 1;
> 
>-      /* Integer vector instructions execute in FP unit
>+      /* Znver1-4 Integer vector instructions execute in FP unit
> 	 and can execute 3 additions and one multiplication per cycle.  */
>       if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
>-	   || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
>-	   || ix86_tune == PROCESSOR_ZNVER5)
>+	    || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
>    	  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
> 	return 1;
> 
>+     /* Znver5 can do 2 integer multiplications per cycle with latency
>+       of 3.  */
>+      if (ix86_tune == PROCESSOR_ZNVER5
>+         && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
>+        width = 6;
>+
>       /* Account for targets that splits wide vectors into multiple parts.  */
>       if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
> 	div = GET_MODE_BITSIZE (mode) / 256;
>diff '--color=auto' -aurd a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
>--- a/gcc/config/i386/i386.h	2024-08-31 18:32:19.000000000 -0400
>+++ b/gcc/config/i386/i386.h	2024-09-08 09:48:54.521515174 -0400
>@@ -427,6 +427,8 @@
> 	ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
> #define TARGET_FUSE_ALU_AND_BRANCH \
> 	ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
>+#define TARGET_FUSE_MOV_AND_ALU \
>+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]   
> #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
> #define TARGET_AVOID_LEA_FOR_ADDR \
> 	ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
>diff '--color=auto' -aurd a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
>--- a/gcc/config/i386/x86-tune-costs.h	2024-08-31 18:32:19.000000000 -0400
>+++ b/gcc/config/i386/x86-tune-costs.h	2024-09-08 11:51:50.178162215 -0400
>@@ -2034,6 +2034,7 @@
>   COSTS_N_INSNS (1),			/* cost of a lea instruction.  */
>   COSTS_N_INSNS (1),			/* variable shift costs.  */
>   COSTS_N_INSNS (1),			/* constant shift costs.  */
>+  /* mul has latency 3, executes in 3 integer units.  */
>   {COSTS_N_INSNS (3),			/* cost of starting multiply for QI.  */
>    COSTS_N_INSNS (3),			/* 				 HI.  */
>    COSTS_N_INSNS (3),			/*				 SI.  */
>@@ -2041,6 +2042,9 @@
>    COSTS_N_INSNS (3)},			/*			other.  */
>   0,					/* cost of multiply per each bit
> 					   set.  */
>+
>+ /* integer divide has latency of 8 cycles
>+    plus 1 for every 9 bits of quotient.  */
>   {COSTS_N_INSNS (10),			/* cost of a divide/mod for QI.  */
>    COSTS_N_INSNS (11),			/* 			    HI.  */
>    COSTS_N_INSNS (13),			/*			    SI.  */
>@@ -2048,7 +2052,7 @@
>    COSTS_N_INSNS (16)},			/*			    other.  */
>   COSTS_N_INSNS (1),			/* cost of movsx.  */
>   COSTS_N_INSNS (1),			/* cost of movzx.  */
>-  8,					/* "large" insn.  */
>+  15,					/* "large" insn.  */
>   9,					/* MOVE_RATIO.  */
>   6,					/* CLEAR_RATIO */
>   {6, 6, 6},				/* cost of loading integer registers
>@@ -2070,7 +2074,7 @@
>      is 5 uops.  */
>   14, 10,				/* Gather load static, per_elt.  */
>   14, 20,				/* Gather store static, per_elt.  */
>-  32,					/* size of l1 cache.  */
>+  48,					/* size of l1 cache.  */
>   1024,					/* size of l2 cache.  */
>   64,					/* size of prefetch block.  */
>   /* New AMD processors never drop prefetches; if they cannot be performed
>@@ -2080,6 +2084,8 @@
>      time).  */
>   100,					/* number of parallel prefetches.  */
>   3,					/* Branch cost.  */
>+  /* TODO x87 latencies are still based on znver4.
>+     Probably not very important these days.  */
>   COSTS_N_INSNS (7),			/* cost of FADD and FSUB insns.  */
>   COSTS_N_INSNS (7),			/* cost of FMUL instruction.  */
>   /* Latency of fdiv is 8-15.  */
>@@ -2088,28 +2094,38 @@
>   COSTS_N_INSNS (1),			/* cost of FCHS instruction.  */
>   /* Latency of fsqrt is 4-10.  */
>   COSTS_N_INSNS (25),			/* cost of FSQRT instruction.  */
>-
>+  /* SSE instructions have typical throughput 4 and latency 1.  */
>   COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
>-  COSTS_N_INSNS (3),			/* cost of ADDSS/SD SUBSS/SD insns.  */
>+  /* ADDSS has throughput 2 and latency 2
>+     (in some cases when source is another addition).  */
>+  COSTS_N_INSNS (2),      /* cost of ADDSS/SD SUBSS/SD insns.  */
>+  /* MULSS has throughput 2 and latency 3.  */
>   COSTS_N_INSNS (3),			/* cost of MULSS instruction.  */
>   COSTS_N_INSNS (3),			/* cost of MULSD instruction.  */
>+  /* FMA had throughput 2 and latency 4.  */
>   COSTS_N_INSNS (4),			/* cost of FMA SS instruction.  */
>   COSTS_N_INSNS (4),			/* cost of FMA SD instruction.  */
>+  /* DIVSS has throughtput 0.4 and latency 10.  */  
>   COSTS_N_INSNS (10),			/* cost of DIVSS instruction.  */
>-  /* 9-13.  */
>+  /* DIVSD has throughtput 0.25 and latency 13.  */
>   COSTS_N_INSNS (13),			/* cost of DIVSD instruction.  */
>+  /* DIVSD has throughtput 0.22 and latency 14.  */
>   COSTS_N_INSNS (14),			/* cost of SQRTSS instruction.  */
>+  /* DIVSD has throughtput 0.13 and latency 20.  */
>   COSTS_N_INSNS (20),			/* cost of SQRTSD instruction.  */
>-  /* Zen can execute 4 integer operations per cycle.  FP operations
>-     take 3 cycles and it can execute 2 integer additions and 2
>-     multiplications thus reassociation may make sense up to with of 6.
>-     SPEC2k6 bencharks suggests
>-     that 4 works better than 6 probably due to register pressure.
>+  /* Zen5 can execute:
>+      - integer ops: 6 per cycle, at most 3 multiplications.
>+       latency 1 for additions, 3 for multiplications (pipelined)
> 
>-     Integer vector operations are taken by FP unit and execute 3 vector
>-     plus/minus operations per cycle but only one multiply.  This is adjusted
>-     in ix86_reassociation_width.  */
>-  4, 4, 3, 6,				/* reassoc int, fp, vec_int, vec_fp.  */
>+       Setting width of 9 for multiplication is probably excessive
>+       for register pressure.
>+      - fp ops: 2 additions per cycle, latency 2-3
>+               2 multiplicaitons per cycle, latency 3
>+      - vector intger ops: 4 additions, latency 1
>+                          2 multiplications, latency 4
>+       We increase width to 6 for multiplications
>+       in ix86_reassociation_width.  */
>+  6, 6, 4, 6,                          /* reassoc int, fp, vec_int, vec_fp.  */
>   znver2_memcpy,
>   znver2_memset,
>   COSTS_N_INSNS (4),			/* cond_taken_branch_cost.  */
>diff '--color=auto' -aurd a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
>--- a/gcc/config/i386/x86-tune-sched.cc	2024-08-31 18:32:19.000000000 -0400
>+++ b/gcc/config/i386/x86-tune-sched.cc	2024-09-08 11:52:33.241167442 -0400
>@@ -69,7 +69,6 @@
>     case PROCESSOR_ZNVER2:
>     case PROCESSOR_ZNVER3:
>     case PROCESSOR_ZNVER4:
>-    case PROCESSOR_ZNVER5:
>     case PROCESSOR_CORE2:
>     case PROCESSOR_NEHALEM:
>     case PROCESSOR_SANDYBRIDGE:
>@@ -92,6 +91,13 @@
>       return 5;
> 
>     case PROCESSOR_SAPPHIRERAPIDS:
>+    /* For znver5 decoder can handle 4 or 8 instructions per cycle,
>+       op cache 12 instruction/cycle, dispatch 8 instructions
>+       integer rename 8 instructions and Fp 6 instructions.
>+
>+       The scheduler, without understanding out of order nature of the CPU
>+       is unlikely going to be able to fill all of these.  */
>+    case PROCESSOR_ZNVER5:
>       return 6;
> 
>     default:
>@@ -434,7 +440,8 @@
> 	{
> 	  enum attr_unit unit = get_attr_unit (insn);
> 	  int loadcost;
>-
>+    /* TODO: On znver5 complex addressing modes have
>+       greater latency.  */
> 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
> 	    loadcost = 4;
> 	  else
>@@ -565,6 +572,61 @@
>   return TARGET_FUSE_CMP_AND_BRANCH;
> }
> 
>+static bool
>+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
>+{
>+  /* Validate mov:
>+      - It should be reg-reg move with opcode 0x89 or 0x8B.  */
>+  rtx set1 = PATTERN (mov);
>+  if (GET_CODE (set1) != SET
>+      || !GENERAL_REG_P (SET_SRC (set1))
>+      || !GENERAL_REG_P (SET_DEST (set1)))
>+    return false;
>+  rtx reg = SET_DEST (set1);
>+  /*  - it should have 0x89 or 0x8B opcode.  */
>+  if (!INTEGRAL_MODE_P (GET_MODE (reg))
>+      || GET_MODE_SIZE (GET_MODE (reg)) < 2
>+      || GET_MODE_SIZE (GET_MODE (reg)) > 8)
>+   return false;
>+  /* Validate ALU.  */
>+ if (GET_CODE (PATTERN (alu)) != PARALLEL)
>+    return false;
>+  rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
>+  if (GET_CODE (set2) != SET)
>+   return false;
>+ /* Match one of:
>+    ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
>+    We also may add insn attribute to handle some of sporadic
>+    case we output those with different RTX expressions.  */
>+
>+  if (GET_CODE (SET_SRC (set2)) != PLUS
>+      && GET_CODE (SET_SRC (set2)) != MINUS
>+      && GET_CODE (SET_SRC (set2)) != XOR
>+      && GET_CODE (SET_SRC (set2)) != AND
>+      && GET_CODE (SET_SRC (set2)) != IOR
>+      && GET_CODE (SET_SRC (set2)) != NOT
>+      && GET_CODE (SET_SRC (set2)) != ASHIFT
>+      && GET_CODE (SET_SRC (set2)) != ASHIFTRT
>+      && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
>+    return false;
>+  rtx op0 = XEXP (SET_SRC (set2), 0);
>+  rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
>+  /* One of operands should be register.  */
>+  if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
>+    std::swap (op0, op1);
>+  if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
>+    return false;
>+  if (op1
>+      && !REG_P (op1)
>+      && !x86_64_immediate_operand (op1, VOIDmode))
>+    return false;
>+  /* Only one of two paramters must be move destination.  */
>+  if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
>+    return false;
>+  return true;
>+}
>+
>+
> /* Check whether current microarchitecture support macro fusion
>    for insn pair "CONDGEN + CONDJMP". Refer to
>    "Intel Architectures Optimization Reference Manual". */
>@@ -572,6 +634,9 @@
> bool
> ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
> {
>+  if (TARGET_FUSE_MOV_AND_ALU
>+      && ix86_fuse_mov_alu_p (condgen, condjmp))
>+    return true;
>   rtx src, dest;
>   enum rtx_code ccode;
>   rtx compare_set = NULL_RTX, test_if, cond;
>diff '--color=auto' -aurd a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
>--- a/gcc/config/i386/x86-tune.def	2024-08-31 18:32:19.000000000 -0400
>+++ b/gcc/config/i386/x86-tune.def	2024-09-08 11:52:08.570164243 -0400
>@@ -143,10 +143,16 @@
> 
> /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
>    jump instruction when the alu instruction produces the CCFLAG consumed by
>-   the conditional jump instruction. */
>+   the conditional jump instruction.
>+   TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
>+   There is also limitation for immediate and displacement supported.  */
> DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
>-		  m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
>+		  m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
> 
>+/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
>+   and the destination is used by alu.  alu must be one of
>+   ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR.  */
>+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
> 
> /*****************************************************************************/
> /* Function prologue, epilogue and function calling sequences.               */
>@@ -483,49 +489,50 @@
> /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
>-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
>+	  ~(m_ZNVER | m_CORE_HYBRID
> 	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
> 
> /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
>-	  ~(m_ZNVER4))
>+	  ~(m_ZNVER4 | m_ZNVER5))
> 
> /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
>-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
>+	  ~(m_ZNVER | m_CORE_HYBRID
> 	    | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS))
> 
> /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
>-	  ~(m_ZNVER4))
>+	  ~(m_ZNVER4  | m_ZNVER5))
> 
> /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
>-	  ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
>+	  ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
> 	    | m_YONGFENG | m_GENERIC | m_GDS))
> 
> /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
>    elements.  */
> DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
>-	  ~(m_ZNVER4))
>+	  ~(m_ZNVER4 | m_ZNVER5))
> 
> /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
>    smaller FMA chain.  */
>-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4
>+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
>           | m_YONGFENG | m_GENERIC)
> 
> /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
>    smaller FMA chain.  */
>-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4
>-	  | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
>+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
>+         m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
>+         | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
> 
> /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
>    smaller FMA chain.  */
>-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
>+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
> 
> /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
>    for v2df vector reduction.  */

Actions: View | Diff

Attachments on bug 939335: 902411