Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 902411 Details for
Bug 939335
<sys-devel/gcc-15: AMD Zen5 (znver5) tunings backport for GCC 14
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
GCC 15 Zen5 tunings for GCC 14 (experimental backport)
backport-zen5-gcc14.patch (text/plain), 13.77 KB, created by
Adrien Dessemond
on 2024-09-08 16:41:13 UTC
(
hide
)
Description:
GCC 15 Zen5 tunings for GCC 14 (experimental backport)
Filename:
MIME Type:
Creator:
Adrien Dessemond
Created:
2024-09-08 16:41:13 UTC
Size:
13.77 KB
patch
obsolete
>diff '--color=auto' -aurd a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc >--- a/gcc/config/i386/i386.cc 2024-08-31 18:32:19.000000000 -0400 >+++ b/gcc/config/i386/i386.cc 2024-09-08 10:03:44.138531523 -0400 >@@ -24502,14 +24502,19 @@ > if (width == 1) > return 1; > >- /* Integer vector instructions execute in FP unit >+ /* Znver1-4 Integer vector instructions execute in FP unit > and can execute 3 additions and one multiplication per cycle. */ > if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2 >- || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4 >- || ix86_tune == PROCESSOR_ZNVER5) >+ || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4) > && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) > return 1; > >+ /* Znver5 can do 2 integer multiplications per cycle with latency >+ of 3. */ >+ if (ix86_tune == PROCESSOR_ZNVER5 >+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS) >+ width = 6; >+ > /* Account for targets that splits wide vectors into multiple parts. */ > if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256) > div = GET_MODE_BITSIZE (mode) / 256; >diff '--color=auto' -aurd a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h >--- a/gcc/config/i386/i386.h 2024-08-31 18:32:19.000000000 -0400 >+++ b/gcc/config/i386/i386.h 2024-09-08 09:48:54.521515174 -0400 >@@ -427,6 +427,8 @@ > ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] > #define TARGET_FUSE_ALU_AND_BRANCH \ > ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] >+#define TARGET_FUSE_MOV_AND_ALU \ >+ ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] > #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] > #define TARGET_AVOID_LEA_FOR_ADDR \ > ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] >diff '--color=auto' -aurd a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h >--- a/gcc/config/i386/x86-tune-costs.h 2024-08-31 18:32:19.000000000 -0400 >+++ b/gcc/config/i386/x86-tune-costs.h 2024-09-08 11:51:50.178162215 -0400 >@@ -2034,6 +2034,7 @@ > COSTS_N_INSNS (1), /* cost of a lea instruction. */ > COSTS_N_INSNS (1), /* variable shift costs. */ > COSTS_N_INSNS (1), /* constant shift costs. */ >+ /* mul has latency 3, executes in 3 integer units. */ > {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */ > COSTS_N_INSNS (3), /* HI. */ > COSTS_N_INSNS (3), /* SI. */ >@@ -2041,6 +2042,9 @@ > COSTS_N_INSNS (3)}, /* other. */ > 0, /* cost of multiply per each bit > set. */ >+ >+ /* integer divide has latency of 8 cycles >+ plus 1 for every 9 bits of quotient. */ > {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */ > COSTS_N_INSNS (11), /* HI. */ > COSTS_N_INSNS (13), /* SI. */ >@@ -2048,7 +2052,7 @@ > COSTS_N_INSNS (16)}, /* other. */ > COSTS_N_INSNS (1), /* cost of movsx. */ > COSTS_N_INSNS (1), /* cost of movzx. */ >- 8, /* "large" insn. */ >+ 15, /* "large" insn. */ > 9, /* MOVE_RATIO. */ > 6, /* CLEAR_RATIO */ > {6, 6, 6}, /* cost of loading integer registers >@@ -2070,7 +2074,7 @@ > is 5 uops. */ > 14, 10, /* Gather load static, per_elt. */ > 14, 20, /* Gather store static, per_elt. */ >- 32, /* size of l1 cache. */ >+ 48, /* size of l1 cache. */ > 1024, /* size of l2 cache. */ > 64, /* size of prefetch block. */ > /* New AMD processors never drop prefetches; if they cannot be performed >@@ -2080,6 +2084,8 @@ > time). */ > 100, /* number of parallel prefetches. */ > 3, /* Branch cost. */ >+ /* TODO x87 latencies are still based on znver4. >+ Probably not very important these days. */ > COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */ > COSTS_N_INSNS (7), /* cost of FMUL instruction. */ > /* Latency of fdiv is 8-15. */ >@@ -2088,28 +2094,38 @@ > COSTS_N_INSNS (1), /* cost of FCHS instruction. */ > /* Latency of fsqrt is 4-10. */ > COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ >- >+ /* SSE instructions have typical throughput 4 and latency 1. */ > COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ >- COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */ >+ /* ADDSS has throughput 2 and latency 2 >+ (in some cases when source is another addition). */ >+ COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */ >+ /* MULSS has throughput 2 and latency 3. */ > COSTS_N_INSNS (3), /* cost of MULSS instruction. */ > COSTS_N_INSNS (3), /* cost of MULSD instruction. */ >+ /* FMA had throughput 2 and latency 4. */ > COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ > COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ >+ /* DIVSS has throughtput 0.4 and latency 10. */ > COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ >- /* 9-13. */ >+ /* DIVSD has throughtput 0.25 and latency 13. */ > COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ >+ /* DIVSD has throughtput 0.22 and latency 14. */ > COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */ >+ /* DIVSD has throughtput 0.13 and latency 20. */ > COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */ >- /* Zen can execute 4 integer operations per cycle. FP operations >- take 3 cycles and it can execute 2 integer additions and 2 >- multiplications thus reassociation may make sense up to with of 6. >- SPEC2k6 bencharks suggests >- that 4 works better than 6 probably due to register pressure. >+ /* Zen5 can execute: >+ - integer ops: 6 per cycle, at most 3 multiplications. >+ latency 1 for additions, 3 for multiplications (pipelined) > >- Integer vector operations are taken by FP unit and execute 3 vector >- plus/minus operations per cycle but only one multiply. This is adjusted >- in ix86_reassociation_width. */ >- 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */ >+ Setting width of 9 for multiplication is probably excessive >+ for register pressure. >+ - fp ops: 2 additions per cycle, latency 2-3 >+ 2 multiplicaitons per cycle, latency 3 >+ - vector intger ops: 4 additions, latency 1 >+ 2 multiplications, latency 4 >+ We increase width to 6 for multiplications >+ in ix86_reassociation_width. */ >+ 6, 6, 4, 6, /* reassoc int, fp, vec_int, vec_fp. */ > znver2_memcpy, > znver2_memset, > COSTS_N_INSNS (4), /* cond_taken_branch_cost. */ >diff '--color=auto' -aurd a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc >--- a/gcc/config/i386/x86-tune-sched.cc 2024-08-31 18:32:19.000000000 -0400 >+++ b/gcc/config/i386/x86-tune-sched.cc 2024-09-08 11:52:33.241167442 -0400 >@@ -69,7 +69,6 @@ > case PROCESSOR_ZNVER2: > case PROCESSOR_ZNVER3: > case PROCESSOR_ZNVER4: >- case PROCESSOR_ZNVER5: > case PROCESSOR_CORE2: > case PROCESSOR_NEHALEM: > case PROCESSOR_SANDYBRIDGE: >@@ -92,6 +91,13 @@ > return 5; > > case PROCESSOR_SAPPHIRERAPIDS: >+ /* For znver5 decoder can handle 4 or 8 instructions per cycle, >+ op cache 12 instruction/cycle, dispatch 8 instructions >+ integer rename 8 instructions and Fp 6 instructions. >+ >+ The scheduler, without understanding out of order nature of the CPU >+ is unlikely going to be able to fill all of these. */ >+ case PROCESSOR_ZNVER5: > return 6; > > default: >@@ -434,7 +440,8 @@ > { > enum attr_unit unit = get_attr_unit (insn); > int loadcost; >- >+ /* TODO: On znver5 complex addressing modes have >+ greater latency. */ > if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) > loadcost = 4; > else >@@ -565,6 +572,61 @@ > return TARGET_FUSE_CMP_AND_BRANCH; > } > >+static bool >+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) >+{ >+ /* Validate mov: >+ - It should be reg-reg move with opcode 0x89 or 0x8B. */ >+ rtx set1 = PATTERN (mov); >+ if (GET_CODE (set1) != SET >+ || !GENERAL_REG_P (SET_SRC (set1)) >+ || !GENERAL_REG_P (SET_DEST (set1))) >+ return false; >+ rtx reg = SET_DEST (set1); >+ /* - it should have 0x89 or 0x8B opcode. */ >+ if (!INTEGRAL_MODE_P (GET_MODE (reg)) >+ || GET_MODE_SIZE (GET_MODE (reg)) < 2 >+ || GET_MODE_SIZE (GET_MODE (reg)) > 8) >+ return false; >+ /* Validate ALU. */ >+ if (GET_CODE (PATTERN (alu)) != PARALLEL) >+ return false; >+ rtx set2 = XVECEXP (PATTERN (alu), 0, 0); >+ if (GET_CODE (set2) != SET) >+ return false; >+ /* Match one of: >+ ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR >+ We also may add insn attribute to handle some of sporadic >+ case we output those with different RTX expressions. */ >+ >+ if (GET_CODE (SET_SRC (set2)) != PLUS >+ && GET_CODE (SET_SRC (set2)) != MINUS >+ && GET_CODE (SET_SRC (set2)) != XOR >+ && GET_CODE (SET_SRC (set2)) != AND >+ && GET_CODE (SET_SRC (set2)) != IOR >+ && GET_CODE (SET_SRC (set2)) != NOT >+ && GET_CODE (SET_SRC (set2)) != ASHIFT >+ && GET_CODE (SET_SRC (set2)) != ASHIFTRT >+ && GET_CODE (SET_SRC (set2)) != LSHIFTRT) >+ return false; >+ rtx op0 = XEXP (SET_SRC (set2), 0); >+ rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL; >+ /* One of operands should be register. */ >+ if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) >+ std::swap (op0, op1); >+ if (!REG_P (op0) || REGNO (op1) != REGNO (reg)) >+ return false; >+ if (op1 >+ && !REG_P (op1) >+ && !x86_64_immediate_operand (op1, VOIDmode)) >+ return false; >+ /* Only one of two paramters must be move destination. */ >+ if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg)) >+ return false; >+ return true; >+} >+ >+ > /* Check whether current microarchitecture support macro fusion > for insn pair "CONDGEN + CONDJMP". Refer to > "Intel Architectures Optimization Reference Manual". */ >@@ -572,6 +634,9 @@ > bool > ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) > { >+ if (TARGET_FUSE_MOV_AND_ALU >+ && ix86_fuse_mov_alu_p (condgen, condjmp)) >+ return true; > rtx src, dest; > enum rtx_code ccode; > rtx compare_set = NULL_RTX, test_if, cond; >diff '--color=auto' -aurd a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def >--- a/gcc/config/i386/x86-tune.def 2024-08-31 18:32:19.000000000 -0400 >+++ b/gcc/config/i386/x86-tune.def 2024-09-08 11:52:08.570164243 -0400 >@@ -143,10 +143,16 @@ > > /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional > jump instruction when the alu instruction produces the CCFLAG consumed by >- the conditional jump instruction. */ >+ the conditional jump instruction. >+ TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, >+ There is also limitation for immediate and displacement supported. */ > DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", >- m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) >+ m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5) > >+/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov >+ and the destination is used by alu. alu must be one of >+ ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ >+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5) > > /*****************************************************************************/ > /* Function prologue, epilogue and function calling sequences. */ >@@ -483,49 +489,50 @@ > /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2 > elements. */ > DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts", >- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID >+ ~(m_ZNVER | m_CORE_HYBRID > | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) > > /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2 > elements. */ > DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts", >- ~(m_ZNVER4)) >+ ~(m_ZNVER4 | m_ZNVER5)) > > /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4 > elements. */ > DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts", >- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID >+ ~(m_ZNVER | m_CORE_HYBRID > | m_YONGFENG | m_CORE_ATOM | m_GENERIC | m_GDS)) > > /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4 > elements. */ > DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts", >- ~(m_ZNVER4)) >+ ~(m_ZNVER4 | m_ZNVER5)) > > /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more > elements. */ > DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts", >- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM >+ ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM > | m_YONGFENG | m_GENERIC | m_GDS)) > > /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more > elements. */ > DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts", >- ~(m_ZNVER4)) >+ ~(m_ZNVER4 | m_ZNVER5)) > > /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or > smaller FMA chain. */ >-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 >+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER > | m_YONGFENG | m_GENERIC) > > /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or > smaller FMA chain. */ >-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | m_ZNVER3 | m_ZNVER4 >- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) >+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", >+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID >+ | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC) > > /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or > smaller FMA chain. */ >-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE) >+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5) > > /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd > for v2df vector reduction. */
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 939335
: 902411