Lines 768-784
static void init_f32_gemm_config(void) {
Link Here
|
768 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
768 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
769 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
769 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
770 |
assert(hardware_config != NULL); |
770 |
assert(hardware_config != NULL); |
771 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) { |
771 |
if (hardware_config->use_x86_fma3) { |
772 |
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast); |
|
|
773 |
f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast); |
774 |
f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast); |
775 |
f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast); |
776 |
f32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params; |
777 |
f32_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w; |
778 |
f32_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm; |
779 |
f32_gemm_config.mr = 7; |
780 |
f32_gemm_config.nr = 16; |
781 |
} else if (hardware_config->use_x86_fma3) { |
782 |
switch (cpuinfo_get_core(0)->uarch) { |
772 |
switch (cpuinfo_get_core(0)->uarch) { |
783 |
case cpuinfo_uarch_zen: |
773 |
case cpuinfo_uarch_zen: |
784 |
case cpuinfo_uarch_dhyana: |
774 |
case cpuinfo_uarch_dhyana: |
Lines 1246-1259
static void init_f32_qc4w_gemm_config(void) {
Link Here
|
1246 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1236 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1247 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1237 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1248 |
assert(hardware_config != NULL); |
1238 |
assert(hardware_config != NULL); |
1249 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
1239 |
if (hardware_config->use_x86_avx2) { |
1250 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast); |
|
|
1251 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast); |
1252 |
f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512_params; |
1253 |
f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w; |
1254 |
f32_qc4w_gemm_config.mr = 7; |
1255 |
f32_qc4w_gemm_config.nr = 32; |
1256 |
} else if (hardware_config->use_x86_avx2) { |
1257 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast); |
1240 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast); |
1258 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast); |
1241 |
f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast); |
1259 |
f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params; |
1242 |
f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params; |
Lines 1420-1434
static void init_f32_qc8w_gemm_config(void) {
Link Here
|
1420 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1403 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1421 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1404 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1422 |
assert(hardware_config != NULL); |
1405 |
assert(hardware_config != NULL); |
1423 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
1406 |
if (hardware_config->use_x86_avx2) { |
1424 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast); |
|
|
1425 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_7x32__avx512skx_broadcast); |
1426 |
f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params; |
1427 |
f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w; |
1428 |
f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x32__scalar_int_u2; |
1429 |
f32_qc8w_gemm_config.mr = 7; |
1430 |
f32_qc8w_gemm_config.nr = 32; |
1431 |
} else if (hardware_config->use_x86_avx2) { |
1432 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast); |
1407 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast); |
1433 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast); |
1408 |
f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast); |
1434 |
f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params; |
1409 |
f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params; |
Lines 1631-1679
static void init_qd8_f16_qc4w_gemm_config(void) {
Link Here
|
1631 |
#elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
1606 |
#elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
1632 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1607 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1633 |
assert(hardware_config != NULL); |
1608 |
assert(hardware_config != NULL); |
1634 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) { |
1609 |
if (hardware_config->use_x86_avx2) { |
1635 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm); |
|
|
1636 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm); |
1637 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params; |
1638 |
qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1639 |
qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1640 |
qd8_f16_qc4w_gemm_config.mr = 7; |
1641 |
qd8_f16_qc4w_gemm_config.nr = 8; |
1642 |
qd8_f16_qc4w_gemm_config.log2_kr = 3; |
1643 |
qd8_f16_qc4w_gemm_config.planes = 2; |
1644 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
1645 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
1646 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
1647 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params; |
1648 |
qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1649 |
qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1650 |
qd8_f16_qc4w_gemm_config.mr = 7; |
1651 |
qd8_f16_qc4w_gemm_config.nr = 8; |
1652 |
qd8_f16_qc4w_gemm_config.log2_kr = 3; |
1653 |
qd8_f16_qc4w_gemm_config.planes = 2; |
1654 |
#if XNN_ENABLE_AVXVNNI |
1655 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) { |
1656 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
1657 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
1658 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params; |
1659 |
qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1660 |
qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1661 |
qd8_f16_qc4w_gemm_config.mr = 5; |
1662 |
qd8_f16_qc4w_gemm_config.nr = 8; |
1663 |
qd8_f16_qc4w_gemm_config.log2_kr = 3; |
1664 |
qd8_f16_qc4w_gemm_config.planes = 2; |
1665 |
#endif |
1666 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
1667 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512skx); |
1668 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx512skx); |
1669 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params; |
1670 |
qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1671 |
qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1672 |
qd8_f16_qc4w_gemm_config.mr = 5; |
1673 |
qd8_f16_qc4w_gemm_config.nr = 8; |
1674 |
qd8_f16_qc4w_gemm_config.log2_kr = 3; |
1675 |
qd8_f16_qc4w_gemm_config.planes = 2; |
1676 |
} else if (hardware_config->use_x86_avx2) { |
1677 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2); |
1610 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2); |
1678 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2); |
1611 |
qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2); |
1679 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params; |
1612 |
qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params; |
Lines 1776-1844
static void init_qd8_f32_qc4w_gemm_config(void) {
Link Here
|
1776 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1709 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
1777 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1710 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
1778 |
assert(hardware_config != NULL); |
1711 |
assert(hardware_config != NULL); |
1779 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) { |
1712 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
1780 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm); |
|
|
1781 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm); |
1782 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params; |
1783 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1784 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1785 |
qd8_f32_qc4w_gemm_config.mr = 7; |
1786 |
qd8_f32_qc4w_gemm_config.nr = 16; |
1787 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1788 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1789 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
1790 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm); |
1791 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm); |
1792 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params; |
1793 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1794 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1795 |
qd8_f32_qc4w_gemm_config.mr = 7; |
1796 |
qd8_f32_qc4w_gemm_config.nr = 16; |
1797 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1798 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1799 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) { |
1800 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm); |
1801 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm); |
1802 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params; |
1803 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1804 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1805 |
qd8_f32_qc4w_gemm_config.mr = 7; |
1806 |
qd8_f32_qc4w_gemm_config.nr = 8; |
1807 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1808 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1809 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
1810 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
1811 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
1812 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params; |
1813 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1814 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1815 |
qd8_f32_qc4w_gemm_config.mr = 7; |
1816 |
qd8_f32_qc4w_gemm_config.nr = 8; |
1817 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1818 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1819 |
#if XNN_ENABLE_AVXVNNI |
1820 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) { |
1821 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
1822 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
1823 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avxvnni_params; |
1824 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1825 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1826 |
qd8_f32_qc4w_gemm_config.mr = 5; |
1827 |
qd8_f32_qc4w_gemm_config.nr = 8; |
1828 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1829 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1830 |
#endif |
1831 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
1832 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm); |
1833 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm); |
1834 |
qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params; |
1835 |
qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w; |
1836 |
qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w; |
1837 |
qd8_f32_qc4w_gemm_config.mr = 7; |
1838 |
qd8_f32_qc4w_gemm_config.nr = 16; |
1839 |
qd8_f32_qc4w_gemm_config.log2_kr = 3; |
1840 |
qd8_f32_qc4w_gemm_config.planes = 2; |
1841 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
1842 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
1713 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
1843 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128); |
1714 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128); |
1844 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128); |
1715 |
qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128); |
Lines 2246-2288
static void init_qd8_f16_qc8w_gemm_config(void) {
Link Here
|
2246 |
#elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
2117 |
#elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE |
2247 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
2118 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
2248 |
assert(hardware_config != NULL); |
2119 |
assert(hardware_config != NULL); |
2249 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
2120 |
if (hardware_config->use_x86_avx2) { |
2250 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
|
|
2251 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
2252 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
2253 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
2254 |
qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params; |
2255 |
qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2256 |
qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2257 |
qd8_f16_qc8w_gemm_config.mr = 7; |
2258 |
qd8_f16_qc8w_gemm_config.nr = 8; |
2259 |
qd8_f16_qc8w_gemm_config.log2_kr = 3; |
2260 |
#if XNN_ENABLE_AVXVNNI |
2261 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) { |
2262 |
// AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels |
2263 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
2264 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
2265 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
2266 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
2267 |
qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params; |
2268 |
qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2269 |
qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2270 |
qd8_f16_qc8w_gemm_config.mr = 5; |
2271 |
qd8_f16_qc8w_gemm_config.nr = 8; |
2272 |
qd8_f16_qc8w_gemm_config.log2_kr = 3; |
2273 |
#endif |
2274 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
2275 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512skx); |
2276 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx512skx); |
2277 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512skx); |
2278 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avx512skx); |
2279 |
qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avx_params; |
2280 |
qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2281 |
qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2282 |
qd8_f16_qc8w_gemm_config.mr = 5; |
2283 |
qd8_f16_qc8w_gemm_config.nr = 8; |
2284 |
qd8_f16_qc8w_gemm_config.log2_kr = 3; |
2285 |
} else if (hardware_config->use_x86_avx2) { |
2286 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2); |
2121 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2); |
2287 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2); |
2122 |
qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2); |
2288 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2); |
2123 |
qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2); |
Lines 2618-2671
static void init_qd8_f32_qc8w_gemm_config(void) {
Link Here
|
2618 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
2453 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
2619 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
2454 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
2620 |
assert(hardware_config != NULL); |
2455 |
assert(hardware_config != NULL); |
2621 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
2456 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
2622 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm); |
|
|
2623 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm); |
2624 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni_prfm); |
2625 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512vnni_prfm); |
2626 |
qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx512vnni_params; |
2627 |
qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2628 |
qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2629 |
qd8_f32_qc8w_gemm_config.mr = 7; |
2630 |
qd8_f32_qc8w_gemm_config.nr = 16; |
2631 |
qd8_f32_qc8w_gemm_config.log2_kr = 3; |
2632 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
2633 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
2634 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
2635 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm); |
2636 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm); |
2637 |
qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params; |
2638 |
qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2639 |
qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2640 |
qd8_f32_qc8w_gemm_config.mr = 7; |
2641 |
qd8_f32_qc8w_gemm_config.nr = 8; |
2642 |
qd8_f32_qc8w_gemm_config.log2_kr = 3; |
2643 |
#if XNN_ENABLE_AVXVNNI |
2644 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) { |
2645 |
// AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels |
2646 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
2647 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
2648 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm); |
2649 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm); |
2650 |
qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params; |
2651 |
qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2652 |
qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2653 |
qd8_f32_qc8w_gemm_config.mr = 5; |
2654 |
qd8_f32_qc8w_gemm_config.nr = 8; |
2655 |
qd8_f32_qc8w_gemm_config.log2_kr = 3; |
2656 |
#endif |
2657 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
2658 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm); |
2659 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm); |
2660 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx_prfm); |
2661 |
qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512skx_prfm); |
2662 |
qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params; |
2663 |
qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
2664 |
qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
2665 |
qd8_f32_qc8w_gemm_config.mr = 7; |
2666 |
qd8_f32_qc8w_gemm_config.nr = 16; |
2667 |
qd8_f32_qc8w_gemm_config.log2_kr = 3; |
2668 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
2669 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
2457 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
2670 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64); |
2458 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64); |
2671 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64); |
2459 |
qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64); |
Lines 3393-3471
static void init_qs8_qc8w_gemm_config(void) {
Link Here
|
3393 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
3181 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
3394 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
3182 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
3395 |
assert(hardware_config != NULL); |
3183 |
assert(hardware_config != NULL); |
3396 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
3184 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
3397 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm); |
|
|
3398 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm); |
3399 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm); |
3400 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm); |
3401 |
qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params; |
3402 |
qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w; |
3403 |
qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w; |
3404 |
qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w; |
3405 |
qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w; |
3406 |
qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w; |
3407 |
qs8_qc8w_gemm_config.mr = 7; |
3408 |
qs8_qc8w_gemm_config.nr = 16; |
3409 |
qs8_qc8w_gemm_config.log2_kr = 3; |
3410 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) { |
3411 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm); |
3412 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm); |
3413 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm); |
3414 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm); |
3415 |
qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avxvnni_params; |
3416 |
qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w; |
3417 |
qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w; |
3418 |
qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w; |
3419 |
qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w; |
3420 |
qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w; |
3421 |
qs8_qc8w_gemm_config.mr = 7; |
3422 |
qs8_qc8w_gemm_config.nr = 8; |
3423 |
qs8_qc8w_gemm_config.log2_kr = 3; |
3424 |
#if XNN_ENABLE_AVXVNNI |
3425 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) { |
3426 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm); |
3427 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm); |
3428 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm); |
3429 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm); |
3430 |
qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params; |
3431 |
qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w; |
3432 |
qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w; |
3433 |
qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w; |
3434 |
qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w; |
3435 |
qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w; |
3436 |
qs8_qc8w_gemm_config.mr = 5; |
3437 |
qs8_qc8w_gemm_config.nr = 8; |
3438 |
qs8_qc8w_gemm_config.log2_kr = 3; |
3439 |
#endif |
3440 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
3441 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm); |
3442 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm); |
3443 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm); |
3444 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm); |
3445 |
qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params; |
3446 |
qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
3447 |
qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
3448 |
qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w; |
3449 |
qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w; |
3450 |
qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w; |
3451 |
qs8_qc8w_gemm_config.mr = 7; |
3452 |
qs8_qc8w_gemm_config.nr = 16; |
3453 |
qs8_qc8w_gemm_config.log2_kr = 3; |
3454 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
3455 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512skx); |
3456 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avx512skx); |
3457 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512skx); |
3458 |
qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avx512skx); |
3459 |
qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params; |
3460 |
qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w; |
3461 |
qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w; |
3462 |
qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w; |
3463 |
qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w; |
3464 |
qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w; |
3465 |
qs8_qc8w_gemm_config.mr = 4; // TODO: upgrade to 5x8 prfm when supported |
3466 |
qs8_qc8w_gemm_config.nr = 8; |
3467 |
qs8_qc8w_gemm_config.log2_kr = 3; |
3468 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
3469 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
3185 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
3470 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64); |
3186 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64); |
3471 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64); |
3187 |
qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64); |
Lines 3991-4019
static void init_qu8_gemm_config(void) {
Link Here
|
3991 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
3707 |
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
3992 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
3708 |
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
3993 |
assert(hardware_config != NULL); |
3709 |
assert(hardware_config != NULL); |
3994 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
3710 |
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
3995 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm); |
|
|
3996 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm); |
3997 |
qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm); |
3998 |
qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm); |
3999 |
qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params; |
4000 |
qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w; |
4001 |
qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w; |
4002 |
qu8_gemm_config.mr = 7; |
4003 |
qu8_gemm_config.nr = 16; |
4004 |
qu8_gemm_config.log2_kr = 3; |
4005 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) { |
4006 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx512skx); |
4007 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x8c8__avx512skx); |
4008 |
qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx512skx); |
4009 |
qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x8c8__avx512skx); |
4010 |
qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params; |
4011 |
qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w; |
4012 |
qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w; |
4013 |
qu8_gemm_config.mr = 4; // TODO: upgrade to 5x8 prfm when supported |
4014 |
qu8_gemm_config.nr = 8; |
4015 |
qu8_gemm_config.log2_kr = 3; |
4016 |
} else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) { |
4017 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
3711 |
// XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels |
4018 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64); |
3712 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64); |
4019 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64); |
3713 |
qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64); |