Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 931623
Collapse All | Expand All

(-)a/skia/BUILD.gn (-22 lines)
Lines 766-797 if (current_cpu == "x64") { Link Here
766
    }
766
    }
767
    visibility = [ ":skcms" ]
767
    visibility = [ ":skcms" ]
768
  }
768
  }
769
  skia_source_set("skcms_TransformSkx") {
770
    sources = skcms_TransformSkx
771
    if (!is_win) {
772
      cflags = [
773
        "-w",
774
        "-mavx512f",
775
        "-mavx512dq",
776
        "-mavx512cd",
777
        "-mavx512bw",
778
        "-mavx512vl",
779
        "-std=c11",
780
      ]
781
    } else {
782
      cflags = [ "/arch:AVX512" ]
783
    }
784
    visibility = [ ":skcms" ]
785
  }
786
} else {
769
} else {
787
  skia_source_set("skcms_TransformHsw") {
770
  skia_source_set("skcms_TransformHsw") {
788
    sources = []
771
    sources = []
789
    visibility = [ ":skcms" ]
772
    visibility = [ ":skcms" ]
790
  }
773
  }
791
  skia_source_set("skcms_TransformSkx") {
792
    sources = []
793
    visibility = [ ":skcms" ]
794
  }
795
}
774
}
796
775
797
source_set("skcms_TransformBaseline_and_public") {
776
source_set("skcms_TransformBaseline_and_public") {
Lines 820-826 source_set("skcms") { Link Here
820
  deps = [
799
  deps = [
821
    ":skcms_TransformBaseline_and_public",
800
    ":skcms_TransformBaseline_and_public",
822
    ":skcms_TransformHsw",
801
    ":skcms_TransformHsw",
823
    ":skcms_TransformSkx",
824
  ]
802
  ]
825
  public =
803
  public =
826
      rebase_path(skcms_public_headers, ".", "//third_party/skia/modules/skcms")
804
      rebase_path(skcms_public_headers, ".", "//third_party/skia/modules/skcms")
(-)a/third_party/skia/modules/skcms/skcms.cc (-5 lines)
Lines 2783-2793 bool skcms_Transform(const void* src, Link Here
2783
    auto run = baseline::run_program;
2783
    auto run = baseline::run_program;
2784
    switch (cpu_type()) {
2784
    switch (cpu_type()) {
2785
        case CpuType::SKX:
2785
        case CpuType::SKX:
2786
            #if !defined(SKCMS_DISABLE_SKX)
2787
                run = skx::run_program;
2788
                break;
2789
            #endif
2790
2791
        case CpuType::HSW:
2786
        case CpuType::HSW:
2792
            #if !defined(SKCMS_DISABLE_HSW)
2787
            #if !defined(SKCMS_DISABLE_HSW)
2793
                run = hsw::run_program;
2788
                run = hsw::run_program;
(-)a/third_party/xnnpack/BUILD.gn (-359 lines)
Lines 61-73 config("xnnpack_config") { Link Here
61
if (current_cpu == "x64" || current_cpu == "x86") {
61
if (current_cpu == "x64" || current_cpu == "x86") {
62
  xnnpack_deps = [
62
  xnnpack_deps = [
63
    ":amalgam_avx-no-avx2-no-f16c-no-fma",
63
    ":amalgam_avx-no-avx2-no-f16c-no-fma",
64
    ":amalgam_avx2-avxvnni-f16c-fma",
65
    ":amalgam_avx512f",
66
    ":amalgam_f16c-fma-avx2",
64
    ":amalgam_f16c-fma-avx2",
67
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl",
68
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi",
69
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni",
70
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni",
71
    ":amalgam_f16c-fma-no-avx2",
65
    ":amalgam_f16c-fma-no-avx2",
72
    ":amalgam_f16c-no-avx2-no-fma",
66
    ":amalgam_f16c-no-avx2-no-fma",
73
    ":amalgam_sse2-no-sse3",
67
    ":amalgam_sse2-no-sse3",
Lines 84-96 if (current_cpu == "x64" || current_cpu == "x86") { Link Here
84
78
85
  xnnpack_standalone_deps = [
79
  xnnpack_standalone_deps = [
86
    ":amalgam_avx-no-avx2-no-f16c-no-fma_standalone",
80
    ":amalgam_avx-no-avx2-no-f16c-no-fma_standalone",
87
    ":amalgam_avx2-avxvnni-f16c-fma_standalone",
88
    ":amalgam_avx512f_standalone",
89
    ":amalgam_f16c-fma-avx2_standalone",
81
    ":amalgam_f16c-fma-avx2_standalone",
90
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi_standalone",
91
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni_standalone",
92
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni_standalone",
93
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl_standalone",
94
    ":amalgam_f16c-fma-no-avx2_standalone",
82
    ":amalgam_f16c-fma-no-avx2_standalone",
95
    ":amalgam_f16c-no-avx2-no-fma_standalone",
83
    ":amalgam_f16c-no-avx2-no-fma_standalone",
96
    ":amalgam_sse2-no-sse3_standalone",
84
    ":amalgam_sse2-no-sse3_standalone",
Lines 306-407 if (current_cpu == "x64" || current_cpu == "x86") { Link Here
306
    }
294
    }
307
  }
295
  }
308
296
309
  source_set("amalgam_avx2-avxvnni-f16c-fma") {
310
    cflags = [
311
      "-mavx2",
312
      "-mavxvnni",
313
      "-mf16c",
314
      "-mfma",
315
    ]
316
317
    sources = [ "src/src/amalgam/gen/avxvnni.c" ]
318
319
    configs -= [ "//build/config/compiler:chromium_code" ]
320
    configs += [ "//build/config/compiler:no_chromium_code" ]
321
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
322
323
    deps = [
324
      "//third_party/cpuinfo",
325
      "//third_party/fp16",
326
      "//third_party/fxdiv",
327
      "//third_party/pthreadpool",
328
    ]
329
330
    public_configs = [ ":xnnpack_config" ]
331
  }
332
333
  # This is a target that cannot depend on //base.
334
  source_set("amalgam_avx2-avxvnni-f16c-fma_standalone") {
335
    cflags = [
336
      "-mavx2",
337
      "-mavxvnni",
338
      "-mf16c",
339
      "-mfma",
340
    ]
341
342
    sources = [ "src/src/amalgam/gen/avxvnni.c" ]
343
344
    configs -= [ "//build/config/compiler:chromium_code" ]
345
    configs += [ "//build/config/compiler:no_chromium_code" ]
346
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
347
348
    deps = [
349
      "//third_party/cpuinfo",
350
      "//third_party/fp16",
351
      "//third_party/fxdiv",
352
      "//third_party/pthreadpool:pthreadpool_standalone",
353
    ]
354
355
    public_configs = [ ":xnnpack_config" ]
356
357
    if (!(is_android && use_order_profiling)) {
358
      assert_no_deps = [ "//base" ]
359
    }
360
  }
361
362
  source_set("amalgam_avx512f") {
363
    cflags = [ "-mavx512f" ]
364
365
    sources = [ "src/src/amalgam/gen/avx512f.c" ]
366
367
    configs -= [ "//build/config/compiler:chromium_code" ]
368
    configs += [ "//build/config/compiler:no_chromium_code" ]
369
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
370
371
    deps = [
372
      "//third_party/cpuinfo",
373
      "//third_party/fp16",
374
      "//third_party/fxdiv",
375
      "//third_party/pthreadpool",
376
    ]
377
378
    public_configs = [ ":xnnpack_config" ]
379
  }
380
381
  # This is a target that cannot depend on //base.
382
  source_set("amalgam_avx512f_standalone") {
383
    cflags = [ "-mavx512f" ]
384
385
    sources = [ "src/src/amalgam/gen/avx512f.c" ]
386
387
    configs -= [ "//build/config/compiler:chromium_code" ]
388
    configs += [ "//build/config/compiler:no_chromium_code" ]
389
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
390
391
    deps = [
392
      "//third_party/cpuinfo",
393
      "//third_party/fp16",
394
      "//third_party/fxdiv",
395
      "//third_party/pthreadpool:pthreadpool_standalone",
396
    ]
397
398
    public_configs = [ ":xnnpack_config" ]
399
400
    if (!(is_android && use_order_profiling)) {
401
      assert_no_deps = [ "//base" ]
402
    }
403
  }
404
405
  source_set("amalgam_f16c-fma-avx2") {
297
  source_set("amalgam_f16c-fma-avx2") {
406
    cflags = [
298
    cflags = [
407
      "-mavx2",
299
      "-mavx2",
Lines 453-709 if (current_cpu == "x64" || current_cpu == "x86") { Link Here
453
    }
345
    }
454
  }
346
  }
455
347
456
  source_set("amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl") {
457
    cflags = [
458
      "-mavx512bw",
459
      "-mavx512cd",
460
      "-mavx512dq",
461
      "-mavx512f",
462
      "-mavx512vl",
463
      "-mf16c",
464
      "-mfma",
465
    ]
466
467
    sources = [ "src/src/amalgam/gen/avx512skx.c" ]
468
469
    configs -= [ "//build/config/compiler:chromium_code" ]
470
    configs += [ "//build/config/compiler:no_chromium_code" ]
471
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
472
473
    deps = [
474
      "//third_party/cpuinfo",
475
      "//third_party/fp16",
476
      "//third_party/fxdiv",
477
      "//third_party/pthreadpool",
478
    ]
479
480
    public_configs = [ ":xnnpack_config" ]
481
  }
482
483
  # This is a target that cannot depend on //base.
484
  source_set(
485
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl_standalone") {
486
    cflags = [
487
      "-mavx512bw",
488
      "-mavx512cd",
489
      "-mavx512dq",
490
      "-mavx512f",
491
      "-mavx512vl",
492
      "-mf16c",
493
      "-mfma",
494
    ]
495
496
    sources = [ "src/src/amalgam/gen/avx512skx.c" ]
497
498
    configs -= [ "//build/config/compiler:chromium_code" ]
499
    configs += [ "//build/config/compiler:no_chromium_code" ]
500
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
501
502
    deps = [
503
      "//third_party/cpuinfo",
504
      "//third_party/fp16",
505
      "//third_party/fxdiv",
506
      "//third_party/pthreadpool:pthreadpool_standalone",
507
    ]
508
509
    public_configs = [ ":xnnpack_config" ]
510
511
    if (!(is_android && use_order_profiling)) {
512
      assert_no_deps = [ "//base" ]
513
    }
514
  }
515
516
  source_set(
517
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi") {
518
    cflags = [
519
      "-mavx512bw",
520
      "-mavx512cd",
521
      "-mavx512dq",
522
      "-mavx512f",
523
      "-mavx512vbmi",
524
      "-mavx512vl",
525
      "-mf16c",
526
      "-mfma",
527
    ]
528
529
    sources = [ "src/src/amalgam/gen/avx512vbmi.c" ]
530
531
    configs -= [ "//build/config/compiler:chromium_code" ]
532
    configs += [ "//build/config/compiler:no_chromium_code" ]
533
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
534
535
    deps = [
536
      "//third_party/cpuinfo",
537
      "//third_party/fp16",
538
      "//third_party/fxdiv",
539
      "//third_party/pthreadpool",
540
    ]
541
542
    public_configs = [ ":xnnpack_config" ]
543
  }
544
545
  # This is a target that cannot depend on //base.
546
  source_set(
547
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi_standalone") {
548
    cflags = [
549
      "-mavx512bw",
550
      "-mavx512cd",
551
      "-mavx512dq",
552
      "-mavx512f",
553
      "-mavx512vbmi",
554
      "-mavx512vl",
555
      "-mf16c",
556
      "-mfma",
557
    ]
558
559
    sources = [ "src/src/amalgam/gen/avx512vbmi.c" ]
560
561
    configs -= [ "//build/config/compiler:chromium_code" ]
562
    configs += [ "//build/config/compiler:no_chromium_code" ]
563
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
564
565
    deps = [
566
      "//third_party/cpuinfo",
567
      "//third_party/fp16",
568
      "//third_party/fxdiv",
569
      "//third_party/pthreadpool:pthreadpool_standalone",
570
    ]
571
572
    public_configs = [ ":xnnpack_config" ]
573
574
    if (!(is_android && use_order_profiling)) {
575
      assert_no_deps = [ "//base" ]
576
    }
577
  }
578
579
  source_set(
580
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni") {
581
    cflags = [
582
      "-mavx512bw",
583
      "-mavx512cd",
584
      "-mavx512dq",
585
      "-mavx512f",
586
      "-mavx512vl",
587
      "-mavx512vnni",
588
      "-mf16c",
589
      "-mfma",
590
    ]
591
592
    sources = [ "src/src/amalgam/gen/avx512vnni.c" ]
593
594
    configs -= [ "//build/config/compiler:chromium_code" ]
595
    configs += [ "//build/config/compiler:no_chromium_code" ]
596
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
597
598
    deps = [
599
      "//third_party/cpuinfo",
600
      "//third_party/fp16",
601
      "//third_party/fxdiv",
602
      "//third_party/pthreadpool",
603
    ]
604
605
    public_configs = [ ":xnnpack_config" ]
606
  }
607
608
  # This is a target that cannot depend on //base.
609
  source_set(
610
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni_standalone") {
611
    cflags = [
612
      "-mavx512bw",
613
      "-mavx512cd",
614
      "-mavx512dq",
615
      "-mavx512f",
616
      "-mavx512vl",
617
      "-mavx512vnni",
618
      "-mf16c",
619
      "-mfma",
620
    ]
621
622
    sources = [ "src/src/amalgam/gen/avx512vnni.c" ]
623
624
    configs -= [ "//build/config/compiler:chromium_code" ]
625
    configs += [ "//build/config/compiler:no_chromium_code" ]
626
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
627
628
    deps = [
629
      "//third_party/cpuinfo",
630
      "//third_party/fp16",
631
      "//third_party/fxdiv",
632
      "//third_party/pthreadpool:pthreadpool_standalone",
633
    ]
634
635
    public_configs = [ ":xnnpack_config" ]
636
637
    if (!(is_android && use_order_profiling)) {
638
      assert_no_deps = [ "//base" ]
639
    }
640
  }
641
642
  source_set(
643
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni") {
644
    cflags = [
645
      "-mavx512bw",
646
      "-mavx512cd",
647
      "-mavx512dq",
648
      "-mavx512f",
649
      "-mavx512vl",
650
      "-mavx512vnni",
651
      "-mf16c",
652
      "-mfma",
653
      "-mgfni",
654
    ]
655
656
    sources = [ "src/src/amalgam/gen/avx512vnnigfni.c" ]
657
658
    configs -= [ "//build/config/compiler:chromium_code" ]
659
    configs += [ "//build/config/compiler:no_chromium_code" ]
660
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
661
662
    deps = [
663
      "//third_party/cpuinfo",
664
      "//third_party/fp16",
665
      "//third_party/fxdiv",
666
      "//third_party/pthreadpool",
667
    ]
668
669
    public_configs = [ ":xnnpack_config" ]
670
  }
671
672
  # This is a target that cannot depend on //base.
673
  source_set(
674
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni_standalone") {
675
    cflags = [
676
      "-mavx512bw",
677
      "-mavx512cd",
678
      "-mavx512dq",
679
      "-mavx512f",
680
      "-mavx512vl",
681
      "-mavx512vnni",
682
      "-mf16c",
683
      "-mfma",
684
      "-mgfni",
685
    ]
686
687
    sources = [ "src/src/amalgam/gen/avx512vnnigfni.c" ]
688
689
    configs -= [ "//build/config/compiler:chromium_code" ]
690
    configs += [ "//build/config/compiler:no_chromium_code" ]
691
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
692
693
    deps = [
694
      "//third_party/cpuinfo",
695
      "//third_party/fp16",
696
      "//third_party/fxdiv",
697
      "//third_party/pthreadpool:pthreadpool_standalone",
698
    ]
699
700
    public_configs = [ ":xnnpack_config" ]
701
702
    if (!(is_android && use_order_profiling)) {
703
      assert_no_deps = [ "//base" ]
704
    }
705
  }
706
707
  source_set("amalgam_f16c-fma-no-avx2") {
348
  source_set("amalgam_f16c-fma-no-avx2") {
708
    cflags = [
349
    cflags = [
709
      "-mf16c",
350
      "-mf16c",
(-)a/third_party/xnnpack/src/src/configs/binary-elementwise-config.c (-60 / +9 lines)
Lines 338-350 static void init_f32_vadd_config(void) { Link Here
338
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
338
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
339
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
339
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
340
    assert(hardware_config != NULL);
340
    assert(hardware_config != NULL);
341
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
341
    if (hardware_config->use_x86_avx) {
342
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx512f_u32;
343
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32;
344
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32;
345
      f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
346
      f32_vadd_config.minmax.element_tile = 32;
347
    } else if (hardware_config->use_x86_avx) {
348
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16;
342
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16;
349
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
343
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
350
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
344
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
Lines 430-442 static void init_f32_vdiv_config(void) { Link Here
430
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
424
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
431
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
425
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
432
    assert(hardware_config != NULL);
426
    assert(hardware_config != NULL);
433
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
427
    if (hardware_config->use_x86_avx) {
434
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx512f_u32;
435
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx512f_u32;
436
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx512f_u32;
437
      f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
438
      f32_vdiv_config.minmax.element_tile = 32;
439
    } else if (hardware_config->use_x86_avx) {
440
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16;
428
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16;
441
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16;
429
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16;
442
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16;
430
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16;
Lines 519-530 static void init_f32_vmax_config(void) { Link Here
519
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
507
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
520
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
508
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
521
    assert(hardware_config != NULL);
509
    assert(hardware_config != NULL);
522
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
510
    if (hardware_config->use_x86_avx) {
523
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32;
524
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32;
525
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32;
526
      f32_vmax_config.minmax.element_tile = 32;
527
    } else if (hardware_config->use_x86_avx) {
528
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16;
511
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16;
529
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
512
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
530
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
513
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
Lines 593-604 static void init_f32_vmin_config(void) { Link Here
593
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
576
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
594
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
577
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
595
    assert(hardware_config != NULL);
578
    assert(hardware_config != NULL);
596
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
579
    if (hardware_config->use_x86_avx) {
597
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32;
598
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32;
599
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32;
600
      f32_vmin_config.minmax.element_tile = 32;
601
    } else if (hardware_config->use_x86_avx) {
602
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16;
580
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16;
603
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
581
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
604
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
582
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
Lines 670-682 static void init_f32_vmul_config(void) { Link Here
670
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
648
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
671
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
649
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
672
    assert(hardware_config != NULL);
650
    assert(hardware_config != NULL);
673
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
651
    if (hardware_config->use_x86_avx) {
674
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx512f_u32;
675
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32;
676
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32;
677
      f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
678
      f32_vmul_config.minmax.element_tile = 32;
679
    } else if (hardware_config->use_x86_avx) {
680
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16;
652
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16;
681
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
653
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
682
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
654
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
Lines 762-774 static void init_f32_vsub_config(void) { Link Here
762
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
734
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
763
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
735
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
764
    assert(hardware_config != NULL);
736
    assert(hardware_config != NULL);
765
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
737
    if (hardware_config->use_x86_avx) {
766
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx512f_u32;
767
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx512f_u32;
768
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx512f_u32;
769
      f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
770
      f32_vsub_config.minmax.element_tile = 32;
771
    } else if (hardware_config->use_x86_avx) {
772
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16;
738
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16;
773
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16;
739
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16;
774
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16;
740
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16;
Lines 851-862 static void init_f32_vsqrdiff_config(void) { Link Here
851
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
817
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
852
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
818
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
853
    assert(hardware_config != NULL);
819
    assert(hardware_config != NULL);
854
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
820
    if (hardware_config->use_x86_avx) {
855
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32;
856
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32;
857
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32;
858
      f32_vsqrdiff_config.minmax.element_tile = 32;
859
    } else if (hardware_config->use_x86_avx) {
860
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16;
821
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16;
861
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
822
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
862
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
823
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
Lines 919-931 static void init_qs8_vadd_config(void) { Link Here
919
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
880
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
920
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
881
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
921
    assert(hardware_config != NULL);
882
    assert(hardware_config != NULL);
922
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
883
    if (hardware_config->use_x86_xop) {
923
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16;
924
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
925
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
926
      qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_avx512_params;
927
      qs8_vadd_config.minmax.element_tile = 16;
928
    } else if (hardware_config->use_x86_xop) {
929
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_u8;
884
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_u8;
930
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
885
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
931
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
886
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
Lines 1093-1105 static void init_qu8_vadd_config(void) { Link Here
1093
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1048
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1094
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1049
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1095
    assert(hardware_config != NULL);
1050
    assert(hardware_config != NULL);
1096
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1051
    if (hardware_config->use_x86_xop) {
1097
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16;
1098
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
1099
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
1100
      qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_avx512_params;
1101
      qu8_vadd_config.minmax.element_tile = 16;
1102
    } else if (hardware_config->use_x86_xop) {
1103
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_u8;
1052
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_u8;
1104
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
1053
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
1105
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
1054
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_u8;
(-)a/third_party/xnnpack/src/src/configs/dwconv-config.c (-83 / +4 lines)
Lines 301-348 static void init_f32_dwconv_config(void) { Link Here
301
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
301
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
302
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
302
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
303
    assert(hardware_config != NULL);
303
    assert(hardware_config != NULL);
304
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
304
    if (hardware_config->use_x86_fma3) {
305
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f;
306
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
307
      f32_dwconv_config[0].channel_tile = 16;
308
      f32_dwconv_config[0].channel_subtile = 16;
309
      f32_dwconv_config[0].channel_round = 1;
310
      f32_dwconv_config[0].primary_tile = 3;
311
312
      f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f;
313
      f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
314
      f32_dwconv_config[1].channel_tile = 16;
315
      f32_dwconv_config[1].channel_subtile = 16;
316
      f32_dwconv_config[1].channel_round = 1;
317
      f32_dwconv_config[1].primary_tile = 4;
318
319
      f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f;
320
      f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
321
      f32_dwconv_config[2].channel_tile = 16;
322
      f32_dwconv_config[2].channel_subtile = 16;
323
      f32_dwconv_config[2].channel_round = 1;
324
      f32_dwconv_config[2].primary_tile = 9;
325
326
      // Multipass microkernel "acc" value should match unipass and also match across different hardware config.
327
      // Accumulation (FMA) can produce different results, which results in tests only failing on certain platforms.
328
      #if XNN_ENABLE_DWCONV_MULTIPASS
329
        f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f;
330
        f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
331
        f32_dwconv_config[3].channel_tile = 32;
332
        f32_dwconv_config[3].channel_subtile = 16;
333
        f32_dwconv_config[3].channel_round = 1;
334
        f32_dwconv_config[3].primary_tile = 5;
335
        f32_dwconv_config[3].middle_tile = 5;
336
        f32_dwconv_config[3].last_tile = 5;
337
      #else
338
        f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f;
339
        f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
340
        f32_dwconv_config[3].channel_tile = 16;
341
        f32_dwconv_config[3].channel_subtile = 16;
342
        f32_dwconv_config[3].channel_round = 1;
343
        f32_dwconv_config[3].primary_tile = 25;
344
      #endif  // XNN_ENABLE_DWCONV_MULTIPASS
345
    } else if (hardware_config->use_x86_fma3) {
346
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
305
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
347
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
306
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
348
      f32_dwconv_config[0].channel_tile = 16;
307
      f32_dwconv_config[0].channel_tile = 16;
Lines 800-822 static void init_qs8_qc8w_dwconv_config(void) { Link Here
800
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
759
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
801
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
760
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
802
    assert(hardware_config != NULL);
761
    assert(hardware_config != NULL);
803
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
762
    if (hardware_config->use_x86_xop) {
804
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32;
805
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
806
      qs8_qc8w_dwconv_config[0].channel_tile = 32;
807
      qs8_qc8w_dwconv_config[0].channel_subtile = 32;
808
      qs8_qc8w_dwconv_config[0].channel_round = 1;
809
      qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
810
      qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
811
      qs8_qc8w_dwconv_config[1].channel_tile = 32;
812
      qs8_qc8w_dwconv_config[1].channel_subtile = 32;
813
      qs8_qc8w_dwconv_config[1].channel_round = 1;
814
      qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
815
      qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
816
      qs8_qc8w_dwconv_config[2].channel_tile = 32;
817
      qs8_qc8w_dwconv_config[2].channel_subtile = 32;
818
      qs8_qc8w_dwconv_config[2].channel_round = 1;
819
    } else if (hardware_config->use_x86_xop) {
820
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
763
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
821
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__xop_mul16_add16;
764
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__xop_mul16_add16;
822
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
765
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_sse4_params;
Lines 1050-1067 static void init_qs8_dwconv_config(void) { Link Here
1050
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
993
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1051
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
994
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1052
    assert(hardware_config != NULL);
995
    assert(hardware_config != NULL);
1053
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
996
    if (hardware_config->use_x86_xop) {
1054
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
1055
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
1056
      qs8_dwconv_config[0].channel_tile = 32;
1057
      qs8_dwconv_config[0].channel_subtile = 32;
1058
      qs8_dwconv_config[0].channel_round = 1;
1059
      qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
1060
      qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
1061
      qs8_dwconv_config[1].channel_tile = 32;
1062
      qs8_dwconv_config[1].channel_subtile = 32;
1063
      qs8_dwconv_config[1].channel_round = 1;
1064
    } else if (hardware_config->use_x86_xop) {
1065
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
997
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1066
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
998
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul16_add16;
1067
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
999
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
Lines 1239-1256 static void init_qu8_dwconv_config(void) { Link Here
1239
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1171
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1240
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1172
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1241
    assert(hardware_config != NULL);
1173
    assert(hardware_config != NULL);
1242
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1174
    if (hardware_config->use_x86_xop) {
1243
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
1244
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
1245
      qu8_dwconv_config[0].channel_tile = 32;
1246
      qu8_dwconv_config[0].channel_subtile = 32;
1247
      qu8_dwconv_config[0].channel_round = 1;
1248
      qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
1249
      qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
1250
      qu8_dwconv_config[1].channel_tile = 32;
1251
      qu8_dwconv_config[1].channel_subtile = 32;
1252
      qu8_dwconv_config[1].channel_round = 1;
1253
    } else if (hardware_config->use_x86_xop) {
1254
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1175
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1255
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul32;
1176
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__xop_mul32;
1256
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
1177
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
(-)a/third_party/xnnpack/src/src/configs/gemm-config.c (-315 / +9 lines)
Lines 768-784 static void init_f32_gemm_config(void) { Link Here
768
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
768
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
769
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
769
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
770
    assert(hardware_config != NULL);
770
    assert(hardware_config != NULL);
771
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
771
    if (hardware_config->use_x86_fma3) {
772
      f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
773
      f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
774
      f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
775
      f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
776
      f32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
777
      f32_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
778
      f32_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm;
779
      f32_gemm_config.mr = 7;
780
      f32_gemm_config.nr = 16;
781
    } else if (hardware_config->use_x86_fma3) {
782
      switch (cpuinfo_get_core(0)->uarch) {
772
      switch (cpuinfo_get_core(0)->uarch) {
783
        case cpuinfo_uarch_zen:
773
        case cpuinfo_uarch_zen:
784
        case cpuinfo_uarch_dhyana:
774
        case cpuinfo_uarch_dhyana:
Lines 1246-1259 static void init_f32_qc4w_gemm_config(void) { Link Here
1246
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1236
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1247
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1237
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1248
    assert(hardware_config != NULL);
1238
    assert(hardware_config != NULL);
1249
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1239
    if (hardware_config->use_x86_avx2) {
1250
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1251
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1252
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512_params;
1253
      f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1254
      f32_qc4w_gemm_config.mr = 7;
1255
      f32_qc4w_gemm_config.nr = 32;
1256
    } else if (hardware_config->use_x86_avx2) {
1257
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1240
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1258
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast);
1241
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast);
1259
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
1242
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
Lines 1420-1434 static void init_f32_qc8w_gemm_config(void) { Link Here
1420
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1403
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1421
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1404
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1422
    assert(hardware_config != NULL);
1405
    assert(hardware_config != NULL);
1423
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1406
    if (hardware_config->use_x86_avx2) {
1424
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1425
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1426
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1427
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1428
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x32__scalar_int_u2;
1429
      f32_qc8w_gemm_config.mr = 7;
1430
      f32_qc8w_gemm_config.nr = 32;
1431
    } else if (hardware_config->use_x86_avx2) {
1432
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1407
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1433
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast);
1408
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast);
1434
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params;
1409
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params;
Lines 1631-1679 static void init_qd8_f16_qc4w_gemm_config(void) { Link Here
1631
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
1606
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
1632
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1607
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1633
    assert(hardware_config != NULL);
1608
    assert(hardware_config != NULL);
1634
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1609
    if (hardware_config->use_x86_avx2) {
1635
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm);
1636
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm);
1637
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1638
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1639
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1640
      qd8_f16_qc4w_gemm_config.mr = 7;
1641
      qd8_f16_qc4w_gemm_config.nr = 8;
1642
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1643
      qd8_f16_qc4w_gemm_config.planes = 2;
1644
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1645
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
1646
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
1647
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1648
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1649
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1650
      qd8_f16_qc4w_gemm_config.mr = 7;
1651
      qd8_f16_qc4w_gemm_config.nr = 8;
1652
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1653
      qd8_f16_qc4w_gemm_config.planes = 2;
1654
    #if XNN_ENABLE_AVXVNNI
1655
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
1656
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
1657
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
1658
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1659
        qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1660
        qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1661
        qd8_f16_qc4w_gemm_config.mr = 5;
1662
        qd8_f16_qc4w_gemm_config.nr = 8;
1663
        qd8_f16_qc4w_gemm_config.log2_kr = 3;
1664
        qd8_f16_qc4w_gemm_config.planes = 2;
1665
    #endif
1666
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1667
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512skx);
1668
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx512skx);
1669
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
1670
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1671
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1672
      qd8_f16_qc4w_gemm_config.mr = 5;
1673
      qd8_f16_qc4w_gemm_config.nr = 8;
1674
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1675
      qd8_f16_qc4w_gemm_config.planes = 2;
1676
    } else if (hardware_config->use_x86_avx2) {
1677
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1610
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1678
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1611
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1679
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
1612
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
Lines 1776-1844 static void init_qd8_f32_qc4w_gemm_config(void) { Link Here
1776
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1709
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1777
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1710
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1778
    assert(hardware_config != NULL);
1711
    assert(hardware_config != NULL);
1779
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1712
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
1780
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm);
1781
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm);
1782
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1783
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1784
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1785
      qd8_f32_qc4w_gemm_config.mr = 7;
1786
      qd8_f32_qc4w_gemm_config.nr = 16;
1787
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1788
      qd8_f32_qc4w_gemm_config.planes = 2;
1789
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1790
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
1791
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
1792
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1793
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1794
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1795
      qd8_f32_qc4w_gemm_config.mr = 7;
1796
      qd8_f32_qc4w_gemm_config.nr = 16;
1797
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1798
      qd8_f32_qc4w_gemm_config.planes = 2;
1799
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1800
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm);
1801
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm);
1802
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1803
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1804
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1805
      qd8_f32_qc4w_gemm_config.mr = 7;
1806
      qd8_f32_qc4w_gemm_config.nr = 8;
1807
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1808
      qd8_f32_qc4w_gemm_config.planes = 2;
1809
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1810
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
1811
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
1812
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1813
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1814
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1815
      qd8_f32_qc4w_gemm_config.mr = 7;
1816
      qd8_f32_qc4w_gemm_config.nr = 8;
1817
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1818
      qd8_f32_qc4w_gemm_config.planes = 2;
1819
    #if XNN_ENABLE_AVXVNNI
1820
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
1821
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
1822
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
1823
        qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avxvnni_params;
1824
        qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1825
        qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1826
        qd8_f32_qc4w_gemm_config.mr = 5;
1827
        qd8_f32_qc4w_gemm_config.nr = 8;
1828
        qd8_f32_qc4w_gemm_config.log2_kr = 3;
1829
        qd8_f32_qc4w_gemm_config.planes = 2;
1830
    #endif
1831
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1832
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm);
1833
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm);
1834
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1835
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1836
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1837
      qd8_f32_qc4w_gemm_config.mr = 7;
1838
      qd8_f32_qc4w_gemm_config.nr = 16;
1839
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1840
      qd8_f32_qc4w_gemm_config.planes = 2;
1841
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
1842
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1713
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1843
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128);
1714
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x4c8__xop_ld128);
1844
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128);
1715
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_4x4c8__xop_ld128);
Lines 2246-2288 static void init_qd8_f16_qc8w_gemm_config(void) { Link Here
2246
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
2117
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
2247
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2118
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2248
    assert(hardware_config != NULL);
2119
    assert(hardware_config != NULL);
2249
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
2120
    if (hardware_config->use_x86_avx2) {
2250
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2251
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2252
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2253
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2254
      qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params;
2255
      qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2256
      qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2257
      qd8_f16_qc8w_gemm_config.mr = 7;
2258
      qd8_f16_qc8w_gemm_config.nr = 8;
2259
      qd8_f16_qc8w_gemm_config.log2_kr = 3;
2260
    #if XNN_ENABLE_AVXVNNI
2261
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
2262
        // AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels
2263
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2264
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2265
        qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2266
        qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2267
        qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params;
2268
        qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2269
        qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2270
        qd8_f16_qc8w_gemm_config.mr = 5;
2271
        qd8_f16_qc8w_gemm_config.nr = 8;
2272
        qd8_f16_qc8w_gemm_config.log2_kr = 3;
2273
    #endif
2274
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
2275
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512skx);
2276
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx512skx);
2277
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512skx);
2278
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avx512skx);
2279
      qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avx_params;
2280
      qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2281
      qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2282
      qd8_f16_qc8w_gemm_config.mr = 5;
2283
      qd8_f16_qc8w_gemm_config.nr = 8;
2284
      qd8_f16_qc8w_gemm_config.log2_kr = 3;
2285
    } else if (hardware_config->use_x86_avx2) {
2286
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
2121
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
2287
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
2122
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
2288
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
2123
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
Lines 2618-2671 static void init_qd8_f32_qc8w_gemm_config(void) { Link Here
2618
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2453
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2619
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2454
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2620
    assert(hardware_config != NULL);
2455
    assert(hardware_config != NULL);
2621
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
2456
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
2622
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
2623
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
2624
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
2625
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
2626
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx512vnni_params;
2627
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2628
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2629
      qd8_f32_qc8w_gemm_config.mr = 7;
2630
      qd8_f32_qc8w_gemm_config.nr = 16;
2631
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2632
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
2633
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2634
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2635
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2636
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2637
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params;
2638
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2639
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2640
      qd8_f32_qc8w_gemm_config.mr = 7;
2641
      qd8_f32_qc8w_gemm_config.nr = 8;
2642
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2643
    #if XNN_ENABLE_AVXVNNI
2644
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
2645
        // AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels
2646
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2647
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2648
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2649
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2650
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params;
2651
        qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2652
        qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2653
        qd8_f32_qc8w_gemm_config.mr = 5;
2654
        qd8_f32_qc8w_gemm_config.nr = 8;
2655
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
2656
    #endif
2657
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
2658
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm);
2659
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm);
2660
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx_prfm);
2661
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512skx_prfm);
2662
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2663
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2664
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2665
      qd8_f32_qc8w_gemm_config.mr = 7;
2666
      qd8_f32_qc8w_gemm_config.nr = 16;
2667
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2668
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
2669
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2457
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
2670
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64);
2458
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x4c8__xop_ld64);
2671
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64);
2459
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x4c8__xop_ld64);
Lines 3393-3471 static void init_qs8_qc8w_gemm_config(void) { Link Here
3393
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3181
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3394
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3182
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3395
    assert(hardware_config != NULL);
3183
    assert(hardware_config != NULL);
3396
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
3184
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
3397
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
3398
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
3399
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
3400
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
3401
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params;
3402
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3403
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3404
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3405
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3406
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3407
      qs8_qc8w_gemm_config.mr = 7;
3408
      qs8_qc8w_gemm_config.nr = 16;
3409
      qs8_qc8w_gemm_config.log2_kr = 3;
3410
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
3411
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm);
3412
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm);
3413
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm);
3414
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm);
3415
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avxvnni_params;
3416
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3417
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3418
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3419
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3420
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3421
      qs8_qc8w_gemm_config.mr = 7;
3422
      qs8_qc8w_gemm_config.nr = 8;
3423
      qs8_qc8w_gemm_config.log2_kr = 3;
3424
    #if XNN_ENABLE_AVXVNNI
3425
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
3426
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
3427
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
3428
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
3429
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
3430
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params;
3431
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3432
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3433
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3434
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3435
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3436
        qs8_qc8w_gemm_config.mr = 5;
3437
        qs8_qc8w_gemm_config.nr = 8;
3438
        qs8_qc8w_gemm_config.log2_kr = 3;
3439
    #endif
3440
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3441
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3442
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3443
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3444
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3445
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
3446
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3447
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3448
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
3449
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
3450
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
3451
      qs8_qc8w_gemm_config.mr = 7;
3452
      qs8_qc8w_gemm_config.nr = 16;
3453
      qs8_qc8w_gemm_config.log2_kr = 3;
3454
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3455
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3456
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3457
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3458
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3459
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
3460
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3461
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3462
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
3463
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
3464
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
3465
      qs8_qc8w_gemm_config.mr = 4;  // TODO: upgrade to 5x8 prfm when supported
3466
      qs8_qc8w_gemm_config.nr = 8;
3467
      qs8_qc8w_gemm_config.log2_kr = 3;
3468
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
3469
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3185
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3470
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3186
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3471
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3187
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
Lines 3991-4019 static void init_qu8_gemm_config(void) { Link Here
3991
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3707
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3992
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3708
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3993
    assert(hardware_config != NULL);
3709
    assert(hardware_config != NULL);
3994
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3710
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
3995
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3996
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3997
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3998
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3999
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
4000
      qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w;
4001
      qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w;
4002
      qu8_gemm_config.mr = 7;
4003
      qu8_gemm_config.nr = 16;
4004
      qu8_gemm_config.log2_kr = 3;
4005
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
4006
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx512skx);
4007
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x8c8__avx512skx);
4008
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx512skx);
4009
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x8c8__avx512skx);
4010
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
4011
      qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w;
4012
      qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w;
4013
      qu8_gemm_config.mr = 4;  // TODO: upgrade to 5x8 prfm when supported
4014
      qu8_gemm_config.nr = 8;
4015
      qu8_gemm_config.log2_kr = 3;
4016
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_xop) {
4017
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3711
      // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
4018
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3712
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
4019
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3713
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
(-)a/third_party/xnnpack/src/src/configs/hardware-config.c (-8 / +7 lines)
Lines 144-159 static void init_hardware_config(void) { Link Here
144
    hardware_config.use_x86_fma3 = cpuinfo_has_x86_fma3();
144
    hardware_config.use_x86_fma3 = cpuinfo_has_x86_fma3();
145
    hardware_config.use_x86_xop = cpuinfo_has_x86_xop();
145
    hardware_config.use_x86_xop = cpuinfo_has_x86_xop();
146
    hardware_config.use_x86_avx2 = cpuinfo_has_x86_avx2();
146
    hardware_config.use_x86_avx2 = cpuinfo_has_x86_avx2();
147
    hardware_config.use_x86_avx512f = cpuinfo_has_x86_avx512f();
147
    hardware_config.use_x86_avx512f = 0;
148
    hardware_config.use_x86_avx512skx = hardware_config.use_x86_avx512f &&
148
    hardware_config.use_x86_avx512skx = 0;
149
      cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl();
149
    hardware_config.use_x86_avx512vbmi = 0;
150
    hardware_config.use_x86_avx512vbmi = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vbmi();
150
    hardware_config.use_x86_avx512vnni = 0;
151
    hardware_config.use_x86_avx512vnni = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vnni();
151
    hardware_config.use_x86_avx512vnnigfni = 0;
152
    hardware_config.use_x86_avx512vnnigfni = hardware_config.use_x86_avx512vnni && cpuinfo_has_x86_gfni();
153
#if XNN_ENABLE_AVX512AMX
152
#if XNN_ENABLE_AVX512AMX
154
    // TODO(fbarchard): Use cpuinfo_has_x86_amx_int8 when available.
153
    // TODO(fbarchard): Use cpuinfo_has_x86_amx_int8 when available.
155
    // Infer AMX support from Sapphire Rapids having fp16 and amx.
154
    // Infer AMX support from Sapphire Rapids having fp16 and amx.
156
    hardware_config.use_x86_avx512amx = hardware_config.use_x86_avx512vnnigfni && cpuinfo_has_x86_avx512fp16();
155
    hardware_config.use_x86_avx512amx = 0;
157
#if XNN_ARCH_X86_64 && defined(__linux__)
156
#if XNN_ARCH_X86_64 && defined(__linux__)
158
    if (hardware_config.use_x86_avx512amx) {
157
    if (hardware_config.use_x86_avx512amx) {
159
      size_t status = xnn_syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA, 0);
158
      size_t status = xnn_syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA, 0);
Lines 167-173 static void init_hardware_config(void) { Link Here
167
    hardware_config.use_x86_avx512amx = 0;
166
    hardware_config.use_x86_avx512amx = 0;
168
#endif
167
#endif
169
#if XNN_ENABLE_AVXVNNI
168
#if XNN_ENABLE_AVXVNNI
170
    hardware_config.use_x86_avxvnni = hardware_config.use_x86_avx2 && cpuinfo_has_x86_avxvnni();
169
    hardware_config.use_x86_avxvnni = 0;
171
#else
170
#else
172
    hardware_config.use_x86_avxvnni = 0;
171
    hardware_config.use_x86_avxvnni = 0;
173
#endif
172
#endif
(-)a/third_party/xnnpack/src/src/configs/prelu-config.c (-5 / +1 lines)
Lines 77-87 static void init_f32_prelu_config(void) { Link Here
77
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
77
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
78
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
78
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
79
    assert(hardware_config != NULL);
79
    assert(hardware_config != NULL);
80
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
80
    if (hardware_config->use_x86_avx) {
81
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx512f_2x16;
82
      f32_prelu_config.row_tile = 2;
83
      f32_prelu_config.channel_tile = 16;
84
    } else if (hardware_config->use_x86_avx) {
85
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16;
81
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16;
86
      f32_prelu_config.row_tile = 2;
82
      f32_prelu_config.row_tile = 2;
87
      f32_prelu_config.channel_tile = 16;
83
      f32_prelu_config.channel_tile = 16;
(-)a/third_party/xnnpack/src/src/configs/reduce-config.c (-13 / +2 lines)
Lines 106-117 static void init_f32_rminmax_config(void) { Link Here
106
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
106
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
107
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
107
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
108
    assert(hardware_config != NULL);
108
    assert(hardware_config != NULL);
109
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
109
    if (hardware_config->use_x86_avx) {
110
      f32_rminmax_config = (struct xnn_reduce_config) {
111
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx512f_u64_acc4,
112
        .element_tile = 64,
113
      };
114
    } else if (hardware_config->use_x86_avx) {
115
      f32_rminmax_config = (struct xnn_reduce_config) {
110
      f32_rminmax_config = (struct xnn_reduce_config) {
116
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx_u32_acc4,
111
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx_u32_acc4,
117
        .init.f32_default = xnn_init_f32_default_avx_params,
112
        .init.f32_default = xnn_init_f32_default_avx_params,
Lines 180-192 static void init_f32_rsum_config(void) { Link Here
180
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
175
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
181
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
176
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
182
    assert(hardware_config != NULL);
177
    assert(hardware_config != NULL);
183
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
178
    if (hardware_config->use_x86_avx) {
184
      f32_rsum_config = (struct xnn_reduce_config) {
185
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx512f_u64_acc4,
186
        .init.f32_scale = xnn_init_f32_scale_scalar_params,
187
        .element_tile = 64,
188
      };
189
    } else if (hardware_config->use_x86_avx) {
190
      f32_rsum_config = (struct xnn_reduce_config) {
179
      f32_rsum_config = (struct xnn_reduce_config) {
191
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx_u32_acc4,
180
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx_u32_acc4,
192
        .init.f32_scale = xnn_init_f32_scale_avx_params,
181
        .init.f32_scale = xnn_init_f32_scale_avx_params,
(-)a/third_party/xnnpack/src/src/configs/rmax-config.c (-3 / +1 lines)
Lines 68-76 static void init_f32_rmax_config(void) { Link Here
68
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
68
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
69
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
69
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
70
    assert(hardware_config != NULL);
70
    assert(hardware_config != NULL);
71
    if (hardware_config->use_x86_avx512f) {
71
    if (hardware_config->use_x86_avx) {
72
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx512f_u64_acc4;
73
    } else if (hardware_config->use_x86_avx) {
74
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx_u32_acc4;
72
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx_u32_acc4;
75
      f32_rmax_config.init.f32 = xnn_init_f32_default_avx_params;
73
      f32_rmax_config.init.f32 = xnn_init_f32_default_avx_params;
76
    } else {
74
    } else {
(-)a/third_party/xnnpack/src/src/configs/unary-elementwise-config.c (-93 / +20 lines)
Lines 554-563 static void init_f16_to_f32_cvt_config(void) { Link Here
554
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
554
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
555
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
555
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
556
    assert(hardware_config != NULL);
556
    assert(hardware_config != NULL);
557
    if (hardware_config->use_x86_avx512skx) {
557
    if (hardware_config->use_x86_f16c) {
558
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx512skx_u16;
559
      f16_to_f32_cvt_config.element_tile = 16;
560
    } else if (hardware_config->use_x86_f16c) {
561
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__f16c_u16;
558
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__f16c_u16;
562
      f16_to_f32_cvt_config.element_tile = 16;
559
      f16_to_f32_cvt_config.element_tile = 16;
563
    } else if (hardware_config->use_x86_avx) {
560
    } else if (hardware_config->use_x86_avx) {
Lines 631-641 static void init_f32_abs_config(void) { Link Here
631
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
628
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
632
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
629
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
633
    assert(hardware_config != NULL);
630
    assert(hardware_config != NULL);
634
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
631
    if (hardware_config->use_x86_avx) {
635
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx512f_u16;
636
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx512_params;
637
      f32_abs_config.element_tile = 16;
638
    } else if (hardware_config->use_x86_avx) {
639
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx_u16;
632
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx_u16;
640
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx_params;
633
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx_params;
641
      f32_abs_config.element_tile = 16;
634
      f32_abs_config.element_tile = 16;
Lines 680-690 static void init_f32_clamp_config(void) { Link Here
680
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
673
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
681
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
674
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
682
    assert(hardware_config != NULL);
675
    assert(hardware_config != NULL);
683
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
676
    if (hardware_config->use_x86_avx) {
684
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx512f_u16;
685
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
686
      f32_clamp_config.element_tile = 16;
687
    } else if (hardware_config->use_x86_avx) {
688
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_u16;
677
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_u16;
689
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_avx_params;
678
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_avx_params;
690
      f32_clamp_config.element_tile = 16;
679
      f32_clamp_config.element_tile = 16;
Lines 746-756 static void init_f32_elu_config(void) { Link Here
746
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
735
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
747
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
736
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
748
    assert(hardware_config != NULL);
737
    assert(hardware_config != NULL);
749
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
738
    if (hardware_config->use_x86_avx2) {
750
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx512f_rr1_p6_u128;
751
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx512_rr1_p6_params;
752
      f32_elu_config.element_tile = 128;
753
    } else if (hardware_config->use_x86_avx2) {
754
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_u56;
739
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_u56;
755
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params;
740
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params;
756
      f32_elu_config.element_tile = 56;
741
      f32_elu_config.element_tile = 56;
Lines 824-834 static void init_f32_hswish_config(void) { Link Here
824
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
809
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
825
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
810
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
826
    assert(hardware_config != NULL);
811
    assert(hardware_config != NULL);
827
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
812
    if (hardware_config->use_x86_fma3) {
828
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__avx512f_u16;
829
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx512_params;
830
      f32_hswish_config.element_tile = 16;
831
    } else if (hardware_config->use_x86_fma3) {
832
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__fma3_u16;
813
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__fma3_u16;
833
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx_params;
814
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx_params;
834
      f32_hswish_config.element_tile = 16;
815
      f32_hswish_config.element_tile = 16;
Lines 888-898 static void init_f32_lrelu_config(void) { Link Here
888
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
869
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
889
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
870
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
890
    assert(hardware_config != NULL);
871
    assert(hardware_config != NULL);
891
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
872
    if (hardware_config->use_x86_avx) {
892
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx512f_u16;
893
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params;
894
      f32_lrelu_config.element_tile = 16;
895
    } else if (hardware_config->use_x86_avx) {
896
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_u16;
873
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_u16;
897
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_avx_params;
874
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_avx_params;
898
      f32_lrelu_config.element_tile = 16;
875
      f32_lrelu_config.element_tile = 16;
Lines 961-971 static void init_f32_neg_config(void) { Link Here
961
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
938
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
962
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
939
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
963
    assert(hardware_config != NULL);
940
    assert(hardware_config != NULL);
964
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
941
    if (hardware_config->use_x86_avx) {
965
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx512f_u16;
966
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx512_params;
967
      f32_neg_config.element_tile = 16;
968
    } else if (hardware_config->use_x86_avx) {
969
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx_u16;
942
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx_u16;
970
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx_params;
943
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx_params;
971
      f32_neg_config.element_tile = 16;
944
      f32_neg_config.element_tile = 16;
Lines 1029-1038 static void init_f32_rndd_config(void) { Link Here
1029
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1002
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1030
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1003
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1031
    assert(hardware_config != NULL);
1004
    assert(hardware_config != NULL);
1032
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1005
    if (hardware_config->use_x86_avx) {
1033
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx512f_u16;
1034
      f32_rndd_config.element_tile = 16;
1035
    } else if (hardware_config->use_x86_avx) {
1036
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx_u16;
1006
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx_u16;
1037
      f32_rndd_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1007
      f32_rndd_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1038
      f32_rndd_config.element_tile = 16;
1008
      f32_rndd_config.element_tile = 16;
Lines 1081-1090 static void init_f32_rndne_config(void) { Link Here
1081
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1051
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1082
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1052
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1083
    assert(hardware_config != NULL);
1053
    assert(hardware_config != NULL);
1084
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1054
    if (hardware_config->use_x86_avx) {
1085
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx512f_u16;
1086
      f32_rndne_config.element_tile = 16;
1087
    } else if (hardware_config->use_x86_avx) {
1088
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx_u16;
1055
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx_u16;
1089
      f32_rndne_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1056
      f32_rndne_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1090
      f32_rndne_config.element_tile = 16;
1057
      f32_rndne_config.element_tile = 16;
Lines 1133-1142 static void init_f32_rndu_config(void) { Link Here
1133
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1100
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1134
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1101
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1135
    assert(hardware_config != NULL);
1102
    assert(hardware_config != NULL);
1136
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1103
    if (hardware_config->use_x86_avx) {
1137
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx512f_u16;
1138
      f32_rndu_config.element_tile = 16;
1139
    } else if (hardware_config->use_x86_avx) {
1140
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx_u16;
1104
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx_u16;
1141
      f32_rndu_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1105
      f32_rndu_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1142
      f32_rndu_config.element_tile = 16;
1106
      f32_rndu_config.element_tile = 16;
Lines 1185-1194 static void init_f32_rndz_config(void) { Link Here
1185
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1149
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1186
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1150
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1187
    assert(hardware_config != NULL);
1151
    assert(hardware_config != NULL);
1188
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1152
    if (hardware_config->use_x86_avx) {
1189
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx512f_u16;
1190
      f32_rndz_config.element_tile = 16;
1191
    } else if (hardware_config->use_x86_avx) {
1192
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx_u16;
1153
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx_u16;
1193
      f32_rndz_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1154
      f32_rndz_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1194
      f32_rndz_config.element_tile = 16;
1155
      f32_rndz_config.element_tile = 16;
Lines 1235-1245 static void init_f32_sigmoid_config(void) { Link Here
1235
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1196
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1236
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1197
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1237
    assert(hardware_config != NULL);
1198
    assert(hardware_config != NULL);
1238
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1199
    if (hardware_config->use_x86_avx2) {
1239
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u64;
1240
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params;
1241
      f32_sigmoid_config.element_tile = 64;
1242
    } else if (hardware_config->use_x86_avx2) {
1243
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40;
1200
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40;
1244
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params;
1201
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params;
1245
      f32_sigmoid_config.element_tile = 40;
1202
      f32_sigmoid_config.element_tile = 40;
Lines 1298-1307 static void init_f32_sqr_config(void) { Link Here
1298
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1255
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1299
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1256
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1300
    assert(hardware_config != NULL);
1257
    assert(hardware_config != NULL);
1301
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1258
    if (hardware_config->use_x86_avx) {
1302
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx512f_u16;
1303
      f32_sqr_config.element_tile = 16;
1304
    } else if (hardware_config->use_x86_avx) {
1305
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx_u16;
1259
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx_u16;
1306
      f32_sqr_config.init.f32_default = xnn_init_f32_default_avx_params;
1260
      f32_sqr_config.init.f32_default = xnn_init_f32_default_avx_params;
1307
      f32_sqr_config.element_tile = 16;
1261
      f32_sqr_config.element_tile = 16;
Lines 1368-1378 static void init_f32_rsqrt_config(void) { Link Here
1368
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1322
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1369
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1323
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1370
    assert(hardware_config != NULL);
1324
    assert(hardware_config != NULL);
1371
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1325
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_fma3) {
1372
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32;
1373
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_avx512_params;
1374
      f32_rsqrt_config.element_tile = 32;
1375
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_fma3) {
1376
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16;
1326
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16;
1377
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_fma3_params;
1327
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_fma3_params;
1378
      f32_rsqrt_config.element_tile = 16;
1328
      f32_rsqrt_config.element_tile = 16;
Lines 1416-1426 static void init_f32_tanh_config(void) { Link Here
1416
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1366
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1417
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1367
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1418
    assert(hardware_config != NULL);
1368
    assert(hardware_config != NULL);
1419
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1369
    if (hardware_config->use_x86_avx2) {
1420
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx512skx_expm1minus_rr1_lut4_p4h3ts_perm_div_u64;
1421
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx512_expm1minus_rr1_lut4_p4h3_perm_params;
1422
      f32_tanh_config.element_tile = 64;
1423
    } else if (hardware_config->use_x86_avx2) {
1424
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx2_expm1minus_rr1_lut4_p4h3ts_perm_div_u32;
1370
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx2_expm1minus_rr1_lut4_p4h3ts_perm_div_u32;
1425
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx_expm1minus_rr1_lut4_p4h3_perm_params;
1371
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx_expm1minus_rr1_lut4_p4h3_perm_params;
1426
      f32_tanh_config.element_tile = 32;
1372
      f32_tanh_config.element_tile = 32;
Lines 1500-1509 static void init_f32_to_f16_cvt_config(void) { Link Here
1500
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1446
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1501
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1447
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1502
    assert(hardware_config != NULL);
1448
    assert(hardware_config != NULL);
1503
    if (hardware_config->use_x86_avx512skx) {
1449
    if (hardware_config->use_x86_f16c) {
1504
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__avx512skx_u16;
1505
      f32_to_f16_cvt_config.element_tile = 16;
1506
    } else if (hardware_config->use_x86_f16c) {
1507
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__f16c_u16;
1450
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__f16c_u16;
1508
      f32_to_f16_cvt_config.init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params;
1451
      f32_to_f16_cvt_config.init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params;
1509
      f32_to_f16_cvt_config.element_tile = 16;
1452
      f32_to_f16_cvt_config.element_tile = 16;
Lines 1571-1581 static void init_f32_to_qs8_cvt_config(void) { Link Here
1571
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1514
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1572
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1515
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1573
    assert(hardware_config != NULL);
1516
    assert(hardware_config != NULL);
1574
    if (hardware_config->use_x86_avx512skx) {
1517
    if (hardware_config->use_x86_avx2) {
1575
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx512skx_u128;
1576
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params;
1577
      f32_to_qs8_cvt_config.element_tile = 128;
1578
    } else if (hardware_config->use_x86_avx2) {
1579
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_u64;
1518
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_u64;
1580
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params;
1519
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params;
1581
      f32_to_qs8_cvt_config.element_tile = 64;
1520
      f32_to_qs8_cvt_config.element_tile = 64;
Lines 1645-1655 static void init_f32_to_qu8_cvt_config(void) { Link Here
1645
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1584
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1646
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1585
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1647
    assert(hardware_config != NULL);
1586
    assert(hardware_config != NULL);
1648
    if (hardware_config->use_x86_avx512skx) {
1587
    if (hardware_config->use_x86_avx2) {
1649
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx512skx_u128;
1650
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params;
1651
      f32_to_qu8_cvt_config.element_tile = 128;
1652
    } else if (hardware_config->use_x86_avx2) {
1653
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_u64;
1588
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_u64;
1654
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params;
1589
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params;
1655
      f32_to_qu8_cvt_config.element_tile = 64;
1590
      f32_to_qu8_cvt_config.element_tile = 64;
Lines 1939-1949 static void init_qs8_to_f32_cvt_config(void) { Link Here
1939
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1874
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1940
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1875
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1941
    assert(hardware_config != NULL);
1876
    assert(hardware_config != NULL);
1942
    if (hardware_config->use_x86_avx512skx) {
1877
    if (hardware_config->use_x86_avx2) {
1943
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx512skx_u32;
1944
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params;
1945
      qs8_to_f32_cvt_config.element_tile = 32;
1946
    } else if (hardware_config->use_x86_avx2) {
1947
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_u16;
1878
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_u16;
1948
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params;
1879
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params;
1949
      qs8_to_f32_cvt_config.element_tile = 16;
1880
      qs8_to_f32_cvt_config.element_tile = 16;
Lines 2161-2171 static void init_qu8_to_f32_cvt_config(void) { Link Here
2161
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2092
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2162
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2093
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2163
    assert(hardware_config != NULL);
2094
    assert(hardware_config != NULL);
2164
    if (hardware_config->use_x86_avx512skx) {
2095
    if (hardware_config->use_x86_avx2) {
2165
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx512skx_u32;
2166
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params;
2167
      qu8_to_f32_cvt_config.element_tile = 32;
2168
    } else if (hardware_config->use_x86_avx2) {
2169
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_u16;
2096
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_u16;
2170
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params;
2097
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params;
2171
      qu8_to_f32_cvt_config.element_tile = 16;
2098
      qu8_to_f32_cvt_config.element_tile = 16;
(-)a/third_party/xnnpack/src/src/configs/x8-lut-config.c (-9 / +1 lines)
Lines 36-50 static void init_x8_lut_config(void) { Link Here
36
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
36
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
37
    assert(hardware_config != NULL);
37
    assert(hardware_config != NULL);
38
38
39
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
39
    if (hardware_config->use_x86_avx2) {
40
      if (hardware_config->use_x86_avx512vbmi) {
41
        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128;
42
        x8_lut_config.tile_size = 128;
43
      } else {
44
        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64;
45
        x8_lut_config.tile_size = 64;
46
      }
47
    } else if (hardware_config->use_x86_avx2) {
48
      x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx2_u128;
40
      x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx2_u128;
49
      x8_lut_config.tile_size = 128;
41
      x8_lut_config.tile_size = 128;
50
    } else if (hardware_config->use_x86_avx) {
42
    } else if (hardware_config->use_x86_avx) {

Return to bug 931623