Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 933827
Collapse All | Expand All

(-)a/skia/BUILD.gn (-22 lines)
Lines 769-800 Link Here
769
    }
769
    }
770
    visibility = [ ":skcms" ]
770
    visibility = [ ":skcms" ]
771
  }
771
  }
772
  skia_source_set("skcms_TransformSkx") {
773
    sources = skcms_TransformSkx
774
    if (!is_win) {
775
      cflags = [
776
        "-w",
777
        "-mavx512f",
778
        "-mavx512dq",
779
        "-mavx512cd",
780
        "-mavx512bw",
781
        "-mavx512vl",
782
        "-std=c11",
783
      ]
784
    } else {
785
      cflags = [ "/arch:AVX512" ]
786
    }
787
    visibility = [ ":skcms" ]
788
  }
789
} else {
772
} else {
790
  skia_source_set("skcms_TransformHsw") {
773
  skia_source_set("skcms_TransformHsw") {
791
    sources = []
774
    sources = []
792
    visibility = [ ":skcms" ]
775
    visibility = [ ":skcms" ]
793
  }
776
  }
794
  skia_source_set("skcms_TransformSkx") {
795
    sources = []
796
    visibility = [ ":skcms" ]
797
  }
798
}
777
}
799
778
800
source_set("skcms_TransformBaseline_and_public") {
779
source_set("skcms_TransformBaseline_and_public") {
Lines 823-829 Link Here
823
  deps = [
802
  deps = [
824
    ":skcms_TransformBaseline_and_public",
803
    ":skcms_TransformBaseline_and_public",
825
    ":skcms_TransformHsw",
804
    ":skcms_TransformHsw",
826
    ":skcms_TransformSkx",
827
  ]
805
  ]
828
  public =
806
  public =
829
      rebase_path(skcms_public_headers, ".", "//third_party/skia/modules/skcms")
807
      rebase_path(skcms_public_headers, ".", "//third_party/skia/modules/skcms")
(-)a/third_party/skia/modules/skcms/skcms.cc (-5 lines)
Lines 2787-2797 Link Here
2787
    auto run = baseline::run_program;
2787
    auto run = baseline::run_program;
2788
    switch (cpu_type()) {
2788
    switch (cpu_type()) {
2789
        case CpuType::SKX:
2789
        case CpuType::SKX:
2790
            #if !defined(SKCMS_DISABLE_SKX)
2791
                run = skx::run_program;
2792
                break;
2793
            #endif
2794
2795
        case CpuType::HSW:
2790
        case CpuType::HSW:
2796
            #if !defined(SKCMS_DISABLE_HSW)
2791
            #if !defined(SKCMS_DISABLE_HSW)
2797
                run = hsw::run_program;
2792
                run = hsw::run_program;
(-)a/third_party/xnnpack/BUILD.gn (-359 lines)
Lines 61-73 Link Here
61
if (current_cpu == "x64" || current_cpu == "x86") {
61
if (current_cpu == "x64" || current_cpu == "x86") {
62
  xnnpack_deps = [
62
  xnnpack_deps = [
63
    ":amalgam_avx-no-avx2-no-f16c-no-fma",
63
    ":amalgam_avx-no-avx2-no-f16c-no-fma",
64
    ":amalgam_avx2-avxvnni-f16c-fma",
65
    ":amalgam_avx512f",
66
    ":amalgam_f16c-fma-avx2",
64
    ":amalgam_f16c-fma-avx2",
67
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl",
68
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi",
69
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni",
70
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni",
71
    ":amalgam_f16c-fma-no-avx2",
65
    ":amalgam_f16c-fma-no-avx2",
72
    ":amalgam_f16c-no-avx2-no-fma",
66
    ":amalgam_f16c-no-avx2-no-fma",
73
    ":amalgam_sse2-no-sse3",
67
    ":amalgam_sse2-no-sse3",
Lines 83-95 Link Here
83
77
84
  xnnpack_standalone_deps = [
78
  xnnpack_standalone_deps = [
85
    ":amalgam_avx-no-avx2-no-f16c-no-fma_standalone",
79
    ":amalgam_avx-no-avx2-no-f16c-no-fma_standalone",
86
    ":amalgam_avx2-avxvnni-f16c-fma_standalone",
87
    ":amalgam_avx512f_standalone",
88
    ":amalgam_f16c-fma-avx2_standalone",
80
    ":amalgam_f16c-fma-avx2_standalone",
89
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi_standalone",
90
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni_standalone",
91
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni_standalone",
92
    ":amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl_standalone",
93
    ":amalgam_f16c-fma-no-avx2_standalone",
81
    ":amalgam_f16c-fma-no-avx2_standalone",
94
    ":amalgam_f16c-no-avx2-no-fma_standalone",
82
    ":amalgam_f16c-no-avx2-no-fma_standalone",
95
    ":amalgam_sse2-no-sse3_standalone",
83
    ":amalgam_sse2-no-sse3_standalone",
Lines 304-405 Link Here
304
    }
292
    }
305
  }
293
  }
306
294
307
  source_set("amalgam_avx2-avxvnni-f16c-fma") {
308
    cflags = [
309
      "-mavx2",
310
      "-mavxvnni",
311
      "-mf16c",
312
      "-mfma",
313
    ]
314
315
    sources = [ "src/src/amalgam/gen/avxvnni.c" ]
316
317
    configs -= [ "//build/config/compiler:chromium_code" ]
318
    configs += [ "//build/config/compiler:no_chromium_code" ]
319
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
320
321
    deps = [
322
      "//third_party/cpuinfo",
323
      "//third_party/fp16",
324
      "//third_party/fxdiv",
325
      "//third_party/pthreadpool",
326
    ]
327
328
    public_configs = [ ":xnnpack_config" ]
329
  }
330
331
  # This is a target that cannot depend on //base.
332
  source_set("amalgam_avx2-avxvnni-f16c-fma_standalone") {
333
    cflags = [
334
      "-mavx2",
335
      "-mavxvnni",
336
      "-mf16c",
337
      "-mfma",
338
    ]
339
340
    sources = [ "src/src/amalgam/gen/avxvnni.c" ]
341
342
    configs -= [ "//build/config/compiler:chromium_code" ]
343
    configs += [ "//build/config/compiler:no_chromium_code" ]
344
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
345
346
    deps = [
347
      "//third_party/cpuinfo",
348
      "//third_party/fp16",
349
      "//third_party/fxdiv",
350
      "//third_party/pthreadpool:pthreadpool_standalone",
351
    ]
352
353
    public_configs = [ ":xnnpack_config" ]
354
355
    if (!(is_android && use_order_profiling)) {
356
      assert_no_deps = [ "//base" ]
357
    }
358
  }
359
360
  source_set("amalgam_avx512f") {
361
    cflags = [ "-mavx512f" ]
362
363
    sources = [ "src/src/amalgam/gen/avx512f.c" ]
364
365
    configs -= [ "//build/config/compiler:chromium_code" ]
366
    configs += [ "//build/config/compiler:no_chromium_code" ]
367
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
368
369
    deps = [
370
      "//third_party/cpuinfo",
371
      "//third_party/fp16",
372
      "//third_party/fxdiv",
373
      "//third_party/pthreadpool",
374
    ]
375
376
    public_configs = [ ":xnnpack_config" ]
377
  }
378
379
  # This is a target that cannot depend on //base.
380
  source_set("amalgam_avx512f_standalone") {
381
    cflags = [ "-mavx512f" ]
382
383
    sources = [ "src/src/amalgam/gen/avx512f.c" ]
384
385
    configs -= [ "//build/config/compiler:chromium_code" ]
386
    configs += [ "//build/config/compiler:no_chromium_code" ]
387
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
388
389
    deps = [
390
      "//third_party/cpuinfo",
391
      "//third_party/fp16",
392
      "//third_party/fxdiv",
393
      "//third_party/pthreadpool:pthreadpool_standalone",
394
    ]
395
396
    public_configs = [ ":xnnpack_config" ]
397
398
    if (!(is_android && use_order_profiling)) {
399
      assert_no_deps = [ "//base" ]
400
    }
401
  }
402
403
  source_set("amalgam_f16c-fma-avx2") {
295
  source_set("amalgam_f16c-fma-avx2") {
404
    cflags = [
296
    cflags = [
405
      "-mavx2",
297
      "-mavx2",
Lines 435-691 Link Here
435
327
436
    configs -= [ "//build/config/compiler:chromium_code" ]
328
    configs -= [ "//build/config/compiler:chromium_code" ]
437
    configs += [ "//build/config/compiler:no_chromium_code" ]
329
    configs += [ "//build/config/compiler:no_chromium_code" ]
438
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
439
440
    deps = [
441
      "//third_party/cpuinfo",
442
      "//third_party/fp16",
443
      "//third_party/fxdiv",
444
      "//third_party/pthreadpool:pthreadpool_standalone",
445
    ]
446
447
    public_configs = [ ":xnnpack_config" ]
448
449
    if (!(is_android && use_order_profiling)) {
450
      assert_no_deps = [ "//base" ]
451
    }
452
  }
453
454
  source_set("amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl") {
455
    cflags = [
456
      "-mavx512bw",
457
      "-mavx512cd",
458
      "-mavx512dq",
459
      "-mavx512f",
460
      "-mavx512vl",
461
      "-mf16c",
462
      "-mfma",
463
    ]
464
465
    sources = [ "src/src/amalgam/gen/avx512skx.c" ]
466
467
    configs -= [ "//build/config/compiler:chromium_code" ]
468
    configs += [ "//build/config/compiler:no_chromium_code" ]
469
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
470
471
    deps = [
472
      "//third_party/cpuinfo",
473
      "//third_party/fp16",
474
      "//third_party/fxdiv",
475
      "//third_party/pthreadpool",
476
    ]
477
478
    public_configs = [ ":xnnpack_config" ]
479
  }
480
481
  # This is a target that cannot depend on //base.
482
  source_set(
483
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl_standalone") {
484
    cflags = [
485
      "-mavx512bw",
486
      "-mavx512cd",
487
      "-mavx512dq",
488
      "-mavx512f",
489
      "-mavx512vl",
490
      "-mf16c",
491
      "-mfma",
492
    ]
493
494
    sources = [ "src/src/amalgam/gen/avx512skx.c" ]
495
496
    configs -= [ "//build/config/compiler:chromium_code" ]
497
    configs += [ "//build/config/compiler:no_chromium_code" ]
498
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
499
500
    deps = [
501
      "//third_party/cpuinfo",
502
      "//third_party/fp16",
503
      "//third_party/fxdiv",
504
      "//third_party/pthreadpool:pthreadpool_standalone",
505
    ]
506
507
    public_configs = [ ":xnnpack_config" ]
508
509
    if (!(is_android && use_order_profiling)) {
510
      assert_no_deps = [ "//base" ]
511
    }
512
  }
513
514
  source_set(
515
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi") {
516
    cflags = [
517
      "-mavx512bw",
518
      "-mavx512cd",
519
      "-mavx512dq",
520
      "-mavx512f",
521
      "-mavx512vbmi",
522
      "-mavx512vl",
523
      "-mf16c",
524
      "-mfma",
525
    ]
526
527
    sources = [ "src/src/amalgam/gen/avx512vbmi.c" ]
528
529
    configs -= [ "//build/config/compiler:chromium_code" ]
530
    configs += [ "//build/config/compiler:no_chromium_code" ]
531
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
532
533
    deps = [
534
      "//third_party/cpuinfo",
535
      "//third_party/fp16",
536
      "//third_party/fxdiv",
537
      "//third_party/pthreadpool",
538
    ]
539
540
    public_configs = [ ":xnnpack_config" ]
541
  }
542
543
  # This is a target that cannot depend on //base.
544
  source_set(
545
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vbmi_standalone") {
546
    cflags = [
547
      "-mavx512bw",
548
      "-mavx512cd",
549
      "-mavx512dq",
550
      "-mavx512f",
551
      "-mavx512vbmi",
552
      "-mavx512vl",
553
      "-mf16c",
554
      "-mfma",
555
    ]
556
557
    sources = [ "src/src/amalgam/gen/avx512vbmi.c" ]
558
559
    configs -= [ "//build/config/compiler:chromium_code" ]
560
    configs += [ "//build/config/compiler:no_chromium_code" ]
561
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
562
563
    deps = [
564
      "//third_party/cpuinfo",
565
      "//third_party/fp16",
566
      "//third_party/fxdiv",
567
      "//third_party/pthreadpool:pthreadpool_standalone",
568
    ]
569
570
    public_configs = [ ":xnnpack_config" ]
571
572
    if (!(is_android && use_order_profiling)) {
573
      assert_no_deps = [ "//base" ]
574
    }
575
  }
576
577
  source_set(
578
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni") {
579
    cflags = [
580
      "-mavx512bw",
581
      "-mavx512cd",
582
      "-mavx512dq",
583
      "-mavx512f",
584
      "-mavx512vl",
585
      "-mavx512vnni",
586
      "-mf16c",
587
      "-mfma",
588
    ]
589
590
    sources = [ "src/src/amalgam/gen/avx512vnni.c" ]
591
592
    configs -= [ "//build/config/compiler:chromium_code" ]
593
    configs += [ "//build/config/compiler:no_chromium_code" ]
594
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
595
596
    deps = [
597
      "//third_party/cpuinfo",
598
      "//third_party/fp16",
599
      "//third_party/fxdiv",
600
      "//third_party/pthreadpool",
601
    ]
602
603
    public_configs = [ ":xnnpack_config" ]
604
  }
605
606
  # This is a target that cannot depend on //base.
607
  source_set(
608
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni_standalone") {
609
    cflags = [
610
      "-mavx512bw",
611
      "-mavx512cd",
612
      "-mavx512dq",
613
      "-mavx512f",
614
      "-mavx512vl",
615
      "-mavx512vnni",
616
      "-mf16c",
617
      "-mfma",
618
    ]
619
620
    sources = [ "src/src/amalgam/gen/avx512vnni.c" ]
621
622
    configs -= [ "//build/config/compiler:chromium_code" ]
623
    configs += [ "//build/config/compiler:no_chromium_code" ]
624
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
625
626
    deps = [
627
      "//third_party/cpuinfo",
628
      "//third_party/fp16",
629
      "//third_party/fxdiv",
630
      "//third_party/pthreadpool:pthreadpool_standalone",
631
    ]
632
633
    public_configs = [ ":xnnpack_config" ]
634
635
    if (!(is_android && use_order_profiling)) {
636
      assert_no_deps = [ "//base" ]
637
    }
638
  }
639
640
  source_set(
641
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni") {
642
    cflags = [
643
      "-mavx512bw",
644
      "-mavx512cd",
645
      "-mavx512dq",
646
      "-mavx512f",
647
      "-mavx512vl",
648
      "-mavx512vnni",
649
      "-mf16c",
650
      "-mfma",
651
      "-mgfni",
652
    ]
653
654
    sources = [ "src/src/amalgam/gen/avx512vnnigfni.c" ]
655
656
    configs -= [ "//build/config/compiler:chromium_code" ]
657
    configs += [ "//build/config/compiler:no_chromium_code" ]
658
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
659
660
    deps = [
661
      "//third_party/cpuinfo",
662
      "//third_party/fp16",
663
      "//third_party/fxdiv",
664
      "//third_party/pthreadpool",
665
    ]
666
667
    public_configs = [ ":xnnpack_config" ]
668
  }
669
670
  # This is a target that cannot depend on //base.
671
  source_set(
672
      "amalgam_f16c-fma-avx512f-avx512cd-avx512bw-avx512dq-avx512vl-avx512vnni-gfni_standalone") {
673
    cflags = [
674
      "-mavx512bw",
675
      "-mavx512cd",
676
      "-mavx512dq",
677
      "-mavx512f",
678
      "-mavx512vl",
679
      "-mavx512vnni",
680
      "-mf16c",
681
      "-mfma",
682
      "-mgfni",
683
    ]
684
685
    sources = [ "src/src/amalgam/gen/avx512vnnigfni.c" ]
686
687
    configs -= [ "//build/config/compiler:chromium_code" ]
688
    configs += [ "//build/config/compiler:no_chromium_code" ]
689
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
330
    configs += [ "//build/config/sanitizers:cfi_icall_generalize_pointers" ]
690
331
691
    deps = [
332
    deps = [
(-)a/third_party/xnnpack/src/src/configs/binary-elementwise-config.c (-60 / +9 lines)
Lines 338-350 Link Here
338
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
338
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
339
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
339
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
340
    assert(hardware_config != NULL);
340
    assert(hardware_config != NULL);
341
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
341
    if (hardware_config->use_x86_avx) {
342
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx512f_u32;
343
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32;
344
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx512f_u32;
345
      f32_vadd_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
346
      f32_vadd_config.minmax.element_tile = 32;
347
    } else if (hardware_config->use_x86_avx) {
348
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16;
342
      f32_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vadd_minmax_ukernel__avx_u16;
349
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
343
      f32_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
350
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
344
      f32_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vaddc_minmax_ukernel__avx_u16;
Lines 429-441 Link Here
429
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
423
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
430
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
424
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
431
    assert(hardware_config != NULL);
425
    assert(hardware_config != NULL);
432
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
426
    if (hardware_config->use_x86_avx) {
433
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx512f_u32;
434
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx512f_u32;
435
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx512f_u32;
436
      f32_vdiv_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
437
      f32_vdiv_config.minmax.element_tile = 32;
438
    } else if (hardware_config->use_x86_avx) {
439
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16;
427
      f32_vdiv_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdiv_minmax_ukernel__avx_u16;
440
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16;
428
      f32_vdiv_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vdivc_minmax_ukernel__avx_u16;
441
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16;
429
      f32_vdiv_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrdivc_minmax_ukernel__avx_u16;
Lines 517-528 Link Here
517
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
505
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
518
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
506
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
519
    assert(hardware_config != NULL);
507
    assert(hardware_config != NULL);
520
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
508
    if (hardware_config->use_x86_avx) {
521
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx512f_u32;
522
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32;
523
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx512f_u32;
524
      f32_vmax_config.minmax.element_tile = 32;
525
    } else if (hardware_config->use_x86_avx) {
526
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16;
509
      f32_vmax_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmax_ukernel__avx_u16;
527
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
510
      f32_vmax_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
528
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
511
      f32_vmax_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmaxc_ukernel__avx_u16;
Lines 590-601 Link Here
590
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
573
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
591
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
574
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
592
    assert(hardware_config != NULL);
575
    assert(hardware_config != NULL);
593
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
576
    if (hardware_config->use_x86_avx) {
594
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx512f_u32;
595
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32;
596
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx512f_u32;
597
      f32_vmin_config.minmax.element_tile = 32;
598
    } else if (hardware_config->use_x86_avx) {
599
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16;
577
      f32_vmin_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmin_ukernel__avx_u16;
600
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
578
      f32_vmin_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
601
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
579
      f32_vmin_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vminc_ukernel__avx_u16;
Lines 666-678 Link Here
666
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
644
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
667
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
645
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
668
    assert(hardware_config != NULL);
646
    assert(hardware_config != NULL);
669
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
647
    if (hardware_config->use_x86_avx) {
670
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx512f_u32;
671
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32;
672
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx512f_u32;
673
      f32_vmul_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
674
      f32_vmul_config.minmax.element_tile = 32;
675
    } else if (hardware_config->use_x86_avx) {
676
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16;
648
      f32_vmul_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmul_minmax_ukernel__avx_u16;
677
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
649
      f32_vmul_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
678
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
650
      f32_vmul_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vmulc_minmax_ukernel__avx_u16;
Lines 757-769 Link Here
757
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
729
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
758
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
730
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
759
    assert(hardware_config != NULL);
731
    assert(hardware_config != NULL);
760
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
732
    if (hardware_config->use_x86_avx) {
761
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx512f_u32;
762
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx512f_u32;
763
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx512f_u32;
764
      f32_vsub_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
765
      f32_vsub_config.minmax.element_tile = 32;
766
    } else if (hardware_config->use_x86_avx) {
767
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16;
733
      f32_vsub_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsub_minmax_ukernel__avx_u16;
768
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16;
734
      f32_vsub_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsubc_minmax_ukernel__avx_u16;
769
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16;
735
      f32_vsub_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vrsubc_minmax_ukernel__avx_u16;
Lines 845-856 Link Here
845
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
811
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
846
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
812
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
847
    assert(hardware_config != NULL);
813
    assert(hardware_config != NULL);
848
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
814
    if (hardware_config->use_x86_avx) {
849
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx512f_u32;
850
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32;
851
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx512f_u32;
852
      f32_vsqrdiff_config.minmax.element_tile = 32;
853
    } else if (hardware_config->use_x86_avx) {
854
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16;
815
      f32_vsqrdiff_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiff_ukernel__avx_u16;
855
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
816
      f32_vsqrdiff_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
856
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
817
      f32_vsqrdiff_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_f32_vsqrdiffc_ukernel__avx_u16;
Lines 907-919 Link Here
907
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
868
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
908
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
869
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
909
    assert(hardware_config != NULL);
870
    assert(hardware_config != NULL);
910
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
871
    if (hardware_config->use_x86_avx2) {
911
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16;
912
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
913
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
914
      qs8_vadd_config.init.qs8_add = xnn_init_qs8_add_minmax_avx512_params;
915
      qs8_vadd_config.minmax.element_tile = 16;
916
    } else if (hardware_config->use_x86_avx2) {
917
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16;
872
      qs8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_u16;
918
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
873
      qs8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
919
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
874
      qs8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
Lines 1045-1057 Link Here
1045
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1000
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1046
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1001
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1047
    assert(hardware_config != NULL);
1002
    assert(hardware_config != NULL);
1048
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1003
    if (hardware_config->use_x86_avx2) {
1049
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_u16;
1050
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
1051
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_u16;
1052
      qu8_vadd_config.init.qu8_add = xnn_init_qu8_add_minmax_avx512_params;
1053
      qu8_vadd_config.minmax.element_tile = 16;
1054
    } else if (hardware_config->use_x86_avx2) {
1055
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16;
1004
      qu8_vadd_config.minmax.op_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_u16;
1056
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
1005
      qu8_vadd_config.minmax.opc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
1057
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
1006
      qu8_vadd_config.minmax.ropc_ukernel = (xnn_vbinary_ukernel_fn) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_u16;
(-)a/third_party/xnnpack/src/src/configs/dwconv-config.c (-83 / +4 lines)
Lines 301-348 Link Here
301
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
301
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
302
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
302
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
303
    assert(hardware_config != NULL);
303
    assert(hardware_config != NULL);
304
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
304
    if (hardware_config->use_x86_fma3) {
305
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__avx512f;
306
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params;
307
      f32_dwconv_config[0].channel_tile = 16;
308
      f32_dwconv_config[0].channel_subtile = 16;
309
      f32_dwconv_config[0].channel_round = 1;
310
      f32_dwconv_config[0].primary_tile = 3;
311
312
      f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p16c__avx512f;
313
      f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params;
314
      f32_dwconv_config[1].channel_tile = 16;
315
      f32_dwconv_config[1].channel_subtile = 16;
316
      f32_dwconv_config[1].channel_round = 1;
317
      f32_dwconv_config[1].primary_tile = 4;
318
319
      f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p16c__avx512f;
320
      f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params;
321
      f32_dwconv_config[2].channel_tile = 16;
322
      f32_dwconv_config[2].channel_subtile = 16;
323
      f32_dwconv_config[2].channel_round = 1;
324
      f32_dwconv_config[2].primary_tile = 9;
325
326
      // Multipass microkernel "acc" value should match unipass and also match across different hardware config.
327
      // Accumulation (FMA) can produce different results, which results in tests only failing on certain platforms.
328
      #if XNN_ENABLE_DWCONV_MULTIPASS
329
        f32_dwconv_config[3].minmax.multipass = (xnn_dwconv_multipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_5f5m5l32c16s1r__avx512f;
330
        f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
331
        f32_dwconv_config[3].channel_tile = 32;
332
        f32_dwconv_config[3].channel_subtile = 16;
333
        f32_dwconv_config[3].channel_round = 1;
334
        f32_dwconv_config[3].primary_tile = 5;
335
        f32_dwconv_config[3].middle_tile = 5;
336
        f32_dwconv_config[3].last_tile = 5;
337
      #else
338
        f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p16c__avx512f;
339
        f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params;
340
        f32_dwconv_config[3].channel_tile = 16;
341
        f32_dwconv_config[3].channel_subtile = 16;
342
        f32_dwconv_config[3].channel_round = 1;
343
        f32_dwconv_config[3].primary_tile = 25;
344
      #endif  // XNN_ENABLE_DWCONV_MULTIPASS
345
    } else if (hardware_config->use_x86_fma3) {
346
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
305
      f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p16c__fma3;
347
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
306
      f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_avx_params;
348
      f32_dwconv_config[0].channel_tile = 16;
307
      f32_dwconv_config[0].channel_tile = 16;
Lines 768-790 Link Here
768
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
727
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
769
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
728
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
770
    assert(hardware_config != NULL);
729
    assert(hardware_config != NULL);
771
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
730
    if (hardware_config->use_x86_avx2) {
772
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p32c__avx512skx_mul32;
773
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
774
      qs8_qc8w_dwconv_config[0].channel_tile = 32;
775
      qs8_qc8w_dwconv_config[0].channel_subtile = 32;
776
      qs8_qc8w_dwconv_config[0].channel_round = 1;
777
      qs8_qc8w_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
778
      qs8_qc8w_dwconv_config[1].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
779
      qs8_qc8w_dwconv_config[1].channel_tile = 32;
780
      qs8_qc8w_dwconv_config[1].channel_subtile = 32;
781
      qs8_qc8w_dwconv_config[1].channel_round = 1;
782
      qs8_qc8w_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
783
      qs8_qc8w_dwconv_config[2].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
784
      qs8_qc8w_dwconv_config[2].channel_tile = 32;
785
      qs8_qc8w_dwconv_config[2].channel_subtile = 32;
786
      qs8_qc8w_dwconv_config[2].channel_round = 1;
787
    } else if (hardware_config->use_x86_avx2) {
788
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32;
731
      qs8_qc8w_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_qc8w_dwconv_minmax_fp32_ukernel_3p16c__avx2_mul32;
789
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx2_params;
732
      qs8_qc8w_dwconv_config[0].init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx2_params;
790
      qs8_qc8w_dwconv_config[0].channel_tile = 16;
733
      qs8_qc8w_dwconv_config[0].channel_tile = 16;
Lines 982-999 Link Here
982
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
925
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
983
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
926
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
984
    assert(hardware_config != NULL);
927
    assert(hardware_config != NULL);
985
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
928
    if (hardware_config->use_x86_avx2) {
986
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
987
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
988
      qs8_dwconv_config[0].channel_tile = 32;
989
      qs8_dwconv_config[0].channel_subtile = 32;
990
      qs8_dwconv_config[0].channel_round = 1;
991
      qs8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
992
      qs8_dwconv_config[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
993
      qs8_dwconv_config[1].channel_tile = 32;
994
      qs8_dwconv_config[1].channel_subtile = 32;
995
      qs8_dwconv_config[1].channel_round = 1;
996
    } else if (hardware_config->use_x86_avx2) {
997
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
929
      qs8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qs8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
998
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
930
      qs8_dwconv_config[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
999
      qs8_dwconv_config[0].channel_tile = 16;
931
      qs8_dwconv_config[0].channel_tile = 16;
Lines 1146-1163 Link Here
1146
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1078
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1147
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1079
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1148
    assert(hardware_config != NULL);
1080
    assert(hardware_config != NULL);
1149
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1081
    if (hardware_config->use_x86_avx2) {
1150
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p32c__avx512skx_mul32;
1151
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
1152
      qu8_dwconv_config[0].channel_tile = 32;
1153
      qu8_dwconv_config[0].channel_subtile = 32;
1154
      qu8_dwconv_config[0].channel_round = 1;
1155
      qu8_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_25p32c__avx512skx_mul32;
1156
      qu8_dwconv_config[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
1157
      qu8_dwconv_config[1].channel_tile = 32;
1158
      qu8_dwconv_config[1].channel_subtile = 32;
1159
      qu8_dwconv_config[1].channel_round = 1;
1160
    } else if (hardware_config->use_x86_avx2) {
1161
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
1082
      qu8_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_qu8_dwconv_minmax_fp32_ukernel_9p16c__avx2_mul32;
1162
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
1083
      qu8_dwconv_config[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
1163
      qu8_dwconv_config[0].channel_tile = 16;
1084
      qu8_dwconv_config[0].channel_tile = 16;
(-)a/third_party/xnnpack/src/src/configs/gemm-config.c (-329 / +9 lines)
Lines 768-784 Link Here
768
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
768
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
769
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
769
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
770
    assert(hardware_config != NULL);
770
    assert(hardware_config != NULL);
771
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
771
    if (hardware_config->use_x86_fma3) {
772
      f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
773
      f32_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
774
      f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
775
      f32_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
776
      f32_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
777
      f32_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_gemm_gio_w;
778
      f32_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x32_packw_gemm_goi_ukernel_x16__avx512f_u4_prfm;
779
      f32_gemm_config.mr = 7;
780
      f32_gemm_config.nr = 16;
781
    } else if (hardware_config->use_x86_fma3) {
782
      switch (cpuinfo_get_core(0)->uarch) {
772
      switch (cpuinfo_get_core(0)->uarch) {
783
        case cpuinfo_uarch_zen:
773
        case cpuinfo_uarch_zen:
784
        case cpuinfo_uarch_dhyana:
774
        case cpuinfo_uarch_dhyana:
Lines 1143-1156 Link Here
1143
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1133
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1144
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1134
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1145
    assert(hardware_config != NULL);
1135
    assert(hardware_config != NULL);
1146
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1136
    if (hardware_config->use_x86_avx2) {
1147
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1148
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1149
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512_params;
1150
      f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_f32_qc4w_gemm_goi_w;
1151
      f32_qc4w_gemm_config.mr = 7;
1152
      f32_qc4w_gemm_config.nr = 32;
1153
    } else if (hardware_config->use_x86_avx2) {
1154
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1137
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1155
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast);
1138
      f32_qc4w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc4w_gemm_minmax_ukernel_3x16__avx2_broadcast);
1156
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
1139
      f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
Lines 1303-1317 Link Here
1303
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1286
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1304
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1287
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1305
    assert(hardware_config != NULL);
1288
    assert(hardware_config != NULL);
1306
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1289
    if (hardware_config->use_x86_avx2) {
1307
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x32__avx512skx_broadcast);
1308
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_7x32__avx512skx_broadcast);
1309
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
1310
      f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_f32_qs8w_gemm_gio_w;
1311
      f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_x8_packw_gemm_goi_ukernel_x32__scalar_int_u2;
1312
      f32_qc8w_gemm_config.mr = 7;
1313
      f32_qc8w_gemm_config.nr = 32;
1314
    } else if (hardware_config->use_x86_avx2) {
1315
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1290
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_1x16__avx2_broadcast);
1316
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast);
1291
      f32_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_f32_qc8w_gemm_minmax_ukernel_5x16__avx2_broadcast);
1317
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params;
1292
      f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx_params;
Lines 1498-1546 Link Here
1498
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
1473
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
1499
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1474
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1500
    assert(hardware_config != NULL);
1475
    assert(hardware_config != NULL);
1501
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1476
    if (hardware_config->use_x86_avx2) {
1502
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm);
1503
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm);
1504
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1505
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1506
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1507
      qd8_f16_qc4w_gemm_config.mr = 7;
1508
      qd8_f16_qc4w_gemm_config.nr = 8;
1509
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1510
      qd8_f16_qc4w_gemm_config.planes = 2;
1511
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1512
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
1513
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
1514
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1515
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1516
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1517
      qd8_f16_qc4w_gemm_config.mr = 7;
1518
      qd8_f16_qc4w_gemm_config.nr = 8;
1519
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1520
      qd8_f16_qc4w_gemm_config.planes = 2;
1521
    #if XNN_ENABLE_AVXVNNI
1522
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
1523
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
1524
        qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
1525
        qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avxvnni_params;
1526
        qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1527
        qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1528
        qd8_f16_qc4w_gemm_config.mr = 5;
1529
        qd8_f16_qc4w_gemm_config.nr = 8;
1530
        qd8_f16_qc4w_gemm_config.log2_kr = 3;
1531
        qd8_f16_qc4w_gemm_config.planes = 2;
1532
    #endif
1533
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1534
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx512skx);
1535
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_5x8c8__avx512skx);
1536
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
1537
      qd8_f16_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1538
      qd8_f16_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1539
      qd8_f16_qc4w_gemm_config.mr = 5;
1540
      qd8_f16_qc4w_gemm_config.nr = 8;
1541
      qd8_f16_qc4w_gemm_config.log2_kr = 3;
1542
      qd8_f16_qc4w_gemm_config.planes = 2;
1543
    } else if (hardware_config->use_x86_avx2) {
1544
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1477
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1545
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1478
      qd8_f16_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1546
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
1479
      qd8_f16_qc4w_gemm_config.init.f16_qc4w = xnn_init_f16_qc4w_minmax_avx_params;
Lines 1643-1711 Link Here
1643
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1576
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1644
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1577
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1645
    assert(hardware_config != NULL);
1578
    assert(hardware_config != NULL);
1646
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1579
    if (hardware_config->use_x86_avx2) {
1647
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnnigfni_prfm);
1648
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnnigfni_prfm);
1649
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1650
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1651
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1652
      qd8_f32_qc4w_gemm_config.mr = 7;
1653
      qd8_f32_qc4w_gemm_config.nr = 16;
1654
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1655
      qd8_f32_qc4w_gemm_config.planes = 2;
1656
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1657
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
1658
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
1659
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1660
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1661
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1662
      qd8_f32_qc4w_gemm_config.mr = 7;
1663
      qd8_f32_qc4w_gemm_config.nr = 16;
1664
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1665
      qd8_f32_qc4w_gemm_config.planes = 2;
1666
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnnigfni) {
1667
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnnigfni_prfm);
1668
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnnigfni_prfm);
1669
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1670
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1671
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1672
      qd8_f32_qc4w_gemm_config.mr = 7;
1673
      qd8_f32_qc4w_gemm_config.nr = 8;
1674
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1675
      qd8_f32_qc4w_gemm_config.planes = 2;
1676
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1677
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
1678
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
1679
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx512vnni_params;
1680
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1681
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1682
      qd8_f32_qc4w_gemm_config.mr = 7;
1683
      qd8_f32_qc4w_gemm_config.nr = 8;
1684
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1685
      qd8_f32_qc4w_gemm_config.planes = 2;
1686
    #if XNN_ENABLE_AVXVNNI
1687
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
1688
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
1689
        qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
1690
        qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avxvnni_params;
1691
        qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1692
        qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1693
        qd8_f32_qc4w_gemm_config.mr = 5;
1694
        qd8_f32_qc4w_gemm_config.nr = 8;
1695
        qd8_f32_qc4w_gemm_config.log2_kr = 3;
1696
        qd8_f32_qc4w_gemm_config.planes = 2;
1697
    #endif
1698
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1699
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm);
1700
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm);
1701
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_scalar_params;
1702
      qd8_f32_qc4w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_qc4w_gemm_gio_w;
1703
      qd8_f32_qc4w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_qc4w_gemm_goi_w;
1704
      qd8_f32_qc4w_gemm_config.mr = 7;
1705
      qd8_f32_qc4w_gemm_config.nr = 16;
1706
      qd8_f32_qc4w_gemm_config.log2_kr = 3;
1707
      qd8_f32_qc4w_gemm_config.planes = 2;
1708
    } else if (hardware_config->use_x86_avx2) {
1709
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1580
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_1x8c8__avx2);
1710
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1581
      qd8_f32_qc4w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc4w_gemm_minmax_ukernel_3x8c8__avx2);
1711
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
1582
      qd8_f32_qc4w_gemm_config.init.f32_qc4w = xnn_init_f32_qc4w_minmax_avx_params;
Lines 2093-2135 Link Here
2093
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
1964
  #elif (XNN_ARCH_X86 || XNN_ARCH_X86_64) && !XNN_PLATFORM_MOBILE
2094
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1965
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2095
    assert(hardware_config != NULL);
1966
    assert(hardware_config != NULL);
2096
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
1967
    if (hardware_config->use_x86_avx2) {
2097
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2098
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2099
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2100
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2101
      qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params;
2102
      qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2103
      qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2104
      qd8_f16_qc8w_gemm_config.mr = 7;
2105
      qd8_f16_qc8w_gemm_config.nr = 8;
2106
      qd8_f16_qc8w_gemm_config.log2_kr = 3;
2107
    #if XNN_ENABLE_AVXVNNI
2108
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
2109
        // AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels
2110
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2111
        qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2112
        qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2113
        qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2114
        qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avxvnni_params;
2115
        qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2116
        qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2117
        qd8_f16_qc8w_gemm_config.mr = 5;
2118
        qd8_f16_qc8w_gemm_config.nr = 8;
2119
        qd8_f16_qc8w_gemm_config.log2_kr = 3;
2120
    #endif
2121
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
2122
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx512skx);
2123
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_5x8c8__avx512skx);
2124
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx512skx);
2125
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_5x8c8__avx512skx);
2126
      qd8_f16_qc8w_gemm_config.init.f16 = xnn_init_f16_minmax_avx_params;
2127
      qd8_f16_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2128
      qd8_f16_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2129
      qd8_f16_qc8w_gemm_config.mr = 5;
2130
      qd8_f16_qc8w_gemm_config.nr = 8;
2131
      qd8_f16_qc8w_gemm_config.log2_kr = 3;
2132
    } else if (hardware_config->use_x86_avx2) {
2133
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
1968
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
2134
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
1969
      qd8_f16_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f16_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
2135
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
1970
      qd8_f16_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f16_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
Lines 2465-2532 Link Here
2465
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2300
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2466
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2301
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2467
    assert(hardware_config != NULL);
2302
    assert(hardware_config != NULL);
2468
    #if XNN_ENABLE_AVX512AMX
2303
    if (hardware_config->use_x86_avx2) {
2469
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512amx) {
2470
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512amx);
2471
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512amx);
2472
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c4__avx512vnni_prfm);
2473
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c4__avx512vnni_prfm);
2474
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx512vnni_params;
2475
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2476
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2477
      qd8_f32_qc8w_gemm_config.mr = 7;
2478
      qd8_f32_qc8w_gemm_config.nr = 16;
2479
      qd8_f32_qc8w_gemm_config.log2_kr = 2;
2480
    } else
2481
    #endif
2482
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
2483
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
2484
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
2485
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512vnni_prfm);
2486
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512vnni_prfm);
2487
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avx512vnni_params;
2488
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2489
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2490
      qd8_f32_qc8w_gemm_config.mr = 7;
2491
      qd8_f32_qc8w_gemm_config.nr = 16;
2492
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2493
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
2494
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2495
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2496
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx512vnni_prfm);
2497
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x8c8__avx512vnni_prfm);
2498
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params;
2499
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2500
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2501
      qd8_f32_qc8w_gemm_config.mr = 7;
2502
      qd8_f32_qc8w_gemm_config.nr = 8;
2503
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2504
    #if XNN_ENABLE_AVXVNNI
2505
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
2506
        // AVX VNNI should be checked before AVX512SKX as it performs better with VNNI microkernels
2507
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2508
        qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2509
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avxvnni_prfm);
2510
        qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_5x8c8__avxvnni_prfm);
2511
        qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_avxvnni_params;
2512
        qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2513
        qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2514
        qd8_f32_qc8w_gemm_config.mr = 5;
2515
        qd8_f32_qc8w_gemm_config.nr = 8;
2516
        qd8_f32_qc8w_gemm_config.log2_kr = 3;
2517
    #endif
2518
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
2519
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c8__avx512skx_prfm);
2520
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c8__avx512skx_prfm);
2521
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x16c8__avx512skx_prfm);
2522
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_7x16c8__avx512skx_prfm);
2523
      qd8_f32_qc8w_gemm_config.init.f32 = xnn_init_f32_minmax_scalar_params;
2524
      qd8_f32_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
2525
      qd8_f32_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
2526
      qd8_f32_qc8w_gemm_config.mr = 7;
2527
      qd8_f32_qc8w_gemm_config.nr = 16;
2528
      qd8_f32_qc8w_gemm_config.log2_kr = 3;
2529
    } else if (hardware_config->use_x86_avx2) {
2530
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
2304
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2);
2531
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
2305
      qd8_f32_qc8w_gemm_config.minmax.dqgemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_dqgemm_ukernel((xnn_dqgemm_ukernel_fn) xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2);
2532
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
2306
      qd8_f32_qc8w_gemm_config.minmax.dqigemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_dqigemm_ukernel((xnn_dqigemm_ukernel_fn) xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2);
Lines 3234-3312 Link Here
3234
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3008
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3235
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3009
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3236
    assert(hardware_config != NULL);
3010
    assert(hardware_config != NULL);
3237
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
3011
    if (hardware_config->use_x86_avx2) {
3238
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
3239
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
3240
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512vnni_prfm);
3241
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512vnni_prfm);
3242
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params;
3243
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3244
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3245
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3246
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3247
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3248
      qs8_qc8w_gemm_config.mr = 7;
3249
      qs8_qc8w_gemm_config.nr = 16;
3250
      qs8_qc8w_gemm_config.log2_kr = 3;
3251
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512vnni) {
3252
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm);
3253
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm);
3254
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512vnni_prfm);
3255
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x8c8__avx512vnni_prfm);
3256
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avxvnni_params;
3257
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3258
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3259
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3260
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3261
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3262
      qs8_qc8w_gemm_config.mr = 7;
3263
      qs8_qc8w_gemm_config.nr = 8;
3264
      qs8_qc8w_gemm_config.log2_kr = 3;
3265
    #if XNN_ENABLE_AVXVNNI
3266
      } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avxvnni) {
3267
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
3268
        qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
3269
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avxvnni_prfm);
3270
        qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_5x8c8__avxvnni_prfm);
3271
        qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512vnni_params;
3272
        qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_gio_w;
3273
        qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_to_qu8_gemm_goi_w;
3274
        qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_to_qu8_conv_goki_w;
3275
        qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_to_qu8_conv_kgo_w;
3276
        qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_to_qu8_deconv_goki_w;
3277
        qs8_qc8w_gemm_config.mr = 5;
3278
        qs8_qc8w_gemm_config.nr = 8;
3279
        qs8_qc8w_gemm_config.log2_kr = 3;
3280
    #endif
3281
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3282
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3283
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3284
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3285
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3286
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
3287
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3288
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3289
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
3290
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
3291
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
3292
      qs8_qc8w_gemm_config.mr = 7;
3293
      qs8_qc8w_gemm_config.nr = 16;
3294
      qs8_qc8w_gemm_config.log2_kr = 3;
3295
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3296
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3297
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3298
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3299
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3300
      qs8_qc8w_gemm_config.init.qs8_qc8w = xnn_init_qs8_qc8w_conv_minmax_fp32_avx512_params;
3301
      qs8_qc8w_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qs8_gemm_gio_w;
3302
      qs8_qc8w_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qs8_gemm_goi_w;
3303
      qs8_qc8w_gemm_config.pack_igemm_goki = (xnn_pack_conv_goki_w_fn) xnn_pack_qs8_conv_goki_w;
3304
      qs8_qc8w_gemm_config.pack_igemm_kgo = (xnn_pack_conv_kgo_w_fn) xnn_pack_qs8_conv_kgo_w;
3305
      qs8_qc8w_gemm_config.pack_deconv_goki = (xnn_pack_deconv_goki_w_fn) xnn_pack_qs8_deconv_goki_w;
3306
      qs8_qc8w_gemm_config.mr = 4;  // TODO: upgrade to 5x8 prfm when supported
3307
      qs8_qc8w_gemm_config.nr = 8;
3308
      qs8_qc8w_gemm_config.log2_kr = 3;
3309
    } else if (hardware_config->use_x86_avx2) {
3310
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3012
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3311
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3013
      qs8_qc8w_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qs8_qc8w_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3312
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3014
      qs8_qc8w_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qs8_qc8w_igemm_minmax_fp32_ukernel_1x8c8__avx2);
Lines 3696-3724 Link Here
3696
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3398
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3697
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3399
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
3698
    assert(hardware_config != NULL);
3400
    assert(hardware_config != NULL);
3699
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3401
    if (hardware_config->use_x86_avx2) {
3700
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3701
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3702
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx_prfm);
3703
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_7x16c8__avx512skx_prfm);
3704
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3705
      qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w;
3706
      qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w;
3707
      qu8_gemm_config.mr = 7;
3708
      qu8_gemm_config.nr = 16;
3709
      qu8_gemm_config.log2_kr = 3;
3710
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
3711
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3712
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3713
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx512skx);
3714
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_4x8c8__avx512skx);
3715
      qu8_gemm_config.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3716
      qu8_gemm_config.pack_gemm_gio = (xnn_packw_gemm_gio_ukernel_fn) xnn_pack_qu8_gemm_gio_w;
3717
      qu8_gemm_config.pack_gemm_goi = (xnn_packw_gemm_goi_ukernel_fn) xnn_pack_qu8_gemm_goi_w;
3718
      qu8_gemm_config.mr = 4;  // TODO: upgrade to 5x8 prfm when supported
3719
      qu8_gemm_config.nr = 8;
3720
      qu8_gemm_config.log2_kr = 3;
3721
    } else if (hardware_config->use_x86_avx2) {
3722
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3402
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3723
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3403
      qu8_gemm_config.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_fn) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3724
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3404
      qu8_gemm_config.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_fn) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
(-)a/third_party/xnnpack/src/src/configs/hardware-config.c (-8 / +7 lines)
Lines 112-127 Link Here
112
    hardware_config.use_x86_f16c = cpuinfo_has_x86_f16c();
112
    hardware_config.use_x86_f16c = cpuinfo_has_x86_f16c();
113
    hardware_config.use_x86_fma3 = cpuinfo_has_x86_fma3();
113
    hardware_config.use_x86_fma3 = cpuinfo_has_x86_fma3();
114
    hardware_config.use_x86_avx2 = cpuinfo_has_x86_avx2();
114
    hardware_config.use_x86_avx2 = cpuinfo_has_x86_avx2();
115
    hardware_config.use_x86_avx512f = cpuinfo_has_x86_avx512f();
115
    hardware_config.use_x86_avx512f = 0;
116
    hardware_config.use_x86_avx512skx = hardware_config.use_x86_avx512f &&
116
    hardware_config.use_x86_avx512skx = 0;
117
      cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl();
117
    hardware_config.use_x86_avx512vbmi = 0;
118
    hardware_config.use_x86_avx512vbmi = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vbmi();
118
    hardware_config.use_x86_avx512vnni = 0;
119
    hardware_config.use_x86_avx512vnni = hardware_config.use_x86_avx512skx && cpuinfo_has_x86_avx512vnni();
119
    hardware_config.use_x86_avx512vnnigfni = 0;
120
    hardware_config.use_x86_avx512vnnigfni = hardware_config.use_x86_avx512vnni && cpuinfo_has_x86_gfni();
121
#if XNN_ENABLE_AVX512AMX
120
#if XNN_ENABLE_AVX512AMX
122
    // TODO(fbarchard): Use cpuinfo_has_x86_amx_int8 when available.
121
    // TODO(fbarchard): Use cpuinfo_has_x86_amx_int8 when available.
123
    // Infer AMX support from Sapphire Rapids having fp16 and amx.
122
    // Infer AMX support from Sapphire Rapids having fp16 and amx.
124
    hardware_config.use_x86_avx512amx = hardware_config.use_x86_avx512vnnigfni && cpuinfo_has_x86_avx512fp16();
123
    hardware_config.use_x86_avx512amx = 0;
125
#if XNN_ARCH_X86_64 && defined(__linux__) && !defined(CHROMIUM)
124
#if XNN_ARCH_X86_64 && defined(__linux__) && !defined(CHROMIUM)
126
    if (hardware_config.use_x86_avx512amx) {
125
    if (hardware_config.use_x86_avx512amx) {
127
      size_t status = xnn_syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA, 0);
126
      size_t status = xnn_syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA, 0);
Lines 135-141 Link Here
135
    hardware_config.use_x86_avx512amx = 0;
134
    hardware_config.use_x86_avx512amx = 0;
136
#endif
135
#endif
137
#if XNN_ENABLE_AVXVNNI
136
#if XNN_ENABLE_AVXVNNI
138
    hardware_config.use_x86_avxvnni = hardware_config.use_x86_avx2 && cpuinfo_has_x86_avxvnni();
137
    hardware_config.use_x86_avxvnni = 0;
139
#else
138
#else
140
    hardware_config.use_x86_avxvnni = 0;
139
    hardware_config.use_x86_avxvnni = 0;
141
#endif
140
#endif
(-)a/third_party/xnnpack/src/src/configs/prelu-config.c (-5 / +1 lines)
Lines 77-87 Link Here
77
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
77
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
78
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
78
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
79
    assert(hardware_config != NULL);
79
    assert(hardware_config != NULL);
80
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
80
    if (hardware_config->use_x86_avx) {
81
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx512f_2x16;
82
      f32_prelu_config.row_tile = 2;
83
      f32_prelu_config.channel_tile = 16;
84
    } else if (hardware_config->use_x86_avx) {
85
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16;
81
      f32_prelu_config.ukernel = (xnn_prelu_ukernel_fn) xnn_f32_prelu_ukernel__avx_2x16;
86
      f32_prelu_config.row_tile = 2;
82
      f32_prelu_config.row_tile = 2;
87
      f32_prelu_config.channel_tile = 16;
83
      f32_prelu_config.channel_tile = 16;
(-)a/third_party/xnnpack/src/src/configs/reduce-config.c (-13 / +2 lines)
Lines 106-117 Link Here
106
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
106
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
107
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
107
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
108
    assert(hardware_config != NULL);
108
    assert(hardware_config != NULL);
109
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
109
    if (hardware_config->use_x86_avx) {
110
      f32_rminmax_config = (struct xnn_reduce_config) {
111
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx512f_u64_acc4,
112
        .element_tile = 64,
113
      };
114
    } else if (hardware_config->use_x86_avx) {
115
      f32_rminmax_config = (struct xnn_reduce_config) {
110
      f32_rminmax_config = (struct xnn_reduce_config) {
116
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx_u32_acc4,
111
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rminmax_ukernel__avx_u32_acc4,
117
        .init.f32_default = xnn_init_f32_default_avx_params,
112
        .init.f32_default = xnn_init_f32_default_avx_params,
Lines 173-185 Link Here
173
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
168
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
174
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
169
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
175
    assert(hardware_config != NULL);
170
    assert(hardware_config != NULL);
176
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
171
    if (hardware_config->use_x86_avx) {
177
      f32_rsum_config = (struct xnn_reduce_config) {
178
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx512f_u64_acc4,
179
        .init.f32_scale = xnn_init_f32_scale_scalar_params,
180
        .element_tile = 64,
181
      };
182
    } else if (hardware_config->use_x86_avx) {
183
      f32_rsum_config = (struct xnn_reduce_config) {
172
      f32_rsum_config = (struct xnn_reduce_config) {
184
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx_u32_acc4,
173
        .ukernel = (xnn_reduce_ukernel_fn) xnn_f32_rsum_ukernel__avx_u32_acc4,
185
        .init.f32_scale = xnn_init_f32_scale_avx_params,
174
        .init.f32_scale = xnn_init_f32_scale_avx_params,
(-)a/third_party/xnnpack/src/src/configs/rmax-config.c (-3 / +1 lines)
Lines 68-76 Link Here
68
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
68
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
69
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
69
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
70
    assert(hardware_config != NULL);
70
    assert(hardware_config != NULL);
71
    if (hardware_config->use_x86_avx512f) {
71
    if (hardware_config->use_x86_avx) {
72
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx512f_u64_acc4;
73
    } else if (hardware_config->use_x86_avx) {
74
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx_u32_acc4;
72
      f32_rmax_config.ukernel = (xnn_rmax_ukernel_fn) xnn_f32_rmax_ukernel__avx_u32_acc4;
75
      f32_rmax_config.init.f32 = xnn_init_f32_default_avx_params;
73
      f32_rmax_config.init.f32 = xnn_init_f32_default_avx_params;
76
    } else {
74
    } else {
(-)a/third_party/xnnpack/src/src/configs/unary-elementwise-config.c (-93 / +20 lines)
Lines 554-563 Link Here
554
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
554
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
555
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
555
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
556
    assert(hardware_config != NULL);
556
    assert(hardware_config != NULL);
557
    if (hardware_config->use_x86_avx512skx) {
557
    if (hardware_config->use_x86_f16c) {
558
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx512skx_u16;
559
      f16_to_f32_cvt_config.element_tile = 16;
560
    } else if (hardware_config->use_x86_f16c) {
561
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__f16c_u16;
558
      f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__f16c_u16;
562
      f16_to_f32_cvt_config.element_tile = 16;
559
      f16_to_f32_cvt_config.element_tile = 16;
563
    } else if (hardware_config->use_x86_avx) {
560
    } else if (hardware_config->use_x86_avx) {
Lines 631-641 Link Here
631
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
628
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
632
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
629
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
633
    assert(hardware_config != NULL);
630
    assert(hardware_config != NULL);
634
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
631
    if (hardware_config->use_x86_avx) {
635
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx512f_u16;
636
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx512_params;
637
      f32_abs_config.element_tile = 16;
638
    } else if (hardware_config->use_x86_avx) {
639
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx_u16;
632
      f32_abs_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vabs_ukernel__avx_u16;
640
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx_params;
633
      f32_abs_config.init.f32_abs = xnn_init_f32_abs_avx_params;
641
      f32_abs_config.element_tile = 16;
634
      f32_abs_config.element_tile = 16;
Lines 680-690 Link Here
680
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
673
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
681
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
674
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
682
    assert(hardware_config != NULL);
675
    assert(hardware_config != NULL);
683
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
676
    if (hardware_config->use_x86_avx) {
684
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx512f_u16;
685
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_scalar_params;
686
      f32_clamp_config.element_tile = 16;
687
    } else if (hardware_config->use_x86_avx) {
688
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_u16;
677
      f32_clamp_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vclamp_ukernel__avx_u16;
689
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_avx_params;
678
      f32_clamp_config.init.f32_minmax = xnn_init_f32_minmax_avx_params;
690
      f32_clamp_config.element_tile = 16;
679
      f32_clamp_config.element_tile = 16;
Lines 746-756 Link Here
746
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
735
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
747
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
736
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
748
    assert(hardware_config != NULL);
737
    assert(hardware_config != NULL);
749
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
738
    if (hardware_config->use_x86_avx2) {
750
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx512f_rr1_p6_u128;
751
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx512_rr1_p6_params;
752
      f32_elu_config.element_tile = 128;
753
    } else if (hardware_config->use_x86_avx2) {
754
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_u56;
739
      f32_elu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_u56;
755
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params;
740
      f32_elu_config.init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params;
756
      f32_elu_config.element_tile = 56;
741
      f32_elu_config.element_tile = 56;
Lines 824-834 Link Here
824
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
809
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
825
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
810
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
826
    assert(hardware_config != NULL);
811
    assert(hardware_config != NULL);
827
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
812
    if (hardware_config->use_x86_fma3) {
828
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__avx512f_u16;
829
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx512_params;
830
      f32_hswish_config.element_tile = 16;
831
    } else if (hardware_config->use_x86_fma3) {
832
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__fma3_u16;
813
      f32_hswish_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vhswish_ukernel__fma3_u16;
833
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx_params;
814
      f32_hswish_config.init.f32_hswish = xnn_init_f32_hswish_avx_params;
834
      f32_hswish_config.element_tile = 16;
815
      f32_hswish_config.element_tile = 16;
Lines 888-898 Link Here
888
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
869
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
889
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
870
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
890
    assert(hardware_config != NULL);
871
    assert(hardware_config != NULL);
891
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
872
    if (hardware_config->use_x86_avx) {
892
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx512f_u16;
893
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_scalar_params;
894
      f32_lrelu_config.element_tile = 16;
895
    } else if (hardware_config->use_x86_avx) {
896
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_u16;
873
      f32_lrelu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vlrelu_ukernel__avx_u16;
897
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_avx_params;
874
      f32_lrelu_config.init.f32_lrelu = xnn_init_f32_lrelu_avx_params;
898
      f32_lrelu_config.element_tile = 16;
875
      f32_lrelu_config.element_tile = 16;
Lines 961-971 Link Here
961
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
938
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
962
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
939
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
963
    assert(hardware_config != NULL);
940
    assert(hardware_config != NULL);
964
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
941
    if (hardware_config->use_x86_avx) {
965
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx512f_u16;
966
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx512_params;
967
      f32_neg_config.element_tile = 16;
968
    } else if (hardware_config->use_x86_avx) {
969
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx_u16;
942
      f32_neg_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vneg_ukernel__avx_u16;
970
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx_params;
943
      f32_neg_config.init.f32_neg = xnn_init_f32_neg_avx_params;
971
      f32_neg_config.element_tile = 16;
944
      f32_neg_config.element_tile = 16;
Lines 1029-1038 Link Here
1029
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1002
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1030
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1003
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1031
    assert(hardware_config != NULL);
1004
    assert(hardware_config != NULL);
1032
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1005
    if (hardware_config->use_x86_avx) {
1033
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx512f_u16;
1034
      f32_rndd_config.element_tile = 16;
1035
    } else if (hardware_config->use_x86_avx) {
1036
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx_u16;
1006
      f32_rndd_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndd_ukernel__avx_u16;
1037
      f32_rndd_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1007
      f32_rndd_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1038
      f32_rndd_config.element_tile = 16;
1008
      f32_rndd_config.element_tile = 16;
Lines 1081-1090 Link Here
1081
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1051
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1082
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1052
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1083
    assert(hardware_config != NULL);
1053
    assert(hardware_config != NULL);
1084
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1054
    if (hardware_config->use_x86_avx) {
1085
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx512f_u16;
1086
      f32_rndne_config.element_tile = 16;
1087
    } else if (hardware_config->use_x86_avx) {
1088
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx_u16;
1055
      f32_rndne_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndne_ukernel__avx_u16;
1089
      f32_rndne_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1056
      f32_rndne_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1090
      f32_rndne_config.element_tile = 16;
1057
      f32_rndne_config.element_tile = 16;
Lines 1133-1142 Link Here
1133
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1100
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1134
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1101
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1135
    assert(hardware_config != NULL);
1102
    assert(hardware_config != NULL);
1136
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1103
    if (hardware_config->use_x86_avx) {
1137
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx512f_u16;
1138
      f32_rndu_config.element_tile = 16;
1139
    } else if (hardware_config->use_x86_avx) {
1140
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx_u16;
1104
      f32_rndu_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndu_ukernel__avx_u16;
1141
      f32_rndu_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1105
      f32_rndu_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1142
      f32_rndu_config.element_tile = 16;
1106
      f32_rndu_config.element_tile = 16;
Lines 1185-1194 Link Here
1185
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1149
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1186
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1150
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1187
    assert(hardware_config != NULL);
1151
    assert(hardware_config != NULL);
1188
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1152
    if (hardware_config->use_x86_avx) {
1189
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx512f_u16;
1190
      f32_rndz_config.element_tile = 16;
1191
    } else if (hardware_config->use_x86_avx) {
1192
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx_u16;
1153
      f32_rndz_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrndz_ukernel__avx_u16;
1193
      f32_rndz_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1154
      f32_rndz_config.init.f32_rnd = xnn_init_f32_rnd_avx_params;
1194
      f32_rndz_config.element_tile = 16;
1155
      f32_rndz_config.element_tile = 16;
Lines 1235-1245 Link Here
1235
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1196
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1236
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1197
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1237
    assert(hardware_config != NULL);
1198
    assert(hardware_config != NULL);
1238
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1199
    if (hardware_config->use_x86_avx2) {
1239
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_u64;
1240
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params;
1241
      f32_sigmoid_config.element_tile = 64;
1242
    } else if (hardware_config->use_x86_avx2) {
1243
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40;
1200
      f32_sigmoid_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_u40;
1244
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params;
1201
      f32_sigmoid_config.init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params;
1245
      f32_sigmoid_config.element_tile = 40;
1202
      f32_sigmoid_config.element_tile = 40;
Lines 1298-1307 Link Here
1298
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1255
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1299
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1256
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1300
    assert(hardware_config != NULL);
1257
    assert(hardware_config != NULL);
1301
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1258
    if (hardware_config->use_x86_avx) {
1302
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx512f_u16;
1303
      f32_sqr_config.element_tile = 16;
1304
    } else if (hardware_config->use_x86_avx) {
1305
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx_u16;
1259
      f32_sqr_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vsqr_ukernel__avx_u16;
1306
      f32_sqr_config.init.f32_default = xnn_init_f32_default_avx_params;
1260
      f32_sqr_config.init.f32_default = xnn_init_f32_default_avx_params;
1307
      f32_sqr_config.element_tile = 16;
1261
      f32_sqr_config.element_tile = 16;
Lines 1368-1378 Link Here
1368
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1322
  #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1369
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1323
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1370
    assert(hardware_config != NULL);
1324
    assert(hardware_config != NULL);
1371
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
1325
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_fma3) {
1372
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__avx512f_rsqrt_u32;
1373
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_avx512_params;
1374
      f32_rsqrt_config.element_tile = 32;
1375
    } else if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_fma3) {
1376
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16;
1326
      f32_rsqrt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vrsqrt_ukernel__fma3_rsqrt_u16;
1377
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_fma3_params;
1327
      f32_rsqrt_config.init.f32_rsqrt = xnn_init_f32_rsqrt_fma3_params;
1378
      f32_rsqrt_config.element_tile = 16;
1328
      f32_rsqrt_config.element_tile = 16;
Lines 1416-1426 Link Here
1416
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1366
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1417
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1367
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1418
    assert(hardware_config != NULL);
1368
    assert(hardware_config != NULL);
1419
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
1369
    if (hardware_config->use_x86_avx2) {
1420
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx512skx_expm1minus_rr1_lut4_p4h3ts_perm_div_u64;
1421
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx512_expm1minus_rr1_lut4_p4h3_perm_params;
1422
      f32_tanh_config.element_tile = 64;
1423
    } else if (hardware_config->use_x86_avx2) {
1424
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx2_expm1minus_rr1_lut4_p4h3ts_perm_div_u32;
1370
      f32_tanh_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_vtanh_ukernel__avx2_expm1minus_rr1_lut4_p4h3ts_perm_div_u32;
1425
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx_expm1minus_rr1_lut4_p4h3_perm_params;
1371
      f32_tanh_config.init.f32_tanh = xnn_init_f32_tanh_avx_expm1minus_rr1_lut4_p4h3_perm_params;
1426
      f32_tanh_config.element_tile = 32;
1372
      f32_tanh_config.element_tile = 32;
Lines 1500-1509 Link Here
1500
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1446
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1501
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1447
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1502
    assert(hardware_config != NULL);
1448
    assert(hardware_config != NULL);
1503
    if (hardware_config->use_x86_avx512skx) {
1449
    if (hardware_config->use_x86_f16c) {
1504
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__avx512skx_u16;
1505
      f32_to_f16_cvt_config.element_tile = 16;
1506
    } else if (hardware_config->use_x86_f16c) {
1507
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__f16c_u16;
1450
      f32_to_f16_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_f16_vcvt_ukernel__f16c_u16;
1508
      f32_to_f16_cvt_config.init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params;
1451
      f32_to_f16_cvt_config.init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params;
1509
      f32_to_f16_cvt_config.element_tile = 16;
1452
      f32_to_f16_cvt_config.element_tile = 16;
Lines 1571-1581 Link Here
1571
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1514
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1572
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1515
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1573
    assert(hardware_config != NULL);
1516
    assert(hardware_config != NULL);
1574
    if (hardware_config->use_x86_avx512skx) {
1517
    if (hardware_config->use_x86_avx2) {
1575
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx512skx_u128;
1576
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params;
1577
      f32_to_qs8_cvt_config.element_tile = 128;
1578
    } else if (hardware_config->use_x86_avx2) {
1579
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_u64;
1518
      f32_to_qs8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qs8_vcvt_ukernel__avx2_u64;
1580
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params;
1519
      f32_to_qs8_cvt_config.init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params;
1581
      f32_to_qs8_cvt_config.element_tile = 64;
1520
      f32_to_qs8_cvt_config.element_tile = 64;
Lines 1645-1655 Link Here
1645
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1584
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1646
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1585
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1647
    assert(hardware_config != NULL);
1586
    assert(hardware_config != NULL);
1648
    if (hardware_config->use_x86_avx512skx) {
1587
    if (hardware_config->use_x86_avx2) {
1649
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx512skx_u128;
1650
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params;
1651
      f32_to_qu8_cvt_config.element_tile = 128;
1652
    } else if (hardware_config->use_x86_avx2) {
1653
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_u64;
1588
      f32_to_qu8_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f32_qu8_vcvt_ukernel__avx2_u64;
1654
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params;
1589
      f32_to_qu8_cvt_config.init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params;
1655
      f32_to_qu8_cvt_config.element_tile = 64;
1590
      f32_to_qu8_cvt_config.element_tile = 64;
Lines 1939-1949 Link Here
1939
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1874
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1940
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1875
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
1941
    assert(hardware_config != NULL);
1876
    assert(hardware_config != NULL);
1942
    if (hardware_config->use_x86_avx512skx) {
1877
    if (hardware_config->use_x86_avx2) {
1943
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx512skx_u32;
1944
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params;
1945
      qs8_to_f32_cvt_config.element_tile = 32;
1946
    } else if (hardware_config->use_x86_avx2) {
1947
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_u16;
1878
      qs8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qs8_f32_vcvt_ukernel__avx2_u16;
1948
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params;
1879
      qs8_to_f32_cvt_config.init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params;
1949
      qs8_to_f32_cvt_config.element_tile = 16;
1880
      qs8_to_f32_cvt_config.element_tile = 16;
Lines 2161-2171 Link Here
2161
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2092
  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
2162
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2093
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
2163
    assert(hardware_config != NULL);
2094
    assert(hardware_config != NULL);
2164
    if (hardware_config->use_x86_avx512skx) {
2095
    if (hardware_config->use_x86_avx2) {
2165
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx512skx_u32;
2166
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params;
2167
      qu8_to_f32_cvt_config.element_tile = 32;
2168
    } else if (hardware_config->use_x86_avx2) {
2169
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_u16;
2096
      qu8_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_qu8_f32_vcvt_ukernel__avx2_u16;
2170
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params;
2097
      qu8_to_f32_cvt_config.init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params;
2171
      qu8_to_f32_cvt_config.element_tile = 16;
2098
      qu8_to_f32_cvt_config.element_tile = 16;
(-)a/third_party/xnnpack/src/src/configs/x8-lut-config.c (-9 / +1 lines)
Lines 36-50 Link Here
36
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
36
    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
37
    assert(hardware_config != NULL);
37
    assert(hardware_config != NULL);
38
38
39
    if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512skx) {
39
    if (hardware_config->use_x86_avx2) {
40
      if (hardware_config->use_x86_avx512vbmi) {
41
        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512vbmi_vpermx2b_u128;
42
        x8_lut_config.tile_size = 128;
43
      } else {
44
        x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx512skx_vpshufb_u64;
45
        x8_lut_config.tile_size = 64;
46
      }
47
    } else if (hardware_config->use_x86_avx2) {
48
      x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx2_u128;
40
      x8_lut_config.microkernel = xnn_x8_lut_ukernel__avx2_u128;
49
      x8_lut_config.tile_size = 128;
41
      x8_lut_config.tile_size = 128;
50
    } else if (hardware_config->use_x86_avx) {
42
    } else if (hardware_config->use_x86_avx) {

Return to bug 933827