/usr/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp(566): error: static assertion failed with "Row broadcast doesn't support smem usage" static_assert(Stages == 0, "Row broadcast doesn't support smem usage"); ^ detected during: instantiation of class "cutlass::epilogue::fusion::Sm90RowBroadcast<Stages, CtaTileShapeMNK, Element, StrideMNL, Alignment, EnableNullptr> [with Stages=2, CtaTileShapeMNK=cute::tuple<cute::_128, cute::_128, cute::_128>, Element=<unnamed>:: DtypeScale, StrideMNL=cute::tuple<cute::_0, cute::_1, cute::_0>, Alignment=4, EnableNullptr=true]" at line 909 of /usr/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp instantiation of class "cutlass::epilogue::fusion::detail::Sm90VisitorImplBase<Op0, Op1, Op2> [with Op0=cutlass::epilogue::fusion::Sm90RowBroadcast<2, cute::tuple<cute::_128, cute::_128, cute::_128>, <unnamed>::DtypeScale, cute::tuple<cute ::_0, cute::_1, cute::_0>, 4, true>, Op1=cutlass::epilogue::fusion::Sm90AccFetch, Op2=<unnamed>::Multiply]" at line 305 of /usr/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp instantiation of class "cutlass::epilogue::fusion::detail::Sm90VisitorImpl<Ops...> [with Ops=<cutlass::epilogue::fusion::Sm90RowBroadcast<2, cute::tuple<cute::_128, cute::_128, cute::_128>, <unnamed>::DtypeScale, cute::tuple<cute::_0, cute ::_1, cute::_0>, 4, true>, cutlass::epilogue::fusion::Sm90AccFetch, <unnamed>::Multiply>]" at line 564 of /usr/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp instantiation of class "cutlass::epilogue::fusion::Sm90TreeVisitor<NodeOp, ChildOps...> [with NodeOp=<unnamed>::Multiply, ChildOps=<cutlass::epilogue::fusion::Sm90RowBroadcast<2, cute::tuple<cute::_128, cute::_128, cute::_128>, <unnamed>:: DtypeScale, cute::tuple<cute::_0, cute::_1, cute::_0>, 4, true>, cutlass::epilogue::fusion::Sm90AccFetch>]" at line 910 of /usr/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp instantiation of class "cutlass::epilogue::fusion::detail::Sm90VisitorImplBase<Op0, Op1, Op2> [with Op0=cutlass::epilogue::fusion::Sm90ColBroadcast<0, cute::tuple<cute::_128, cute::_128, cute::_128>, <unnamed>::DtypeScale, cute::tuple<cute ::_1, cute::_0, cute::C<0>>, 4, true>, Op1=cutlass::epilogue::fusion::Sm90TreeVisitor<<unnamed>::Multiply, cutlass::epilogue::fusion::Sm90RowBroadcast<2, cute::tuple<cute::_128, cute::_128, cute::_128>, <unnamed>::DtypeScale, cute::tuple<cute::_0, cut e::_1, cute::_0>, 4, true>, cutlass::epilogue::fusion::Sm90AccFetch>, Op2=<unnamed>::Multiply]" at line 305 of /usr/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp [ 11 instantiation contexts not shown ] instantiation of "void <unnamed>::dispatch_fp8_rowwise_kernel_on_tile_size<ClusterShape,Types...>(at::Tensor, at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>, at::Tensor) [with ClusterShape=cute::tuple<cute::_1, cute::_2, cut e::_1>, Types=<std::false_type, std::true_type, cutlass::float_e5m2_t, cutlass::float_e4m3_t, cutlass::bfloat16_t>]" at line 357 of /tmp/portage/sci-libs/caffe2-2.5.1-r2/work/pytorch-2.5.1/aten/src/ATen/native/cuda/RowwiseScaledMM.cu instantiation of "void <unnamed>::handle_transposition<ClusterShape,Transposed,FastAccum,DtypeA,DtypeB,DtypeBias>(at::Tensor, at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>, at::Tensor) [with ClusterShape=cute::tuple<cute::_ 1, cute::_2, cute::_1>, Transposed=std::false_type, FastAccum=std::true_type, DtypeA=cutlass::float_e5m2_t, DtypeB=cutlass::float_e4m3_t, DtypeBias=cutlass::bfloat16_t]" at line 390 of /tmp/portage/sci-libs/caffe2-2.5.1-r2/work/pytorch-2.5.1/aten/src/ATen/native/cuda/RowwiseScaledMM.cu instantiation of "void <unnamed>::dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<Types...>(at::Tensor, at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>, at::Tensor) [with Types=<std::true_type, cutlass::float_e5m2_t, cutlass::float_e4m3_t, cutlass::bfloat16_t>]" at line 466 of /tmp/portage/sci-libs/caffe2-2.5.1-r2/work/pytorch-2.5.1/aten/src/ATen/native/cuda/RowwiseScaledMM.cu instantiation of "void <unnamed>::dispatch_fp8_rowwise_kernel_on_fast_accum<Types...>(at::Tensor, at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>, __nv_bool, at::Tensor) [with Types=<cutlass::float_e5m2_t, cutlass::float_e4m3_t, cutlass::bfloat16_t>]" at line 487 of /tmp/portage/sci-libs/caffe2-2.5.1-r2/work/pytorch-2.5.1/aten/src/ATen/native/cuda/RowwiseScaledMM.cu instantiation of "void <unnamed>::dispatch_fp8_rowwise_kernel_on_input_dtypes<Types...>(at::Tensor, at::Tensor, at::Tensor, at::Tensor, std::optional<at::Tensor>, __nv_bool, at::Tensor) [with Types=<cutlass::bfloat16_t>]" at line 505 of /tmp/portage/sci-libs/caffe2-2.5.1-r2/work/pytorch-2.5.1/aten/src/ATen/native/cuda/RowwiseScaledMM.cu TORCH_CUDA_ARCH_LIST="8.6 7.5" Reproducible: Always emerge --info sci-libs/caffe2 Portage 3.0.66.1 (python 3.12.7-final-0, default/linux/amd64/23.0/desktop/plasma/systemd, gcc-14, glibc-2.40-r5, 6.12.0-gentoo x86_64) ================================================================= System Settings ================================================================= System uname: Linux-6.12.0-gentoo-x86_64-AMD_Ryzen_9_7950X_16-Core_Processor-with-glibc2.40 KiB Mem: 131195172 total, 53605076 free KiB Swap: 205520892 total, 205389924 free Timestamp of repository gentoo: Tue, 19 Nov 2024 08:48:41 +0000 Head commit of repository gentoo: be21079a5ff0b75ef8e8bc7ab77953164bc0614e Timestamp of repository 4nykey: Sun, 17 Nov 2024 05:33:33 +0000 Head commit of repository 4nykey: ab686e7753127fb4664eb3131fba0c4dd8a3d0bf Timestamp of repository benzene-overlay: Tue, 19 Nov 2024 08:03:32 +0000 Head commit of repository benzene-overlay: a252b5c2610cc0ca55b98f6baaff0fa6c7c90655 Timestamp of repository brother-overlay: Tue, 08 Oct 2024 15:51:03 +0000 Head commit of repository brother-overlay: 928bbe8f324720cbb3dd74c3db524c0e674f1349 Timestamp of repository cg: Sun, 17 Nov 2024 05:33:33 +0000 Head commit of repository cg: 741157670a48bb061c00ad85e9009e1408f35808 Timestamp of repository guru: Tue, 19 Nov 2024 09:03:38 +0000 Head commit of repository guru: 56cbcef3cce5e608ed45c5dcd538f36469fc0d03 Timestamp of repository haarp: Sun, 17 Nov 2024 05:33:30 +0000 Head commit of repository haarp: 493fee10a8a47ed3ee8f4d4bbfce493248d23d9b Timestamp of repository kde: Mon, 18 Nov 2024 17:03:18 +0000 Head commit of repository kde: dbaf002a01e9d8cb39e70cfb813a6fc475df6d5f Timestamp of repository mv: Mon, 18 Nov 2024 17:03:18 +0000 Head commit of repository mv: 92f42ca2204d7a099bb6689610d81d21abb3442b Timestamp of repository qt: Fri, 08 Nov 2024 19:18:38 +0000 Head commit of repository qt: c7468208fb0377022a2debe5e4167d23e06d01f7 Timestamp of repository science: Sun, 17 Nov 2024 05:33:36 +0000 Head commit of repository science: 85518758faa565881972c27482a7571fd3d26eee Timestamp of repository wayland-desktop: Sun, 17 Nov 2024 05:33:37 +0000 Head commit of repository wayland-desktop: de46a80a5b8e00e5426f3a0930a7b7137f72eb33 sh bash 5.2_p37 ld GNU ld (Gentoo 2.43 p3) 2.43.1 app-misc/pax-utils: 1.3.8::gentoo app-shells/bash: 5.2_p37::gentoo dev-build/autoconf: 2.13-r8::gentoo, 2.72-r1::gentoo dev-build/automake: 1.16.5-r2::gentoo, 1.17-r1::gentoo dev-build/cmake: 3.31.0::gentoo dev-build/libtool: 2.5.3::gentoo dev-build/make: 4.4.1-r100::gentoo dev-build/meson: 1.6.0::gentoo dev-java/java-config: 2.3.4::gentoo dev-lang/perl: 5.40.0::gentoo dev-lang/python: 3.12.7_p1::gentoo, 3.13.0::gentoo dev-lang/rust-bin: 1.81.0-r100::gentoo, 1.82.0-r100::gentoo sys-apps/baselayout: 2.17::gentoo sys-apps/sandbox: 2.40::gentoo sys-apps/systemd: 256.7::gentoo sys-devel/binutils: 2.43-r2::gentoo sys-devel/binutils-config: 5.5.2::gentoo sys-devel/clang: 18.1.8-r6::gentoo, 19.1.3::gentoo sys-devel/gcc: 13.3.1_p20241025::gentoo, 14.2.1_p20241026::gentoo sys-devel/gcc-config: 2.11::gentoo sys-devel/llvm: 18.1.8-r6::gentoo, 19.1.3::gentoo sys-kernel/linux-headers: 6.11::gentoo (virtual/os-headers) sys-libs/glibc: 2.40-r5::gentoo Repositories: gentoo location: /var/db/repos/gentoo sync-type: git sync-uri: https://github.com/gentoo-mirror/gentoo.git priority: -1000 volatile: False 4nykey location: /var/db/repos/4nykey sync-type: git sync-uri: https://github.com/gentoo-mirror/4nykey.git masters: gentoo volatile: False benzene-overlay location: /var/db/repos/benzene-overlay sync-type: git sync-uri: https://github.com/gentoo-mirror/benzene-overlay.git masters: gentoo volatile: False brother-overlay location: /var/db/repos/brother-overlay sync-type: git sync-uri: https://github.com/gentoo-mirror/brother-overlay.git masters: gentoo volatile: False cg location: /var/db/repos/cg sync-type: git sync-uri: https://github.com/gentoo-mirror/cg.git masters: gentoo volatile: False guru location: /var/db/repos/guru sync-type: git sync-uri: https://github.com/gentoo-mirror/guru.git masters: gentoo volatile: False haarp location: /var/db/repos/haarp sync-type: git sync-uri: https://github.com/gentoo-mirror/haarp.git masters: gentoo volatile: False kde location: /var/db/repos/kde sync-type: git sync-uri: https://github.com/gentoo-mirror/kde.git masters: gentoo volatile: False local location: /var/db/repos/local masters: gentoo volatile: False mv location: /var/db/repos/mv sync-type: git sync-uri: https://github.com/gentoo-mirror/mv.git masters: gentoo volatile: False qt location: /var/db/repos/qt sync-type: git sync-uri: https://github.com/gentoo-mirror/qt.git masters: gentoo volatile: False science location: /var/db/repos/science sync-type: git sync-uri: https://github.com/gentoo-mirror/science.git masters: gentoo volatile: False wayland-desktop location: /var/db/repos/wayland-desktop sync-type: git sync-uri: https://github.com/gentoo-mirror/wayland-desktop.git masters: gentoo volatile: False Binary Repositories: gentoobinhost priority: 1 sync-uri: https://gentoo.osuosl.org/releases/amd64/binpackages/23.0/x86-64 ACCEPT_KEYWORDS="amd64 ~amd64" ACCEPT_LICENSE="@FREE" CBUILD="x86_64-pc-linux-gnu" CFLAGS="-O2 -march=native -pipe" CHOST="x86_64-pc-linux-gnu" CONFIG_PROTECT="/etc /usr/lib64/libreoffice/program/sofficerc /usr/share/config /usr/share/gnupg/qualified.txt /usr/share/maven-bin-3.9/conf /usr/share/themes/oxygen-gtk/gtk-3.0 /var/bind" CONFIG_PROTECT_MASK="/etc/ca-certificates.conf /etc/dconf /etc/env.d /etc/fonts/fonts.conf /etc/gconf /etc/gentoo-release /etc/revdep-rebuild /etc/sandbox.d /etc/texmf/language.dat.d /etc/texmf/language.def.d /etc/texmf/updmap.d /etc/texmf/web2c" CXXFLAGS="-O2 -march=native -pipe" DISTDIR="/var/cache/distfiles" EMERGE_DEFAULT_OPTS=" --quiet-build --load-average=24 --keep-going --jobs=3" ENV_UNSET="CARGO_HOME DBUS_SESSION_BUS_ADDRESS DISPLAY GDK_PIXBUF_MODULE_FILE GOBIN GOPATH PERL5LIB PERL5OPT PERLPREFIX PERL_CORE PERL_MB_OPT PERL_MM_OPT XAUTHORITY XDG_CACHE_HOME XDG_CONFIG_HOME XDG_DATA_HOME XDG_RUNTIME_DIR XDG_STATE_HOME" FCFLAGS="-O2 -march=native -pipe" FEATURES="assume-digests binpkg-docompress binpkg-dostrip binpkg-logs binpkg-multi-instance buildpkg-live config-protect-if-modified distlocks ebuild-locks fixlafiles ipc-sandbox merge-sync merge-wait multilib-strict network-sandbox news parallel-fetch pid-sandbox pkgdir-index-trusted preserve-libs protect-owned qa-unresolved-soname-deps sandbox sfperms splitdebug strict strict-keepdir unknown-features-warn unmerge-logs unmerge-orphans userfetch userpriv usersandbox usersync warn-on-large-env xattr" FFLAGS="-O2 -march=native -pipe" GENTOO_MIRRORS="ftp://mirror.netcologne.de/gentoo/ http://mirror.netcologne.de/gentoo/" LANG="C.UTF-8" LDFLAGS="-Wl,-O1 -Wl,--as-needed -Wl,-z,pack-relative-relocs" LEX="flex" MAKEOPTS="-j24" PKGDIR="/var/cache/binpkgs" PORTAGE_COMPRESS="zstd" PORTAGE_CONFIGROOT="/" PORTAGE_RSYNC_OPTS="--recursive --links --safe-links --perms --times --omit-dir-times --compress --force --whole-file --delete --stats --human-readable --timeout=180 --exclude=/distfiles --exclude=/local --exclude=/packages --exclude=/.git" PORTAGE_TMPDIR="/tmp" SHELL="/bin/zsh" USE="X a52 aac acl acpi activities alsa amd64 bash-completion bluetooth branding bzip2 cairo cdda cdr cet crypt cups dbus declarative dri dts dvd dvdr encode exif flac fontconfig gdbm gif gpm gtk gui heif iconv icu idn ipv6 jpeg jpegxl kde kerberos kf6 kf6compat kwallet lcms libnotify libtirpc mad mng mp3 mp4 mpeg multilib ncurses networkmanager nls numa ogg openexr opengl openmp pam pango pcre pdf pipewire plasma png policykit ppds pulseaudio qml qt5 qt6 readline samba screencast sdl seccomp semantic-desktop sound spell ssl startup-notification svg systemd test-rust tiff truetype udev udisks unicode upower usb v4l vaapi vorbis vulkan wayland widgets wxwidgets x264 x265 xattr xcb xft xml xv xvid zeroconf zlib zsh-completion zstd" ABI_X86="64" ADA_TARGET="gcc_12" APACHE2_MODULES="authn_core authz_core socache_shmcb unixd actions alias auth_basic authn_anon authn_dbm authn_file authz_dbm authz_groupfile authz_host authz_owner authz_user autoindex cache cgi cgid dav dav_fs dav_lock deflate dir env expires ext_filter file_cache filter headers include info log_config logio mime mime_magic negotiation rewrite setenvif speling status unique_id userdir usertrack vhost_alias" CALLIGRA_FEATURES="karbon sheets words" COLLECTD_PLUGINS="df interface irq load memory rrdtool swap syslog" CPU_FLAGS_X86="mmx mmxext sse sse2 aes avx avx2 avx512bw avx512cd avx512dq avx512f avx512vbmi avx512vl f16c fma3 pclmul popcnt rdrand sha sse3 sse4_1 sse4_2 sse4a ssse3" ELIBC="glibc" GPSD_PROTOCOLS="ashtech aivdm earthmate evermore fv18 garmin garmintxt gpsclock greis isync itrax navcom oceanserver oncore rtcm104v2 rtcm104v3 sirf skytraq superstar2 tsip tripmate tnt ublox" GUILE_SINGLE_TARGET="3-0" GUILE_TARGETS="3-0" INPUT_DEVICES="libinput" KERNEL="linux" L10N="de en en-GB uk" LCD_DEVICES="bayrad cfontz glk hd44780 lb216 lcdm001 mtxorb text" LUA_SINGLE_TARGET="lua5-1" LUA_TARGETS="lua5-1" OFFICE_IMPLEMENTATION="libreoffice" PHP_TARGETS="php8-2" POSTGRES_TARGETS="postgres16" PYTHON_SINGLE_TARGET="python3_12" PYTHON_TARGETS="python3_12" RUBY_TARGETS="ruby33" VIDEO_CARDS="amdgpu radeon radeonsi nvidia nouveau nvk" XTABLES_ADDONS="quota2 psd pknock lscan length2 ipv4options ipp2p iface geoip fuzzy condition tarpit sysrq proto logmark ipmark dhcpmac delude chaos account" Unset: ADDR2LINE, AR, ARFLAGS, AS, ASFLAGS, CC, CCLD, CONFIG_SHELL, CPP, CPPFLAGS, CTARGET, CXX, CXXFILT, ELFEDIT, EXTRA_ECONF, F77FLAGS, FC, GCOV, GPROF, INSTALL_MASK, LC_ALL, LD, LFLAGS, LIBTOOL, LINGUAS, MAKE, MAKEFLAGS, NM, OBJCOPY, OBJDUMP, PORTAGE_BINHOST, PORTAGE_BUNZIP2_COMMAND, PORTAGE_COMPRESS_FLAGS, PORTAGE_RSYNC_EXTRA_OPTS, PYTHONPATH, RANLIB, READELF, RUSTFLAGS, SIZE, STRINGS, STRIP, YACC, YFLAGS ================================================================= Package Settings ================================================================= sci-libs/caffe2-2.4.1-r4::gentoo was built with the following: USE="cuda fbgemm nnpack numpy opencl openmp qnnpack xnnpack -distributed -flash -gloo -mkl -mpi -onednn -openblas -rocm" ABI_X86="(64)" AMDGPU_TARGETS="gfx1030 gfx1100 gfx906 gfx908 gfx90a gfx942 -gfx1010 -gfx1011 -gfx1012 -gfx1031 -gfx1101 -gfx1102 -gfx803 -gfx900 -gfx940 -gfx941" PYTHON_SINGLE_TARGET="python3_12 -python3_10 -python3_11"
Created attachment 910165 [details] build log
What is the version of cutlass that are you using ?
(In reply to Tupone Alfredo from comment #2) > What is the version of cutlass that are you using ? dev-libs/cutlass-3.5.1
+1 Downgrading to =dev-libs/cutlass-3.4.1 works for me as a workaround. I didn't dig into it but it is a bit weird that we are seeing SM90 headers regardless of the targeted CUDA architectures; I am trying to build for 8.9 only. Not familiar enough with caffe2/cutlass to know why that is.
The bug has been closed via the following commit(s): https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=589da8f799c441e138c5d26feacc8bfdd98c6ea2 commit 589da8f799c441e138c5d26feacc8bfdd98c6ea2 Author: Alfredo Tupone <tupone@gentoo.org> AuthorDate: 2024-11-28 05:38:41 +0000 Commit: Alfredo Tupone <tupone@gentoo.org> CommitDate: 2024-11-28 05:40:08 +0000 sci-libs/caffe2: fix dep for cutlass Closes: https://bugs.gentoo.org/943981 Signed-off-by: Alfredo Tupone <tupone@gentoo.org> sci-libs/caffe2/{caffe2-2.5.1-r2.ebuild => caffe2-2.5.1-r3.ebuild} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)