Fix compilation failures for targets like gfx1100. Loss.cu has a distinctive pattern of using CUDA_KERNEL_ASSERT after 64-bit getelementptr, which causes LLVM backend error. The problem is addressed in https://github.com/llvm/llvm-project/pull/174774, but requires a workaround for LLVM-22 (e.g., using __assert_fail, which does not emit llvm.trap()). See also: https://bugs.gentoo.org/978229 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -157,12 +157,38 @@ int nll_loss_threads(int64_t nframe){ AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Byte, index_t, __VA_ARGS__) \ AT_PRIVATE_CASE_TYPE_USING_HINT(at::ScalarType::Long, index_t, __VA_ARGS__)) +// Workaround for LLVM bug in insertSimulatedTrap (fixed in +// https://github.com/llvm/llvm-project/pull/174774): abort() generates +// llvm.trap() which causes FinalizeISel to skip subsequent blocks containing +// S_ADD_U64_PSEUDO, leaving it unexpanded. Using __assert_fail avoids +// llvm.trap() while preserving the same assertion behavior. +#ifdef USE_ROCM +#define CHECK_INDEX_IN_CLASS(INDEX, N_CLASSES) \ + if constexpr(std::is_unsigned_v) { \ + if (C10_UNLIKELY(!(INDEX < N_CLASSES))) { \ + __assert_fail( \ + #INDEX " < " #N_CLASSES, \ + __FILE__, \ + static_cast(__LINE__), \ + __func__); \ + } \ + } else { \ + if (C10_UNLIKELY(!(INDEX >= 0 && INDEX < N_CLASSES))) { \ + __assert_fail( \ + #INDEX " >= 0 && " #INDEX " < " #N_CLASSES, \ + __FILE__, \ + static_cast(__LINE__), \ + __func__); \ + } \ + } +#else #define CHECK_INDEX_IN_CLASS(INDEX, N_CLASSES) \ if constexpr(std::is_unsigned_v) { \ CUDA_KERNEL_ASSERT(INDEX < N_CLASSES); \ } else { \ CUDA_KERNEL_ASSERT(INDEX >= 0 && INDEX < N_CLASSES); \ } +#endif template __global__ void nll_loss_forward_no_reduce_cuda_kernel(