From 17e51267c9b917183f23961962b8c857e9eb2e63 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Fri, 22 May 2026 23:05:06 +0000 Subject: [PATCH 01/21] Expert Parallelism: common C API + NCCL EP v0.1 backend Signed-off-by: Phuong Nguyen --- .gitmodules | 4 + 3rdparty/nccl | 1 + qa/L1_cpp_distributed/test.sh | 3 + setup.py | 127 +++ tests/cpp_distributed/CMakeLists.txt | 91 +- tests/cpp_distributed/run_test_ep.sh | 137 +++ tests/cpp_distributed/test_ep_common.h | 308 ++++++ tests/cpp_distributed/test_ep_coverage.cu | 379 ++++++++ tests/cpp_distributed/test_ep_init.cu | 64 ++ tests/cpp_distributed/test_ep_pipeline.cu | 890 ++++++++++++++++++ transformer_engine/common/CMakeLists.txt | 90 ++ transformer_engine/common/ep/ep_api.cpp | 76 ++ transformer_engine/common/ep/ep_api_stub.cpp | 61 ++ transformer_engine/common/ep/ep_backend.cpp | 514 ++++++++++ transformer_engine/common/ep/ep_backend.h | 114 +++ .../include/transformer_engine/comm_window.h | 32 + .../common/include/transformer_engine/ep.h | 161 ++++ 17 files changed, 3050 insertions(+), 2 deletions(-) create mode 160000 3rdparty/nccl create mode 100755 tests/cpp_distributed/run_test_ep.sh create mode 100644 tests/cpp_distributed/test_ep_common.h create mode 100644 tests/cpp_distributed/test_ep_coverage.cu create mode 100644 tests/cpp_distributed/test_ep_init.cu create mode 100644 tests/cpp_distributed/test_ep_pipeline.cu create mode 100644 transformer_engine/common/ep/ep_api.cpp create mode 100644 transformer_engine/common/ep/ep_api_stub.cpp create mode 100644 transformer_engine/common/ep/ep_backend.cpp create mode 100644 transformer_engine/common/ep/ep_backend.h create mode 100644 transformer_engine/common/include/transformer_engine/comm_window.h create mode 100644 transformer_engine/common/include/transformer_engine/ep.h diff --git a/.gitmodules b/.gitmodules index 4b188d6bb1..e531c95507 100644 --- a/.gitmodules +++ b/.gitmodules @@ -7,3 +7,7 @@ [submodule "3rdparty/cutlass"] path = 3rdparty/cutlass url = https://github.com/NVIDIA/cutlass.git +[submodule "3rdparty/nccl"] + path = 3rdparty/nccl + url = https://github.com/NVIDIA/nccl.git + branch = v2.30u1 diff --git a/3rdparty/nccl b/3rdparty/nccl new file mode 160000 index 0000000000..6a9bc953ac --- /dev/null +++ b/3rdparty/nccl @@ -0,0 +1 @@ +Subproject commit 6a9bc953ac1c4eef92d5adbe3092d4c2cb0a4c98 diff --git a/qa/L1_cpp_distributed/test.sh b/qa/L1_cpp_distributed/test.sh index 8d767a4efb..7e5ce2cf0d 100755 --- a/qa/L1_cpp_distributed/test.sh +++ b/qa/L1_cpp_distributed/test.sh @@ -14,4 +14,7 @@ if [[ $(nvidia-smi --list-gpus | wc -l) -ge 4 ]]; then cmake -GNinja -S. -Bbuild cmake --build build mpirun --allow-run-as-root --np 4 --oversubscribe ./build/test_comm_gemm + + # EP suites; runner self-skips on pre-Hopper GPUs. + bash ./run_test_ep.sh 4 ./build fi diff --git a/setup.py b/setup.py index ec277b6349..db360c8a29 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,34 @@ def setup_common_extension() -> CMakeExtension: cusolvermp_dir = os.getenv("CUSOLVERMP_HOME", "/usr") cmake_flags.append(f"-DCUSOLVERMP_DIR={cusolvermp_dir}") + # NCCL EP: on by default; auto-disabled if no arch >= 90. + # Set NVTE_BUILD_WITH_NCCL_EP=0/1 to force off/on. + nccl_ep_env = os.getenv("NVTE_BUILD_WITH_NCCL_EP") + explicit_nccl_ep = nccl_ep_env is not None + build_with_nccl_ep = bool(int(nccl_ep_env)) if explicit_nccl_ep else True + + if build_with_nccl_ep: + arch_tokens = [a.strip() for a in str(archs or "").split(";") if a.strip()] + has_hopper_or_newer = any(t.lower() == "native" for t in arch_tokens) or any( + int(t.rstrip("af")) >= 90 for t in arch_tokens if t.rstrip("af").isdigit() + ) + if not has_hopper_or_newer: + if explicit_nccl_ep: + raise RuntimeError( + "NVTE_BUILD_WITH_NCCL_EP=1 requires at least one CUDA arch >= 90 in " + f"NVTE_CUDA_ARCHS (got '{archs}'). Add '90' or unset NVTE_BUILD_WITH_NCCL_EP." + ) + print( + "[NCCL EP] No CUDA arch >= 90 in NVTE_CUDA_ARCHS" + f" ('{archs}'); auto-disabling NCCL EP (nvte_ep_* will throw at runtime)." + ) + build_with_nccl_ep = False + + if build_with_nccl_ep: + build_nccl_ep_submodule() + else: + cmake_flags.append("-DNVTE_WITH_NCCL_EP=OFF") + # Add custom CMake arguments from environment variable nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS") if nvte_cmake_extra_args: @@ -128,6 +156,105 @@ def setup_requirements() -> Tuple[List[str], List[str]]: return [remove_dups(reqs) for reqs in [install_reqs, test_reqs]] +def _discover_nccl_home() -> str: + """Resolve NCCL_HOME: honor env var, else probe well-known prefixes, else ldconfig.""" + env_home = os.environ.get("NCCL_HOME") + if env_home: + if (Path(env_home) / "include" / "nccl.h").exists(): + return env_home + print( + f"[NCCL EP] WARNING: NCCL_HOME='{env_home}' is set but " + f"'{env_home}/include/nccl.h' was not found; falling back to system probes." + ) + + for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"): + p = Path(cand) + if (p / "include" / "nccl.h").exists() and any( + (p / "lib" / name).exists() or (p / "lib64" / name).exists() + for name in ("libnccl.so", "libnccl.so.2") + ): + return str(p) + + try: + out = subprocess.check_output(["ldconfig", "-p"], stderr=subprocess.DEVNULL).decode() + for line in out.splitlines(): + if "libnccl.so" in line and "=>" in line: + lib_path = Path(line.split("=>")[-1].strip()) + root = lib_path.parent.parent + if (root / "include" / "nccl.h").exists(): + return str(root) + except (subprocess.CalledProcessError, FileNotFoundError): + pass + + raise RuntimeError( + "Could not locate NCCL core (nccl.h + libnccl.so). Set NCCL_HOME to the install prefix." + ) + + +def build_nccl_ep_submodule() -> str: + """Build libnccl_ep.so from the 3rdparty/nccl submodule. + + NCCL EP is on by default; the system NCCL core (libnccl.so) supplies the + headers and runtime symbols. Returns the submodule build directory. + """ + nccl_root = current_file_path / "3rdparty" / "nccl" + if not (nccl_root / "Makefile").exists(): + raise RuntimeError( + f"NCCL submodule not found at {nccl_root}. " + "Run `git submodule update --init --recursive`." + ) + + build_dir = nccl_root / "build" + nccl_ep_lib = build_dir / "lib" / "libnccl_ep.so" + + archs = cuda_archs() or "90" + arch_list = [] + for a in str(archs).split(";"): + a = a.strip().rstrip("af") + if a and a.isdigit() and int(a) >= 90: + arch_list.append(a) + if not arch_list: + arch_list = ["90"] + gencode = " ".join(f"-gencode=arch=compute_{a},code=sm_{a}" for a in arch_list) + + nproc = os.cpu_count() or 8 + env = os.environ.copy() + env["NVCC_GENCODE"] = gencode + # NCCL EP needs the core NCCL headers + libnccl.so; write NCCL EP build + # outputs to the submodule's local build/ tree. + nccl_home = _discover_nccl_home() + env["NCCL_HOME"] = nccl_home + env["NCCL_EP_BUILDDIR"] = str(build_dir) + + if not nccl_ep_lib.exists(): + print(f"[NCCL EP] Building libnccl_ep.so (gencode='{gencode}')") + subprocess.check_call( + ["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"], + cwd=str(nccl_root), + env=env, + ) + + # TE's CMake expects nccl.h under 3rdparty/nccl/build/include/ for its + # version check. Mirror the top-level host headers from the system NCCL + # install — DON'T mirror nccl_device/ because the submodule ships its own + # newer copy at src/include/nccl_device/ with device-side templates that + # conflict with older system versions, and the JIT include path picks the + # submodule's. + nccl_include = build_dir / "include" + nccl_include.mkdir(parents=True, exist_ok=True) + for cand in (Path(nccl_home) / "include", Path("/usr/include")): + p = Path(cand) + if (p / "nccl.h").exists(): + for name in ("nccl.h", "nccl_net.h", "nccl_tuner.h"): + src = p / name + dst = nccl_include / name + if src.exists() and not dst.exists(): + dst.symlink_to(src) + break + + return str(build_dir) + + def git_check_submodules() -> None: """ Attempt to checkout git submodules automatically during setup. diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index 0d7258a81d..3870f57911 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(NOT DEFINED TE_LIB_PATH) get_filename_component(TE_LIB_PATH ${TE_LIB_FILE} DIRECTORY) endif() -find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED) +find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED NO_CMAKE_SYSTEM_PATH) message(STATUS "Found transformer_engine library: ${TE_LIB}") include_directories(../../transformer_engine/common/include) @@ -46,12 +46,99 @@ add_executable(test_comm_gemm find_package(OpenMP REQUIRED) find_package(MPI REQUIRED) + +# ── NCCL library ────────────────────────────────────────────────────────────── +# Search order: NCCL_HOME env → 3rdparty/nccl submodule build → system paths. +set(NCCL_SUBMODULE_BUILD "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build") find_library(NCCL_LIB NAMES nccl libnccl - PATH_SUFFIXES lib + HINTS $ENV{NCCL_HOME}/lib ${NCCL_SUBMODULE_BUILD}/lib + PATH_SUFFIXES lib lib64 REQUIRED) + +# NCCL headers: prefer submodule build output (has the handle_init API), +# then submodule src, then system (CUDA toolkit). +set(NCCL_SUBMODULE_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build/include") +set(NCCL_SUBMODULE_SRC_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/src/include") +if(EXISTS "${NCCL_SUBMODULE_INCLUDE}/nccl.h") + set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_INCLUDE}") +elseif(EXISTS "${NCCL_SUBMODULE_SRC_INCLUDE}/nccl.h") + set(NCCL_INCLUDE_DIR "${NCCL_SUBMODULE_SRC_INCLUDE}") +elseif(DEFINED ENV{NCCL_HOME}) + set(NCCL_INCLUDE_DIR "$ENV{NCCL_HOME}/include") +endif() target_include_directories(test_comm_gemm PRIVATE ${MPI_CXX_INCLUDE_PATH} $ENV{CUBLASMP_HOME}/include) target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest::gtest ${TE_LIB} CUDA::nvrtc MPI::MPI_CXX ${NCCL_LIB} OpenMP::OpenMP_CXX) include(GoogleTest) gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600) + +# ── EP distributed tests (HT mode) ───────────────────────────────────────── +# No MPI dependency — processes are spawned by run_test_ep.sh with +# --rank / --nranks flags. ncclUniqueId exchange uses a +# shared temp file (see test_ep_common.h for details). +# Headers + libs come from the in-tree 3rdparty/nccl submodule build. +set(NCCL_EP_SUBMODULE_ROOT + "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl") +find_library(NCCL_EP_LIB + NAMES nccl_ep libnccl_ep + HINTS ${NCCL_EP_SUBMODULE_ROOT}/build/lib + NO_DEFAULT_PATH + REQUIRED) + +set(NCCL_EP_INCLUDE_DIR "${NCCL_EP_SUBMODULE_ROOT}/contrib/nccl_ep/include") +if(NOT EXISTS "${NCCL_EP_INCLUDE_DIR}/nccl_ep.h") + message(FATAL_ERROR + "NCCL EP header not found at ${NCCL_EP_INCLUDE_DIR}/nccl_ep.h. " + "Run `git submodule update --init --recursive` to checkout 3rdparty/nccl.") +endif() +message(STATUS "EP test: NCCL EP headers: ${NCCL_EP_INCLUDE_DIR}") + +# Collect NCCL include dirs shared by all EP test targets (nccl_ep.h + nccl.h). +set(EP_TEST_NCCL_INCLUDES ${NCCL_EP_INCLUDE_DIR}) +if(DEFINED NCCL_INCLUDE_DIR) + list(APPEND EP_TEST_NCCL_INCLUDES ${NCCL_INCLUDE_DIR}) + message(STATUS "EP test: NCCL headers: ${NCCL_INCLUDE_DIR}") +endif() + +set(EP_TEST_COMMON_INCLUDES + ${EP_TEST_NCCL_INCLUDES} + ../../transformer_engine/common/include + ../../transformer_engine/common + ${CMAKE_CURRENT_SOURCE_DIR}) + +set(EP_TEST_COMMON_LIBS + CUDA::cuda_driver + CUDA::cudart + CUDA::nvrtc + GTest::gtest + ${TE_LIB} + ${NCCL_LIB} + ${NCCL_EP_LIB}) + +# nvrtc symbols are referenced from libtransformer_engine.so but not in its +# DT_NEEDED list (loaded via dlopen in Python). For cpp tests we link nvrtc +# explicitly with --no-as-needed so the linker keeps the dependency. +set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed") + +# ── EP init tests (InitPath, HandleMemSizeQuery) ───────────────────────────── +add_executable(test_ep_init test_ep_init.cu) +target_include_directories(test_ep_init PRIVATE ${EP_TEST_COMMON_INCLUDES}) +target_link_libraries(test_ep_init PUBLIC ${EP_TEST_COMMON_LIBS}) +target_link_options(test_ep_init PUBLIC ${EP_TEST_LINK_OPTS}) + +# ── EP pipeline tests (dispatch, combine, bwd, integrated) ─────────────────── +add_executable(test_ep_pipeline test_ep_pipeline.cu) +target_include_directories(test_ep_pipeline PRIVATE ${EP_TEST_COMMON_INCLUDES}) +target_link_libraries(test_ep_pipeline PUBLIC ${EP_TEST_COMMON_LIBS}) +target_link_options(test_ep_pipeline PUBLIC ${EP_TEST_LINK_OPTS}) + +# ── EP coverage tests (multi-handle, top_k=1, empty experts, negatives, threading) ── +add_executable(test_ep_coverage test_ep_coverage.cu) +target_include_directories(test_ep_coverage PRIVATE ${EP_TEST_COMMON_INCLUDES}) +target_link_libraries(test_ep_coverage PUBLIC ${EP_TEST_COMMON_LIBS}) +target_link_options(test_ep_coverage PUBLIC ${EP_TEST_LINK_OPTS}) + +# Do NOT use gtest_discover_tests — these binaries require multi-process +# launch via run_test_ep.sh, not direct single-process execution. +message(STATUS "EP distributed tests enabled: ${NCCL_EP_LIB}") diff --git a/tests/cpp_distributed/run_test_ep.sh b/tests/cpp_distributed/run_test_ep.sh new file mode 100755 index 0000000000..017d3f807b --- /dev/null +++ b/tests/cpp_distributed/run_test_ep.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. +# +# Run TE EP distributed unit tests across multiple GPUs. +# +# Spawns one background bash process per GPU (no MPI dependency), matching the +# JAX multi-process launcher style. ncclUniqueId is exchanged via a shared +# temp file (see test_ep_common.h). Each rank builds its own ncclComm_t and +# passes it to nvte_ep_initialize. +# +# Usage: +# bash run_test_ep.sh [num_gpus] [build_dir] +# +# Defaults: +# num_gpus = number of GPUs visible to nvidia-smi +# build_dir = /build +# +# Environment variables: +# GTEST_FILTER — forwarded to all processes (e.g., "EPDispatchTest.*") +# TEST_TIMEOUT_S — per-process timeout in seconds (default: 180) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="${2:-${SCRIPT_DIR}/build}" +NUM_GPUS="${1:-$(nvidia-smi -L 2>/dev/null | wc -l)}" +TEST_TIMEOUT_S="${TEST_TIMEOUT_S:-180}" + +# Skip cleanly on pre-Hopper: NCCL EP requires SM>=90. +MIN_SM=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null \ + | awk -F. 'NR==1 || ($1*10+$2) 0 && MIN_SM < 90 )); then + echo "NCCL EP requires SM>=90 (lowest visible GPU is SM${MIN_SM}); SKIPPING." + exit 0 +fi + +GTEST_ARGS="${GTEST_FILTER:+--gtest_filter=${GTEST_FILTER}}" +OVERALL_FAIL=0 + +# --------------------------------------------------------------------------- +# run_suite BINARY SUITE_NAME MIN_GPUS +# --------------------------------------------------------------------------- +run_suite() { + local BINARY="$1" + local SUITE_NAME="$2" + local MIN_GPUS="${3:-2}" + + local TEST_BIN="${BUILD_DIR}/${BINARY}" + + if [[ ! -x "${TEST_BIN}" ]]; then + echo "ERROR: binary not found: ${TEST_BIN}" + echo "Build: cd ${SCRIPT_DIR} && mkdir -p build && cd build && cmake .. && make" + OVERALL_FAIL=1 + return + fi + + if (( NUM_GPUS < MIN_GPUS )); then + echo "${SUITE_NAME}: requires ${MIN_GPUS} GPUs, found ${NUM_GPUS}. Skipping." + return + fi + + local TMPDIR_L="${TMPDIR:-/tmp}" + local UID_FILE="${TMPDIR_L}/te_ep_uid_${BINARY}_$$" + rm -f "${UID_FILE}" + + local LOG_DIR + LOG_DIR=$(mktemp -d) + local FAIL=0 + + echo "=== ${SUITE_NAME} ===" + echo " GPUs: ${NUM_GPUS} Binary: ${TEST_BIN}" + echo + + # Spawn one background process per GPU. ncclUniqueId is exchanged via the + # shared UID_FILE. Each process is wrapped in `timeout` to detect hangs early. + local PIDS=() + for i in $(seq 0 $((NUM_GPUS - 1))); do + timeout --foreground --signal=KILL "${TEST_TIMEOUT_S}" \ + "${TEST_BIN}" \ + --rank="${i}" \ + --nranks="${NUM_GPUS}" \ + --uid-file="${UID_FILE}" \ + ${GTEST_ARGS} \ + > "${LOG_DIR}/rank_${i}.log" 2>&1 & + PIDS+=($!) + done + for i in $(seq 0 $((NUM_GPUS - 1))); do + if ! wait "${PIDS[$i]}"; then + local rc=$? + FAIL=1 + if [[ $rc -eq 137 || $rc -eq 124 ]]; then + echo " rank ${i}: TIMEOUT after ${TEST_TIMEOUT_S}s (rc=${rc})" + fi + fi + done + + echo "--- Rank 0 output ---" + cat "${LOG_DIR}/rank_0.log" + + if (( FAIL )); then + for i in $(seq 1 $((NUM_GPUS - 1))); do + echo "--- Rank ${i} output ---" + cat "${LOG_DIR}/rank_${i}.log" + done + echo "=== ${SUITE_NAME}: FAILED ===" + OVERALL_FAIL=1 + else + echo "=== ${SUITE_NAME}: ALL PASSED ===" + fi + + rm -rf "${LOG_DIR}" + rm -f "${UID_FILE}" +} + +# --------------------------------------------------------------------------- +# Cleanup on abort +# --------------------------------------------------------------------------- +cleanup() { rm -f "${TMPDIR:-/tmp}"/te_ep_uid_*_"$$" 2>/dev/null || true; } +trap cleanup EXIT INT TERM + +# --------------------------------------------------------------------------- +# Run all suites +# --------------------------------------------------------------------------- +run_suite "test_ep_init" "EP Init Tests" 2 +run_suite "test_ep_pipeline" "EP Pipeline Tests" 2 +run_suite "test_ep_coverage" "EP Coverage Tests" 2 + +echo +if (( OVERALL_FAIL )); then + echo "=== SOME SUITES FAILED ===" +else + echo "=== ALL SUITES PASSED ===" +fi + +exit "${OVERALL_FAIL}" diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h new file mode 100644 index 0000000000..77baa92b0c --- /dev/null +++ b/tests/cpp_distributed/test_ep_common.h @@ -0,0 +1,308 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/* + * Shared TE EP test infrastructure. Include once per TU; ep_bootstrap() in + * each test binary's main() populates process-level globals. + * Defaults: 4 experts/rank, hidden_dim=256, max_tokens_per_rank=64. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +// ── Error-checking macros ───────────────────────────────────────────────────── + +#define CHECK_NCCL(expr) \ + do { \ + ncclResult_t _err = (expr); \ + if (_err != ncclSuccess) \ + FAIL() << "NCCL error " << _err << ": " << ncclGetErrorString(_err); \ + } while (false) + +#define CHECK_CUDA(expr) \ + do { \ + cudaError_t _err = (expr); \ + if (_err != cudaSuccess) \ + FAIL() << "CUDA error " << _err << ": " << cudaGetErrorString(_err); \ + } while (false) + +#define ASSERT_CUDA_OK(expr) \ + do { \ + cudaError_t _err = (expr); \ + if (_err != cudaSuccess) { \ + fprintf(stderr, "CUDA error %d: %s\n", _err, cudaGetErrorString(_err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) + +#define ASSERT_NCCL_OK(expr) \ + do { \ + ncclResult_t _err = (expr); \ + if (_err != ncclSuccess) { \ + fprintf(stderr, "NCCL error %d: %s\n", _err, ncclGetErrorString(_err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (false) + +// ── Process-level state ─────────────────────────────────────────────────────── + +static int g_process_id = -1; +static int g_num_processes = -1; +static std::string g_uid_file; + +static int g_sm_major = -1; // set by ep_bootstrap; -1 until then +static int g_ep_size = -1; +static int g_num_experts = -1; +static int g_hidden_dim = 256; +static int g_max_tokens_per_rank = 64; +static bool g_ep_initialized = false; +static ncclComm_t g_ep_comm = nullptr; // owned by harness, destroyed in ep_teardown + +// ── TensorHandle RAII wrapper ───────────────────────────────────────────────── + +// View over a caller-owned device buffer; owns NVTETensor metadata only. Move-only. +struct TensorHandle { + NVTETensor tensor = nullptr; + void* dev_ptr = nullptr; + + ~TensorHandle() { + if (tensor) nvte_destroy_tensor(tensor); + } + + TensorHandle() = default; + TensorHandle(const TensorHandle&) = delete; + TensorHandle& operator=(const TensorHandle&) = delete; + + TensorHandle(TensorHandle&& o) noexcept : tensor(o.tensor), dev_ptr(o.dev_ptr) { + o.tensor = nullptr; o.dev_ptr = nullptr; + } + TensorHandle& operator=(TensorHandle&& o) noexcept { + if (this != &o) { + if (tensor) nvte_destroy_tensor(tensor); + tensor = o.tensor; dev_ptr = o.dev_ptr; + o.tensor = nullptr; o.dev_ptr = nullptr; + } + return *this; + } +}; + +static TensorHandle make_nvte_tensor(void* dev_ptr, + const std::vector& shape, + NVTEDType dtype) { + TensorHandle h; + h.dev_ptr = dev_ptr; + h.tensor = nvte_create_tensor(NVTE_DELAYED_TENSOR_SCALING); + + NVTEShape s; + s.ndim = shape.size(); + for (size_t i = 0; i < shape.size(); ++i) s.data[i] = shape[i]; + + NVTEBasicTensor bt; + bt.data_ptr = dev_ptr; + bt.dtype = dtype; + bt.shape = s; + nvte_set_tensor_param_v2(h.tensor, kNVTERowwiseData, &bt, sizeof(bt)); + + return h; +} + +// RAII owner for a cudaMalloc'd device buffer; frees on destruction. +template +struct DevBuf { + T* ptr = nullptr; + size_t count = 0; + + DevBuf() = default; + explicit DevBuf(size_t n) { alloc(n); } + ~DevBuf() { reset(); } + + DevBuf(const DevBuf&) = delete; + DevBuf& operator=(const DevBuf&) = delete; + DevBuf(DevBuf&& o) noexcept : ptr(o.ptr), count(o.count) { o.ptr = nullptr; o.count = 0; } + DevBuf& operator=(DevBuf&& o) noexcept { + if (this != &o) { reset(); ptr = o.ptr; count = o.count; o.ptr = nullptr; o.count = 0; } + return *this; + } + + void alloc(size_t n) { + reset(); + count = n; + if (n > 0) { + cudaError_t e = cudaMalloc(&ptr, n * sizeof(T)); + if (e != cudaSuccess) { + fprintf(stderr, "DevBuf cudaMalloc(%zu) failed: %s\n", n * sizeof(T), + cudaGetErrorString(e)); + ptr = nullptr; + count = 0; + } + } + } + + void reset() { + if (ptr) { cudaFree(ptr); ptr = nullptr; } + count = 0; + } + + T* get() const { return ptr; } + size_t bytes() const { return count * sizeof(T); } +}; + +// ── Shared routing helper ───────────────────────────────────────────────────── + +// Balanced round-robin routing: token t on rank r maps top_k experts to +// (r * num_local_experts + t * top_k + k) % num_experts +static inline std::vector routing_balanced( + int rank, int num_tokens, int top_k, int num_experts, int num_local_experts) { + std::vector idx(num_tokens * top_k); + for (int t = 0; t < num_tokens; ++t) + for (int k = 0; k < top_k; ++k) + idx[t * top_k + k] = (rank * num_local_experts + t * top_k + k) % num_experts; + return idx; +} + +// ── File-based ncclUniqueId exchange ───────────────────────────────────────── + +static void exchange_unique_id(ncclUniqueId* uid) { + const size_t sz = sizeof(ncclUniqueId); + + if (g_process_id == 0) { + ASSERT_NCCL_OK(ncclGetUniqueId(uid)); + FILE* f = fopen(g_uid_file.c_str(), "wb"); + if (!f) { fprintf(stderr, "Cannot open uid file: %s\n", g_uid_file.c_str()); exit(EXIT_FAILURE); } + fwrite(uid, 1, sz, f); + fclose(f); + } else { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(60); + while (true) { + FILE* f = fopen(g_uid_file.c_str(), "rb"); + if (f) { + fseek(f, 0, SEEK_END); + if (static_cast(ftell(f)) >= sz) { + fseek(f, 0, SEEK_SET); + size_t n = fread(uid, 1, sz, f); + fclose(f); + if (n == sz) break; + } else { + fclose(f); + } + } + if (std::chrono::steady_clock::now() > deadline) { + fprintf(stderr, "Process %d: timed out waiting for uid file\n", g_process_id); + exit(EXIT_FAILURE); + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + } +} + +// ── CLI parsing ─────────────────────────────────────────────────────────────── + +static void ep_parse_args(int argc, char* argv[]) { + for (int i = 1; i < argc; ++i) { + std::string a(argv[i]); + if (a.rfind("--process-id=", 0) == 0) g_process_id = std::stoi(a.substr(13)); + else if (a.rfind("--rank=", 0) == 0) g_process_id = std::stoi(a.substr(7)); + else if (a.rfind("--num-processes=",0)==0) g_num_processes = std::stoi(a.substr(16)); + else if (a.rfind("--nranks=", 0) == 0) g_num_processes = std::stoi(a.substr(9)); + else if (a.rfind("--uid-file=", 0) == 0) g_uid_file = a.substr(11); + } + + if (g_process_id < 0 || g_num_processes <= 0) { + fprintf(stderr, + "Usage: %s --rank=N --nranks=N [--uid-file=path] [gtest flags]\n" + " Aliases: --process-id=N, --num-processes=N\n", + argc > 0 ? argv[0] : "test_ep"); + exit(EXIT_FAILURE); + } + + if (g_uid_file.empty()) { + const char* t = getenv("TMPDIR"); if (!t) t = "/tmp"; + g_uid_file = std::string(t) + "/te_ep_uid_" + std::to_string(g_process_id); + } +} + +// ── Bootstrap / teardown ────────────────────────────────────────────────────── + +// Returns false if the binary should exit without running tests (wrong SM, etc.). +static bool ep_bootstrap(int argc, char* argv[]) { + ep_parse_args(argc, argv); + ::testing::InitGoogleTest(&argc, argv); + + int device_count; + cudaGetDeviceCount(&device_count); + cudaSetDevice(g_process_id % device_count); + + int device, major; + cudaGetDevice(&device); + cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device); + g_sm_major = major; + if (major < 9) { + if (g_process_id == 0) + printf("SKIP: EP requires SM_90+ (device is SM_%d0)\n", major); + return false; + } + if (g_num_processes < 2) { + if (g_process_id == 0) + printf("SKIP: at least 2 processes required\n"); + return false; + } + + g_ep_size = g_num_processes; + g_num_experts = g_ep_size * 4; // 4 experts per rank + + ncclUniqueId uid{}; + exchange_unique_id(&uid); + + NVTEEpGroupConfig group_config{}; + group_config.ep_size = g_ep_size; + group_config.num_experts = g_num_experts; + group_config.max_tokens_per_rank = g_max_tokens_per_rank; + // Worst-case for top_k fan-out: ep_size * max_tokens_per_rank * 2. + group_config.max_recv_tokens_per_rank = g_ep_size * g_max_tokens_per_rank * 2; + group_config.hidden_dim = g_hidden_dim; + + ASSERT_NCCL_OK(ncclCommInitRank(&g_ep_comm, g_num_processes, uid, g_process_id)); + nvte_ep_initialize(static_cast(g_ep_comm), group_config); + + if (g_process_id == 0) { + printf("EP initialized: ep_size=%d num_experts=%d " + "hidden_dim=%d max_tokens_per_rank=%d\n", + g_ep_size, g_num_experts, g_hidden_dim, g_max_tokens_per_rank); + } + + g_ep_initialized = true; + return true; +} + +// Tear down in dependency order: backend's ep_group reads from ep_comm, +// so destroy the group first, then the comm. +static void ep_teardown() { + if (g_ep_initialized) { + nvte_ep_shutdown(); + if (g_ep_comm != nullptr) { + ncclCommDestroy(g_ep_comm); + g_ep_comm = nullptr; + } + g_ep_initialized = false; + } + if (g_process_id == 0) remove(g_uid_file.c_str()); +} diff --git a/tests/cpp_distributed/test_ep_coverage.cu b/tests/cpp_distributed/test_ep_coverage.cu new file mode 100644 index 0000000000..ef7941905d --- /dev/null +++ b/tests/cpp_distributed/test_ep_coverage.cu @@ -0,0 +1,379 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/* + * EP C-API coverage tests (paths not exercised by the pipeline suite). + * + * MultiHandleAllocTest — distinct handle ids; each works end-to-end. + * TopK1Test — top_k=1 dispatch/combine/bwd round-trip. + * EmptyExpertsTest — alignment ∈ {0, 2, 8, 16} with experts receiving 0 tokens. + * NegativeTests — alignment mismatch and null handle_mem must throw. + */ + +#include "test_ep_common.h" + +#include +#include + +// top1 -> expert 0, top2 -> expert 2; leaves local-expert 1 empty between two +// full experts. Requires top_k >= 2 and num_experts >= 3. +static std::vector routing_skip_middle(int num_tokens, int top_k) { + std::vector idx(num_tokens * top_k); + for (int t = 0; t < num_tokens; ++t) { + idx[t * top_k + 0] = 0; + if (top_k >= 2) idx[t * top_k + 1] = 2; + for (int k = 2; k < top_k; ++k) idx[t * top_k + k] = 2 + k; // distinct stragglers + } + return idx; +} + +static std::vector tokens_constant(int num_tokens, int hidden_dim, float val) { + std::vector v(num_tokens * hidden_dim); + nv_bfloat16 b = __float2bfloat16(val); + std::fill(v.begin(), v.end(), b); + return v; +} + +namespace { + +class EpCoverageBase : public ::testing::Test { + protected: + int ep_size_, num_experts_, num_local_experts_, hidden_dim_; + int max_tokens_per_rank_; + + void SetUp() override { + if (g_sm_major < 9) + GTEST_SKIP() << "EP requires SM_90+ (device is SM_" << g_sm_major << "0)"; + ASSERT_GE(g_num_processes, 2); + ASSERT_TRUE(g_ep_initialized); + ep_size_ = g_ep_size; + num_experts_ = g_num_experts; + num_local_experts_ = num_experts_ / ep_size_; + hidden_dim_ = g_hidden_dim; + max_tokens_per_rank_ = g_max_tokens_per_rank; + } + + // Helper: allocate buffers + tensor views for a single dispatch+combine. + struct Bundle { + DevBuf topk_idx; + DevBuf topk_weights; + DevBuf tokens; + DevBuf token_counts; + DevBuf handle_mem; + DevBuf recv_tokens; + DevBuf recv_topk_weights; + DevBuf result; + uint64_t handle_id = 0; + size_t handle_mem_size = 0; + size_t recv_capacity = 0; + }; + + Bundle make_bundle(int num_tokens, int top_k, int num_local_experts, + size_t alignment) { + Bundle b; + b.recv_capacity = static_cast(ep_size_) * max_tokens_per_rank_ * 2; + b.topk_idx.alloc(num_tokens * top_k); + b.topk_weights.alloc(num_tokens * top_k); + b.tokens.alloc(num_tokens * hidden_dim_); + b.token_counts.alloc(num_local_experts); + b.recv_tokens.alloc(b.recv_capacity * hidden_dim_); + b.recv_topk_weights.alloc(b.recv_capacity); + b.result.alloc(num_tokens * hidden_dim_); + NVTEEpLayerConfig cfg{num_local_experts, top_k, alignment}; + b.handle_id = nvte_ep_register_layer(cfg, &b.handle_mem_size); + b.handle_mem.alloc(b.handle_mem_size); + return b; + } +}; + +} // namespace + +// ============================================================================= +// MultiHandleAllocTest: ids are distinct and each is independently usable. +// ============================================================================= + +class MultiHandleAllocTest : public EpCoverageBase {}; + +TEST_F(MultiHandleAllocTest, IdsAreDistinct) { + NVTEEpLayerConfig cfg{num_local_experts_, /*top_k=*/2, /*alignment=*/0}; + const int kN = 8; + std::vector ids(kN); + for (int i = 0; i < kN; ++i) { + size_t sz = 0; + ids[i] = nvte_ep_register_layer(cfg, &sz); + } + for (int i = 0; i < kN; ++i) { + EXPECT_NE(ids[i], 0u) << "handle_id 0 is reserved as \"no id\""; + for (int j = i + 1; j < kN; ++j) + EXPECT_NE(ids[i], ids[j]) << "duplicate id " << ids[i] << " at indices " << i << ", " << j; + } +} + +TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { + const int num_tokens = 16, top_k = 2; + Bundle a = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + std::vector h_w(num_tokens * top_k, 1.0f / top_k); + auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); + for (Bundle* x : {&a, &b}) { + CHECK_CUDA(cudaMemcpy(x->topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(x->topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(x->tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + } + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + ASSERT_NE(a.handle_id, b.handle_id); + + auto run_one = [&](Bundle& x) { + auto topk_idx = make_nvte_tensor(x.topk_idx.get(), {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto topk_weights = make_nvte_tensor(x.topk_weights.get(), {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + auto token_counts = make_nvte_tensor(x.token_counts.get(), {(size_t)num_local_experts_}, kNVTEInt32); + auto handle_mem = make_nvte_tensor(x.handle_mem.get(), {x.handle_mem_size}, kNVTEByte); + auto tokens = make_nvte_tensor(x.tokens.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_tokens = make_nvte_tensor(x.recv_tokens.get(), {x.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_w = make_nvte_tensor(x.recv_topk_weights.get(), {x.recv_capacity}, kNVTEFloat32); + auto result = make_nvte_tensor(x.result.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + NVTEEpHandle h{x.handle_id, handle_mem.tensor}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx.tensor, token_counts.tensor, + /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx.tensor, tokens.tensor, + NVTECommWindow{}, topk_weights.tensor, NVTECommWindow{}, + recv_tokens.tensor, NVTECommWindow{}, recv_w.tensor, + NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens.tensor, NVTECommWindow{}, + result.tensor, stream)); + }; + run_one(a); + run_one(b); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // Both round-trips must produce result == top_k * 0.5 = 1.0. + for (Bundle* x : {&a, &b}) { + std::vector h_res(num_tokens * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), x->result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int t = 0; t < num_tokens; ++t) + for (int p : probes) + EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), + static_cast(top_k) * 0.5f, 1e-2f); + } + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// TopK1Test: top_k=1 dispatch/combine round-trip, including dispatch_bwd. +// ============================================================================= + +class TopK1Test : public EpCoverageBase {}; + +TEST_F(TopK1Test, RoundTrip) { + const int num_tokens = 16, top_k = 1; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + std::vector h_w(num_tokens * top_k, 1.0f); // top_k=1: weight is unity + auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.25f); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts_}, kNVTEInt32); + auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), + {b.handle_mem_size}, kNVTEByte); + auto tokens_t = make_nvte_tensor(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), + {b.recv_capacity}, kNVTEFloat32); + auto result_t = make_nvte_tensor(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.tensor, + tokens_t.tensor, NVTECommWindow{}, topk_weights_t.tensor, + NVTECommWindow{}, recv_tokens_t.tensor, NVTECommWindow{}, + recv_w_t.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, + NVTECommWindow{}, result_t.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // top_k=1: combine is unweighted gather, so result[t] == tokens[t]. + std::vector h_res(num_tokens * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int t = 0; t < num_tokens; ++t) + for (int p : probes) + EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), 0.25f, 1e-2f) + << "tok " << t << " hidden " << p; + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EmptyExpertsTest: alignment ∈ {0, 2, 8, 16}, only local-expert 0 receives +// tokens. Round-trip must produce result == top_k * tokens regardless of the +// per-expert padding choice. +// ============================================================================= + +class EmptyExpertsTest : public EpCoverageBase, + public ::testing::WithParamInterface {}; + +TEST_P(EmptyExpertsTest, RoundTripCorrect) { + // routing_skip_middle needs experts {0, 2, ...}; smallest viable num_experts is 3. + ASSERT_GE(num_experts_, 3); + const size_t alignment = GetParam(); + const int num_tokens = 16, top_k = 2; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, alignment); + + // top1 -> expert 0, top2 -> expert 2; rank 0's local-expert 1 receives 0 + // tokens between two non-empty experts. + std::vector h_idx = routing_skip_middle(num_tokens, top_k); + std::vector h_w(num_tokens * top_k, 1.0f / top_k); + auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.3f); + + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts_}, kNVTEInt32); + auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), + {b.handle_mem_size}, kNVTEByte); + auto tokens_t = make_nvte_tensor(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), + {b.recv_capacity}, kNVTEFloat32); + auto result_t = make_nvte_tensor(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + alignment, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.tensor, + tokens_t.tensor, NVTECommWindow{}, topk_weights_t.tensor, + NVTECommWindow{}, recv_tokens_t.tensor, NVTECommWindow{}, + recv_w_t.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, + NVTECommWindow{}, result_t.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // Identity expert + uniform weights: result[t] == top_k * tokens[t]. + std::vector h_res(num_tokens * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const float expected = static_cast(top_k) * 0.3f; + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int t = 0; t < num_tokens; ++t) + for (int p : probes) + EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), expected, 1e-2f) + << "alignment=" << alignment << " tok=" << t << " hidden=" << p; + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +INSTANTIATE_TEST_SUITE_P(Alignments, EmptyExpertsTest, + ::testing::Values(0, 2, 8, 16)); + +// ============================================================================= +// NegativeTests: prepare/dispatch must surface bad inputs as exceptions. +// ============================================================================= + +class NegativeTests : public EpCoverageBase {}; + +TEST_F(NegativeTests, AlignmentMismatchThrows) { + const int num_tokens = 8, top_k = 2; + // Allocate handle for alignment=0, then call prepare with alignment=16. + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts_}, kNVTEInt32); + auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), + {b.handle_mem_size}, kNVTEByte); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; + EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + /*alignment=*/16, stream), + std::exception); + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +TEST_F(NegativeTests, NullHandleMemThrows) { + const int num_tokens = 8, top_k = 2; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts_}, kNVTEInt32); + // Construct a tensor view backed by a null device pointer. + auto null_hm_t = make_nvte_tensor(nullptr, {b.handle_mem_size}, kNVTEByte); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + NVTEEpHandle h{b.handle_id, null_hm_t.tensor}; + EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + /*alignment=*/0, stream), + std::exception); + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ── main ────────────────────────────────────────────────────────────────────── + +int main(int argc, char* argv[]) { + if (!ep_bootstrap(argc, argv)) return 0; + int ret = RUN_ALL_TESTS(); + ep_teardown(); + return ret; +} diff --git a/tests/cpp_distributed/test_ep_init.cu b/tests/cpp_distributed/test_ep_init.cu new file mode 100644 index 0000000000..08744dfee5 --- /dev/null +++ b/tests/cpp_distributed/test_ep_init.cu @@ -0,0 +1,64 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/* + * Unit tests for EP initialization paths. + * + * Tests: + * EPInitTest/InitPath — backend is live after init, handle_mem_size > 0 + * EPInitTest/NumLocalExperts — handle_mem_size is consistent across num_local_experts values + * + * Run via run_test_ep.sh (both uid and comm init paths are tested by the script). + */ + +#include "test_ep_common.h" + +// ── Fixture ─────────────────────────────────────────────────────────────────── + +class EPInitTest : public ::testing::Test { + protected: + void SetUp() override { + if (g_sm_major < 9) + GTEST_SKIP() << "EP requires SM_90+ (device is SM_" << g_sm_major << "0)"; + ASSERT_GE(g_num_processes, 2) << "EP tests require at least 2 processes"; + ASSERT_TRUE(g_ep_initialized) << "EP not initialized"; + } +}; + +// ── Tests ───────────────────────────────────────────────────────────────────── + +TEST_F(EPInitTest, InitPath) { + int nle = g_num_experts / g_ep_size; + NVTEEpLayerConfig cfg{nle, /*top_k=*/2}; + size_t sz = 0; + (void)nvte_ep_register_layer(cfg, &sz); + ASSERT_GT(sz, 0u) << "handle_mem_size must be > 0 after init"; + + if (g_process_id == 0) { + printf(" handle_mem : %zu bytes\n", sz); + } +} + +TEST_F(EPInitTest, NumLocalExperts) { + // handle_mem_size should be > 0 for any valid num_local_experts value. + for (int nle : {1, g_num_experts / g_ep_size}) { + NVTEEpLayerConfig cfg{nle, /*top_k=*/2}; + size_t sz = 0; + (void)nvte_ep_register_layer(cfg, &sz); + ASSERT_GT(sz, 0u) << "num_local_experts=" << nle; + if (g_process_id == 0) + printf(" nle=%-3d handle_mem_size=%zu bytes\n", nle, sz); + } +} + +// ── main ────────────────────────────────────────────────────────────────────── + +int main(int argc, char* argv[]) { + if (!ep_bootstrap(argc, argv)) return 0; + int ret = RUN_ALL_TESTS(); + ep_teardown(); + return ret; +} diff --git a/tests/cpp_distributed/test_ep_pipeline.cu b/tests/cpp_distributed/test_ep_pipeline.cu new file mode 100644 index 0000000000..41f83a6d11 --- /dev/null +++ b/tests/cpp_distributed/test_ep_pipeline.cu @@ -0,0 +1,890 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/* + * EP pipeline tests: smallest-scope first. + * + * EPDispatchTest/PrepareAndDispatch — exact recv values + per-expert counts + * EPCombineTest/Combine — round-trip: out == top_k * tokens + * EPCombineBwdTest/CombineBwdCheck — exact grad_expert values + * EPDispatchBwdTest/DispatchBwdCheck — exact grad_tokens + * EPDispatchBwdGradWeightsTest/RoundTrip — exact per-(t, k) grad_topk_weights + * EPPipelineTest/FullForwardBackward — fwd + bwd NaN/Inf check + * + * Routing: token t on rank r → expert (r * num_local_experts + t * top_k + k) % num_experts + * Token values: rank r, token t → all hidden dims = (r+1)*0.01 + t*0.001 + * + * Closed-form expected values: + * dispatch recv: multiset of source-token values routed to this rank's experts + * combine: result[t] == top_k * tokens[t] + * combine_bwd: grad_expert[slot] == d_result[t] (no weighting) + * dispatch_bwd: grad_tokens[t] == top_k * d_result[t] + */ + +#include "test_ep_common.h" + +#include +#include +#include +#include + +// ── Deterministic routing helpers ───────────────────────────────────────────── + +// Token value for (rank, t): (rank * num_tokens + t + 1) / 256. Step 1/256 is +// bf16-exact and unique across (rank, t) when rank * num_tokens + t < 256. +static inline float token_value(int rank, int t, int num_tokens) { + return static_cast(rank * num_tokens + t + 1) * (1.0f / 256.0f); +} + +static std::vector generate_tokens(int rank, int num_tokens, int hidden_dim) { + std::vector v(num_tokens * hidden_dim); + for (int t = 0; t < num_tokens; ++t) { + nv_bfloat16 val = __float2bfloat16(token_value(rank, t, num_tokens)); + for (int h = 0; h < hidden_dim; ++h) + v[t * hidden_dim + h] = val; + } + return v; +} + +static std::vector expected_token_counts( + int recv_rank, int num_processes, int num_tokens, int top_k, + int num_experts, int num_local_experts) { + int base = recv_rank * num_local_experts; + std::vector cnt(num_local_experts, 0); + for (int src = 0; src < num_processes; ++src) { + auto idx = routing_balanced(src, num_tokens, top_k, num_experts, num_local_experts); + for (int t = 0; t < num_tokens; ++t) + for (int k = 0; k < top_k; ++k) { + int64_t e = idx[t * top_k + k]; + if (e >= base && e < base + num_local_experts) ++cnt[e - base]; + } + } + return cnt; +} + +static std::vector expected_recv_values_sorted( + int recv_rank, int num_processes, int num_tokens, int top_k, + int num_experts, int num_local_experts) { + int base = recv_rank * num_local_experts; + std::vector vals; + for (int src = 0; src < num_processes; ++src) { + auto idx = routing_balanced(src, num_tokens, top_k, num_experts, num_local_experts); + for (int t = 0; t < num_tokens; ++t) + for (int k = 0; k < top_k; ++k) { + int64_t e = idx[t * top_k + k]; + if (e >= base && e < base + num_local_experts) { + float raw = token_value(src, t, num_tokens); + vals.push_back(__bfloat162float(__float2bfloat16(raw))); + } + } + } + std::sort(vals.begin(), vals.end()); + return vals; +} + +// BF16 has 7 mantissa bits; relative ULP ≈ 2^-7. Use 4× headroom for +// accumulation noise inside dispatch/combine. +static float bf16_tol(float magnitude) { + return 4.f * std::ldexp(std::fabs(magnitude) + 1e-3f, -7); +} + +static bool check_no_nan_inf(const nv_bfloat16* dev, int count, const char* name) { + std::vector h(count); + cudaMemcpy(h.data(), dev, count * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost); + for (int i = 0; i < count; ++i) { + float v = __bfloat162float(h[i]); + if (std::isnan(v) || std::isinf(v)) { + fprintf(stderr, "Rank %d: %s in %s[%d]\n", + g_process_id, std::isnan(v) ? "NaN" : "Inf", name, i); + return false; + } + } + return true; +} + +// ── Forward buffer set with RAII ────────────────────────────────────────────── + +struct EPBuffers { + // Forward + DevBuf topk_idx; + DevBuf topk_weights; + DevBuf tokens; + DevBuf token_counts; + DevBuf handle_mem; + DevBuf recv_tokens; + DevBuf recv_topk_weights; + DevBuf result; + // Backward + DevBuf grad_result; + DevBuf grad_expert; + DevBuf grad_tokens; + DevBuf g_recv_topk_weights; + DevBuf grad_topk_weights; + + uint64_t handle_id = 0; + size_t handle_mem_size = 0; + size_t recv_capacity = 0; + int top_k_ = 0; + + void alloc(int num_tokens, int top_k, int hidden_dim, int num_local_experts, + int ep_size, int max_tokens_per_rank, size_t alignment = 0) { + top_k_ = top_k; + recv_capacity = static_cast(ep_size) * max_tokens_per_rank * 2; + + topk_idx.alloc(num_tokens * top_k); + topk_weights.alloc(num_tokens * top_k); + tokens.alloc(num_tokens * hidden_dim); + token_counts.alloc(num_local_experts); + recv_tokens.alloc(recv_capacity * hidden_dim); + recv_topk_weights.alloc(recv_capacity); + result.alloc(num_tokens * hidden_dim); + + NVTEEpLayerConfig cfg{num_local_experts, top_k, alignment}; + handle_id = nvte_ep_register_layer(cfg, &handle_mem_size); + handle_mem.alloc(handle_mem_size); + + grad_result.alloc(num_tokens * hidden_dim); + grad_expert.alloc(recv_capacity * hidden_dim); + grad_tokens.alloc(num_tokens * hidden_dim); + g_recv_topk_weights.alloc(recv_capacity); + grad_topk_weights.alloc(num_tokens * top_k); + } +}; + +// Bundled NVTETensor views over an EPBuffers — one place to update the shape +// conventions when the C-API evolves. +struct EPTensors { + TensorHandle topk_idx, topk_weights, token_counts, handle_mem, tokens; + TensorHandle recv_tokens, recv_topk_weights, result; + TensorHandle grad_result, grad_expert, grad_tokens; + TensorHandle g_recv_topk_weights, grad_topk_weights; + + EPTensors(EPBuffers& b, int num_tokens, int top_k, int hidden_dim, + int num_local_experts) { + topk_idx = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + topk_weights = make_nvte_tensor(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + token_counts = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts}, kNVTEInt32); + handle_mem = make_nvte_tensor(b.handle_mem.get(), + {b.handle_mem_size}, kNVTEByte); + tokens = make_nvte_tensor(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + recv_tokens = make_nvte_tensor(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); + recv_topk_weights = make_nvte_tensor(b.recv_topk_weights.get(), + {b.recv_capacity}, kNVTEFloat32); + result = make_nvte_tensor(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + grad_result = make_nvte_tensor(b.grad_result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + grad_expert = make_nvte_tensor(b.grad_expert.get(), + {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); + grad_tokens = make_nvte_tensor(b.grad_tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + g_recv_topk_weights = make_nvte_tensor(b.g_recv_topk_weights.get(), + {b.recv_capacity}, kNVTEFloat32); + grad_topk_weights = make_nvte_tensor(b.grad_topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + } +}; + +// ── Shared fixture base ─────────────────────────────────────────────────────── + +class EpOpTestBase : public ::testing::Test { + protected: + int ep_size_, num_experts_, num_local_experts_, hidden_dim_; + int max_tokens_per_rank_, top_k_, num_tokens_; + + void SetUp() override { + if (g_sm_major < 9) + GTEST_SKIP() << "EP requires SM_90+ (device is SM_" << g_sm_major << "0)"; + ASSERT_GE(g_num_processes, 2); + ASSERT_TRUE(g_ep_initialized); + + ep_size_ = g_ep_size; + num_experts_ = g_num_experts; + num_local_experts_ = num_experts_ / ep_size_; + hidden_dim_ = g_hidden_dim; + max_tokens_per_rank_ = g_max_tokens_per_rank; + top_k_ = 2; + num_tokens_ = 32; + } + + void upload_inputs(EPBuffers& buf, int rank = -1) { + if (rank < 0) rank = g_process_id; + auto h_idx = routing_balanced(rank, num_tokens_, top_k_, + num_experts_, num_local_experts_); + std::vector h_w(num_tokens_ * top_k_, 1.0f / top_k_); + auto h_tok = generate_tokens(rank, num_tokens_, hidden_dim_); + + CHECK_CUDA(cudaMemcpy(buf.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(buf.tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + } + + NVTEEpLayerConfig layer_config(size_t alignment = 0) const { + return NVTEEpLayerConfig{num_local_experts_, top_k_, alignment}; + } + + // ASSERT_CUDA_OK (fprintf+exit) so this non-void helper stays legal. + int read_total_recv(const EPBuffers& buf) const { + std::vector cnt(num_local_experts_); + ASSERT_CUDA_OK(cudaMemcpy(cnt.data(), buf.token_counts.get(), + num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); + int total = 0; + for (int c : cnt) total += c; + return total; + } +}; + +// ============================================================================= +// EPDispatchTest: exact recv values and per-expert counts. +// ============================================================================= + +class EPDispatchTest : public EpOpTestBase {}; + +TEST_F(EPDispatchTest, PrepareAndDispatch) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + CHECK_CUDA(cudaMemset(buf.recv_tokens.get(), 0, buf.recv_tokens.bytes())); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + // 1. Per-expert counts. + std::vector got_counts(num_local_experts_); + CHECK_CUDA(cudaMemcpy(got_counts.data(), buf.token_counts.get(), + num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); + auto exp_counts = expected_token_counts(g_process_id, g_num_processes, num_tokens_, top_k_, + num_experts_, num_local_experts_); + int total_recv = 0; + for (int i = 0; i < num_local_experts_; ++i) { + EXPECT_EQ(got_counts[i], exp_counts[i]) << "local expert " << i; + total_recv += exp_counts[i]; + } + ASSERT_LE(total_recv, static_cast(buf.recv_capacity)) + << "total_recv exceeded recv_capacity — overflow would corrupt downstream memory"; + + // 2. Recv values: read only the filled prefix per local-expert zone, not the + // whole recv buffer — avoids false positives from legitimate-zero token values. + std::vector h_recv(buf.recv_capacity * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_recv.data(), buf.recv_tokens.get(), + h_recv.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + + std::vector got_vals; + got_vals.reserve(total_recv); + size_t slot = 0; + for (int e = 0; e < num_local_experts_; ++e) { + for (int i = 0; i < got_counts[e]; ++i) { + got_vals.push_back(__bfloat162float(h_recv[slot * hidden_dim_])); + ++slot; + } + } + std::sort(got_vals.begin(), got_vals.end()); + + auto exp_vals = expected_recv_values_sorted(g_process_id, g_num_processes, num_tokens_, + top_k_, num_experts_, num_local_experts_); + + ASSERT_EQ(got_vals.size(), exp_vals.size()); + for (size_t i = 0; i < exp_vals.size(); ++i) + EXPECT_NEAR(got_vals[i], exp_vals[i], bf16_tol(exp_vals[i])) + << "recv value mismatch at sorted index " << i; + + // 3. recv_topk_weights: every filled slot must equal the per-token weight (1/top_k). + std::vector h_w(buf.recv_capacity); + CHECK_CUDA(cudaMemcpy(h_w.data(), buf.recv_topk_weights.get(), + h_w.size() * sizeof(float), cudaMemcpyDeviceToHost)); + const float exp_w = 1.0f / static_cast(top_k_); + for (int i = 0; i < total_recv; ++i) + EXPECT_NEAR(h_w[i], exp_w, 1e-6f) << "recv_topk_weights[" << i << "]"; + + if (g_process_id == 0) + printf(" PrepareAndDispatch: passed (recv=%d, values + weights exact)\n", total_recv); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EPCombineTest: round-trip identity expert → result == top_k * tokens. +// ============================================================================= + +class EPCombineTest : public EpOpTestBase {}; + +TEST_F(EPCombineTest, Combine) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, + t.result.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_result(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_result.data(), buf.result.get(), + h_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); + // Spot-check 3 hidden-dim positions per token to catch partial-row writes. + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int tok = 0; tok < num_tokens_; ++tok) { + float exp = __bfloat162float(h_tok[tok * hidden_dim_]) * static_cast(top_k_); + for (int p : probes) { + float got = __bfloat162float(h_result[tok * hidden_dim_ + p]); + EXPECT_NEAR(got, exp, bf16_tol(exp)) + << "token " << tok << " rank " << g_process_id << " hidden " << p; + } + } + + if (g_process_id == 0) + printf(" Combine: passed (result == top_k * tokens)\n"); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EPCombineBwdTest: filled slots in grad_expert == d_result (unweighted). +// ============================================================================= + +class EPCombineBwdTest : public EpOpTestBase {}; + +TEST_F(EPCombineBwdTest, CombineBwdCheck) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, + t.result.tensor, stream)); + + std::vector h_grad_r(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); + CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad_r.data(), + h_grad_r.size() * sizeof(nv_bfloat16), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, + t.grad_expert.tensor, NVTECommWindow{}, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + int total_recv = read_total_recv(buf); + + std::vector cnt(num_local_experts_); + CHECK_CUDA(cudaMemcpy(cnt.data(), buf.token_counts.get(), + num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); + std::vector h_ge(buf.recv_capacity * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_ge.data(), buf.grad_expert.get(), + h_ge.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + + // Walk filled slots by per-expert zone (no v != 0 heuristic). + const float kExpGrad = 0.1f; + size_t slot = 0; + int filled = 0; + for (int e = 0; e < num_local_experts_; ++e) { + for (int i = 0; i < cnt[e]; ++i) { + float v = __bfloat162float(h_ge[slot * hidden_dim_]); + EXPECT_NEAR(v, kExpGrad, bf16_tol(kExpGrad)) + << "grad_expert expert " << e << " slot " << i << " (linear " << slot << ")"; + ++filled; ++slot; + } + } + EXPECT_EQ(filled, total_recv); + + if (g_process_id == 0) + printf(" CombineBwdCheck: passed (filled=%d)\n", filled); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EPDispatchBwdTest: grad_tokens == top_k * d_result. +// ============================================================================= + +class EPDispatchBwdTest : public EpOpTestBase {}; + +TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, + t.result.tensor, stream)); + + std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); + CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), + h_grad.size() * sizeof(nv_bfloat16), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, + t.grad_expert.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, + t.g_recv_topk_weights.tensor, NVTECommWindow{}, + t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_gt(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), + h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const float kExpGrad = static_cast(top_k_) * 0.1f; + for (int tok = 0; tok < num_tokens_; ++tok) + EXPECT_NEAR(__bfloat162float(h_gt[tok * hidden_dim_]), kExpGrad, bf16_tol(kExpGrad)) + << "grad_tokens token " << tok; + + if (g_process_id == 0) + printf(" DispatchBwdCheck: passed (grad_tokens == %.2f)\n", kExpGrad); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EPDispatchBwdGradWeightsTest: round-trip per-(t, k) weights. +// ============================================================================= + +class EPDispatchBwdGradWeightsTest : public EpOpTestBase {}; + +TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + // Distinct per-(rank, t, k) weights so each slot carries a unique value. + std::vector h_w(num_tokens_ * top_k_); + for (int tok = 0; tok < num_tokens_; ++tok) + for (int k = 0; k < top_k_; ++k) + h_w[tok * top_k_ + k] = 0.1f + 0.01f * tok + 0.001f * k + + 0.0001f * (g_process_id + 1); + CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.recv_topk_weights.get(), 0, + buf.recv_topk_weights.bytes(), stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + + // Sentinel: NaN so any (t, k) the bwd kernel fails to write is immediately visible. + std::vector h_nan(num_tokens_ * top_k_, + std::numeric_limits::quiet_NaN()); + CHECK_CUDA(cudaMemcpyAsync(buf.grad_topk_weights.get(), h_nan.data(), + h_nan.size() * sizeof(float), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + + // g_recv_topk_weights := recv_topk_weights (the round-trip input). + auto g_recv_t = make_nvte_tensor(buf.recv_topk_weights.get(), + {buf.recv_capacity}, kNVTEFloat32); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, + NVTECommWindow{}, g_recv_t.tensor, NVTECommWindow{}, + t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_grad_w(num_tokens_ * top_k_); + CHECK_CUDA(cudaMemcpy(h_grad_w.data(), buf.grad_topk_weights.get(), + h_grad_w.size() * sizeof(float), cudaMemcpyDeviceToHost)); + + const float kTol = 1e-5f; + int errs = 0, k0_eq_k1 = 0; + for (int tok = 0; tok < num_tokens_; ++tok) { + for (int k = 0; k < top_k_; ++k) { + float got = h_grad_w[tok * top_k_ + k]; + float exp = h_w[tok * top_k_ + k]; + if (std::isnan(got) || std::fabs(got - exp) > kTol) { + if (errs < 8) + fprintf(stderr, "Rank %d: grad_topk_weights[%d, %d]: got %.6f, expected %.6f\n", + g_process_id, tok, k, got, exp); + ++errs; + } + } + if (top_k_ >= 2 && + std::fabs(h_grad_w[tok * top_k_ + 0] - h_grad_w[tok * top_k_ + 1]) < 1e-7f) + ++k0_eq_k1; + } + EXPECT_EQ(errs, 0); + EXPECT_EQ(k0_eq_k1, 0) << "per-token-average regression: grad[t, 0] == grad[t, 1]"; + + if (g_process_id == 0 && errs == 0 && k0_eq_k1 == 0) + printf(" RoundTrip: passed (%d (t, k) gradients)\n", num_tokens_ * top_k_); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// Integrated FwdBwd: NaN/Inf check end-to-end. +// ============================================================================= + +class EPPipelineTest : public EpOpTestBase {}; + +TEST_F(EPPipelineTest, FullForwardBackward) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, + NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, + t.result.tensor, stream)); + + std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); + CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), + h_grad.size() * sizeof(nv_bfloat16), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, + t.grad_expert.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, + t.g_recv_topk_weights.tensor, NVTECommWindow{}, + t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); + ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); + + if (g_process_id == 0) printf(" FullForwardBackward: passed\n"); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ============================================================================= +// EPZeroCopyTest: dispatch/combine with NCCL symmetric-memory windows attached +// to payload tensors (zero-copy fast path via ncclEpTensorCreateFromWindow). +// Symm-mem requirements per spec: input&output of Dispatch, input of Combine, +// input&output of Combine bwd, input of Dispatch bwd. +// ============================================================================= + +namespace { + +// Caller-owned ncclMemAlloc'd buffer with a registered symmetric window. +// Frees in destructor (deregister + ncclMemFree). Non-copyable, move-only. +struct SymmBuf { + void* ptr = nullptr; + size_t bytes = 0; + ncclWindow_t win = nullptr; + + SymmBuf() = default; + SymmBuf(const SymmBuf&) = delete; + SymmBuf& operator=(const SymmBuf&) = delete; + SymmBuf(SymmBuf&& o) noexcept : ptr(o.ptr), bytes(o.bytes), win(o.win) { + o.ptr = nullptr; o.win = nullptr; o.bytes = 0; + } + ~SymmBuf() { + if (win) ncclCommWindowDeregister(g_ep_comm, win); + if (ptr) ncclMemFree(ptr); + } + + void alloc(size_t n_bytes) { + bytes = n_bytes; + ASSERT_NCCL_OK(ncclMemAlloc(&ptr, bytes)); + CHECK_CUDA(cudaMemset(ptr, 0, bytes)); + ASSERT_NCCL_OK(ncclCommWindowRegister(g_ep_comm, ptr, bytes, &win, + NCCL_WIN_COLL_SYMMETRIC)); + } +}; + +// Build an NVTECommWindow descriptor pointing at a SymmBuf's window (offset 0). +static inline NVTECommWindow symm_window(const SymmBuf& b) { + return NVTECommWindow{b.win, /*offset=*/0}; +} + +} // namespace + +class EPZeroCopyTest : public EpOpTestBase {}; + +// Identity round-trip with symm-mem on dispatch i/o + combine input. Bit-exact +// vs HBM reference (same routing, same input). +TEST_F(EPZeroCopyTest, IdentityAllSymm) { + // HBM reference run. + EPBuffers ref_buf; + ref_buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(ref_buf); + EPTensors ref_t(ref_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t ref_hid = ref_buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.topk_idx.tensor, ref_t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.topk_idx.tensor, + ref_t.tokens.tensor, NVTECommWindow{}, ref_t.topk_weights.tensor, + NVTECommWindow{}, ref_t.recv_tokens.tensor, NVTECommWindow{}, + ref_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.recv_tokens.tensor, NVTECommWindow{}, + ref_t.result.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector ref_recv(ref_buf.recv_capacity * hidden_dim_); + std::vector ref_result(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(ref_recv.data(), ref_buf.recv_tokens.get(), + ref_recv.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(ref_result.data(), ref_buf.result.get(), + ref_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + + // Symm-mem run: tokens, recv_tokens, combine_input (== recv_tokens) all symm. + EPBuffers sym_buf; // alloc all buffers except the symm ones. + sym_buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(sym_buf); + + SymmBuf sym_tokens, sym_recv; + sym_tokens.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); + sym_recv .alloc(sym_buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); + + // Stage same tokens into the symm-mem input. + auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); + CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + EPTensors sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + // Replace the tokens/recv_tokens views with ones pointing at the symm buffers. + sym_t.tokens = make_nvte_tensor(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); + sym_t.recv_tokens = make_nvte_tensor(sym_recv.ptr, + {sym_buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + + uint64_t sym_hid = sym_buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.topk_idx.tensor, sym_t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.topk_idx.tensor, + sym_t.tokens.tensor, symm_window(sym_tokens), + sym_t.topk_weights.tensor, NVTECommWindow{}, + sym_t.recv_tokens.tensor, symm_window(sym_recv), + sym_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.recv_tokens.tensor, + symm_window(sym_recv), sym_t.result.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector sym_recv_host(sym_buf.recv_capacity * hidden_dim_); + std::vector sym_result(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(sym_recv_host.data(), sym_recv.ptr, + sym_recv_host.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + CHECK_CUDA(cudaMemcpy(sym_result.data(), sym_buf.result.get(), + sym_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + + // Compare per filled recv slot (HBM ref vs symm) and full result. + int total_recv = read_total_recv(sym_buf); + for (int i = 0; i < total_recv * hidden_dim_; ++i) + ASSERT_EQ(__bfloat162float(sym_recv_host[i]), __bfloat162float(ref_recv[i])) + << "recv mismatch at " << i; + for (size_t i = 0; i < sym_result.size(); ++i) + ASSERT_EQ(__bfloat162float(sym_result[i]), __bfloat162float(ref_result[i])) + << "result mismatch at " << i; + + if (g_process_id == 0) + printf(" IdentityAllSymm: passed (recv_slots=%d, bit-exact vs HBM)\n", total_recv); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// Same buffers, 2 iterations — catches window-lifecycle regressions where the +// symm-mem registration goes stale between calls. +TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + + SymmBuf sym_tokens, sym_recv; + sym_tokens.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); + sym_recv .alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); + auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); + CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + t.tokens = make_nvte_tensor(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); + t.recv_tokens = make_nvte_tensor(sym_recv.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + for (int iter = 0; iter < 2; ++iter) { + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, symm_window(sym_tokens), + t.topk_weights.tensor, NVTECommWindow{}, + t.recv_tokens.tensor, symm_window(sym_recv), + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, + symm_window(sym_recv), t.result.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_res(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), buf.result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + for (int tok = 0; tok < num_tokens_; ++tok) { + float exp = __bfloat162float(h_tok[tok * hidden_dim_]) * static_cast(top_k_); + float got = __bfloat162float(h_res[tok * hidden_dim_]); + ASSERT_NEAR(got, exp, bf16_tol(exp)) << "iter " << iter << " tok " << tok; + } + } + + if (g_process_id == 0) + printf(" IdentityAllSymmRepeated: passed (2 iters)\n"); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// Full forward+backward with symm-mem on every spec-mandated buffer: +// dispatch i/o, combine input, combine_bwd i/o, dispatch_bwd input. +// TODO: flaky on rank 0 (grad_tokens partial-zero) when run after the prior +// EPZeroCopyTest cases in the same binary; passes in isolation. Re-enable once +// the root cause (likely NCCL EP NVLS write→read coherence on grad_expert) is +// understood. Tracked separately. +TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + + // Symm-mem: tokens (dispatch input), recv_tokens (dispatch output AND + // combine input), grad_result (combine_bwd input), grad_expert + // (combine_bwd output AND dispatch_bwd input). + SymmBuf sym_tokens, sym_recv, sym_grad_result, sym_grad_expert; + sym_tokens .alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); + sym_recv .alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); + sym_grad_result.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); + sym_grad_expert.alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); + + auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); + CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + t.tokens = make_nvte_tensor(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); + t.recv_tokens = make_nvte_tensor(sym_recv.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + t.grad_result = make_nvte_tensor(sym_grad_result.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); + t.grad_expert = make_nvte_tensor(sym_grad_expert.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, + t.tokens.tensor, symm_window(sym_tokens), + t.topk_weights.tensor, NVTECommWindow{}, + t.recv_tokens.tensor, symm_window(sym_recv), + t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, + symm_window(sym_recv), t.result.tensor, stream)); + + std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); + CHECK_CUDA(cudaMemcpyAsync(sym_grad_result.ptr, h_grad.data(), + h_grad.size() * sizeof(nv_bfloat16), + cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemsetAsync(sym_grad_expert.ptr, 0, sym_grad_expert.bytes, stream)); + CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, + symm_window(sym_grad_result), t.grad_expert.tensor, + symm_window(sym_grad_expert), stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, + symm_window(sym_grad_expert), + t.g_recv_topk_weights.tensor, NVTECommWindow{}, + t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); + ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); + + std::vector h_gt(num_tokens_ * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), + h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const float kExpGrad = static_cast(top_k_) * 0.1f; + for (int tok = 0; tok < num_tokens_; ++tok) + EXPECT_NEAR(__bfloat162float(h_gt[tok * hidden_dim_]), kExpGrad, bf16_tol(kExpGrad)) + << "grad_tokens token " << tok; + + if (g_process_id == 0) printf(" FullPipelineSymm: passed\n"); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +// ── main ────────────────────────────────────────────────────────────────────── + +int main(int argc, char* argv[]) { + if (!ep_bootstrap(argc, argv)) return 0; + int ret = RUN_ALL_TESTS(); + ep_teardown(); + return ret; +} diff --git a/transformer_engine/common/CMakeLists.txt b/transformer_engine/common/CMakeLists.txt index 030023d949..c5f8dfb1ab 100644 --- a/transformer_engine/common/CMakeLists.txt +++ b/transformer_engine/common/CMakeLists.txt @@ -379,6 +379,96 @@ if (NVTE_WITH_CUSOLVERMP) message(STATUS "Using cuSolverMp at: ${CUSOLVERMP_DIR}") endif() +# ── NCCL EP (on by default, HT mode only) ───────────────────────────────── +# Set -DNVTE_WITH_NCCL_EP=OFF (or NVTE_BUILD_WITH_NCCL_EP=0 in setup.py) to +# skip NCCL EP entirely — useful on older images whose system NCCL is below +# the 2.30.4 EP minimum. +option(NVTE_WITH_NCCL_EP "Build NCCL EP into libtransformer_engine.so" ON) +if(NVTE_WITH_NCCL_EP) +# SM>=90 and NCCL>=2.30.4 are gated at runtime in EPBackend::initialize. +# ── NCCL EP headers ──────────────────────────────────────────────────────── +# Headers + libs are produced by the in-tree 3rdparty/nccl submodule build +# (auto-built by setup.py via build_nccl_ep_submodule). +set(NCCL_EP_SUBMODULE_ROOT + "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl") +set(NCCL_EP_INCLUDE_DIR "${NCCL_EP_SUBMODULE_ROOT}/contrib/nccl_ep/include") +if(NOT EXISTS "${NCCL_EP_INCLUDE_DIR}/nccl_ep.h") + message(FATAL_ERROR + "NCCL EP header not found at ${NCCL_EP_INCLUDE_DIR}/nccl_ep.h. " + "Run `git submodule update --init --recursive` to checkout 3rdparty/nccl.") +endif() +message(STATUS "NCCL EP headers: ${NCCL_EP_INCLUDE_DIR}") + +# ── libnccl_ep.so ────────────────────────────────────────────────────────── +set(NCCL_EP_LIB_DIR "${NCCL_EP_SUBMODULE_ROOT}/build/lib") +find_library(NCCL_EP_LIB + NAMES nccl_ep libnccl_ep + HINTS ${NCCL_EP_LIB_DIR} + NO_DEFAULT_PATH + REQUIRED) + +# ── NCCL + GIN headers ───────────────────────────────────────────────────── +# libnccl.so and all GIN headers (ncclGin.h, ncclWindow_t, ncclDevComm_t) +# ship with the base CUDA Toolkit OR the 3rdparty/nccl submodule build +# (preferred when present; auto-built by setup.py via build_nccl_ep_submodule). +if(NOT NCCL_LIB) + find_library(NCCL_LIB + NAMES nccl libnccl + HINTS ${NCCL_EP_LIB_DIR} ${CUDAToolkit_LIBRARY_DIR} + PATH_SUFFIXES lib lib64 + REQUIRED) +endif() + +set(NCCL_SUBMODULE_INCLUDE + "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl/build/include") +if(EXISTS "${NCCL_SUBMODULE_INCLUDE}/nccl.h") + set(NCCL_INCLUDE_DIRS_FOR_TE ${NCCL_SUBMODULE_INCLUDE}) +else() + set(NCCL_INCLUDE_DIRS_FOR_TE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +endif() + +# Diagnostic: log detected NCCL header version (minimum enforced at runtime). +find_file(_nvte_nccl_header_path nccl.h + PATHS ${NCCL_INCLUDE_DIRS_FOR_TE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + NO_DEFAULT_PATH) +if(_nvte_nccl_header_path) + file(READ "${_nvte_nccl_header_path}" _nvte_nccl_h) + string(REGEX MATCH "#define[ \t]+NCCL_MAJOR[ \t]+([0-9]+)" _ "${_nvte_nccl_h}") + set(_nvte_nccl_major "${CMAKE_MATCH_1}") + string(REGEX MATCH "#define[ \t]+NCCL_MINOR[ \t]+([0-9]+)" _ "${_nvte_nccl_h}") + set(_nvte_nccl_minor "${CMAKE_MATCH_1}") + string(REGEX MATCH "#define[ \t]+NCCL_PATCH[ \t]+([0-9]+)" _ "${_nvte_nccl_h}") + set(_nvte_nccl_patch "${CMAKE_MATCH_1}") + if(_nvte_nccl_major AND _nvte_nccl_minor AND _nvte_nccl_patch) + message(STATUS "NCCL header: ${_nvte_nccl_header_path} (version ${_nvte_nccl_major}.${_nvte_nccl_minor}.${_nvte_nccl_patch})") + endif() +endif() + +target_include_directories(transformer_engine PRIVATE + ${NCCL_EP_INCLUDE_DIR} + ${NCCL_INCLUDE_DIRS_FOR_TE}) # covers nccl.h + nccl_device/ + +target_link_libraries(transformer_engine PUBLIC + ${NCCL_EP_LIB} + ${NCCL_LIB}) + +# Embed rpath so the installed wheel finds libnccl_ep.so at runtime. +# libnccl.so is already on the system via the Toolkit — no rpath needed for it. +set_target_properties(transformer_engine PROPERTIES + INSTALL_RPATH "$ORIGIN;${NCCL_EP_LIB_DIR}") + +target_sources(transformer_engine PRIVATE + ep/ep_backend.cpp + ep/ep_api.cpp) + +message(STATUS "NCCL EP enabled: ${NCCL_EP_LIB}") +message(STATUS "NCCL EP include: ${NCCL_EP_INCLUDE_DIR}") +else() + # NCCL EP off: export throwing nvte_ep_* stubs so framework bindings link. + target_sources(transformer_engine PRIVATE ep/ep_api_stub.cpp) + message(STATUS "NCCL EP disabled (NVTE_WITH_NCCL_EP=OFF) — using nvte_ep_* stubs") +endif() + # Number of philox4x32 rounds for stochastic rounding (build-time constant). set(NVTE_BUILD_NUM_PHILOX_ROUNDS_STR $ENV{NVTE_BUILD_NUM_PHILOX_ROUNDS}) if (NOT NVTE_BUILD_NUM_PHILOX_ROUNDS_STR) diff --git a/transformer_engine/common/ep/ep_api.cpp b/transformer_engine/common/ep/ep_api.cpp new file mode 100644 index 0000000000..89d8b38607 --- /dev/null +++ b/transformer_engine/common/ep/ep_api.cpp @@ -0,0 +1,76 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file ep_api.cpp + * \brief nvte_ep_* C API: thin delegations to the EPBackend singleton. + */ + +#include +#include + +#include "../common.h" +#include "../util/logging.h" +#include "ep_backend.h" + +using transformer_engine::ep::EPBackend; + +void nvte_ep_initialize(void* ep_comm, NVTEEpGroupConfig group_config) { + NVTE_CHECK(ep_comm != nullptr, "ep_comm must not be null"); + EPBackend::initialize(static_cast(ep_comm), group_config); +} + +void nvte_ep_shutdown(void) { EPBackend::shutdown(); } + +uint64_t nvte_ep_register_layer(NVTEEpLayerConfig layer_config, size_t* handle_mem_size) { + NVTE_CHECK(handle_mem_size != nullptr, "handle_mem_size must not be null"); + return EPBackend::get().register_layer(layer_config, handle_mem_size); +} + +void nvte_ep_prepare(NVTEEpHandle handle, NVTETensor topk_idx, NVTETensor token_counts, + size_t dispatch_output_per_expert_alignment, cudaStream_t stream) { + void* mem_ptr = nvte_tensor_data(handle.mem); + NVTE_CHECK(mem_ptr != nullptr, "handle_mem tensor data must not be null"); + EPBackend::get().prepare(handle.id, topk_idx, token_counts, mem_ptr, + dispatch_output_per_expert_alignment, stream); +} + +void nvte_ep_dispatch(NVTEEpHandle handle, NVTETensor topk_idx, NVTETensor tokens, + NVTECommWindow tokens_win, NVTETensor topk_weights, + NVTECommWindow topk_weights_win, NVTETensor recv_tokens, + NVTECommWindow recv_tokens_win, NVTETensor recv_topk_weights, + NVTECommWindow recv_topk_weights_win, cudaStream_t stream) { + void* mem_ptr = nvte_tensor_data(handle.mem); + NVTE_CHECK(mem_ptr != nullptr, "handle_mem tensor data must not be null"); + EPBackend::get().dispatch(handle.id, mem_ptr, topk_idx, tokens, tokens_win, topk_weights, + topk_weights_win, recv_tokens, recv_tokens_win, recv_topk_weights, + recv_topk_weights_win, stream); +} + +void nvte_ep_combine(NVTEEpHandle handle, NVTETensor expert_out, NVTECommWindow expert_out_win, + NVTETensor result, cudaStream_t stream) { + void* mem_ptr = nvte_tensor_data(handle.mem); + NVTE_CHECK(mem_ptr != nullptr, "handle_mem tensor data must not be null"); + EPBackend::get().combine(handle.id, mem_ptr, expert_out, expert_out_win, result, stream); +} + +void nvte_ep_dispatch_bwd(NVTEEpHandle handle, NVTETensor grad, NVTECommWindow grad_win, + NVTETensor g_recv_topk_weights, NVTECommWindow g_recv_topk_weights_win, + NVTETensor grad_tokens, NVTETensor grad_topk_weights, + cudaStream_t stream) { + void* mem_ptr = nvte_tensor_data(handle.mem); + NVTE_CHECK(mem_ptr != nullptr, "handle_mem tensor data must not be null"); + EPBackend::get().dispatch_bwd(handle.id, mem_ptr, grad, grad_win, g_recv_topk_weights, + g_recv_topk_weights_win, grad_tokens, grad_topk_weights, stream); +} + +void nvte_ep_combine_bwd(NVTEEpHandle handle, NVTETensor grad, NVTECommWindow grad_win, + NVTETensor grad_expert_out, NVTECommWindow grad_expert_out_win, + cudaStream_t stream) { + void* mem_ptr = nvte_tensor_data(handle.mem); + NVTE_CHECK(mem_ptr != nullptr, "handle_mem tensor data must not be null"); + EPBackend::get().combine_bwd(handle.id, mem_ptr, grad, grad_win, grad_expert_out, + grad_expert_out_win, stream); +} diff --git a/transformer_engine/common/ep/ep_api_stub.cpp b/transformer_engine/common/ep/ep_api_stub.cpp new file mode 100644 index 0000000000..fe4127d87d --- /dev/null +++ b/transformer_engine/common/ep/ep_api_stub.cpp @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file ep_api_stub.cpp + * \brief Throwing nvte_ep_* stubs compiled when NVTE_WITH_NCCL_EP=OFF. + */ + +#include + +#include "../util/logging.h" + +namespace { +[[noreturn]] void ep_not_built() { + NVTE_ERROR( + "NCCL EP is not built into this TransformerEngine. Rebuild TE with " + "NVTE_BUILD_WITH_NCCL_EP=1 and CUDA arch >= 90 (e.g. NVTE_CUDA_ARCHS=\"90\")."); +} +} // namespace + +void nvte_ep_initialize(void* /*ep_comm*/, NVTEEpGroupConfig /*group_config*/) { ep_not_built(); } + +void nvte_ep_shutdown(void) {} + +uint64_t nvte_ep_register_layer(NVTEEpLayerConfig /*layer_config*/, size_t* /*handle_mem_size*/) { + ep_not_built(); +} + +void nvte_ep_prepare(NVTEEpHandle /*handle*/, NVTETensor /*topk_idx*/, NVTETensor /*token_counts*/, + size_t /*dispatch_output_per_expert_alignment*/, cudaStream_t /*stream*/) { + ep_not_built(); +} + +void nvte_ep_dispatch(NVTEEpHandle /*handle*/, NVTETensor /*topk_idx*/, NVTETensor /*tokens*/, + NVTECommWindow /*tokens_win*/, NVTETensor /*topk_weights*/, + NVTECommWindow /*topk_weights_win*/, NVTETensor /*recv_tokens*/, + NVTECommWindow /*recv_tokens_win*/, NVTETensor /*recv_topk_weights*/, + NVTECommWindow /*recv_topk_weights_win*/, cudaStream_t /*stream*/) { + ep_not_built(); +} + +void nvte_ep_combine(NVTEEpHandle /*handle*/, NVTETensor /*expert_out*/, + NVTECommWindow /*expert_out_win*/, NVTETensor /*result*/, + cudaStream_t /*stream*/) { + ep_not_built(); +} + +void nvte_ep_dispatch_bwd(NVTEEpHandle /*handle*/, NVTETensor /*grad*/, NVTECommWindow /*grad_win*/, + NVTETensor /*g_recv_topk_weights*/, + NVTECommWindow /*g_recv_topk_weights_win*/, NVTETensor /*grad_tokens*/, + NVTETensor /*grad_topk_weights*/, cudaStream_t /*stream*/) { + ep_not_built(); +} + +void nvte_ep_combine_bwd(NVTEEpHandle /*handle*/, NVTETensor /*grad*/, NVTECommWindow /*grad_win*/, + NVTETensor /*grad_expert_out*/, NVTECommWindow /*grad_expert_out_win*/, + cudaStream_t /*stream*/) { + ep_not_built(); +} diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp new file mode 100644 index 0000000000..ae0f3ab888 --- /dev/null +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -0,0 +1,514 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file ep_backend.cpp + * \brief EPBackend implementation. See ep_backend.h for the op flow. + */ + +#include "ep_backend.h" + +#include +#include +#include +#include +#include +#include + +#include "../common.h" +#include "../util/cuda_runtime.h" +#include "../util/logging.h" + +namespace transformer_engine { +namespace ep { + +namespace { + +// Build a by-value ncclEpTensor_t descriptor. `sizes` is caller-owned and must +// outlive any NCCL EP call that consumes the descriptor. +inline ncclEpTensor_t make_tensor(void* data, unsigned int ndim, ncclDataType_t datatype, + size_t* sizes) { + ncclEpTensor_t t = NCCL_EP_TENSOR_INIT; + t.ndim = ndim; + t.datatype = datatype; + t.data = data; + t.sizes = sizes; + return t; +} + +// Payload descriptor: prefer the symmem window when set, else fall back to the +// NVTETensor's raw device pointer. +inline ncclEpTensor_t make_payload_tensor(const NVTETensor t, const NVTECommWindow& win, + unsigned int ndim, ncclDataType_t datatype, + size_t* sizes) { + ncclEpTensor_t desc = NCCL_EP_TENSOR_INIT; + desc.ndim = ndim; + desc.datatype = datatype; + desc.sizes = sizes; + if (win.window != nullptr) { + desc.win_hdl = win.window; + desc.win_offset = win.offset; + } else { + desc.data = nvte_tensor_data(t); + NVTE_CHECK(desc.data != nullptr, "payload tensor data must not be null"); + } + return desc; +} + +// RAII guard for ncclEpHandle_t — destroys on scope exit, leak-free on throw. +class ScopedEpHandle { + public: + ScopedEpHandle() = default; + explicit ScopedEpHandle(ncclEpHandle_t h) : h_(h) {} + ~ScopedEpHandle() { + if (h_ != nullptr) ncclEpHandleDestroy(h_); + } + ScopedEpHandle(const ScopedEpHandle&) = delete; + ScopedEpHandle& operator=(const ScopedEpHandle&) = delete; + ScopedEpHandle(ScopedEpHandle&& other) noexcept : h_(other.h_) { other.h_ = nullptr; } + ScopedEpHandle& operator=(ScopedEpHandle&& other) noexcept { + if (this != &other) { + if (h_ != nullptr) ncclEpHandleDestroy(h_); + h_ = other.h_; + other.h_ = nullptr; + } + return *this; + } + operator ncclEpHandle_t() const { return h_; } + ncclEpHandle_t get() const { return h_; } + + private: + ncclEpHandle_t h_ = nullptr; +}; + +} // namespace + +// --------------------------------------------------------------------------- +// Singleton + bootstrap +// --------------------------------------------------------------------------- + +EPBackend& EPBackend::instance() { + static EPBackend inst; + return inst; +} + +EPBackend& EPBackend::get() { + EPBackend& inst = instance(); + NVTE_CHECK(inst.initialized_, "EPBackend not initialized. Call nvte_ep_initialize() first."); + return inst; +} + +void EPBackend::validate_config(const NVTEEpGroupConfig& config) { + NVTE_CHECK(config.ep_size > 0, "ep_size must be positive, got ", config.ep_size); + NVTE_CHECK(config.num_experts > 0, "num_experts must be positive, got ", config.num_experts); + NVTE_CHECK(config.max_tokens_per_rank > 0, "max_tokens_per_rank must be positive, got ", + config.max_tokens_per_rank); + NVTE_CHECK(config.max_recv_tokens_per_rank > 0, "max_recv_tokens_per_rank must be positive, got ", + config.max_recv_tokens_per_rank); + NVTE_CHECK(config.hidden_dim > 0, "hidden_dim must be positive, got ", config.hidden_dim); + NVTE_CHECK(config.hidden_dim * sizeof(nv_bfloat16) >= 16, + "hidden_dim * 2 must be >= 16 (NCCL EP 16B row alignment); got hidden_dim=", + config.hidden_dim); + NVTE_CHECK(config.num_experts % config.ep_size == 0, "num_experts (", config.num_experts, + ") must be divisible by ep_size (", config.ep_size, ")"); + NVTE_CHECK(config.max_num_sms >= 0, "max_num_sms must be >= 0 (0 = auto), got ", + config.max_num_sms); + + int device, major; + NVTE_CHECK_CUDA(cudaGetDevice(&device)); + NVTE_CHECK_CUDA(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device)); + NVTE_CHECK(major >= 9, + "NCCL EP requires SM_90+ (Hopper or later), " + "but current device has compute capability ", + major, ".x"); + + // NCCL EP needs CUDA multicast (NVLS); init hangs without it. + NVTE_CHECK(cuda::supports_multicast(device), + "NCCL EP requires CUDA multicast (NVLS) support on device ", device, + " but CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED reports 0."); +} + +void EPBackend::initialize(ncclComm_t ep_comm, NVTEEpGroupConfig config) { + EPBackend& inst = instance(); + std::lock_guard lock(inst.mutex_); + NVTE_CHECK(!inst.initialized_, "EP already initialized. Call initialize only once per process."); + NVTE_CHECK(ep_comm != nullptr, "ep_comm must not be null"); + + // Runtime gate: NCCL >= 2.30.4 (matches the submodule pin). + constexpr int kMinNcclVersion = 23004; + int nccl_version = 0; + NVTE_CHECK_NCCL(ncclGetVersion(&nccl_version)); + NVTE_CHECK(nccl_version >= kMinNcclVersion, "NCCL EP requires NCCL >= 2.30.4, found ", + nccl_version / 10000, ".", (nccl_version / 100) % 100, ".", nccl_version % 100, + " at runtime."); + + validate_config(config); + + int comm_size = 0; + NVTE_CHECK_NCCL(ncclCommCount(ep_comm, &comm_size)); + NVTE_CHECK(comm_size == config.ep_size, "ep_comm size (", comm_size, ") must equal ep_size (", + config.ep_size, "). Pass the EP sub-communicator, not the world comm."); + + inst.init(ep_comm, config); +} + +void EPBackend::shutdown() { + EPBackend& inst = instance(); + std::lock_guard lock(inst.mutex_); + if (!inst.initialized_) return; + inst.handles_.clear(); + // ncclEpGroupDestroy reads from ep_comm_; destroy group while comm is still alive. + if (inst.ep_group_ != nullptr) { + ncclEpGroupDestroy(inst.ep_group_); + inst.ep_group_ = nullptr; + } + inst.ep_comm_ = nullptr; // borrowed — caller destroys + inst.initialized_ = false; +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +ncclDataType_t EPBackend::nvte_dtype_to_nccl(NVTEDType dtype) { + switch (dtype) { + case kNVTEFloat32: + return ncclFloat32; + case kNVTEFloat16: + return ncclFloat16; + case kNVTEBFloat16: + return ncclBfloat16; + case kNVTEInt32: + return ncclInt32; + case kNVTEInt64: + return ncclInt64; + case kNVTEByte: + return ncclUint8; + case kNVTEFloat8E4M3: + return ncclFloat8e4m3; + case kNVTEFloat8E5M2: + return ncclFloat8e5m2; + default: + NVTE_ERROR("Unsupported NVTEDType for NCCL EP conversion: ", static_cast(dtype)); + } + return ncclFloat32; // unreachable +} + +// Open a transient ncclEpHandle over handle_mem. Caller owns the result. +ncclEpHandle_t EPBackend::open_handle(void* handle_mem, size_t handle_mem_size, int num_topk, + size_t dispatch_output_per_expert_alignment) { + size_t hm_sizes[1] = {handle_mem_size}; + ncclEpTensor_t routing_desc = make_tensor(handle_mem, 1, ncclUint8, hm_sizes); + ncclEpHandleConfig_t hcfg = NCCL_EP_HANDLE_CONFIG_INIT; + hcfg.dispatch_output_per_expert_alignment = dispatch_output_per_expert_alignment; + ncclEpHandle_t handle; + NVTE_CHECK_NCCL(ncclEpInitHandle(&handle, ep_group_, NCCL_EP_LAYOUT_EXPERT_MAJOR, &hcfg, num_topk, + &routing_desc)); + return handle; +} + +// --------------------------------------------------------------------------- +// Lifecycle +// --------------------------------------------------------------------------- + +// Static-dtor teardown: skip NCCL calls (CUDA context / borrowed ep_comm_ may +// already be gone) and release in-memory state only. +EPBackend::~EPBackend() { + std::lock_guard lock(mutex_); + if (!initialized_) return; + handles_.clear(); + ep_group_ = nullptr; + ep_comm_ = nullptr; + initialized_ = false; +} + +void EPBackend::init(ncclComm_t ep_comm, NVTEEpGroupConfig group_config) { + NVTE_CHECK(!initialized_, "EPBackend already initialized"); + + group_config_ = group_config; + + ncclEpGroupConfig_t cfg = NCCL_EP_GROUP_CONFIG_INIT; + cfg.algorithm = NCCL_EP_ALGO_HIGH_THROUGHPUT; + cfg.num_experts = static_cast(group_config.num_experts); + cfg.max_dispatch_tokens_per_rank = static_cast(group_config.max_tokens_per_rank); + cfg.max_token_bytes = static_cast(group_config.hidden_dim * sizeof(nv_bfloat16)); + cfg.rdma_buffer_size = NCCL_EP_AUTO; + cfg.num_qp_per_rank = NCCL_EP_AUTO; + cfg.num_channels = NCCL_EP_AUTO; + cfg.max_num_sms = group_config.max_num_sms > 0 + ? static_cast(group_config.max_num_sms) + : NCCL_EP_AUTO; + // Must be > 0; NCCL EP errors out on 0. + cfg.max_recv_tokens_per_rank = static_cast(group_config.max_recv_tokens_per_rank); + + NVTE_CHECK_NCCL(ncclEpCreateGroup(&ep_group_, ep_comm, &cfg)); + + ep_comm_ = ep_comm; + + initialized_ = true; +} + +// --------------------------------------------------------------------------- +// Per-handle_id config cache +// --------------------------------------------------------------------------- + +uint64_t EPBackend::insert_new_entry(size_t handle_mem_size, int top_k, size_t alignment) { + if (handle_cache_cap_ == 0) { + const char* cap_env = std::getenv("NVTE_EP_HANDLE_CACHE_SIZE"); + handle_cache_cap_ = (cap_env != nullptr) ? std::max(1, std::atoi(cap_env)) : 8192; + } + NVTE_CHECK(handles_.size() < handle_cache_cap_, "EP handle cache full (", handle_cache_cap_, + " entries). Raise via NVTE_EP_HANDLE_CACHE_SIZE."); + uint64_t id = next_handle_id_.fetch_add(1, std::memory_order_relaxed); + handles_.emplace(id, HandleEntry{handle_mem_size, alignment, top_k}); + return id; +} + +EPBackend::HandleEntry& EPBackend::lookup_config(uint64_t handle_id) { + auto it = handles_.find(handle_id); + NVTE_CHECK(it != handles_.end(), "ep op on handle_id=", handle_id, + " with no cached config — call ep_prepare first."); + return it->second; +} + +// --------------------------------------------------------------------------- +// Per-step operations +// --------------------------------------------------------------------------- + +uint64_t EPBackend::register_layer(NVTEEpLayerConfig layer_config, size_t* handle_mem_size) { + NVTE_CHECK(initialized_, "EPBackend not initialized"); + NVTE_CHECK(layer_config.top_k > 0, "NVTEEpLayerConfig.top_k must be > 0"); + NVTE_CHECK(handle_mem_size != nullptr, "handle_mem_size must not be null"); + ncclEpHandleConfig_t hcfg = NCCL_EP_HANDLE_CONFIG_INIT; + hcfg.dispatch_output_per_expert_alignment = layer_config.dispatch_output_per_expert_alignment; + size_t hm_size = 0; + NVTE_CHECK_NCCL(ncclEpHandleMemSize(ep_group_, NCCL_EP_LAYOUT_EXPERT_MAJOR, &hcfg, &hm_size, + layer_config.top_k)); + *handle_mem_size = hm_size; + std::lock_guard lock(mutex_); + return insert_new_entry(hm_size, layer_config.top_k, + layer_config.dispatch_output_per_expert_alignment); +} + +void EPBackend::prepare(uint64_t handle_id, const NVTETensor topk_idx, NVTETensor token_counts, + void* handle_mem, size_t dispatch_output_per_expert_alignment, + cudaStream_t stream) { + NVTE_CHECK(initialized_, "EPBackend not initialized"); + NVTE_CHECK(handle_mem != nullptr, "handle_mem must not be null"); + + NVTEShape idx_shape = nvte_tensor_shape(topk_idx); + void* idx_data = nvte_tensor_data(topk_idx); + NVTE_CHECK(idx_data != nullptr, "topk_idx data must not be null"); + + const size_t num_tokens = idx_shape.data[0]; + const size_t top_k = idx_shape.ndim > 1 ? idx_shape.data[1] : 1; + const size_t num_local_experts = + static_cast(group_config_.num_experts / group_config_.ep_size); + + size_t idx_sizes[2] = {num_tokens, top_k}; + ncclEpTensor_t nccl_topk_idx = make_tensor(idx_data, 2, ncclInt64, idx_sizes); + + // ncclEpUpdateHandle writes per-expert counts via expert_counters. + size_t cnt_sizes[1] = {num_local_experts}; + ncclEpTensor_t token_counts_desc; + void* token_counts_data = (token_counts != nullptr) ? nvte_tensor_data(token_counts) : nullptr; + if (token_counts_data != nullptr) { + token_counts_desc = make_tensor(token_counts_data, 1, ncclInt32, cnt_sizes); + } + ncclEpLayoutInfo_t layout_info = NCCL_EP_LAYOUT_INFO_INIT; + layout_info.expert_counters = (token_counts_data != nullptr) ? &token_counts_desc : nullptr; + + ScopedEpHandle transient; + { + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + NVTE_CHECK(cfg.alignment == dispatch_output_per_expert_alignment, + "ep_prepare: alignment mismatch for handle_id=", handle_id, + " (cached=", cfg.alignment, ", got=", dispatch_output_per_expert_alignment, ")"); + transient = + ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); + } + NVTE_CHECK_NCCL(ncclEpUpdateHandle(transient, &nccl_topk_idx, &layout_info, stream)); +} + +void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor topk_idx, + const NVTETensor tokens, const NVTECommWindow& tokens_win, + const NVTETensor topk_weights, const NVTECommWindow& topk_weights_win, + NVTETensor recv_tokens, const NVTECommWindow& recv_tokens_win, + NVTETensor recv_topk_weights, const NVTECommWindow& recv_topk_weights_win, + cudaStream_t stream) { + NVTE_CHECK(initialized_, "EPBackend not initialized"); + NVTE_CHECK(handle_mem != nullptr, "handle_mem must not be null"); + + NVTEShape tok_shape = nvte_tensor_shape(tokens); + NVTEDType tok_dtype = nvte_tensor_type(tokens); + + const size_t num_tokens = tok_shape.data[0]; + const size_t hidden_dim = tok_shape.data[1]; + + size_t tok_sizes[2] = {num_tokens, hidden_dim}; + ncclEpTensor_t nccl_tokens_in = + make_payload_tensor(tokens, tokens_win, 2, nvte_dtype_to_nccl(tok_dtype), tok_sizes); + + const bool is_forward = (topk_weights != nullptr); + + // Routing is cached in handle_mem by ep_prepare; dispatch only needs + // topk_weights to reconstruct the sparse-to-dense prob map. + size_t weights_in_sizes[2] = {0, 0}; + ncclEpTensor_t nccl_topk_weights_in; + if (is_forward) { + NVTE_CHECK(topk_idx != nullptr, "topk_idx required in forward dispatch"); + NVTEShape idx_shape = nvte_tensor_shape(topk_idx); + const size_t top_k = idx_shape.ndim > 1 ? idx_shape.data[1] : 1; + weights_in_sizes[0] = num_tokens; + weights_in_sizes[1] = top_k; + nccl_topk_weights_in = + make_payload_tensor(topk_weights, topk_weights_win, 2, ncclFloat32, weights_in_sizes); + } + + NVTEShape recv_shape = nvte_tensor_shape(recv_tokens); + NVTEDType recv_dtype = nvte_tensor_type(recv_tokens); + + size_t recv_sizes[2] = {recv_shape.data[0], recv_shape.data[1]}; + ncclEpTensor_t nccl_tokens_out = make_payload_tensor(recv_tokens, recv_tokens_win, 2, + nvte_dtype_to_nccl(recv_dtype), recv_sizes); + + size_t weights_out_sizes[1] = {recv_shape.data[0]}; + ncclEpTensor_t nccl_topk_weights_out; + if (is_forward) { + NVTE_CHECK(recv_topk_weights != nullptr, + "recv_topk_weights must not be null in forward dispatch"); + NVTEShape recv_w_shape = nvte_tensor_shape(recv_topk_weights); + NVTE_CHECK(recv_w_shape.ndim == 1, "recv_topk_weights must be 1D [recv_capacity]"); + nccl_topk_weights_out = make_payload_tensor(recv_topk_weights, recv_topk_weights_win, 1, + ncclFloat32, weights_out_sizes); + } + + ncclEpDispatchInputs_t in_struct = NCCL_EP_DISPATCH_INPUTS_INIT; + in_struct.tokens = &nccl_tokens_in; + in_struct.topk_weights = is_forward ? &nccl_topk_weights_in : nullptr; + + ncclEpDispatchOutputs_t out_struct = NCCL_EP_DISPATCH_OUTPUTS_INIT; + out_struct.tokens = &nccl_tokens_out; + out_struct.topk_weights = is_forward ? &nccl_topk_weights_out : nullptr; + + ncclEpDispatchConfig_t dispatch_cfg = NCCL_EP_DISPATCH_CONFIG_INIT; + dispatch_cfg.pass_direction = is_forward ? NCCL_EP_FWD_PASS : NCCL_EP_BWD_PASS; + + ScopedEpHandle transient; + { + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + transient = + ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); + } + NVTE_CHECK_NCCL(ncclEpDispatch(transient, &in_struct, &out_struct, + /*layout_info=*/nullptr, &dispatch_cfg, stream)); +} + +void EPBackend::combine(uint64_t handle_id, void* handle_mem, const NVTETensor expert_out, + const NVTECommWindow& expert_out_win, NVTETensor result, + cudaStream_t stream) { + NVTE_CHECK(initialized_, "EPBackend not initialized"); + NVTE_CHECK(handle_mem != nullptr, "handle_mem must not be null"); + + NVTEShape exp_shape = nvte_tensor_shape(expert_out); + NVTEDType exp_dtype = nvte_tensor_type(expert_out); + + size_t exp_sizes[2] = {exp_shape.data[0], exp_shape.data[1]}; + ncclEpTensor_t nccl_expert_in = + make_payload_tensor(expert_out, expert_out_win, 2, nvte_dtype_to_nccl(exp_dtype), exp_sizes); + + NVTEShape res_shape = nvte_tensor_shape(result); + void* res_data = nvte_tensor_data(result); + NVTEDType res_dtype = nvte_tensor_type(result); + NVTE_CHECK(res_data != nullptr, "result data must not be null"); + + size_t res_sizes[2] = {res_shape.data[0], res_shape.data[1]}; + ncclEpTensor_t nccl_result_out = + make_tensor(res_data, 2, nvte_dtype_to_nccl(res_dtype), res_sizes); + + ncclEpCombineInputs_t in_struct = NCCL_EP_COMBINE_INPUTS_INIT; + in_struct.tokens = &nccl_expert_in; + + ncclEpCombineOutputs_t out_struct = NCCL_EP_COMBINE_OUTPUTS_INIT; + out_struct.tokens = &nccl_result_out; + + ScopedEpHandle transient; + { + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + transient = + ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); + } + NVTE_CHECK_NCCL(ncclEpCombine(transient, &in_struct, &out_struct, /*config=*/nullptr, stream)); +} + +void EPBackend::dispatch_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, + const NVTECommWindow& grad_win, const NVTETensor g_recv_topk_weights, + const NVTECommWindow& g_recv_topk_weights_win, NVTETensor grad_tokens, + NVTETensor grad_topk_weights, cudaStream_t stream) { + NVTE_CHECK(initialized_, "EPBackend not initialized"); + NVTE_CHECK(handle_mem != nullptr, "handle_mem must not be null"); + + NVTEShape g_shape = nvte_tensor_shape(grad); + NVTEDType g_dtype = nvte_tensor_type(grad); + size_t g_sizes[2] = {g_shape.data[0], g_shape.data[1]}; + ncclEpTensor_t nccl_tok_in = + make_payload_tensor(grad, grad_win, 2, nvte_dtype_to_nccl(g_dtype), g_sizes); + + // g_recv_topk_weights must be 1D [recv_capacity] — caller flattens. + NVTEShape gw_shape = nvte_tensor_shape(g_recv_topk_weights); + NVTE_CHECK(gw_shape.ndim == 1, + "g_recv_topk_weights must be 1D [recv_capacity]; caller must flatten leading dims"); + size_t gw_sizes[1] = {gw_shape.data[0]}; + ncclEpTensor_t nccl_w_in = + make_payload_tensor(g_recv_topk_weights, g_recv_topk_weights_win, 1, ncclFloat32, gw_sizes); + + NVTEShape gt_shape = nvte_tensor_shape(grad_tokens); + void* gt_data = nvte_tensor_data(grad_tokens); + NVTE_CHECK(gt_data != nullptr, "grad_tokens data must not be null"); + size_t gt_sizes[2] = {gt_shape.data[0], gt_shape.data[1]}; + ncclEpTensor_t nccl_tok_out = make_tensor(gt_data, 2, nvte_dtype_to_nccl(g_dtype), gt_sizes); + + NVTEShape gtw_shape = nvte_tensor_shape(grad_topk_weights); + void* gtw_data = nvte_tensor_data(grad_topk_weights); + NVTE_CHECK(gtw_data != nullptr, "grad_topk_weights data must not be null"); + NVTE_CHECK(gtw_shape.ndim == 2, "grad_topk_weights must be 2D [T, top_k]"); + size_t gtw_sizes[2] = {gtw_shape.data[0], gtw_shape.data[1]}; + ncclEpTensor_t nccl_w_out = make_tensor(gtw_data, 2, ncclFloat32, gtw_sizes); + + ncclEpCombineInputs_t in_struct = NCCL_EP_COMBINE_INPUTS_INIT; + in_struct.tokens = &nccl_tok_in; + in_struct.topk_weights = &nccl_w_in; + + ncclEpCombineOutputs_t out_struct = NCCL_EP_COMBINE_OUTPUTS_INIT; + out_struct.tokens = &nccl_tok_out; + out_struct.topk_weights = &nccl_w_out; + + ncclEpCombineConfig_t cfg = NCCL_EP_COMBINE_CONFIG_INIT; + cfg.pass_direction = NCCL_EP_BWD_PASS; + + ScopedEpHandle transient; + { + std::lock_guard lock(mutex_); + HandleEntry& entry = lookup_config(handle_id); + transient = ScopedEpHandle( + open_handle(handle_mem, entry.handle_mem_size, entry.top_k, entry.alignment)); + } + NVTE_CHECK_NCCL(ncclEpCombine(transient, &in_struct, &out_struct, &cfg, stream)); +} + +void EPBackend::combine_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, + const NVTECommWindow& grad_win, NVTETensor grad_expert_out, + const NVTECommWindow& grad_expert_out_win, cudaStream_t stream) { + // Backward of combine = reverse-direction dispatch. + dispatch(handle_id, handle_mem, /*topk_idx=*/nullptr, grad, grad_win, /*topk_weights=*/nullptr, + /*topk_weights_win=*/NVTECommWindow{}, grad_expert_out, grad_expert_out_win, + /*recv_topk_weights=*/nullptr, /*recv_topk_weights_win=*/NVTECommWindow{}, stream); +} + +} // namespace ep +} // namespace transformer_engine diff --git a/transformer_engine/common/ep/ep_backend.h b/transformer_engine/common/ep/ep_backend.h new file mode 100644 index 0000000000..18307ebb4f --- /dev/null +++ b/transformer_engine/common/ep/ep_backend.h @@ -0,0 +1,114 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file ep_backend.h + * \brief Internal NCCL EP singleton; not part of the public API. + * + * Per handle_id the cache stores config only (no device pointers), so + * handle_mem may be relocated between ops. Cap: NVTE_EP_HANDLE_CACHE_SIZE + * (default 8192); overflow throws. + */ + +#ifndef TRANSFORMER_ENGINE_COMMON_EP_EP_BACKEND_H_ +#define TRANSFORMER_ENGINE_COMMON_EP_EP_BACKEND_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace transformer_engine { +namespace ep { + +/*! \brief EP backend singleton — owns the NCCL EP group; borrows the comm. */ +class EPBackend { + public: + /*! \brief Access the singleton. Aborts if not initialized. */ + static EPBackend& get(); + + /*! \brief Bootstrap from an existing EP sub-communicator. + * ep_comm is borrowed; the caller keeps it alive until shutdown() returns + * and must span exactly config.ep_size ranks. + */ + static void initialize(ncclComm_t ep_comm, NVTEEpGroupConfig config); + + /*! \brief Tear down the backend. Idempotent. Does not destroy ep_comm_. */ + static void shutdown(); + + // Host-only: reserve a fresh handle_id, cache the layer config, and report + // the handle_mem buffer size the caller must allocate. + uint64_t register_layer(NVTEEpLayerConfig layer_config, size_t* handle_mem_size); + + void prepare(uint64_t handle_id, const NVTETensor topk_idx, NVTETensor token_counts, + void* handle_mem, size_t dispatch_output_per_expert_alignment, cudaStream_t stream); + + void dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor topk_idx, + const NVTETensor tokens, const NVTECommWindow& tokens_win, + const NVTETensor topk_weights, const NVTECommWindow& topk_weights_win, + NVTETensor recv_tokens, const NVTECommWindow& recv_tokens_win, + NVTETensor recv_topk_weights, const NVTECommWindow& recv_topk_weights_win, + cudaStream_t stream); + + void combine(uint64_t handle_id, void* handle_mem, const NVTETensor expert_out, + const NVTECommWindow& expert_out_win, NVTETensor result, cudaStream_t stream); + + // g_recv_topk_weights: 1D [recv_capacity] f32; grad_topk_weights: 2D [T, top_k] f32. + void dispatch_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, + const NVTECommWindow& grad_win, const NVTETensor g_recv_topk_weights, + const NVTECommWindow& g_recv_topk_weights_win, NVTETensor grad_tokens, + NVTETensor grad_topk_weights, cudaStream_t stream); + + void combine_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, + const NVTECommWindow& grad_win, NVTETensor grad_expert_out, + const NVTECommWindow& grad_expert_out_win, cudaStream_t stream); + + private: + EPBackend() = default; + ~EPBackend(); + EPBackend(const EPBackend&) = delete; + EPBackend& operator=(const EPBackend&) = delete; + + // ep_comm is borrowed — caller retains ownership across the backend lifetime. + void init(ncclComm_t ep_comm, NVTEEpGroupConfig config); + + static EPBackend& instance(); // Meyers singleton accessor + static void validate_config(const NVTEEpGroupConfig& config); + + static ncclDataType_t nvte_dtype_to_nccl(NVTEDType dtype); + // Open a transient ncclEpHandle over handle_mem. num_topk=-1 for paths + // that don't carry per-token weights. + ncclEpHandle_t open_handle(void* handle_mem, size_t handle_mem_size, int num_topk, + size_t dispatch_output_per_expert_alignment); + + ncclEpGroup_t ep_group_{nullptr}; + ncclComm_t ep_comm_{nullptr}; + NVTEEpGroupConfig group_config_{}; + bool initialized_{false}; + std::mutex mutex_; + struct HandleEntry { + size_t handle_mem_size; + size_t alignment; + int top_k; + }; + std::unordered_map handles_; + std::atomic next_handle_id_{1}; // 0 reserved as "no id" + size_t handle_cache_cap_{0}; // set lazily from NVTE_EP_HANDLE_CACHE_SIZE + + // Caller must hold mutex_. Throws on cap overflow. + uint64_t insert_new_entry(size_t handle_mem_size, int top_k, size_t alignment); + HandleEntry& lookup_config(uint64_t handle_id); +}; + +} // namespace ep +} // namespace transformer_engine + +#endif // TRANSFORMER_ENGINE_COMMON_EP_EP_BACKEND_H_ diff --git a/transformer_engine/common/include/transformer_engine/comm_window.h b/transformer_engine/common/include/transformer_engine/comm_window.h new file mode 100644 index 0000000000..088ea7f0c3 --- /dev/null +++ b/transformer_engine/common/include/transformer_engine/comm_window.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file comm_window.h + * \brief Borrowed symmetric-memory window + offset for zero-copy one-sided ops. + * Pass ``{NULL, 0}`` to use the raw-pointer path. + */ + +#ifndef TRANSFORMER_ENGINE_COMM_WINDOW_H_ +#define TRANSFORMER_ENGINE_COMM_WINDOW_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! \brief NCCL window + byte offset for a zero-copy payload tensor. */ +typedef struct { + ncclWindow_t window; /*!< NCCL window, or NULL to use the raw data pointer. */ + uint64_t offset; /*!< Byte offset of the payload within ``window``. */ +} NVTECommWindow; + +#ifdef __cplusplus +} +#endif + +#endif // TRANSFORMER_ENGINE_COMM_WINDOW_H_ diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h new file mode 100644 index 0000000000..8c3a06b5f0 --- /dev/null +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -0,0 +1,161 @@ +/************************************************************************* + * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See LICENSE for license information. + ************************************************************************/ + +/*! \file ep.h + * \brief Public C API for Expert Parallelism. Per-step ops are allocation-free + * and CUDA graph-capturable. + */ + +#ifndef TRANSFORMER_ENGINE_EP_H_ +#define TRANSFORMER_ENGINE_EP_H_ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* ── Config structs ─────────────────────────────────────────────────────── */ + +/*! \brief Group-level EP configuration (fixed for the EP group lifetime). */ +typedef struct { + int ep_size; /*!< EP world size. */ + int num_experts; /*!< Total experts across all ranks. */ + int max_tokens_per_rank; /*!< Upper bound on tokens this rank sends per dispatch. */ + /*! Upper bound on tokens received per dispatch (worst-case top_k fan-out; must be > 0). */ + int max_recv_tokens_per_rank; + int hidden_dim; /*!< Token hidden dimension. */ + int max_num_sms; /*!< Max SMs for EP kernels. 0 = auto. */ + /*! 0 (default): throw on relocated handle_mem for a cached handle_id. 1: silently rebuild. */ + int allow_handle_mem_reloc; +} NVTEEpGroupConfig; + +/*! \brief Per-layer EP configuration. */ +typedef struct { + int num_local_experts; /*!< Reserved for ABI stability (derived from group config). */ + int top_k; /*!< Per-token expert fan-out. Required. */ + size_t dispatch_output_per_expert_alignment; + /*!< Per-expert zone alignment in tokens (pow2; 0/1 = no padding). Must match + * between nvte_ep_register_layer and nvte_ep_prepare. */ +} NVTEEpLayerConfig; + +/* ── Bootstrap ──────────────────────────────────────────────────────────── */ + +/*! \brief Bootstrap from an existing NCCL EP sub-communicator. Requires SM>=90. + * + * ep_comm is borrowed and must span exactly group_config.ep_size ranks. + * Re-init after shutdown is allowed; double-init throws. + * + * \param[in] ep_comm Opaque ncclComm_t for the EP sub-group. + * \param[in] group_config Group-level EP configuration. + */ +void nvte_ep_initialize(void* ep_comm, NVTEEpGroupConfig group_config); + +/*! \brief Tear down the EP backend. Idempotent. Does not destroy ep_comm. */ +void nvte_ep_shutdown(void); + +/* ── Layer registration (host-only, eager) ───────────────────────────────── */ + +/*! \brief Reserve a handle_id for a layer config and report the handle_mem buffer + * size the caller must allocate. Host-only. + * + * \param[in] layer_config Per-layer EP configuration. + * \param[out] handle_mem_size Bytes the caller must allocate for handle_mem. + * \return uint64_t handle_id (non-zero). + */ +uint64_t nvte_ep_register_layer(NVTEEpLayerConfig layer_config, size_t* handle_mem_size); + +/*! \brief Per-step handle: the registered handle_id paired with its handle_mem buffer. */ +typedef struct { + uint64_t id; /*!< Handle id from nvte_ep_register_layer. */ + NVTETensor mem; /*!< Caller-allocated handle_mem buffer (size from nvte_ep_register_layer). */ +} NVTEEpHandle; + +/* ── Per-step ops (all allocation-free, CUDA graph-capturable) ──────────── */ + +/*! \brief AllGather the routing map; write per-expert counts and cache routing + * metadata in handle.mem for the subsequent dispatch/combine. + * + * \param[in] handle EP handle (id + mem buffer). + * \param[in] topk_idx [T, top_k] int64 routing indices. + * \param[out] token_counts [num_local_experts] int32 counts. + * \param[in] dispatch_output_per_expert_alignment Must match the handle_mem sizing. + * \param[in] stream CUDA stream. + */ +void nvte_ep_prepare(NVTEEpHandle handle, NVTETensor topk_idx, NVTETensor token_counts, + size_t dispatch_output_per_expert_alignment, cudaStream_t stream); + +/*! \brief Dispatch tokens (and routing weights) to expert ranks. + * + * \param[in] handle EP handle (id + mem buffer). + * \param[in] topk_idx [T, top_k] int64 sparse routing indices. + * \param[in] tokens [T, hidden_dim] input tokens. + * \param[in] tokens_win Optional symmem window for ``tokens``. + * \param[in] topk_weights [T, top_k] float32 weights, or null in backward. + * \param[in] topk_weights_win Optional symmem window for ``topk_weights``. + * \param[out] recv_tokens [recv_T, hidden_dim] received tokens. + * \param[in] recv_tokens_win Optional symmem window for ``recv_tokens``. + * \param[out] recv_topk_weights [recv_T] float32 per-slot weights, or null in backward. + * \param[in] recv_topk_weights_win Optional symmem window for ``recv_topk_weights``. + * \param[in] stream CUDA stream. + */ +void nvte_ep_dispatch(NVTEEpHandle handle, NVTETensor topk_idx, NVTETensor tokens, + NVTECommWindow tokens_win, NVTETensor topk_weights, + NVTECommWindow topk_weights_win, NVTETensor recv_tokens, + NVTECommWindow recv_tokens_win, NVTETensor recv_topk_weights, + NVTECommWindow recv_topk_weights_win, cudaStream_t stream); + +/*! \brief Scatter-sum expert outputs back to originating ranks. Unweighted — + * caller must pre-multiply expert_out by recv_topk_weights (and the + * valid-slot mask) before calling. + * + * \param[in] handle EP handle (id + mem buffer). + * \param[in] expert_out [recv_T, hidden_dim] pre-weighted expert outputs. + * \param[in] expert_out_win Optional symmem window for ``expert_out``. + * \param[out] result [T, hidden_dim] combined output. + * \param[in] stream CUDA stream. + */ +void nvte_ep_combine(NVTEEpHandle handle, NVTETensor expert_out, NVTECommWindow expert_out_win, + NVTETensor result, cudaStream_t stream); + +/*! \brief Backward of dispatch — routes token and weight grads back to source. + * + * \param[in] handle EP handle (id + mem buffer). + * \param[in] grad [recv_capacity, hidden_dim] grad w.r.t. recv_tokens. + * \param[in] grad_win Optional symmem window for ``grad``. + * \param[in] g_recv_topk_weights [recv_capacity] f32 grad w.r.t. recv_topk_weights. + * \param[in] g_recv_topk_weights_win Optional symmem window for ``g_recv_topk_weights``. + * \param[out] grad_tokens [T, hidden_dim] grad w.r.t. tokens. + * \param[out] grad_topk_weights [T, top_k] f32 grad w.r.t. topk_weights. + * \param[in] stream CUDA stream. + */ +void nvte_ep_dispatch_bwd(NVTEEpHandle handle, NVTETensor grad, NVTECommWindow grad_win, + NVTETensor g_recv_topk_weights, NVTECommWindow g_recv_topk_weights_win, + NVTETensor grad_tokens, NVTETensor grad_topk_weights, + cudaStream_t stream); + +/*! \brief Backward of combine. Padded slots in grad_expert_out are zeroed. + * + * \param[in] handle EP handle (id + mem buffer). + * \param[in] grad [T, hidden_dim] grad w.r.t. result. + * \param[in] grad_win Optional symmem window for ``grad``. + * \param[out] grad_expert_out [recv_capacity, hidden_dim] grad w.r.t. expert_out. + * \param[in] grad_expert_out_win Optional symmem window for ``grad_expert_out``. + * \param[in] stream CUDA stream. + */ +void nvte_ep_combine_bwd(NVTEEpHandle handle, NVTETensor grad, NVTECommWindow grad_win, + NVTETensor grad_expert_out, NVTECommWindow grad_expert_out_win, + cudaStream_t stream); + +#ifdef __cplusplus +} +#endif + +#endif // TRANSFORMER_ENGINE_EP_H_ From cef4b334c23b79031f91ecd6b2517df140db57f0 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Sat, 23 May 2026 19:36:55 +0000 Subject: [PATCH 02/21] Expert Parallelism: persistent ncclEpHandle cache with allow_handle_mem_reloc gating Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep_coverage.cu | 183 ++++++++++++++++++++ transformer_engine/common/ep/ep_backend.cpp | 109 +++++------- transformer_engine/common/ep/ep_backend.h | 8 + 3 files changed, 238 insertions(+), 62 deletions(-) diff --git a/tests/cpp_distributed/test_ep_coverage.cu b/tests/cpp_distributed/test_ep_coverage.cu index ef7941905d..e9e532386c 100644 --- a/tests/cpp_distributed/test_ep_coverage.cu +++ b/tests/cpp_distributed/test_ep_coverage.cu @@ -369,6 +369,189 @@ TEST_F(NegativeTests, NullHandleMemThrows) { CHECK_CUDA(cudaStreamDestroy(stream)); } +// ============================================================================= +// HandleCacheTest: persistent ncclEpHandle is reused across ops on the same +// handle_mem ptr; relocation triggers throw by default and rebuild when +// NVTEEpGroupConfig.allow_handle_mem_reloc=1. +// ============================================================================= + +class HandleCacheTest : public EpCoverageBase {}; + +// Run prepare → dispatch → combine on bundle b. handle_mem_data overrides the +// device ptr used for handle_mem (must be the buffer owned by b unless +// reloc-allowed mode is active). Templated on Bundle because EpCoverageBase:: +// Bundle is declared in a protected section. +template +static void run_round_trip(B& b, void* handle_mem_data, + int num_tokens, int top_k, int num_local_experts, + int hidden_dim, size_t alignment, + cudaStream_t stream) { + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts}, kNVTEInt32); + auto handle_mem_t = make_nvte_tensor(handle_mem_data, + {b.handle_mem_size}, kNVTEByte); + auto tokens_t = make_nvte_tensor(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); + auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), + {b.recv_capacity}, kNVTEFloat32); + auto result_t = make_nvte_tensor(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); + + NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; + nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, alignment, stream); + nvte_ep_dispatch(h, topk_idx_t.tensor, tokens_t.tensor, NVTECommWindow{}, + topk_weights_t.tensor, NVTECommWindow{}, + recv_tokens_t.tensor, NVTECommWindow{}, + recv_w_t.tensor, NVTECommWindow{}, stream); + nvte_ep_combine(h, recv_tokens_t.tensor, NVTECommWindow{}, result_t.tensor, stream); +} + +// Re-bootstrap EP backend with a different allow_handle_mem_reloc setting. +// Reuses the existing g_ep_comm; caller is responsible for restoring defaults. +static void reinit_ep_with_reloc(int allow_reloc) { + nvte_ep_shutdown(); + NVTEEpGroupConfig cfg{}; + cfg.ep_size = g_ep_size; + cfg.num_experts = g_num_experts; + cfg.max_tokens_per_rank = g_max_tokens_per_rank; + cfg.max_recv_tokens_per_rank = g_ep_size * g_max_tokens_per_rank * 2; + cfg.hidden_dim = g_hidden_dim; + cfg.allow_handle_mem_reloc = allow_reloc; + nvte_ep_initialize(static_cast(g_ep_comm), cfg); +} + +TEST_F(HandleCacheTest, ReuseSameMemSucceeds) { + const int num_tokens = 16, top_k = 2; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + std::vector h_w(num_tokens * top_k, 1.0f / top_k); + auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + // Two consecutive round-trips on the same handle_mem ptr: first opens the + // cached handle, second hits the cache. Both must succeed and be correct. + for (int iter = 0; iter < 2; ++iter) { + ASSERT_NO_THROW(run_round_trip(b, b.handle_mem.get(), num_tokens, top_k, + num_local_experts_, hidden_dim_, + /*alignment=*/0, stream)); + } + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_res(num_tokens * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int t = 0; t < num_tokens; ++t) + for (int p : probes) + EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), + static_cast(top_k) * 0.5f, 1e-2f); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +TEST_F(HandleCacheTest, RelocDefaultThrows) { + // Default bootstrap has allow_handle_mem_reloc=0: a second prepare call on + // the same handle_id with a different handle_mem ptr must throw. + const int num_tokens = 8, top_k = 2; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + DevBuf second_hm(b.handle_mem_size); // distinct device buffer + ASSERT_NE(b.handle_mem.get(), second_hm.get()); + + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + + auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); + auto token_counts_t = make_nvte_tensor(b.token_counts.get(), + {(size_t)num_local_experts_}, kNVTEInt32); + auto hm1_t = make_nvte_tensor(b.handle_mem.get(), + {b.handle_mem_size}, kNVTEByte); + auto hm2_t = make_nvte_tensor(second_hm.get(), + {b.handle_mem_size}, kNVTEByte); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + // First prepare seeds the cache. + NVTEEpHandle h1{b.handle_id, hm1_t.tensor}; + ASSERT_NO_THROW(nvte_ep_prepare(h1, topk_idx_t.tensor, token_counts_t.tensor, + /*alignment=*/0, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + // Same handle_id with a different handle_mem ptr must throw. + NVTEEpHandle h2{b.handle_id, hm2_t.tensor}; + EXPECT_THROW(nvte_ep_prepare(h2, topk_idx_t.tensor, token_counts_t.tensor, + /*alignment=*/0, stream), + std::exception); + CHECK_CUDA(cudaStreamDestroy(stream)); +} + +TEST_F(HandleCacheTest, RelocAllowedRebuilds) { + // Re-init EP backend with allow_handle_mem_reloc=1, run two round-trips with + // distinct handle_mem buffers, verify both succeed numerically, restore. + reinit_ep_with_reloc(/*allow_reloc=*/1); + + struct Restore { ~Restore() { reinit_ep_with_reloc(/*allow_reloc=*/0); } } restore; + + const int num_tokens = 16, top_k = 2; + Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); + DevBuf alt_hm(b.handle_mem_size); + ASSERT_NE(b.handle_mem.get(), alt_hm.get()); + + auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, + num_experts_, num_local_experts_); + std::vector h_w(num_tokens * top_k, 1.0f / top_k); + auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); + CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + + cudaStream_t stream; + CHECK_CUDA(cudaStreamCreate(&stream)); + + // First on the original handle_mem. + ASSERT_NO_THROW(run_round_trip(b, b.handle_mem.get(), num_tokens, top_k, + num_local_experts_, hidden_dim_, + /*alignment=*/0, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + // Then on the relocated handle_mem — must trigger silent rebuild, not throw. + ASSERT_NO_THROW(run_round_trip(b, alt_hm.get(), num_tokens, top_k, + num_local_experts_, hidden_dim_, + /*alignment=*/0, stream)); + CHECK_CUDA(cudaStreamSynchronize(stream)); + + std::vector h_res(num_tokens * hidden_dim_); + CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); + const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; + for (int t = 0; t < num_tokens; ++t) + for (int p : probes) + EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), + static_cast(top_k) * 0.5f, 1e-2f); + + CHECK_CUDA(cudaStreamDestroy(stream)); +} + // ── main ────────────────────────────────────────────────────────────────────── int main(int argc, char* argv[]) { diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp index ae0f3ab888..6494a86817 100644 --- a/transformer_engine/common/ep/ep_backend.cpp +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -57,32 +57,6 @@ inline ncclEpTensor_t make_payload_tensor(const NVTETensor t, const NVTECommWind return desc; } -// RAII guard for ncclEpHandle_t — destroys on scope exit, leak-free on throw. -class ScopedEpHandle { - public: - ScopedEpHandle() = default; - explicit ScopedEpHandle(ncclEpHandle_t h) : h_(h) {} - ~ScopedEpHandle() { - if (h_ != nullptr) ncclEpHandleDestroy(h_); - } - ScopedEpHandle(const ScopedEpHandle&) = delete; - ScopedEpHandle& operator=(const ScopedEpHandle&) = delete; - ScopedEpHandle(ScopedEpHandle&& other) noexcept : h_(other.h_) { other.h_ = nullptr; } - ScopedEpHandle& operator=(ScopedEpHandle&& other) noexcept { - if (this != &other) { - if (h_ != nullptr) ncclEpHandleDestroy(h_); - h_ = other.h_; - other.h_ = nullptr; - } - return *this; - } - operator ncclEpHandle_t() const { return h_; } - ncclEpHandle_t get() const { return h_; } - - private: - ncclEpHandle_t h_ = nullptr; -}; - } // namespace // --------------------------------------------------------------------------- @@ -158,6 +132,13 @@ void EPBackend::shutdown() { EPBackend& inst = instance(); std::lock_guard lock(inst.mutex_); if (!inst.initialized_) return; + for (auto& kv : inst.handles_) { + if (kv.second.cached_handle != nullptr) { + ncclEpHandleDestroy(kv.second.cached_handle); + kv.second.cached_handle = nullptr; + kv.second.cached_handle_mem = nullptr; + } + } inst.handles_.clear(); // ncclEpGroupDestroy reads from ep_comm_; destroy group while comm is still alive. if (inst.ep_group_ != nullptr) { @@ -196,7 +177,7 @@ ncclDataType_t EPBackend::nvte_dtype_to_nccl(NVTEDType dtype) { return ncclFloat32; // unreachable } -// Open a transient ncclEpHandle over handle_mem. Caller owns the result. +// Open a fresh ncclEpHandle over handle_mem. Caller (or cache) owns the result. ncclEpHandle_t EPBackend::open_handle(void* handle_mem, size_t handle_mem_size, int num_topk, size_t dispatch_output_per_expert_alignment) { size_t hm_sizes[1] = {handle_mem_size}; @@ -273,6 +254,26 @@ EPBackend::HandleEntry& EPBackend::lookup_config(uint64_t handle_id) { return it->second; } +ncclEpHandle_t EPBackend::get_or_open_handle(HandleEntry& cfg, void* handle_mem) { + if (cfg.cached_handle != nullptr && cfg.cached_handle_mem == handle_mem) { + return cfg.cached_handle; + } + if (cfg.cached_handle != nullptr) { + NVTE_CHECK(group_config_.allow_handle_mem_reloc != 0, + "EP handle_mem relocated for cached handle (old=", + reinterpret_cast(cfg.cached_handle_mem), + ", new=", reinterpret_cast(handle_mem), + "). Set NVTEEpGroupConfig.allow_handle_mem_reloc=1 to allow rebuild."); + ncclEpHandleDestroy(cfg.cached_handle); + cfg.cached_handle = nullptr; + cfg.cached_handle_mem = nullptr; + } + ncclEpHandle_t h = open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment); + cfg.cached_handle = h; + cfg.cached_handle_mem = handle_mem; + return h; +} + // --------------------------------------------------------------------------- // Per-step operations // --------------------------------------------------------------------------- @@ -320,17 +321,13 @@ void EPBackend::prepare(uint64_t handle_id, const NVTETensor topk_idx, NVTETenso ncclEpLayoutInfo_t layout_info = NCCL_EP_LAYOUT_INFO_INIT; layout_info.expert_counters = (token_counts_data != nullptr) ? &token_counts_desc : nullptr; - ScopedEpHandle transient; - { - std::lock_guard lock(mutex_); - HandleEntry& cfg = lookup_config(handle_id); - NVTE_CHECK(cfg.alignment == dispatch_output_per_expert_alignment, - "ep_prepare: alignment mismatch for handle_id=", handle_id, - " (cached=", cfg.alignment, ", got=", dispatch_output_per_expert_alignment, ")"); - transient = - ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); - } - NVTE_CHECK_NCCL(ncclEpUpdateHandle(transient, &nccl_topk_idx, &layout_info, stream)); + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + NVTE_CHECK(cfg.alignment == dispatch_output_per_expert_alignment, + "ep_prepare: alignment mismatch for handle_id=", handle_id, + " (cached=", cfg.alignment, ", got=", dispatch_output_per_expert_alignment, ")"); + ncclEpHandle_t h = get_or_open_handle(cfg, handle_mem); + NVTE_CHECK_NCCL(ncclEpUpdateHandle(h, &nccl_topk_idx, &layout_info, stream)); } void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor topk_idx, @@ -397,14 +394,10 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor ncclEpDispatchConfig_t dispatch_cfg = NCCL_EP_DISPATCH_CONFIG_INIT; dispatch_cfg.pass_direction = is_forward ? NCCL_EP_FWD_PASS : NCCL_EP_BWD_PASS; - ScopedEpHandle transient; - { - std::lock_guard lock(mutex_); - HandleEntry& cfg = lookup_config(handle_id); - transient = - ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); - } - NVTE_CHECK_NCCL(ncclEpDispatch(transient, &in_struct, &out_struct, + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + ncclEpHandle_t h = get_or_open_handle(cfg, handle_mem); + NVTE_CHECK_NCCL(ncclEpDispatch(h, &in_struct, &out_struct, /*layout_info=*/nullptr, &dispatch_cfg, stream)); } @@ -436,14 +429,10 @@ void EPBackend::combine(uint64_t handle_id, void* handle_mem, const NVTETensor e ncclEpCombineOutputs_t out_struct = NCCL_EP_COMBINE_OUTPUTS_INIT; out_struct.tokens = &nccl_result_out; - ScopedEpHandle transient; - { - std::lock_guard lock(mutex_); - HandleEntry& cfg = lookup_config(handle_id); - transient = - ScopedEpHandle(open_handle(handle_mem, cfg.handle_mem_size, cfg.top_k, cfg.alignment)); - } - NVTE_CHECK_NCCL(ncclEpCombine(transient, &in_struct, &out_struct, /*config=*/nullptr, stream)); + std::lock_guard lock(mutex_); + HandleEntry& cfg = lookup_config(handle_id); + ncclEpHandle_t h = get_or_open_handle(cfg, handle_mem); + NVTE_CHECK_NCCL(ncclEpCombine(h, &in_struct, &out_struct, /*config=*/nullptr, stream)); } void EPBackend::dispatch_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, @@ -491,14 +480,10 @@ void EPBackend::dispatch_bwd(uint64_t handle_id, void* handle_mem, const NVTETen ncclEpCombineConfig_t cfg = NCCL_EP_COMBINE_CONFIG_INIT; cfg.pass_direction = NCCL_EP_BWD_PASS; - ScopedEpHandle transient; - { - std::lock_guard lock(mutex_); - HandleEntry& entry = lookup_config(handle_id); - transient = ScopedEpHandle( - open_handle(handle_mem, entry.handle_mem_size, entry.top_k, entry.alignment)); - } - NVTE_CHECK_NCCL(ncclEpCombine(transient, &in_struct, &out_struct, &cfg, stream)); + std::lock_guard lock(mutex_); + HandleEntry& entry = lookup_config(handle_id); + ncclEpHandle_t h = get_or_open_handle(entry, handle_mem); + NVTE_CHECK_NCCL(ncclEpCombine(h, &in_struct, &out_struct, &cfg, stream)); } void EPBackend::combine_bwd(uint64_t handle_id, void* handle_mem, const NVTETensor grad, diff --git a/transformer_engine/common/ep/ep_backend.h b/transformer_engine/common/ep/ep_backend.h index 18307ebb4f..e82c974c3f 100644 --- a/transformer_engine/common/ep/ep_backend.h +++ b/transformer_engine/common/ep/ep_backend.h @@ -98,6 +98,10 @@ class EPBackend { size_t handle_mem_size; size_t alignment; int top_k; + // Persistent ncclEpHandle bound to cached_handle_mem. Lazily opened on first + // op; reused while handle_mem ptr is unchanged. Destroyed in shutdown(). + ncclEpHandle_t cached_handle{nullptr}; + void* cached_handle_mem{nullptr}; }; std::unordered_map handles_; std::atomic next_handle_id_{1}; // 0 reserved as "no id" @@ -106,6 +110,10 @@ class EPBackend { // Caller must hold mutex_. Throws on cap overflow. uint64_t insert_new_entry(size_t handle_mem_size, int top_k, size_t alignment); HandleEntry& lookup_config(uint64_t handle_id); + // Caller must hold mutex_. Returns the cached handle if handle_mem matches. + // On mismatch: if group_config_.allow_handle_mem_reloc != 0, destroys the + // stale handle and opens a fresh one; otherwise throws. + ncclEpHandle_t get_or_open_handle(HandleEntry& cfg, void* handle_mem); }; } // namespace ep From 0086be410bbb4448526e672b1a55b8472c1dba83 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 23 May 2026 23:09:15 +0000 Subject: [PATCH 03/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/ep/ep_backend.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp index 6494a86817..83657943a4 100644 --- a/transformer_engine/common/ep/ep_backend.cpp +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -324,8 +324,8 @@ void EPBackend::prepare(uint64_t handle_id, const NVTETensor topk_idx, NVTETenso std::lock_guard lock(mutex_); HandleEntry& cfg = lookup_config(handle_id); NVTE_CHECK(cfg.alignment == dispatch_output_per_expert_alignment, - "ep_prepare: alignment mismatch for handle_id=", handle_id, - " (cached=", cfg.alignment, ", got=", dispatch_output_per_expert_alignment, ")"); + "ep_prepare: alignment mismatch for handle_id=", handle_id, " (cached=", cfg.alignment, + ", got=", dispatch_output_per_expert_alignment, ")"); ncclEpHandle_t h = get_or_open_handle(cfg, handle_mem); NVTE_CHECK_NCCL(ncclEpUpdateHandle(h, &nccl_topk_idx, &layout_info, stream)); } From 2dc9fe755bd89f2f327cd5a7d47037337d099098 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Wed, 27 May 2026 14:12:53 -0700 Subject: [PATCH 04/21] Build: NCCL_HOME discovery supports Debian/Ubuntu multiarch lib paths Signed-off-by: Phuong Nguyen --- setup.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index db360c8a29..34a3abfd99 100644 --- a/setup.py +++ b/setup.py @@ -167,11 +167,13 @@ def _discover_nccl_home() -> str: f"'{env_home}/include/nccl.h' was not found; falling back to system probes." ) + lib_names = ("libnccl.so", "libnccl.so.2") + # Include Debian/Ubuntu multiarch subdirs (e.g. lib/aarch64-linux-gnu). + lib_subdirs = ("lib", "lib64", "lib/aarch64-linux-gnu", "lib/x86_64-linux-gnu") for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"): p = Path(cand) if (p / "include" / "nccl.h").exists() and any( - (p / "lib" / name).exists() or (p / "lib64" / name).exists() - for name in ("libnccl.so", "libnccl.so.2") + (p / sub / name).exists() for sub in lib_subdirs for name in lib_names ): return str(p) @@ -180,9 +182,11 @@ def _discover_nccl_home() -> str: for line in out.splitlines(): if "libnccl.so" in line and "=>" in line: lib_path = Path(line.split("=>")[-1].strip()) - root = lib_path.parent.parent - if (root / "include" / "nccl.h").exists(): - return str(root) + # Walk upward so multiarch layouts (.../lib//libnccl.so) + # resolve to the prefix that contains include/nccl.h. + for root in (lib_path.parent.parent, lib_path.parent.parent.parent): + if (root / "include" / "nccl.h").exists(): + return str(root) except (subprocess.CalledProcessError, FileNotFoundError): pass From c93387ccfaf13b4eca9071011f94c6b16fefcb80 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Wed, 27 May 2026 14:26:39 -0700 Subject: [PATCH 05/21] bump NCCL Signed-off-by: Phuong Nguyen --- 3rdparty/nccl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/nccl b/3rdparty/nccl index 6a9bc953ac..146496ac88 160000 --- a/3rdparty/nccl +++ b/3rdparty/nccl @@ -1 +1 @@ -Subproject commit 6a9bc953ac1c4eef92d5adbe3092d4c2cb0a4c98 +Subproject commit 146496ac881bc504ed1a52be0ae7b707ce41e706 From ead4344f5e2ba375005764382cafe04cea4956f3 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:25:16 -0700 Subject: [PATCH 06/21] Expert Parallelism: require token_dtype in NVTEEpGroupConfig and enforce at dispatch Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep_common.h | 4 ++++ transformer_engine/common/ep/ep_backend.cpp | 21 +++++++++++++++---- .../common/include/transformer_engine/ep.h | 3 +++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index 77baa92b0c..ccb20ee3a0 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -74,6 +74,7 @@ static int g_ep_size = -1; static int g_num_experts = -1; static int g_hidden_dim = 256; static int g_max_tokens_per_rank = 64; +static NVTEDType g_token_dtype = kNVTEBFloat16; static bool g_ep_initialized = false; static ncclComm_t g_ep_comm = nullptr; // owned by harness, destroyed in ep_teardown @@ -224,6 +225,8 @@ static void ep_parse_args(int argc, char* argv[]) { else if (a.rfind("--num-processes=",0)==0) g_num_processes = std::stoi(a.substr(16)); else if (a.rfind("--nranks=", 0) == 0) g_num_processes = std::stoi(a.substr(9)); else if (a.rfind("--uid-file=", 0) == 0) g_uid_file = a.substr(11); + else if (a.rfind("--token-dtype=", 0) == 0) + g_token_dtype = static_cast(std::stoi(a.substr(14))); } if (g_process_id < 0 || g_num_processes <= 0) { @@ -279,6 +282,7 @@ static bool ep_bootstrap(int argc, char* argv[]) { // Worst-case for top_k fan-out: ep_size * max_tokens_per_rank * 2. group_config.max_recv_tokens_per_rank = g_ep_size * g_max_tokens_per_rank * 2; group_config.hidden_dim = g_hidden_dim; + group_config.token_dtype = g_token_dtype; ASSERT_NCCL_OK(ncclCommInitRank(&g_ep_comm, g_num_processes, uid, g_process_id)); nvte_ep_initialize(static_cast(g_ep_comm), group_config); diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp index 83657943a4..1e08cb55df 100644 --- a/transformer_engine/common/ep/ep_backend.cpp +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -82,9 +82,13 @@ void EPBackend::validate_config(const NVTEEpGroupConfig& config) { NVTE_CHECK(config.max_recv_tokens_per_rank > 0, "max_recv_tokens_per_rank must be positive, got ", config.max_recv_tokens_per_rank); NVTE_CHECK(config.hidden_dim > 0, "hidden_dim must be positive, got ", config.hidden_dim); - NVTE_CHECK(config.hidden_dim * sizeof(nv_bfloat16) >= 16, - "hidden_dim * 2 must be >= 16 (NCCL EP 16B row alignment); got hidden_dim=", - config.hidden_dim); + NVTE_CHECK(config.token_dtype >= 0 && config.token_dtype < kNVTENumTypes, + "token_dtype out of range, got ", static_cast(config.token_dtype)); + const size_t elem_bytes = typeToSize(static_cast(config.token_dtype)); + NVTE_CHECK(config.hidden_dim * elem_bytes >= 16, + "hidden_dim * sizeof(token_dtype) must be >= 16 (NCCL EP 16B row alignment); " + "got hidden_dim=", + config.hidden_dim, ", element_bytes=", elem_bytes); NVTE_CHECK(config.num_experts % config.ep_size == 0, "num_experts (", config.num_experts, ") must be divisible by ep_size (", config.ep_size, ")"); NVTE_CHECK(config.max_num_sms >= 0, "max_num_sms must be >= 0 (0 = auto), got ", @@ -214,7 +218,8 @@ void EPBackend::init(ncclComm_t ep_comm, NVTEEpGroupConfig group_config) { cfg.algorithm = NCCL_EP_ALGO_HIGH_THROUGHPUT; cfg.num_experts = static_cast(group_config.num_experts); cfg.max_dispatch_tokens_per_rank = static_cast(group_config.max_tokens_per_rank); - cfg.max_token_bytes = static_cast(group_config.hidden_dim * sizeof(nv_bfloat16)); + const size_t elem_bytes = typeToSize(static_cast(group_config.token_dtype)); + cfg.max_token_bytes = static_cast(group_config.hidden_dim * elem_bytes); cfg.rdma_buffer_size = NCCL_EP_AUTO; cfg.num_qp_per_rank = NCCL_EP_AUTO; cfg.num_channels = NCCL_EP_AUTO; @@ -341,6 +346,10 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor NVTEShape tok_shape = nvte_tensor_shape(tokens); NVTEDType tok_dtype = nvte_tensor_type(tokens); + NVTE_CHECK(tok_dtype == group_config_.token_dtype, + "tokens dtype (", static_cast(tok_dtype), + ") does not match group token_dtype (", + static_cast(group_config_.token_dtype), ")"); const size_t num_tokens = tok_shape.data[0]; const size_t hidden_dim = tok_shape.data[1]; @@ -367,6 +376,10 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor NVTEShape recv_shape = nvte_tensor_shape(recv_tokens); NVTEDType recv_dtype = nvte_tensor_type(recv_tokens); + NVTE_CHECK(recv_dtype == group_config_.token_dtype, + "recv_tokens dtype (", static_cast(recv_dtype), + ") does not match group token_dtype (", + static_cast(group_config_.token_dtype), ")"); size_t recv_sizes[2] = {recv_shape.data[0], recv_shape.data[1]}; ncclEpTensor_t nccl_tokens_out = make_payload_tensor(recv_tokens, recv_tokens_win, 2, diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h index 8c3a06b5f0..ac7f1dbf07 100644 --- a/transformer_engine/common/include/transformer_engine/ep.h +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -35,6 +35,9 @@ typedef struct { int max_num_sms; /*!< Max SMs for EP kernels. 0 = auto. */ /*! 0 (default): throw on relocated handle_mem for a cached handle_id. 1: silently rebuild. */ int allow_handle_mem_reloc; + /*! Token dtype for this EP group. Sizes NCCL EP staging buffers at group + * create and is enforced against tensors passed to nvte_ep_dispatch. */ + NVTEDType token_dtype; } NVTEEpGroupConfig; /*! \brief Per-layer EP configuration. */ From eb833421a5f79c7699347ac244ac6cbb338d2210 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:31:47 -0700 Subject: [PATCH 07/21] Expert Parallelism: document ep_comm lifetime, v0.1 single-GPU scope, static layer registration Signed-off-by: Phuong Nguyen --- .../common/include/transformer_engine/ep.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h index ac7f1dbf07..a1c9305e9b 100644 --- a/transformer_engine/common/include/transformer_engine/ep.h +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -54,8 +54,13 @@ typedef struct { /*! \brief Bootstrap from an existing NCCL EP sub-communicator. Requires SM>=90. * * ep_comm is borrowed and must span exactly group_config.ep_size ranks. + * The caller retains ownership and must keep ep_comm alive until + * nvte_ep_shutdown() returns; destroying it earlier is undefined behavior. * Re-init after shutdown is allowed; double-init throws. * + * v0.1 scope: one EP group per process, bound to the current CUDA device at + * initialize time. Multiple GPUs per process are not supported. + * * \param[in] ep_comm Opaque ncclComm_t for the EP sub-group. * \param[in] group_config Group-level EP configuration. */ @@ -69,6 +74,11 @@ void nvte_ep_shutdown(void); /*! \brief Reserve a handle_id for a layer config and report the handle_mem buffer * size the caller must allocate. Host-only. * + * Registration is intended to be static (once per layer at model init). There is + * no per-layer unregister API; all registrations are released by nvte_ep_shutdown. + * Re-registering the same layer config each step is not supported and will + * eventually exhaust the handle cache (NVTE_EP_HANDLE_CACHE_SIZE, default 8192). + * * \param[in] layer_config Per-layer EP configuration. * \param[out] handle_mem_size Bytes the caller must allocate for handle_mem. * \return uint64_t handle_id (non-zero). From 20b32f4038eac7fffe6a2bce7b31be32601a25df Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:32:48 -0700 Subject: [PATCH 08/21] Expert Parallelism: drop version label from initialize scope note Signed-off-by: Phuong Nguyen --- transformer_engine/common/include/transformer_engine/ep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h index a1c9305e9b..a3a306a5bc 100644 --- a/transformer_engine/common/include/transformer_engine/ep.h +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -58,8 +58,8 @@ typedef struct { * nvte_ep_shutdown() returns; destroying it earlier is undefined behavior. * Re-init after shutdown is allowed; double-init throws. * - * v0.1 scope: one EP group per process, bound to the current CUDA device at - * initialize time. Multiple GPUs per process are not supported. + * One EP group per process, bound to the current CUDA device at initialize + * time. Multiple GPUs per process are not supported. * * \param[in] ep_comm Opaque ncclComm_t for the EP sub-group. * \param[in] group_config Group-level EP configuration. From 78e17dc8443f3dba516143c6f940445a3e200534 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:39:39 -0700 Subject: [PATCH 09/21] Common: add NVTE_CHECK_NCCL macro and use it in EP tests Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep_common.h | 39 +----- tests/cpp_distributed/test_ep_coverage.cu | 92 +++++++------- tests/cpp_distributed/test_ep_pipeline.cu | 144 +++++++++++----------- transformer_engine/common/util/logging.h | 8 ++ 4 files changed, 129 insertions(+), 154 deletions(-) diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index ccb20ee3a0..fa3b3cc1c4 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -28,40 +28,7 @@ #include #include #include - -// ── Error-checking macros ───────────────────────────────────────────────────── - -#define CHECK_NCCL(expr) \ - do { \ - ncclResult_t _err = (expr); \ - if (_err != ncclSuccess) \ - FAIL() << "NCCL error " << _err << ": " << ncclGetErrorString(_err); \ - } while (false) - -#define CHECK_CUDA(expr) \ - do { \ - cudaError_t _err = (expr); \ - if (_err != cudaSuccess) \ - FAIL() << "CUDA error " << _err << ": " << cudaGetErrorString(_err); \ - } while (false) - -#define ASSERT_CUDA_OK(expr) \ - do { \ - cudaError_t _err = (expr); \ - if (_err != cudaSuccess) { \ - fprintf(stderr, "CUDA error %d: %s\n", _err, cudaGetErrorString(_err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (false) - -#define ASSERT_NCCL_OK(expr) \ - do { \ - ncclResult_t _err = (expr); \ - if (_err != ncclSuccess) { \ - fprintf(stderr, "NCCL error %d: %s\n", _err, ncclGetErrorString(_err)); \ - exit(EXIT_FAILURE); \ - } \ - } while (false) +#include "util/logging.h" // ── Process-level state ─────────────────────────────────────────────────────── @@ -186,7 +153,7 @@ static void exchange_unique_id(ncclUniqueId* uid) { const size_t sz = sizeof(ncclUniqueId); if (g_process_id == 0) { - ASSERT_NCCL_OK(ncclGetUniqueId(uid)); + NVTE_CHECK_NCCL(ncclGetUniqueId(uid)); FILE* f = fopen(g_uid_file.c_str(), "wb"); if (!f) { fprintf(stderr, "Cannot open uid file: %s\n", g_uid_file.c_str()); exit(EXIT_FAILURE); } fwrite(uid, 1, sz, f); @@ -284,7 +251,7 @@ static bool ep_bootstrap(int argc, char* argv[]) { group_config.hidden_dim = g_hidden_dim; group_config.token_dtype = g_token_dtype; - ASSERT_NCCL_OK(ncclCommInitRank(&g_ep_comm, g_num_processes, uid, g_process_id)); + NVTE_CHECK_NCCL(ncclCommInitRank(&g_ep_comm, g_num_processes, uid, g_process_id)); nvte_ep_initialize(static_cast(g_ep_comm), group_config); if (g_process_id == 0) { diff --git a/tests/cpp_distributed/test_ep_coverage.cu b/tests/cpp_distributed/test_ep_coverage.cu index e9e532386c..2a67bbae0f 100644 --- a/tests/cpp_distributed/test_ep_coverage.cu +++ b/tests/cpp_distributed/test_ep_coverage.cu @@ -122,16 +122,16 @@ TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { std::vector h_w(num_tokens * top_k, 1.0f / top_k); auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); for (Bundle* x : {&a, &b}) { - CHECK_CUDA(cudaMemcpy(x->topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(x->topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(x->topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(x->topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(x->tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(x->tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); } cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); ASSERT_NE(a.handle_id, b.handle_id); @@ -156,12 +156,12 @@ TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { }; run_one(a); run_one(b); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Both round-trips must produce result == top_k * 0.5 = 1.0. for (Bundle* x : {&a, &b}) { std::vector h_res(num_tokens * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), x->result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), x->result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; for (int t = 0; t < num_tokens; ++t) @@ -169,7 +169,7 @@ TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), static_cast(top_k) * 0.5f, 1e-2f); } - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -186,11 +186,11 @@ TEST_F(TopK1Test, RoundTrip) { num_experts_, num_local_experts_); std::vector h_w(num_tokens * top_k, 1.0f); // top_k=1: weight is unity auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.25f); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), @@ -211,7 +211,7 @@ TEST_F(TopK1Test, RoundTrip) { {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, @@ -222,11 +222,11 @@ TEST_F(TopK1Test, RoundTrip) { recv_w_t.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, NVTECommWindow{}, result_t.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // top_k=1: combine is unweighted gather, so result[t] == tokens[t]. std::vector h_res(num_tokens * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; for (int t = 0; t < num_tokens; ++t) @@ -234,7 +234,7 @@ TEST_F(TopK1Test, RoundTrip) { EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), 0.25f, 1e-2f) << "tok " << t << " hidden " << p; - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -259,11 +259,11 @@ TEST_P(EmptyExpertsTest, RoundTripCorrect) { std::vector h_w(num_tokens * top_k, 1.0f / top_k); auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.3f); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), @@ -284,7 +284,7 @@ TEST_P(EmptyExpertsTest, RoundTripCorrect) { {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, @@ -295,11 +295,11 @@ TEST_P(EmptyExpertsTest, RoundTripCorrect) { recv_w_t.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, NVTECommWindow{}, result_t.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Identity expert + uniform weights: result[t] == top_k * tokens[t]. std::vector h_res(num_tokens * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const float expected = static_cast(top_k) * 0.3f; const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; @@ -308,7 +308,7 @@ TEST_P(EmptyExpertsTest, RoundTripCorrect) { EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), expected, 1e-2f) << "alignment=" << alignment << " tok=" << t << " hidden=" << p; - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } INSTANTIATE_TEST_SUITE_P(Alignments, EmptyExpertsTest, @@ -326,7 +326,7 @@ TEST_F(NegativeTests, AlignmentMismatchThrows) { Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, num_experts_, num_local_experts_); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), @@ -337,12 +337,12 @@ TEST_F(NegativeTests, AlignmentMismatchThrows) { {b.handle_mem_size}, kNVTEByte); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, /*alignment=*/16, stream), std::exception); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } TEST_F(NegativeTests, NullHandleMemThrows) { @@ -350,7 +350,7 @@ TEST_F(NegativeTests, NullHandleMemThrows) { Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, num_experts_, num_local_experts_); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), @@ -361,12 +361,12 @@ TEST_F(NegativeTests, NullHandleMemThrows) { auto null_hm_t = make_nvte_tensor(nullptr, {b.handle_mem_size}, kNVTEByte); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); NVTEEpHandle h{b.handle_id, null_hm_t.tensor}; EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, /*alignment=*/0, stream), std::exception); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -434,15 +434,15 @@ TEST_F(HandleCacheTest, ReuseSameMemSucceeds) { num_experts_, num_local_experts_); std::vector h_w(num_tokens * top_k, 1.0f / top_k); auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); // Two consecutive round-trips on the same handle_mem ptr: first opens the // cached handle, second hits the cache. Both must succeed and be correct. @@ -451,10 +451,10 @@ TEST_F(HandleCacheTest, ReuseSameMemSucceeds) { num_local_experts_, hidden_dim_, /*alignment=*/0, stream)); } - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_res(num_tokens * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; for (int t = 0; t < num_tokens; ++t) @@ -462,7 +462,7 @@ TEST_F(HandleCacheTest, ReuseSameMemSucceeds) { EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), static_cast(top_k) * 0.5f, 1e-2f); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } TEST_F(HandleCacheTest, RelocDefaultThrows) { @@ -475,7 +475,7 @@ TEST_F(HandleCacheTest, RelocDefaultThrows) { auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, num_experts_, num_local_experts_); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), @@ -488,19 +488,19 @@ TEST_F(HandleCacheTest, RelocDefaultThrows) { {b.handle_mem_size}, kNVTEByte); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); // First prepare seeds the cache. NVTEEpHandle h1{b.handle_id, hm1_t.tensor}; ASSERT_NO_THROW(nvte_ep_prepare(h1, topk_idx_t.tensor, token_counts_t.tensor, /*alignment=*/0, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Same handle_id with a different handle_mem ptr must throw. NVTEEpHandle h2{b.handle_id, hm2_t.tensor}; EXPECT_THROW(nvte_ep_prepare(h2, topk_idx_t.tensor, token_counts_t.tensor, /*alignment=*/0, stream), std::exception); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } TEST_F(HandleCacheTest, RelocAllowedRebuilds) { @@ -519,29 +519,29 @@ TEST_F(HandleCacheTest, RelocAllowedRebuilds) { num_experts_, num_local_experts_); std::vector h_w(num_tokens * top_k, 1.0f / top_k); auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); - CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); // First on the original handle_mem. ASSERT_NO_THROW(run_round_trip(b, b.handle_mem.get(), num_tokens, top_k, num_local_experts_, hidden_dim_, /*alignment=*/0, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Then on the relocated handle_mem — must trigger silent rebuild, not throw. ASSERT_NO_THROW(run_round_trip(b, alt_hm.get(), num_tokens, top_k, num_local_experts_, hidden_dim_, /*alignment=*/0, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_res(num_tokens * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; for (int t = 0; t < num_tokens; ++t) @@ -549,7 +549,7 @@ TEST_F(HandleCacheTest, RelocAllowedRebuilds) { EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), static_cast(top_k) * 0.5f, 1e-2f); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ── main ────────────────────────────────────────────────────────────────────── diff --git a/tests/cpp_distributed/test_ep_pipeline.cu b/tests/cpp_distributed/test_ep_pipeline.cu index 41f83a6d11..299441997a 100644 --- a/tests/cpp_distributed/test_ep_pipeline.cu +++ b/tests/cpp_distributed/test_ep_pipeline.cu @@ -222,11 +222,11 @@ class EpOpTestBase : public ::testing::Test { std::vector h_w(num_tokens_ * top_k_, 1.0f / top_k_); auto h_tok = generate_tokens(rank, num_tokens_, hidden_dim_); - CHECK_CUDA(cudaMemcpy(buf.topk_idx.get(), h_idx.data(), + NVTE_CHECK_CUDA(cudaMemcpy(buf.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - CHECK_CUDA(cudaMemcpy(buf.tokens.get(), h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(buf.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); } @@ -234,10 +234,10 @@ class EpOpTestBase : public ::testing::Test { return NVTEEpLayerConfig{num_local_experts_, top_k_, alignment}; } - // ASSERT_CUDA_OK (fprintf+exit) so this non-void helper stays legal. + // NVTE_CHECK_CUDA (fprintf+exit) so this non-void helper stays legal. int read_total_recv(const EPBuffers& buf) const { std::vector cnt(num_local_experts_); - ASSERT_CUDA_OK(cudaMemcpy(cnt.data(), buf.token_counts.get(), + NVTE_CHECK_CUDA(cudaMemcpy(cnt.data(), buf.token_counts.get(), num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); int total = 0; for (int c : cnt) total += c; @@ -258,10 +258,10 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { upload_inputs(buf); EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - CHECK_CUDA(cudaMemset(buf.recv_tokens.get(), 0, buf.recv_tokens.bytes())); + NVTE_CHECK_CUDA(cudaMemset(buf.recv_tokens.get(), 0, buf.recv_tokens.bytes())); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -269,11 +269,11 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // 1. Per-expert counts. std::vector got_counts(num_local_experts_); - CHECK_CUDA(cudaMemcpy(got_counts.data(), buf.token_counts.get(), + NVTE_CHECK_CUDA(cudaMemcpy(got_counts.data(), buf.token_counts.get(), num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); auto exp_counts = expected_token_counts(g_process_id, g_num_processes, num_tokens_, top_k_, num_experts_, num_local_experts_); @@ -288,7 +288,7 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { // 2. Recv values: read only the filled prefix per local-expert zone, not the // whole recv buffer — avoids false positives from legitimate-zero token values. std::vector h_recv(buf.recv_capacity * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_recv.data(), buf.recv_tokens.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_recv.data(), buf.recv_tokens.get(), h_recv.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); std::vector got_vals; @@ -312,7 +312,7 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { // 3. recv_topk_weights: every filled slot must equal the per-token weight (1/top_k). std::vector h_w(buf.recv_capacity); - CHECK_CUDA(cudaMemcpy(h_w.data(), buf.recv_topk_weights.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_w.data(), buf.recv_topk_weights.get(), h_w.size() * sizeof(float), cudaMemcpyDeviceToHost)); const float exp_w = 1.0f / static_cast(top_k_); for (int i = 0; i < total_recv; ++i) @@ -321,7 +321,7 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { if (g_process_id == 0) printf(" PrepareAndDispatch: passed (recv=%d, values + weights exact)\n", total_recv); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -338,7 +338,7 @@ TEST_F(EPCombineTest, Combine) { EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -348,10 +348,10 @@ TEST_F(EPCombineTest, Combine) { t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, t.result.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_result(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_result.data(), buf.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_result.data(), buf.result.get(), h_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); // Spot-check 3 hidden-dim positions per token to catch partial-row writes. @@ -368,7 +368,7 @@ TEST_F(EPCombineTest, Combine) { if (g_process_id == 0) printf(" Combine: passed (result == top_k * tokens)\n"); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -385,7 +385,7 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -397,22 +397,22 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { t.result.tensor, stream)); std::vector h_grad_r(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad_r.data(), + NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad_r.data(), h_grad_r.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, t.grad_expert.tensor, NVTECommWindow{}, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); int total_recv = read_total_recv(buf); std::vector cnt(num_local_experts_); - CHECK_CUDA(cudaMemcpy(cnt.data(), buf.token_counts.get(), + NVTE_CHECK_CUDA(cudaMemcpy(cnt.data(), buf.token_counts.get(), num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); std::vector h_ge(buf.recv_capacity * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_ge.data(), buf.grad_expert.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_ge.data(), buf.grad_expert.get(), h_ge.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); // Walk filled slots by per-expert zone (no v != 0 heuristic). @@ -432,7 +432,7 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { if (g_process_id == 0) printf(" CombineBwdCheck: passed (filled=%d)\n", filled); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -449,7 +449,7 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -461,22 +461,22 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { t.result.tensor, stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), + NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), h_grad.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); - CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, t.grad_expert.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, t.g_recv_topk_weights.tensor, NVTECommWindow{}, t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_gt(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const float kExpGrad = static_cast(top_k_) * 0.1f; for (int tok = 0; tok < num_tokens_; ++tok) @@ -486,7 +486,7 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { if (g_process_id == 0) printf(" DispatchBwdCheck: passed (grad_tokens == %.2f)\n", kExpGrad); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -508,15 +508,15 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { for (int k = 0; k < top_k_; ++k) h_w[tok * top_k_ + k] = 0.1f + 0.01f * tok + 0.001f * k + 0.0001f * (g_process_id + 1); - CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), + NVTE_CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.recv_topk_weights.get(), 0, + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.recv_topk_weights.get(), 0, buf.recv_topk_weights.bytes(), stream)); ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, @@ -526,10 +526,10 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { // Sentinel: NaN so any (t, k) the bwd kernel fails to write is immediately visible. std::vector h_nan(num_tokens_ * top_k_, std::numeric_limits::quiet_NaN()); - CHECK_CUDA(cudaMemcpyAsync(buf.grad_topk_weights.get(), h_nan.data(), + NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_topk_weights.get(), h_nan.data(), h_nan.size() * sizeof(float), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); // g_recv_topk_weights := recv_topk_weights (the round-trip input). auto g_recv_t = make_nvte_tensor(buf.recv_topk_weights.get(), @@ -537,10 +537,10 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, g_recv_t.tensor, NVTECommWindow{}, t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_grad_w(num_tokens_ * top_k_); - CHECK_CUDA(cudaMemcpy(h_grad_w.data(), buf.grad_topk_weights.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_grad_w.data(), buf.grad_topk_weights.get(), h_grad_w.size() * sizeof(float), cudaMemcpyDeviceToHost)); const float kTol = 1e-5f; @@ -566,7 +566,7 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { if (g_process_id == 0 && errs == 0 && k0_eq_k1 == 0) printf(" RoundTrip: passed (%d (t, k) gradients)\n", num_tokens_ * top_k_); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -583,7 +583,7 @@ TEST_F(EPPipelineTest, FullForwardBackward) { EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -595,26 +595,26 @@ TEST_F(EPPipelineTest, FullForwardBackward) { t.result.tensor, stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), + NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), h_grad.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); - CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, t.grad_expert.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, t.g_recv_topk_weights.tensor, NVTECommWindow{}, t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); if (g_process_id == 0) printf(" FullForwardBackward: passed\n"); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ============================================================================= @@ -646,9 +646,9 @@ struct SymmBuf { void alloc(size_t n_bytes) { bytes = n_bytes; - ASSERT_NCCL_OK(ncclMemAlloc(&ptr, bytes)); - CHECK_CUDA(cudaMemset(ptr, 0, bytes)); - ASSERT_NCCL_OK(ncclCommWindowRegister(g_ep_comm, ptr, bytes, &win, + NVTE_CHECK_NCCL(ncclMemAlloc(&ptr, bytes)); + NVTE_CHECK_CUDA(cudaMemset(ptr, 0, bytes)); + NVTE_CHECK_NCCL(ncclCommWindowRegister(g_ep_comm, ptr, bytes, &win, NCCL_WIN_COLL_SYMMETRIC)); } }; @@ -673,7 +673,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { EPTensors ref_t(ref_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t ref_hid = ref_buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.topk_idx.tensor, ref_t.token_counts.tensor, /*alignment=*/0, stream)); @@ -683,13 +683,13 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { ref_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.recv_tokens.tensor, NVTECommWindow{}, ref_t.result.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector ref_recv(ref_buf.recv_capacity * hidden_dim_); std::vector ref_result(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(ref_recv.data(), ref_buf.recv_tokens.get(), + NVTE_CHECK_CUDA(cudaMemcpy(ref_recv.data(), ref_buf.recv_tokens.get(), ref_recv.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - CHECK_CUDA(cudaMemcpy(ref_result.data(), ref_buf.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(ref_result.data(), ref_buf.result.get(), ref_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); // Symm-mem run: tokens, recv_tokens, combine_input (== recv_tokens) all symm. @@ -704,7 +704,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { // Stage same tokens into the symm-mem input. auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); EPTensors sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); @@ -723,13 +723,13 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { sym_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.recv_tokens.tensor, symm_window(sym_recv), sym_t.result.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector sym_recv_host(sym_buf.recv_capacity * hidden_dim_); std::vector sym_result(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(sym_recv_host.data(), sym_recv.ptr, + NVTE_CHECK_CUDA(cudaMemcpy(sym_recv_host.data(), sym_recv.ptr, sym_recv_host.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - CHECK_CUDA(cudaMemcpy(sym_result.data(), sym_buf.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(sym_result.data(), sym_buf.result.get(), sym_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); // Compare per filled recv slot (HBM ref vs symm) and full result. @@ -744,7 +744,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { if (g_process_id == 0) printf(" IdentityAllSymm: passed (recv_slots=%d, bit-exact vs HBM)\n", total_recv); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // Same buffers, 2 iterations — catches window-lifecycle regressions where the @@ -759,7 +759,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { sym_tokens.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); sym_recv .alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); @@ -769,7 +769,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; for (int iter = 0; iter < 2; ++iter) { @@ -781,10 +781,10 @@ TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, symm_window(sym_recv), t.result.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_res(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_res.data(), buf.result.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), buf.result.get(), h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); for (int tok = 0; tok < num_tokens_; ++tok) { float exp = __bfloat162float(h_tok[tok * hidden_dim_]) * static_cast(top_k_); @@ -796,7 +796,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { if (g_process_id == 0) printf(" IdentityAllSymmRepeated: passed (2 iters)\n"); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // Full forward+backward with symm-mem on every spec-mandated buffer: @@ -821,7 +821,7 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { sym_grad_expert.alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), + NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); @@ -835,7 +835,7 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); cudaStream_t stream; - CHECK_CUDA(cudaStreamCreate(&stream)); + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); @@ -848,12 +848,12 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { symm_window(sym_recv), t.result.tensor, stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - CHECK_CUDA(cudaMemcpyAsync(sym_grad_result.ptr, h_grad.data(), + NVTE_CHECK_CUDA(cudaMemcpyAsync(sym_grad_result.ptr, h_grad.data(), h_grad.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice, stream)); - CHECK_CUDA(cudaMemsetAsync(sym_grad_expert.ptr, 0, sym_grad_expert.bytes, stream)); - CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); - CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(sym_grad_expert.ptr, 0, sym_grad_expert.bytes, stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, symm_window(sym_grad_result), t.grad_expert.tensor, @@ -862,13 +862,13 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { symm_window(sym_grad_expert), t.g_recv_topk_weights.tensor, NVTECommWindow{}, t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); - CHECK_CUDA(cudaStreamSynchronize(stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); std::vector h_gt(num_tokens_ * hidden_dim_); - CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), + NVTE_CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const float kExpGrad = static_cast(top_k_) * 0.1f; for (int tok = 0; tok < num_tokens_; ++tok) @@ -877,7 +877,7 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { if (g_process_id == 0) printf(" FullPipelineSymm: passed\n"); - CHECK_CUDA(cudaStreamDestroy(stream)); + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } // ── main ────────────────────────────────────────────────────────────────────── diff --git a/transformer_engine/common/util/logging.h b/transformer_engine/common/util/logging.h index da8b9b377d..3308bd22e4 100644 --- a/transformer_engine/common/util/logging.h +++ b/transformer_engine/common/util/logging.h @@ -98,6 +98,14 @@ } \ } while (false) +#define NVTE_CHECK_NCCL(expr) \ + do { \ + const ncclResult_t status_NVTE_CHECK_NCCL = (expr); \ + if (status_NVTE_CHECK_NCCL != ncclSuccess) { \ + NVTE_ERROR("NCCL Error: ", ncclGetErrorString(status_NVTE_CHECK_NCCL)); \ + } \ + } while (false) + #ifdef NVTE_WITH_CUBLASMP #define NVTE_CHECK_CUBLASMP(expr) \ From 11c9a1095e5eea6e7f7e9431e15b304e005528b4 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:44:25 -0700 Subject: [PATCH 10/21] Expert Parallelism tests: replace TensorHandle shim with TensorWrapper Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep_common.h | 51 +--- tests/cpp_distributed/test_ep_coverage.cu | 228 +++++++++--------- tests/cpp_distributed/test_ep_pipeline.cu | 274 +++++++++++----------- 3 files changed, 254 insertions(+), 299 deletions(-) diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index fa3b3cc1c4..6b6b56ad51 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -30,6 +30,9 @@ #include #include "util/logging.h" +using transformer_engine::DType; +using transformer_engine::TensorWrapper; + // ── Process-level state ─────────────────────────────────────────────────────── static int g_process_id = -1; @@ -45,54 +48,6 @@ static NVTEDType g_token_dtype = kNVTEBFloat16; static bool g_ep_initialized = false; static ncclComm_t g_ep_comm = nullptr; // owned by harness, destroyed in ep_teardown -// ── TensorHandle RAII wrapper ───────────────────────────────────────────────── - -// View over a caller-owned device buffer; owns NVTETensor metadata only. Move-only. -struct TensorHandle { - NVTETensor tensor = nullptr; - void* dev_ptr = nullptr; - - ~TensorHandle() { - if (tensor) nvte_destroy_tensor(tensor); - } - - TensorHandle() = default; - TensorHandle(const TensorHandle&) = delete; - TensorHandle& operator=(const TensorHandle&) = delete; - - TensorHandle(TensorHandle&& o) noexcept : tensor(o.tensor), dev_ptr(o.dev_ptr) { - o.tensor = nullptr; o.dev_ptr = nullptr; - } - TensorHandle& operator=(TensorHandle&& o) noexcept { - if (this != &o) { - if (tensor) nvte_destroy_tensor(tensor); - tensor = o.tensor; dev_ptr = o.dev_ptr; - o.tensor = nullptr; o.dev_ptr = nullptr; - } - return *this; - } -}; - -static TensorHandle make_nvte_tensor(void* dev_ptr, - const std::vector& shape, - NVTEDType dtype) { - TensorHandle h; - h.dev_ptr = dev_ptr; - h.tensor = nvte_create_tensor(NVTE_DELAYED_TENSOR_SCALING); - - NVTEShape s; - s.ndim = shape.size(); - for (size_t i = 0; i < shape.size(); ++i) s.data[i] = shape[i]; - - NVTEBasicTensor bt; - bt.data_ptr = dev_ptr; - bt.dtype = dtype; - bt.shape = s; - nvte_set_tensor_param_v2(h.tensor, kNVTERowwiseData, &bt, sizeof(bt)); - - return h; -} - // RAII owner for a cudaMalloc'd device buffer; frees on destruction. template struct DevBuf { diff --git a/tests/cpp_distributed/test_ep_coverage.cu b/tests/cpp_distributed/test_ep_coverage.cu index 2a67bbae0f..2c21620142 100644 --- a/tests/cpp_distributed/test_ep_coverage.cu +++ b/tests/cpp_distributed/test_ep_coverage.cu @@ -136,23 +136,23 @@ TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { ASSERT_NE(a.handle_id, b.handle_id); auto run_one = [&](Bundle& x) { - auto topk_idx = make_nvte_tensor(x.topk_idx.get(), {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto topk_weights = make_nvte_tensor(x.topk_weights.get(), {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); - auto token_counts = make_nvte_tensor(x.token_counts.get(), {(size_t)num_local_experts_}, kNVTEInt32); - auto handle_mem = make_nvte_tensor(x.handle_mem.get(), {x.handle_mem_size}, kNVTEByte); - auto tokens = make_nvte_tensor(x.tokens.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_tokens = make_nvte_tensor(x.recv_tokens.get(), {x.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_w = make_nvte_tensor(x.recv_topk_weights.get(), {x.recv_capacity}, kNVTEFloat32); - auto result = make_nvte_tensor(x.result.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); - NVTEEpHandle h{x.handle_id, handle_mem.tensor}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx.tensor, token_counts.tensor, + auto topk_idx = TensorWrapper(x.topk_idx.get(), {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto topk_weights = TensorWrapper(x.topk_weights.get(), {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + auto token_counts = TensorWrapper(x.token_counts.get(), {(size_t)num_local_experts_}, DType::kInt32); + auto handle_mem = TensorWrapper(x.handle_mem.get(), {x.handle_mem_size}, DType::kByte); + auto tokens = TensorWrapper(x.tokens.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_tokens = TensorWrapper(x.recv_tokens.get(), {x.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_w = TensorWrapper(x.recv_topk_weights.get(), {x.recv_capacity}, DType::kFloat32); + auto result = TensorWrapper(x.result.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); + NVTEEpHandle h{x.handle_id, handle_mem.data()}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx.data(), token_counts.data(), /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx.tensor, tokens.tensor, - NVTECommWindow{}, topk_weights.tensor, NVTECommWindow{}, - recv_tokens.tensor, NVTECommWindow{}, recv_w.tensor, + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx.data(), tokens.data(), + NVTECommWindow{}, topk_weights.data(), NVTECommWindow{}, + recv_tokens.data(), NVTECommWindow{}, recv_w.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens.tensor, NVTECommWindow{}, - result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens.data(), NVTECommWindow{}, + result.data(), stream)); }; run_one(a); run_one(b); @@ -193,35 +193,35 @@ TEST_F(TopK1Test, RoundTrip) { NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts_}, kNVTEInt32); - auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), - {b.handle_mem_size}, kNVTEByte); - auto tokens_t = make_nvte_tensor(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), - {b.recv_capacity}, kNVTEFloat32); - auto result_t = make_nvte_tensor(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto topk_weights_t = TensorWrapper(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts_}, DType::kInt32); + auto handle_mem_t = TensorWrapper(b.handle_mem.get(), + {b.handle_mem_size}, DType::kByte); + auto tokens_t = TensorWrapper(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), + {b.recv_capacity}, DType::kFloat32); + auto result_t = TensorWrapper(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.tensor, - tokens_t.tensor, NVTECommWindow{}, topk_weights_t.tensor, - NVTECommWindow{}, recv_tokens_t.tensor, NVTECommWindow{}, - recv_w_t.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, - NVTECommWindow{}, result_t.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.data(), + tokens_t.data(), NVTECommWindow{}, topk_weights_t.data(), + NVTECommWindow{}, recv_tokens_t.data(), NVTECommWindow{}, + recv_w_t.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.data(), + NVTECommWindow{}, result_t.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // top_k=1: combine is unweighted gather, so result[t] == tokens[t]. @@ -266,35 +266,35 @@ TEST_P(EmptyExpertsTest, RoundTripCorrect) { NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts_}, kNVTEInt32); - auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), - {b.handle_mem_size}, kNVTEByte); - auto tokens_t = make_nvte_tensor(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); - auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), - {b.recv_capacity}, kNVTEFloat32); - auto result_t = make_nvte_tensor(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, kNVTEBFloat16); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto topk_weights_t = TensorWrapper(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts_}, DType::kInt32); + auto handle_mem_t = TensorWrapper(b.handle_mem.get(), + {b.handle_mem_size}, DType::kByte); + auto tokens_t = TensorWrapper(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); + auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), + {b.recv_capacity}, DType::kFloat32); + auto result_t = TensorWrapper(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; + ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), alignment, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.tensor, - tokens_t.tensor, NVTECommWindow{}, topk_weights_t.tensor, - NVTECommWindow{}, recv_tokens_t.tensor, NVTECommWindow{}, - recv_w_t.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.tensor, - NVTECommWindow{}, result_t.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.data(), + tokens_t.data(), NVTECommWindow{}, topk_weights_t.data(), + NVTECommWindow{}, recv_tokens_t.data(), NVTECommWindow{}, + recv_w_t.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.data(), + NVTECommWindow{}, result_t.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Identity expert + uniform weights: result[t] == top_k * tokens[t]. @@ -329,17 +329,17 @@ TEST_F(NegativeTests, AlignmentMismatchThrows) { NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts_}, kNVTEInt32); - auto handle_mem_t = make_nvte_tensor(b.handle_mem.get(), - {b.handle_mem_size}, kNVTEByte); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts_}, DType::kInt32); + auto handle_mem_t = TensorWrapper(b.handle_mem.get(), + {b.handle_mem_size}, DType::kByte); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; - EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; + EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), /*alignment=*/16, stream), std::exception); NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); @@ -353,17 +353,17 @@ TEST_F(NegativeTests, NullHandleMemThrows) { NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts_}, kNVTEInt32); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts_}, DType::kInt32); // Construct a tensor view backed by a null device pointer. - auto null_hm_t = make_nvte_tensor(nullptr, {b.handle_mem_size}, kNVTEByte); + auto null_hm_t = TensorWrapper(nullptr, {b.handle_mem_size}, DType::kByte); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, null_hm_t.tensor}; - EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h{b.handle_id, null_hm_t.data()}; + EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), /*alignment=*/0, stream), std::exception); NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); @@ -386,30 +386,30 @@ static void run_round_trip(B& b, void* handle_mem_data, int num_tokens, int top_k, int num_local_experts, int hidden_dim, size_t alignment, cudaStream_t stream) { - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto topk_weights_t = make_nvte_tensor(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts}, kNVTEInt32); - auto handle_mem_t = make_nvte_tensor(handle_mem_data, - {b.handle_mem_size}, kNVTEByte); - auto tokens_t = make_nvte_tensor(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - auto recv_tokens_t = make_nvte_tensor(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); - auto recv_w_t = make_nvte_tensor(b.recv_topk_weights.get(), - {b.recv_capacity}, kNVTEFloat32); - auto result_t = make_nvte_tensor(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - - NVTEEpHandle h{b.handle_id, handle_mem_t.tensor}; - nvte_ep_prepare(h, topk_idx_t.tensor, token_counts_t.tensor, alignment, stream); - nvte_ep_dispatch(h, topk_idx_t.tensor, tokens_t.tensor, NVTECommWindow{}, - topk_weights_t.tensor, NVTECommWindow{}, - recv_tokens_t.tensor, NVTECommWindow{}, - recv_w_t.tensor, NVTECommWindow{}, stream); - nvte_ep_combine(h, recv_tokens_t.tensor, NVTECommWindow{}, result_t.tensor, stream); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto topk_weights_t = TensorWrapper(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts}, DType::kInt32); + auto handle_mem_t = TensorWrapper(handle_mem_data, + {b.handle_mem_size}, DType::kByte); + auto tokens_t = TensorWrapper(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); + auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), + {b.recv_capacity}, DType::kFloat32); + auto result_t = TensorWrapper(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + + NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; + nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), alignment, stream); + nvte_ep_dispatch(h, topk_idx_t.data(), tokens_t.data(), NVTECommWindow{}, + topk_weights_t.data(), NVTECommWindow{}, + recv_tokens_t.data(), NVTECommWindow{}, + recv_w_t.data(), NVTECommWindow{}, stream); + nvte_ep_combine(h, recv_tokens_t.data(), NVTECommWindow{}, result_t.data(), stream); } // Re-bootstrap EP backend with a different allow_handle_mem_reloc setting. @@ -478,26 +478,26 @@ TEST_F(HandleCacheTest, RelocDefaultThrows) { NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - auto topk_idx_t = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - auto token_counts_t = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts_}, kNVTEInt32); - auto hm1_t = make_nvte_tensor(b.handle_mem.get(), - {b.handle_mem_size}, kNVTEByte); - auto hm2_t = make_nvte_tensor(second_hm.get(), - {b.handle_mem_size}, kNVTEByte); + auto topk_idx_t = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + auto token_counts_t = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts_}, DType::kInt32); + auto hm1_t = TensorWrapper(b.handle_mem.get(), + {b.handle_mem_size}, DType::kByte); + auto hm2_t = TensorWrapper(second_hm.get(), + {b.handle_mem_size}, DType::kByte); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); // First prepare seeds the cache. - NVTEEpHandle h1{b.handle_id, hm1_t.tensor}; - ASSERT_NO_THROW(nvte_ep_prepare(h1, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h1{b.handle_id, hm1_t.data()}; + ASSERT_NO_THROW(nvte_ep_prepare(h1, topk_idx_t.data(), token_counts_t.data(), /*alignment=*/0, stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // Same handle_id with a different handle_mem ptr must throw. - NVTEEpHandle h2{b.handle_id, hm2_t.tensor}; - EXPECT_THROW(nvte_ep_prepare(h2, topk_idx_t.tensor, token_counts_t.tensor, + NVTEEpHandle h2{b.handle_id, hm2_t.data()}; + EXPECT_THROW(nvte_ep_prepare(h2, topk_idx_t.data(), token_counts_t.data(), /*alignment=*/0, stream), std::exception); NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); diff --git a/tests/cpp_distributed/test_ep_pipeline.cu b/tests/cpp_distributed/test_ep_pipeline.cu index 299441997a..91771a6817 100644 --- a/tests/cpp_distributed/test_ep_pipeline.cu +++ b/tests/cpp_distributed/test_ep_pipeline.cu @@ -157,39 +157,39 @@ struct EPBuffers { // Bundled NVTETensor views over an EPBuffers — one place to update the shape // conventions when the C-API evolves. struct EPTensors { - TensorHandle topk_idx, topk_weights, token_counts, handle_mem, tokens; - TensorHandle recv_tokens, recv_topk_weights, result; - TensorHandle grad_result, grad_expert, grad_tokens; - TensorHandle g_recv_topk_weights, grad_topk_weights; + TensorWrapper topk_idx, topk_weights, token_counts, handle_mem, tokens; + TensorWrapper recv_tokens, recv_topk_weights, result; + TensorWrapper grad_result, grad_expert, grad_tokens; + TensorWrapper g_recv_topk_weights, grad_topk_weights; EPTensors(EPBuffers& b, int num_tokens, int top_k, int hidden_dim, int num_local_experts) { - topk_idx = make_nvte_tensor(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEInt64); - topk_weights = make_nvte_tensor(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); - token_counts = make_nvte_tensor(b.token_counts.get(), - {(size_t)num_local_experts}, kNVTEInt32); - handle_mem = make_nvte_tensor(b.handle_mem.get(), - {b.handle_mem_size}, kNVTEByte); - tokens = make_nvte_tensor(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - recv_tokens = make_nvte_tensor(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); - recv_topk_weights = make_nvte_tensor(b.recv_topk_weights.get(), - {b.recv_capacity}, kNVTEFloat32); - result = make_nvte_tensor(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - grad_result = make_nvte_tensor(b.grad_result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - grad_expert = make_nvte_tensor(b.grad_expert.get(), - {b.recv_capacity, (size_t)hidden_dim}, kNVTEBFloat16); - grad_tokens = make_nvte_tensor(b.grad_tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kNVTEBFloat16); - g_recv_topk_weights = make_nvte_tensor(b.g_recv_topk_weights.get(), - {b.recv_capacity}, kNVTEFloat32); - grad_topk_weights = make_nvte_tensor(b.grad_topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, kNVTEFloat32); + topk_idx = TensorWrapper(b.topk_idx.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + topk_weights = TensorWrapper(b.topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + token_counts = TensorWrapper(b.token_counts.get(), + {(size_t)num_local_experts}, DType::kInt32); + handle_mem = TensorWrapper(b.handle_mem.get(), + {b.handle_mem_size}, DType::kByte); + tokens = TensorWrapper(b.tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + recv_tokens = TensorWrapper(b.recv_tokens.get(), + {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); + recv_topk_weights = TensorWrapper(b.recv_topk_weights.get(), + {b.recv_capacity}, DType::kFloat32); + result = TensorWrapper(b.result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + grad_result = TensorWrapper(b.grad_result.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + grad_expert = TensorWrapper(b.grad_expert.get(), + {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); + grad_tokens = TensorWrapper(b.grad_tokens.get(), + {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + g_recv_topk_weights = TensorWrapper(b.g_recv_topk_weights.get(), + {b.recv_capacity}, DType::kFloat32); + grad_topk_weights = TensorWrapper(b.grad_topk_weights.get(), + {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); } }; @@ -264,11 +264,11 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); // 1. Per-expert counts. @@ -341,13 +341,13 @@ TEST_F(EPCombineTest, Combine) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, - t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, + t.result.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_result(num_tokens_ * hidden_dim_); @@ -388,13 +388,13 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, - t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, + t.result.data(), stream)); std::vector h_grad_r(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad_r.data(), @@ -402,8 +402,8 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { cudaMemcpyHostToDevice, stream)); NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, - t.grad_expert.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), NVTECommWindow{}, + t.grad_expert.data(), NVTECommWindow{}, stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); int total_recv = read_total_recv(buf); @@ -452,13 +452,13 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, - t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, + t.result.data(), stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), @@ -468,11 +468,11 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, - t.grad_expert.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, - t.g_recv_topk_weights.tensor, NVTECommWindow{}, - t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), NVTECommWindow{}, + t.grad_expert.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), NVTECommWindow{}, + t.g_recv_topk_weights.data(), NVTECommWindow{}, + t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_gt(num_tokens_ * hidden_dim_); @@ -515,13 +515,13 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); NVTE_CHECK_CUDA(cudaMemsetAsync(buf.recv_topk_weights.get(), 0, buf.recv_topk_weights.bytes(), stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); // Sentinel: NaN so any (t, k) the bwd kernel fails to write is immediately visible. std::vector h_nan(num_tokens_ * top_k_, @@ -532,11 +532,11 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); // g_recv_topk_weights := recv_topk_weights (the round-trip input). - auto g_recv_t = make_nvte_tensor(buf.recv_topk_weights.get(), - {buf.recv_capacity}, kNVTEFloat32); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, - NVTECommWindow{}, g_recv_t.tensor, NVTECommWindow{}, - t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + auto g_recv_t = TensorWrapper(buf.recv_topk_weights.get(), + {buf.recv_capacity}, DType::kFloat32); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), + NVTECommWindow{}, g_recv_t.data(), NVTECommWindow{}, + t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_grad_w(num_tokens_ * top_k_); @@ -586,13 +586,13 @@ TEST_F(EPPipelineTest, FullForwardBackward) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, NVTECommWindow{}, t.topk_weights.tensor, - NVTECommWindow{}, t.recv_tokens.tensor, NVTECommWindow{}, - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, NVTECommWindow{}, - t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, + t.result.data(), stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), @@ -602,11 +602,11 @@ TEST_F(EPPipelineTest, FullForwardBackward) { NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, NVTECommWindow{}, - t.grad_expert.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, NVTECommWindow{}, - t.g_recv_topk_weights.tensor, NVTECommWindow{}, - t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), NVTECommWindow{}, + t.grad_expert.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), NVTECommWindow{}, + t.g_recv_topk_weights.data(), NVTECommWindow{}, + t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); @@ -676,13 +676,13 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t ref_hid = ref_buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.topk_idx.tensor, ref_t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.topk_idx.tensor, - ref_t.tokens.tensor, NVTECommWindow{}, ref_t.topk_weights.tensor, - NVTECommWindow{}, ref_t.recv_tokens.tensor, NVTECommWindow{}, - ref_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{ref_hid, ref_t.handle_mem.tensor}, ref_t.recv_tokens.tensor, NVTECommWindow{}, - ref_t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{ref_hid, ref_t.handle_mem.data()}, ref_t.topk_idx.data(), ref_t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{ref_hid, ref_t.handle_mem.data()}, ref_t.topk_idx.data(), + ref_t.tokens.data(), NVTECommWindow{}, ref_t.topk_weights.data(), + NVTECommWindow{}, ref_t.recv_tokens.data(), NVTECommWindow{}, + ref_t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{ref_hid, ref_t.handle_mem.data()}, ref_t.recv_tokens.data(), NVTECommWindow{}, + ref_t.result.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector ref_recv(ref_buf.recv_capacity * hidden_dim_); @@ -709,20 +709,20 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { EPTensors sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); // Replace the tokens/recv_tokens views with ones pointing at the symm buffers. - sym_t.tokens = make_nvte_tensor(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); - sym_t.recv_tokens = make_nvte_tensor(sym_recv.ptr, - {sym_buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + sym_t.tokens = TensorWrapper(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); + sym_t.recv_tokens = TensorWrapper(sym_recv.ptr, + {sym_buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); uint64_t sym_hid = sym_buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.topk_idx.tensor, sym_t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.topk_idx.tensor, - sym_t.tokens.tensor, symm_window(sym_tokens), - sym_t.topk_weights.tensor, NVTECommWindow{}, - sym_t.recv_tokens.tensor, symm_window(sym_recv), - sym_t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{sym_hid, sym_t.handle_mem.tensor}, sym_t.recv_tokens.tensor, - symm_window(sym_recv), sym_t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{sym_hid, sym_t.handle_mem.data()}, sym_t.topk_idx.data(), sym_t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{sym_hid, sym_t.handle_mem.data()}, sym_t.topk_idx.data(), + sym_t.tokens.data(), symm_window(sym_tokens), + sym_t.topk_weights.data(), NVTECommWindow{}, + sym_t.recv_tokens.data(), symm_window(sym_recv), + sym_t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{sym_hid, sym_t.handle_mem.data()}, sym_t.recv_tokens.data(), + symm_window(sym_recv), sym_t.result.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector sym_recv_host(sym_buf.recv_capacity * hidden_dim_); @@ -763,24 +763,24 @@ TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - t.tokens = make_nvte_tensor(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); - t.recv_tokens = make_nvte_tensor(sym_recv.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + t.tokens = TensorWrapper(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); + t.recv_tokens = TensorWrapper(sym_recv.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; for (int iter = 0; iter < 2; ++iter) { - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, symm_window(sym_tokens), - t.topk_weights.tensor, NVTECommWindow{}, - t.recv_tokens.tensor, symm_window(sym_recv), - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, - symm_window(sym_recv), t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), symm_window(sym_tokens), + t.topk_weights.data(), NVTECommWindow{}, + t.recv_tokens.data(), symm_window(sym_recv), + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), + symm_window(sym_recv), t.result.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); std::vector h_res(num_tokens_ * hidden_dim_); @@ -825,27 +825,27 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - t.tokens = make_nvte_tensor(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); - t.recv_tokens = make_nvte_tensor(sym_recv.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); - t.grad_result = make_nvte_tensor(sym_grad_result.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, kNVTEBFloat16); - t.grad_expert = make_nvte_tensor(sym_grad_expert.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, kNVTEBFloat16); + t.tokens = TensorWrapper(sym_tokens.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); + t.recv_tokens = TensorWrapper(sym_recv.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); + t.grad_result = TensorWrapper(sym_grad_result.ptr, + {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); + t.grad_expert = TensorWrapper(sym_grad_expert.ptr, + {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, t.token_counts.tensor, /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.topk_idx.tensor, - t.tokens.tensor, symm_window(sym_tokens), - t.topk_weights.tensor, NVTECommWindow{}, - t.recv_tokens.tensor, symm_window(sym_recv), - t.recv_topk_weights.tensor, NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.recv_tokens.tensor, - symm_window(sym_recv), t.result.tensor, stream)); + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), symm_window(sym_tokens), + t.topk_weights.data(), NVTECommWindow{}, + t.recv_tokens.data(), symm_window(sym_recv), + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), + symm_window(sym_recv), t.result.data(), stream)); std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); NVTE_CHECK_CUDA(cudaMemcpyAsync(sym_grad_result.ptr, h_grad.data(), @@ -855,13 +855,13 @@ TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_result.tensor, - symm_window(sym_grad_result), t.grad_expert.tensor, + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), + symm_window(sym_grad_result), t.grad_expert.data(), symm_window(sym_grad_expert), stream)); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.tensor}, t.grad_expert.tensor, + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), symm_window(sym_grad_expert), - t.g_recv_topk_weights.tensor, NVTECommWindow{}, - t.grad_tokens.tensor, t.grad_topk_weights.tensor, stream)); + t.g_recv_topk_weights.data(), NVTECommWindow{}, + t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); From 799b3bc2649bce4c97e04d1a402d86b84610656b Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 15:55:10 -0700 Subject: [PATCH 11/21] Expert Parallelism tests: consolidate into single test_ep.cu with essential per-op and pipeline tests Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/CMakeLists.txt | 22 +- tests/cpp_distributed/run_test_ep.sh | 4 +- .../{test_ep_pipeline.cu => test_ep.cu} | 132 ---- tests/cpp_distributed/test_ep_coverage.cu | 562 ------------------ tests/cpp_distributed/test_ep_init.cu | 64 -- 5 files changed, 6 insertions(+), 778 deletions(-) rename tests/cpp_distributed/{test_ep_pipeline.cu => test_ep.cu} (82%) delete mode 100644 tests/cpp_distributed/test_ep_coverage.cu delete mode 100644 tests/cpp_distributed/test_ep_init.cu diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index 3870f57911..531f0d19e5 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -121,23 +121,11 @@ set(EP_TEST_COMMON_LIBS # explicitly with --no-as-needed so the linker keeps the dependency. set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed") -# ── EP init tests (InitPath, HandleMemSizeQuery) ───────────────────────────── -add_executable(test_ep_init test_ep_init.cu) -target_include_directories(test_ep_init PRIVATE ${EP_TEST_COMMON_INCLUDES}) -target_link_libraries(test_ep_init PUBLIC ${EP_TEST_COMMON_LIBS}) -target_link_options(test_ep_init PUBLIC ${EP_TEST_LINK_OPTS}) - -# ── EP pipeline tests (dispatch, combine, bwd, integrated) ─────────────────── -add_executable(test_ep_pipeline test_ep_pipeline.cu) -target_include_directories(test_ep_pipeline PRIVATE ${EP_TEST_COMMON_INCLUDES}) -target_link_libraries(test_ep_pipeline PUBLIC ${EP_TEST_COMMON_LIBS}) -target_link_options(test_ep_pipeline PUBLIC ${EP_TEST_LINK_OPTS}) - -# ── EP coverage tests (multi-handle, top_k=1, empty experts, negatives, threading) ── -add_executable(test_ep_coverage test_ep_coverage.cu) -target_include_directories(test_ep_coverage PRIVATE ${EP_TEST_COMMON_INCLUDES}) -target_link_libraries(test_ep_coverage PUBLIC ${EP_TEST_COMMON_LIBS}) -target_link_options(test_ep_coverage PUBLIC ${EP_TEST_LINK_OPTS}) +# ── EP distributed tests (per-op + full pipeline + zero-copy symm) ─────────── +add_executable(test_ep test_ep.cu) +target_include_directories(test_ep PRIVATE ${EP_TEST_COMMON_INCLUDES}) +target_link_libraries(test_ep PUBLIC ${EP_TEST_COMMON_LIBS}) +target_link_options(test_ep PUBLIC ${EP_TEST_LINK_OPTS}) # Do NOT use gtest_discover_tests — these binaries require multi-process # launch via run_test_ep.sh, not direct single-process execution. diff --git a/tests/cpp_distributed/run_test_ep.sh b/tests/cpp_distributed/run_test_ep.sh index 017d3f807b..8c22edd389 100755 --- a/tests/cpp_distributed/run_test_ep.sh +++ b/tests/cpp_distributed/run_test_ep.sh @@ -123,9 +123,7 @@ trap cleanup EXIT INT TERM # --------------------------------------------------------------------------- # Run all suites # --------------------------------------------------------------------------- -run_suite "test_ep_init" "EP Init Tests" 2 -run_suite "test_ep_pipeline" "EP Pipeline Tests" 2 -run_suite "test_ep_coverage" "EP Coverage Tests" 2 +run_suite "test_ep" "EP Tests" 2 echo if (( OVERALL_FAIL )); then diff --git a/tests/cpp_distributed/test_ep_pipeline.cu b/tests/cpp_distributed/test_ep.cu similarity index 82% rename from tests/cpp_distributed/test_ep_pipeline.cu rename to tests/cpp_distributed/test_ep.cu index 91771a6817..f82e8b9024 100644 --- a/tests/cpp_distributed/test_ep_pipeline.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -747,138 +747,6 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); } -// Same buffers, 2 iterations — catches window-lifecycle regressions where the -// symm-mem registration goes stale between calls. -TEST_F(EPZeroCopyTest, IdentityAllSymmRepeated) { - EPBuffers buf; - buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, - ep_size_, max_tokens_per_rank_); - upload_inputs(buf); - - SymmBuf sym_tokens, sym_recv; - sym_tokens.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); - sym_recv .alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); - auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - t.tokens = TensorWrapper(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); - t.recv_tokens = TensorWrapper(sym_recv.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - uint64_t handle_id = buf.handle_id; - for (int iter = 0; iter < 2; ++iter) { - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), - t.tokens.data(), symm_window(sym_tokens), - t.topk_weights.data(), NVTECommWindow{}, - t.recv_tokens.data(), symm_window(sym_recv), - t.recv_topk_weights.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), - symm_window(sym_recv), t.result.data(), stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - std::vector h_res(num_tokens_ * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), buf.result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - for (int tok = 0; tok < num_tokens_; ++tok) { - float exp = __bfloat162float(h_tok[tok * hidden_dim_]) * static_cast(top_k_); - float got = __bfloat162float(h_res[tok * hidden_dim_]); - ASSERT_NEAR(got, exp, bf16_tol(exp)) << "iter " << iter << " tok " << tok; - } - } - - if (g_process_id == 0) - printf(" IdentityAllSymmRepeated: passed (2 iters)\n"); - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -// Full forward+backward with symm-mem on every spec-mandated buffer: -// dispatch i/o, combine input, combine_bwd i/o, dispatch_bwd input. -// TODO: flaky on rank 0 (grad_tokens partial-zero) when run after the prior -// EPZeroCopyTest cases in the same binary; passes in isolation. Re-enable once -// the root cause (likely NCCL EP NVLS write→read coherence on grad_expert) is -// understood. Tracked separately. -TEST_F(EPZeroCopyTest, DISABLED_FullPipelineSymm) { - EPBuffers buf; - buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, - ep_size_, max_tokens_per_rank_); - upload_inputs(buf); - - // Symm-mem: tokens (dispatch input), recv_tokens (dispatch output AND - // combine input), grad_result (combine_bwd input), grad_expert - // (combine_bwd output AND dispatch_bwd input). - SymmBuf sym_tokens, sym_recv, sym_grad_result, sym_grad_expert; - sym_tokens .alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); - sym_recv .alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); - sym_grad_result.alloc(num_tokens_ * hidden_dim_ * sizeof(nv_bfloat16)); - sym_grad_expert.alloc(buf.recv_capacity * hidden_dim_ * sizeof(nv_bfloat16)); - - auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - t.tokens = TensorWrapper(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); - t.recv_tokens = TensorWrapper(sym_recv.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - t.grad_result = TensorWrapper(sym_grad_result.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); - t.grad_expert = TensorWrapper(sym_grad_expert.ptr, - {buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), - t.tokens.data(), symm_window(sym_tokens), - t.topk_weights.data(), NVTECommWindow{}, - t.recv_tokens.data(), symm_window(sym_recv), - t.recv_topk_weights.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), - symm_window(sym_recv), t.result.data(), stream)); - - std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - NVTE_CHECK_CUDA(cudaMemcpyAsync(sym_grad_result.ptr, h_grad.data(), - h_grad.size() * sizeof(nv_bfloat16), - cudaMemcpyHostToDevice, stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(sym_grad_expert.ptr, 0, sym_grad_expert.bytes, stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); - - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), - symm_window(sym_grad_result), t.grad_expert.data(), - symm_window(sym_grad_expert), stream)); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), - symm_window(sym_grad_expert), - t.g_recv_topk_weights.data(), NVTECommWindow{}, - t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); - ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); - - std::vector h_gt(num_tokens_ * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_gt.data(), buf.grad_tokens.get(), - h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const float kExpGrad = static_cast(top_k_) * 0.1f; - for (int tok = 0; tok < num_tokens_; ++tok) - EXPECT_NEAR(__bfloat162float(h_gt[tok * hidden_dim_]), kExpGrad, bf16_tol(kExpGrad)) - << "grad_tokens token " << tok; - - if (g_process_id == 0) printf(" FullPipelineSymm: passed\n"); - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} // ── main ────────────────────────────────────────────────────────────────────── diff --git a/tests/cpp_distributed/test_ep_coverage.cu b/tests/cpp_distributed/test_ep_coverage.cu deleted file mode 100644 index 2c21620142..0000000000 --- a/tests/cpp_distributed/test_ep_coverage.cu +++ /dev/null @@ -1,562 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * See LICENSE for license information. - ************************************************************************/ - -/* - * EP C-API coverage tests (paths not exercised by the pipeline suite). - * - * MultiHandleAllocTest — distinct handle ids; each works end-to-end. - * TopK1Test — top_k=1 dispatch/combine/bwd round-trip. - * EmptyExpertsTest — alignment ∈ {0, 2, 8, 16} with experts receiving 0 tokens. - * NegativeTests — alignment mismatch and null handle_mem must throw. - */ - -#include "test_ep_common.h" - -#include -#include - -// top1 -> expert 0, top2 -> expert 2; leaves local-expert 1 empty between two -// full experts. Requires top_k >= 2 and num_experts >= 3. -static std::vector routing_skip_middle(int num_tokens, int top_k) { - std::vector idx(num_tokens * top_k); - for (int t = 0; t < num_tokens; ++t) { - idx[t * top_k + 0] = 0; - if (top_k >= 2) idx[t * top_k + 1] = 2; - for (int k = 2; k < top_k; ++k) idx[t * top_k + k] = 2 + k; // distinct stragglers - } - return idx; -} - -static std::vector tokens_constant(int num_tokens, int hidden_dim, float val) { - std::vector v(num_tokens * hidden_dim); - nv_bfloat16 b = __float2bfloat16(val); - std::fill(v.begin(), v.end(), b); - return v; -} - -namespace { - -class EpCoverageBase : public ::testing::Test { - protected: - int ep_size_, num_experts_, num_local_experts_, hidden_dim_; - int max_tokens_per_rank_; - - void SetUp() override { - if (g_sm_major < 9) - GTEST_SKIP() << "EP requires SM_90+ (device is SM_" << g_sm_major << "0)"; - ASSERT_GE(g_num_processes, 2); - ASSERT_TRUE(g_ep_initialized); - ep_size_ = g_ep_size; - num_experts_ = g_num_experts; - num_local_experts_ = num_experts_ / ep_size_; - hidden_dim_ = g_hidden_dim; - max_tokens_per_rank_ = g_max_tokens_per_rank; - } - - // Helper: allocate buffers + tensor views for a single dispatch+combine. - struct Bundle { - DevBuf topk_idx; - DevBuf topk_weights; - DevBuf tokens; - DevBuf token_counts; - DevBuf handle_mem; - DevBuf recv_tokens; - DevBuf recv_topk_weights; - DevBuf result; - uint64_t handle_id = 0; - size_t handle_mem_size = 0; - size_t recv_capacity = 0; - }; - - Bundle make_bundle(int num_tokens, int top_k, int num_local_experts, - size_t alignment) { - Bundle b; - b.recv_capacity = static_cast(ep_size_) * max_tokens_per_rank_ * 2; - b.topk_idx.alloc(num_tokens * top_k); - b.topk_weights.alloc(num_tokens * top_k); - b.tokens.alloc(num_tokens * hidden_dim_); - b.token_counts.alloc(num_local_experts); - b.recv_tokens.alloc(b.recv_capacity * hidden_dim_); - b.recv_topk_weights.alloc(b.recv_capacity); - b.result.alloc(num_tokens * hidden_dim_); - NVTEEpLayerConfig cfg{num_local_experts, top_k, alignment}; - b.handle_id = nvte_ep_register_layer(cfg, &b.handle_mem_size); - b.handle_mem.alloc(b.handle_mem_size); - return b; - } -}; - -} // namespace - -// ============================================================================= -// MultiHandleAllocTest: ids are distinct and each is independently usable. -// ============================================================================= - -class MultiHandleAllocTest : public EpCoverageBase {}; - -TEST_F(MultiHandleAllocTest, IdsAreDistinct) { - NVTEEpLayerConfig cfg{num_local_experts_, /*top_k=*/2, /*alignment=*/0}; - const int kN = 8; - std::vector ids(kN); - for (int i = 0; i < kN; ++i) { - size_t sz = 0; - ids[i] = nvte_ep_register_layer(cfg, &sz); - } - for (int i = 0; i < kN; ++i) { - EXPECT_NE(ids[i], 0u) << "handle_id 0 is reserved as \"no id\""; - for (int j = i + 1; j < kN; ++j) - EXPECT_NE(ids[i], ids[j]) << "duplicate id " << ids[i] << " at indices " << i << ", " << j; - } -} - -TEST_F(MultiHandleAllocTest, TwoHandlesCoexist) { - const int num_tokens = 16, top_k = 2; - Bundle a = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - std::vector h_w(num_tokens * top_k, 1.0f / top_k); - auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); - for (Bundle* x : {&a, &b}) { - NVTE_CHECK_CUDA(cudaMemcpy(x->topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(x->topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(x->tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - } - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - ASSERT_NE(a.handle_id, b.handle_id); - - auto run_one = [&](Bundle& x) { - auto topk_idx = TensorWrapper(x.topk_idx.get(), {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto topk_weights = TensorWrapper(x.topk_weights.get(), {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); - auto token_counts = TensorWrapper(x.token_counts.get(), {(size_t)num_local_experts_}, DType::kInt32); - auto handle_mem = TensorWrapper(x.handle_mem.get(), {x.handle_mem_size}, DType::kByte); - auto tokens = TensorWrapper(x.tokens.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_tokens = TensorWrapper(x.recv_tokens.get(), {x.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_w = TensorWrapper(x.recv_topk_weights.get(), {x.recv_capacity}, DType::kFloat32); - auto result = TensorWrapper(x.result.get(), {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - NVTEEpHandle h{x.handle_id, handle_mem.data()}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx.data(), token_counts.data(), - /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx.data(), tokens.data(), - NVTECommWindow{}, topk_weights.data(), NVTECommWindow{}, - recv_tokens.data(), NVTECommWindow{}, recv_w.data(), - NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens.data(), NVTECommWindow{}, - result.data(), stream)); - }; - run_one(a); - run_one(b); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - // Both round-trips must produce result == top_k * 0.5 = 1.0. - for (Bundle* x : {&a, &b}) { - std::vector h_res(num_tokens * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), x->result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; - for (int t = 0; t < num_tokens; ++t) - for (int p : probes) - EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), - static_cast(top_k) * 0.5f, 1e-2f); - } - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -// ============================================================================= -// TopK1Test: top_k=1 dispatch/combine round-trip, including dispatch_bwd. -// ============================================================================= - -class TopK1Test : public EpCoverageBase {}; - -TEST_F(TopK1Test, RoundTrip) { - const int num_tokens = 16, top_k = 1; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - std::vector h_w(num_tokens * top_k, 1.0f); // top_k=1: weight is unity - auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.25f); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto topk_weights_t = TensorWrapper(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts_}, DType::kInt32); - auto handle_mem_t = TensorWrapper(b.handle_mem.get(), - {b.handle_mem_size}, DType::kByte); - auto tokens_t = TensorWrapper(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), - {b.recv_capacity}, DType::kFloat32); - auto result_t = TensorWrapper(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), - /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.data(), - tokens_t.data(), NVTECommWindow{}, topk_weights_t.data(), - NVTECommWindow{}, recv_tokens_t.data(), NVTECommWindow{}, - recv_w_t.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.data(), - NVTECommWindow{}, result_t.data(), stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - // top_k=1: combine is unweighted gather, so result[t] == tokens[t]. - std::vector h_res(num_tokens * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; - for (int t = 0; t < num_tokens; ++t) - for (int p : probes) - EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), 0.25f, 1e-2f) - << "tok " << t << " hidden " << p; - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -// ============================================================================= -// EmptyExpertsTest: alignment ∈ {0, 2, 8, 16}, only local-expert 0 receives -// tokens. Round-trip must produce result == top_k * tokens regardless of the -// per-expert padding choice. -// ============================================================================= - -class EmptyExpertsTest : public EpCoverageBase, - public ::testing::WithParamInterface {}; - -TEST_P(EmptyExpertsTest, RoundTripCorrect) { - // routing_skip_middle needs experts {0, 2, ...}; smallest viable num_experts is 3. - ASSERT_GE(num_experts_, 3); - const size_t alignment = GetParam(); - const int num_tokens = 16, top_k = 2; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, alignment); - - // top1 -> expert 0, top2 -> expert 2; rank 0's local-expert 1 receives 0 - // tokens between two non-empty experts. - std::vector h_idx = routing_skip_middle(num_tokens, top_k); - std::vector h_w(num_tokens * top_k, 1.0f / top_k); - auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.3f); - - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto topk_weights_t = TensorWrapper(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts_}, DType::kInt32); - auto handle_mem_t = TensorWrapper(b.handle_mem.get(), - {b.handle_mem_size}, DType::kByte); - auto tokens_t = TensorWrapper(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); - auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), - {b.recv_capacity}, DType::kFloat32); - auto result_t = TensorWrapper(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim_}, DType::kBFloat16); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; - ASSERT_NO_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), - alignment, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(h, topk_idx_t.data(), - tokens_t.data(), NVTECommWindow{}, topk_weights_t.data(), - NVTECommWindow{}, recv_tokens_t.data(), NVTECommWindow{}, - recv_w_t.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(h, recv_tokens_t.data(), - NVTECommWindow{}, result_t.data(), stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - // Identity expert + uniform weights: result[t] == top_k * tokens[t]. - std::vector h_res(num_tokens * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const float expected = static_cast(top_k) * 0.3f; - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; - for (int t = 0; t < num_tokens; ++t) - for (int p : probes) - EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), expected, 1e-2f) - << "alignment=" << alignment << " tok=" << t << " hidden=" << p; - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -INSTANTIATE_TEST_SUITE_P(Alignments, EmptyExpertsTest, - ::testing::Values(0, 2, 8, 16)); - -// ============================================================================= -// NegativeTests: prepare/dispatch must surface bad inputs as exceptions. -// ============================================================================= - -class NegativeTests : public EpCoverageBase {}; - -TEST_F(NegativeTests, AlignmentMismatchThrows) { - const int num_tokens = 8, top_k = 2; - // Allocate handle for alignment=0, then call prepare with alignment=16. - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts_}, DType::kInt32); - auto handle_mem_t = TensorWrapper(b.handle_mem.get(), - {b.handle_mem_size}, DType::kByte); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; - EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), - /*alignment=*/16, stream), - std::exception); - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -TEST_F(NegativeTests, NullHandleMemThrows) { - const int num_tokens = 8, top_k = 2; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts_}, DType::kInt32); - // Construct a tensor view backed by a null device pointer. - auto null_hm_t = TensorWrapper(nullptr, {b.handle_mem_size}, DType::kByte); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - NVTEEpHandle h{b.handle_id, null_hm_t.data()}; - EXPECT_THROW(nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), - /*alignment=*/0, stream), - std::exception); - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -// ============================================================================= -// HandleCacheTest: persistent ncclEpHandle is reused across ops on the same -// handle_mem ptr; relocation triggers throw by default and rebuild when -// NVTEEpGroupConfig.allow_handle_mem_reloc=1. -// ============================================================================= - -class HandleCacheTest : public EpCoverageBase {}; - -// Run prepare → dispatch → combine on bundle b. handle_mem_data overrides the -// device ptr used for handle_mem (must be the buffer owned by b unless -// reloc-allowed mode is active). Templated on Bundle because EpCoverageBase:: -// Bundle is declared in a protected section. -template -static void run_round_trip(B& b, void* handle_mem_data, - int num_tokens, int top_k, int num_local_experts, - int hidden_dim, size_t alignment, - cudaStream_t stream) { - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto topk_weights_t = TensorWrapper(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts}, DType::kInt32); - auto handle_mem_t = TensorWrapper(handle_mem_data, - {b.handle_mem_size}, DType::kByte); - auto tokens_t = TensorWrapper(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); - auto recv_tokens_t = TensorWrapper(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); - auto recv_w_t = TensorWrapper(b.recv_topk_weights.get(), - {b.recv_capacity}, DType::kFloat32); - auto result_t = TensorWrapper(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); - - NVTEEpHandle h{b.handle_id, handle_mem_t.data()}; - nvte_ep_prepare(h, topk_idx_t.data(), token_counts_t.data(), alignment, stream); - nvte_ep_dispatch(h, topk_idx_t.data(), tokens_t.data(), NVTECommWindow{}, - topk_weights_t.data(), NVTECommWindow{}, - recv_tokens_t.data(), NVTECommWindow{}, - recv_w_t.data(), NVTECommWindow{}, stream); - nvte_ep_combine(h, recv_tokens_t.data(), NVTECommWindow{}, result_t.data(), stream); -} - -// Re-bootstrap EP backend with a different allow_handle_mem_reloc setting. -// Reuses the existing g_ep_comm; caller is responsible for restoring defaults. -static void reinit_ep_with_reloc(int allow_reloc) { - nvte_ep_shutdown(); - NVTEEpGroupConfig cfg{}; - cfg.ep_size = g_ep_size; - cfg.num_experts = g_num_experts; - cfg.max_tokens_per_rank = g_max_tokens_per_rank; - cfg.max_recv_tokens_per_rank = g_ep_size * g_max_tokens_per_rank * 2; - cfg.hidden_dim = g_hidden_dim; - cfg.allow_handle_mem_reloc = allow_reloc; - nvte_ep_initialize(static_cast(g_ep_comm), cfg); -} - -TEST_F(HandleCacheTest, ReuseSameMemSucceeds) { - const int num_tokens = 16, top_k = 2; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - std::vector h_w(num_tokens * top_k, 1.0f / top_k); - auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - // Two consecutive round-trips on the same handle_mem ptr: first opens the - // cached handle, second hits the cache. Both must succeed and be correct. - for (int iter = 0; iter < 2; ++iter) { - ASSERT_NO_THROW(run_round_trip(b, b.handle_mem.get(), num_tokens, top_k, - num_local_experts_, hidden_dim_, - /*alignment=*/0, stream)); - } - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - std::vector h_res(num_tokens * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; - for (int t = 0; t < num_tokens; ++t) - for (int p : probes) - EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), - static_cast(top_k) * 0.5f, 1e-2f); - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -TEST_F(HandleCacheTest, RelocDefaultThrows) { - // Default bootstrap has allow_handle_mem_reloc=0: a second prepare call on - // the same handle_id with a different handle_mem ptr must throw. - const int num_tokens = 8, top_k = 2; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - DevBuf second_hm(b.handle_mem_size); // distinct device buffer - ASSERT_NE(b.handle_mem.get(), second_hm.get()); - - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - - auto topk_idx_t = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); - auto token_counts_t = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts_}, DType::kInt32); - auto hm1_t = TensorWrapper(b.handle_mem.get(), - {b.handle_mem_size}, DType::kByte); - auto hm2_t = TensorWrapper(second_hm.get(), - {b.handle_mem_size}, DType::kByte); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - // First prepare seeds the cache. - NVTEEpHandle h1{b.handle_id, hm1_t.data()}; - ASSERT_NO_THROW(nvte_ep_prepare(h1, topk_idx_t.data(), token_counts_t.data(), - /*alignment=*/0, stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - // Same handle_id with a different handle_mem ptr must throw. - NVTEEpHandle h2{b.handle_id, hm2_t.data()}; - EXPECT_THROW(nvte_ep_prepare(h2, topk_idx_t.data(), token_counts_t.data(), - /*alignment=*/0, stream), - std::exception); - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -TEST_F(HandleCacheTest, RelocAllowedRebuilds) { - // Re-init EP backend with allow_handle_mem_reloc=1, run two round-trips with - // distinct handle_mem buffers, verify both succeed numerically, restore. - reinit_ep_with_reloc(/*allow_reloc=*/1); - - struct Restore { ~Restore() { reinit_ep_with_reloc(/*allow_reloc=*/0); } } restore; - - const int num_tokens = 16, top_k = 2; - Bundle b = make_bundle(num_tokens, top_k, num_local_experts_, /*alignment=*/0); - DevBuf alt_hm(b.handle_mem_size); - ASSERT_NE(b.handle_mem.get(), alt_hm.get()); - - auto h_idx = routing_balanced(g_process_id, num_tokens, top_k, - num_experts_, num_local_experts_); - std::vector h_w(num_tokens * top_k, 1.0f / top_k); - auto h_tok = tokens_constant(num_tokens, hidden_dim_, 0.5f); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); - NVTE_CHECK_CUDA(cudaMemcpy(b.tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - // First on the original handle_mem. - ASSERT_NO_THROW(run_round_trip(b, b.handle_mem.get(), num_tokens, top_k, - num_local_experts_, hidden_dim_, - /*alignment=*/0, stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - // Then on the relocated handle_mem — must trigger silent rebuild, not throw. - ASSERT_NO_THROW(run_round_trip(b, alt_hm.get(), num_tokens, top_k, - num_local_experts_, hidden_dim_, - /*alignment=*/0, stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - std::vector h_res(num_tokens * hidden_dim_); - NVTE_CHECK_CUDA(cudaMemcpy(h_res.data(), b.result.get(), - h_res.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; - for (int t = 0; t < num_tokens; ++t) - for (int p : probes) - EXPECT_NEAR(__bfloat162float(h_res[t * hidden_dim_ + p]), - static_cast(top_k) * 0.5f, 1e-2f); - - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); -} - -// ── main ────────────────────────────────────────────────────────────────────── - -int main(int argc, char* argv[]) { - if (!ep_bootstrap(argc, argv)) return 0; - int ret = RUN_ALL_TESTS(); - ep_teardown(); - return ret; -} diff --git a/tests/cpp_distributed/test_ep_init.cu b/tests/cpp_distributed/test_ep_init.cu deleted file mode 100644 index 08744dfee5..0000000000 --- a/tests/cpp_distributed/test_ep_init.cu +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * - * See LICENSE for license information. - ************************************************************************/ - -/* - * Unit tests for EP initialization paths. - * - * Tests: - * EPInitTest/InitPath — backend is live after init, handle_mem_size > 0 - * EPInitTest/NumLocalExperts — handle_mem_size is consistent across num_local_experts values - * - * Run via run_test_ep.sh (both uid and comm init paths are tested by the script). - */ - -#include "test_ep_common.h" - -// ── Fixture ─────────────────────────────────────────────────────────────────── - -class EPInitTest : public ::testing::Test { - protected: - void SetUp() override { - if (g_sm_major < 9) - GTEST_SKIP() << "EP requires SM_90+ (device is SM_" << g_sm_major << "0)"; - ASSERT_GE(g_num_processes, 2) << "EP tests require at least 2 processes"; - ASSERT_TRUE(g_ep_initialized) << "EP not initialized"; - } -}; - -// ── Tests ───────────────────────────────────────────────────────────────────── - -TEST_F(EPInitTest, InitPath) { - int nle = g_num_experts / g_ep_size; - NVTEEpLayerConfig cfg{nle, /*top_k=*/2}; - size_t sz = 0; - (void)nvte_ep_register_layer(cfg, &sz); - ASSERT_GT(sz, 0u) << "handle_mem_size must be > 0 after init"; - - if (g_process_id == 0) { - printf(" handle_mem : %zu bytes\n", sz); - } -} - -TEST_F(EPInitTest, NumLocalExperts) { - // handle_mem_size should be > 0 for any valid num_local_experts value. - for (int nle : {1, g_num_experts / g_ep_size}) { - NVTEEpLayerConfig cfg{nle, /*top_k=*/2}; - size_t sz = 0; - (void)nvte_ep_register_layer(cfg, &sz); - ASSERT_GT(sz, 0u) << "num_local_experts=" << nle; - if (g_process_id == 0) - printf(" nle=%-3d handle_mem_size=%zu bytes\n", nle, sz); - } -} - -// ── main ────────────────────────────────────────────────────────────────────── - -int main(int argc, char* argv[]) { - if (!ep_bootstrap(argc, argv)) return 0; - int ret = RUN_ALL_TESTS(); - ep_teardown(); - return ret; -} From 2b5cd817a7687f8581e262580a2c7d6384624752 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:01:39 -0700 Subject: [PATCH 12/21] tests: use test::CudaPtr in DevBuf, check full token rows, simplify bf16 tolerance Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/CMakeLists.txt | 2 +- tests/cpp_distributed/test_ep.cu | 27 +++++++++++++--------- tests/cpp_distributed/test_ep_common.h | 31 ++++++-------------------- 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index 531f0d19e5..31c5eeb064 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -122,7 +122,7 @@ set(EP_TEST_COMMON_LIBS set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed") # ── EP distributed tests (per-op + full pipeline + zero-copy symm) ─────────── -add_executable(test_ep test_ep.cu) +add_executable(test_ep test_ep.cu ../cpp/test_common.cu) target_include_directories(test_ep PRIVATE ${EP_TEST_COMMON_INCLUDES}) target_link_libraries(test_ep PUBLIC ${EP_TEST_COMMON_LIBS}) target_link_options(test_ep PUBLIC ${EP_TEST_LINK_OPTS}) diff --git a/tests/cpp_distributed/test_ep.cu b/tests/cpp_distributed/test_ep.cu index f82e8b9024..2809f076a2 100644 --- a/tests/cpp_distributed/test_ep.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -85,10 +85,12 @@ static std::vector expected_recv_values_sorted( return vals; } -// BF16 has 7 mantissa bits; relative ULP ≈ 2^-7. Use 4× headroom for -// accumulation noise inside dispatch/combine. +// 2^-5 relative tolerance for BF16 (matches mantissa precision with margin), +// plus a small atol floor for near-zero expected values. +static constexpr float kBf16Rtol = 1.0f / 32.0f; +static constexpr float kBf16Atol = 1e-3f; static float bf16_tol(float magnitude) { - return 4.f * std::ldexp(std::fabs(magnitude) + 1e-3f, -7); + return kBf16Atol + kBf16Rtol * std::fabs(magnitude); } static bool check_no_nan_inf(const nv_bfloat16* dev, int count, const char* name) { @@ -354,11 +356,9 @@ TEST_F(EPCombineTest, Combine) { NVTE_CHECK_CUDA(cudaMemcpy(h_result.data(), buf.result.get(), h_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); auto h_tok = generate_tokens(g_process_id, num_tokens_, hidden_dim_); - // Spot-check 3 hidden-dim positions per token to catch partial-row writes. - const int probes[3] = {0, hidden_dim_ / 2, hidden_dim_ - 1}; for (int tok = 0; tok < num_tokens_; ++tok) { float exp = __bfloat162float(h_tok[tok * hidden_dim_]) * static_cast(top_k_); - for (int p : probes) { + for (int p = 0; p < hidden_dim_; ++p) { float got = __bfloat162float(h_result[tok * hidden_dim_ + p]); EXPECT_NEAR(got, exp, bf16_tol(exp)) << "token " << tok << " rank " << g_process_id << " hidden " << p; @@ -421,9 +421,12 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { int filled = 0; for (int e = 0; e < num_local_experts_; ++e) { for (int i = 0; i < cnt[e]; ++i) { - float v = __bfloat162float(h_ge[slot * hidden_dim_]); - EXPECT_NEAR(v, kExpGrad, bf16_tol(kExpGrad)) - << "grad_expert expert " << e << " slot " << i << " (linear " << slot << ")"; + for (int p = 0; p < hidden_dim_; ++p) { + float v = __bfloat162float(h_ge[slot * hidden_dim_ + p]); + EXPECT_NEAR(v, kExpGrad, bf16_tol(kExpGrad)) + << "grad_expert expert " << e << " slot " << i + << " (linear " << slot << ") hidden " << p; + } ++filled; ++slot; } } @@ -480,8 +483,10 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { h_gt.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); const float kExpGrad = static_cast(top_k_) * 0.1f; for (int tok = 0; tok < num_tokens_; ++tok) - EXPECT_NEAR(__bfloat162float(h_gt[tok * hidden_dim_]), kExpGrad, bf16_tol(kExpGrad)) - << "grad_tokens token " << tok; + for (int p = 0; p < hidden_dim_; ++p) + EXPECT_NEAR(__bfloat162float(h_gt[tok * hidden_dim_ + p]), kExpGrad, + bf16_tol(kExpGrad)) + << "grad_tokens token " << tok << " hidden " << p; if (g_process_id == 0) printf(" DispatchBwdCheck: passed (grad_tokens == %.2f)\n", kExpGrad); diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index 6b6b56ad51..8958f7d8ab 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -28,6 +28,7 @@ #include #include #include +#include "../cpp/test_common.h" #include "util/logging.h" using transformer_engine::DType; @@ -48,44 +49,26 @@ static NVTEDType g_token_dtype = kNVTEBFloat16; static bool g_ep_initialized = false; static ncclComm_t g_ep_comm = nullptr; // owned by harness, destroyed in ep_teardown -// RAII owner for a cudaMalloc'd device buffer; frees on destruction. +// RAII owner for a cudaMalloc'd device buffer; element-count API on top of +// test::CudaPtr. template struct DevBuf { - T* ptr = nullptr; + test::CudaPtr ptr; size_t count = 0; DevBuf() = default; explicit DevBuf(size_t n) { alloc(n); } - ~DevBuf() { reset(); } - - DevBuf(const DevBuf&) = delete; - DevBuf& operator=(const DevBuf&) = delete; - DevBuf(DevBuf&& o) noexcept : ptr(o.ptr), count(o.count) { o.ptr = nullptr; o.count = 0; } - DevBuf& operator=(DevBuf&& o) noexcept { - if (this != &o) { reset(); ptr = o.ptr; count = o.count; o.ptr = nullptr; o.count = 0; } - return *this; - } void alloc(size_t n) { - reset(); count = n; - if (n > 0) { - cudaError_t e = cudaMalloc(&ptr, n * sizeof(T)); - if (e != cudaSuccess) { - fprintf(stderr, "DevBuf cudaMalloc(%zu) failed: %s\n", n * sizeof(T), - cudaGetErrorString(e)); - ptr = nullptr; - count = 0; - } - } + ptr = (n > 0) ? test::cuda_alloc(n * sizeof(T)) : test::CudaPtr{}; } - void reset() { - if (ptr) { cudaFree(ptr); ptr = nullptr; } + ptr.reset(); count = 0; } - T* get() const { return ptr; } + T* get() const { return ptr.get(); } size_t bytes() const { return count * sizeof(T); } }; From 2873ac0318c62385edcec7920b54b410ccdff666 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:03:34 -0700 Subject: [PATCH 13/21] tests: reword EPTensors comment to not imply C-API churn Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp_distributed/test_ep.cu b/tests/cpp_distributed/test_ep.cu index 2809f076a2..fa267040dd 100644 --- a/tests/cpp_distributed/test_ep.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -156,8 +156,8 @@ struct EPBuffers { } }; -// Bundled NVTETensor views over an EPBuffers — one place to update the shape -// conventions when the C-API evolves. +// Bundled NVTETensor views over an EPBuffers, with the shapes the EP C API +// expects. struct EPTensors { TensorWrapper topk_idx, topk_weights, token_counts, handle_mem, tokens; TensorWrapper recv_tokens, recv_topk_weights, result; From 319937f906b6c1901ad685d47f203eee7852cdae Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:16:48 -0700 Subject: [PATCH 14/21] EP group config: rename token_dtype to max_token_dtype and allow narrower per-dispatch dtypes Signed-off-by: Phuong Nguyen --- memory/MEMORY.md | 1 + ...k_commit_message_no_te_subsystem_prefix.md | 12 ++++++++++ tests/cpp_distributed/test_ep_common.h | 8 +++---- transformer_engine/common/ep/ep_backend.cpp | 24 ++++++++++--------- .../common/include/transformer_engine/ep.h | 7 +++--- 5 files changed, 34 insertions(+), 18 deletions(-) create mode 100644 memory/MEMORY.md create mode 100644 memory/feedback_commit_message_no_te_subsystem_prefix.md diff --git a/memory/MEMORY.md b/memory/MEMORY.md new file mode 100644 index 0000000000..81a9edfd29 --- /dev/null +++ b/memory/MEMORY.md @@ -0,0 +1 @@ +- [Commit message: no TE subsystem prefix](feedback_commit_message_no_te_subsystem_prefix.md) — don't prefix commit subjects with "Expert Parallelism:" or "EP:" in this repo diff --git a/memory/feedback_commit_message_no_te_subsystem_prefix.md b/memory/feedback_commit_message_no_te_subsystem_prefix.md new file mode 100644 index 0000000000..16aa7abc94 --- /dev/null +++ b/memory/feedback_commit_message_no_te_subsystem_prefix.md @@ -0,0 +1,12 @@ +--- +name: feedback-commit-message-no-te-subsystem-prefix +description: For this TE repo, commit messages should not prefix the subject with "Expert Parallelism:" or "EP:" — go straight to the change itself. +metadata: + type: feedback +--- + +When writing commit messages in this Transformer Engine repo, do not prefix the subject with subsystem labels like "Expert Parallelism:" or "EP:". + +**Why:** User feedback during the EP reviewer-feedback session ("in your commit message, don't need to mention 'expert parallelism or EP'"). Subject lines should describe the change itself. + +**How to apply:** Lead with the actual action (e.g. "require token_dtype in NVTEEpGroupConfig and enforce at dispatch", "consolidate EP tests into single test_ep.cu"). Path/file context implicitly identifies the subsystem. diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index 8958f7d8ab..66f11e92fb 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -45,7 +45,7 @@ static int g_ep_size = -1; static int g_num_experts = -1; static int g_hidden_dim = 256; static int g_max_tokens_per_rank = 64; -static NVTEDType g_token_dtype = kNVTEBFloat16; +static NVTEDType g_max_token_dtype = kNVTEFloat32; // staging-buffer sizing static bool g_ep_initialized = false; static ncclComm_t g_ep_comm = nullptr; // owned by harness, destroyed in ep_teardown @@ -130,8 +130,8 @@ static void ep_parse_args(int argc, char* argv[]) { else if (a.rfind("--num-processes=",0)==0) g_num_processes = std::stoi(a.substr(16)); else if (a.rfind("--nranks=", 0) == 0) g_num_processes = std::stoi(a.substr(9)); else if (a.rfind("--uid-file=", 0) == 0) g_uid_file = a.substr(11); - else if (a.rfind("--token-dtype=", 0) == 0) - g_token_dtype = static_cast(std::stoi(a.substr(14))); + else if (a.rfind("--max-token-dtype=", 0) == 0) + g_max_token_dtype = static_cast(std::stoi(a.substr(18))); } if (g_process_id < 0 || g_num_processes <= 0) { @@ -187,7 +187,7 @@ static bool ep_bootstrap(int argc, char* argv[]) { // Worst-case for top_k fan-out: ep_size * max_tokens_per_rank * 2. group_config.max_recv_tokens_per_rank = g_ep_size * g_max_tokens_per_rank * 2; group_config.hidden_dim = g_hidden_dim; - group_config.token_dtype = g_token_dtype; + group_config.max_token_dtype = g_max_token_dtype; NVTE_CHECK_NCCL(ncclCommInitRank(&g_ep_comm, g_num_processes, uid, g_process_id)); nvte_ep_initialize(static_cast(g_ep_comm), group_config); diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp index 1e08cb55df..5e11db6f48 100644 --- a/transformer_engine/common/ep/ep_backend.cpp +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -82,11 +82,11 @@ void EPBackend::validate_config(const NVTEEpGroupConfig& config) { NVTE_CHECK(config.max_recv_tokens_per_rank > 0, "max_recv_tokens_per_rank must be positive, got ", config.max_recv_tokens_per_rank); NVTE_CHECK(config.hidden_dim > 0, "hidden_dim must be positive, got ", config.hidden_dim); - NVTE_CHECK(config.token_dtype >= 0 && config.token_dtype < kNVTENumTypes, - "token_dtype out of range, got ", static_cast(config.token_dtype)); - const size_t elem_bytes = typeToSize(static_cast(config.token_dtype)); + NVTE_CHECK(config.max_token_dtype >= 0 && config.max_token_dtype < kNVTENumTypes, + "max_token_dtype out of range, got ", static_cast(config.max_token_dtype)); + const size_t elem_bytes = typeToSize(static_cast(config.max_token_dtype)); NVTE_CHECK(config.hidden_dim * elem_bytes >= 16, - "hidden_dim * sizeof(token_dtype) must be >= 16 (NCCL EP 16B row alignment); " + "hidden_dim * sizeof(max_token_dtype) must be >= 16 (NCCL EP 16B row alignment); " "got hidden_dim=", config.hidden_dim, ", element_bytes=", elem_bytes); NVTE_CHECK(config.num_experts % config.ep_size == 0, "num_experts (", config.num_experts, @@ -218,7 +218,7 @@ void EPBackend::init(ncclComm_t ep_comm, NVTEEpGroupConfig group_config) { cfg.algorithm = NCCL_EP_ALGO_HIGH_THROUGHPUT; cfg.num_experts = static_cast(group_config.num_experts); cfg.max_dispatch_tokens_per_rank = static_cast(group_config.max_tokens_per_rank); - const size_t elem_bytes = typeToSize(static_cast(group_config.token_dtype)); + const size_t elem_bytes = typeToSize(static_cast(group_config.max_token_dtype)); cfg.max_token_bytes = static_cast(group_config.hidden_dim * elem_bytes); cfg.rdma_buffer_size = NCCL_EP_AUTO; cfg.num_qp_per_rank = NCCL_EP_AUTO; @@ -346,10 +346,11 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor NVTEShape tok_shape = nvte_tensor_shape(tokens); NVTEDType tok_dtype = nvte_tensor_type(tokens); - NVTE_CHECK(tok_dtype == group_config_.token_dtype, + NVTE_CHECK(typeToSize(static_cast(tok_dtype)) <= + typeToSize(static_cast(group_config_.max_token_dtype)), "tokens dtype (", static_cast(tok_dtype), - ") does not match group token_dtype (", - static_cast(group_config_.token_dtype), ")"); + ") wider than group max_token_dtype (", + static_cast(group_config_.max_token_dtype), ")"); const size_t num_tokens = tok_shape.data[0]; const size_t hidden_dim = tok_shape.data[1]; @@ -376,10 +377,11 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor NVTEShape recv_shape = nvte_tensor_shape(recv_tokens); NVTEDType recv_dtype = nvte_tensor_type(recv_tokens); - NVTE_CHECK(recv_dtype == group_config_.token_dtype, + NVTE_CHECK(typeToSize(static_cast(recv_dtype)) <= + typeToSize(static_cast(group_config_.max_token_dtype)), "recv_tokens dtype (", static_cast(recv_dtype), - ") does not match group token_dtype (", - static_cast(group_config_.token_dtype), ")"); + ") wider than group max_token_dtype (", + static_cast(group_config_.max_token_dtype), ")"); size_t recv_sizes[2] = {recv_shape.data[0], recv_shape.data[1]}; ncclEpTensor_t nccl_tokens_out = make_payload_tensor(recv_tokens, recv_tokens_win, 2, diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h index a3a306a5bc..d426f8845a 100644 --- a/transformer_engine/common/include/transformer_engine/ep.h +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -35,9 +35,10 @@ typedef struct { int max_num_sms; /*!< Max SMs for EP kernels. 0 = auto. */ /*! 0 (default): throw on relocated handle_mem for a cached handle_id. 1: silently rebuild. */ int allow_handle_mem_reloc; - /*! Token dtype for this EP group. Sizes NCCL EP staging buffers at group - * create and is enforced against tensors passed to nvte_ep_dispatch. */ - NVTEDType token_dtype; + /*! Widest token dtype the group will dispatch. Sizes NCCL EP staging buffers + * at group create. Tensors passed to nvte_ep_dispatch may use any dtype whose + * element size is <= sizeof(max_token_dtype). */ + NVTEDType max_token_dtype; } NVTEEpGroupConfig; /*! \brief Per-layer EP configuration. */ From 4b39d0b1cb33cd4e040f7c59134187d1df8f1e70 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:29:04 -0700 Subject: [PATCH 15/21] tests: parameterize FullForwardBackward over bf16, fp16, fp32 Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep.cu | 198 +++++++++++++++---------- tests/cpp_distributed/test_ep_common.h | 1 + 2 files changed, 118 insertions(+), 81 deletions(-) diff --git a/tests/cpp_distributed/test_ep.cu b/tests/cpp_distributed/test_ep.cu index fa267040dd..aff8d04a9e 100644 --- a/tests/cpp_distributed/test_ep.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -39,10 +39,11 @@ static inline float token_value(int rank, int t, int num_tokens) { return static_cast(rank * num_tokens + t + 1) * (1.0f / 256.0f); } -static std::vector generate_tokens(int rank, int num_tokens, int hidden_dim) { - std::vector v(num_tokens * hidden_dim); +template +static std::vector generate_tokens(int rank, int num_tokens, int hidden_dim) { + std::vector v(num_tokens * hidden_dim); for (int t = 0; t < num_tokens; ++t) { - nv_bfloat16 val = __float2bfloat16(token_value(rank, t, num_tokens)); + T val = tok_from_float(token_value(rank, t, num_tokens)); for (int h = 0; h < hidden_dim; ++h) v[t * hidden_dim + h] = val; } @@ -93,11 +94,22 @@ static float bf16_tol(float magnitude) { return kBf16Atol + kBf16Rtol * std::fabs(magnitude); } -static bool check_no_nan_inf(const nv_bfloat16* dev, int count, const char* name) { - std::vector h(count); - cudaMemcpy(h.data(), dev, count * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost); +// Per-element host-side conversion helpers used by templated test code. +inline float tok_to_float(nv_bfloat16 v) { return __bfloat162float(v); } +inline float tok_to_float(__half v) { return __half2float(v); } +inline float tok_to_float(float v) { return v; } + +template T tok_from_float(float v); +template <> inline nv_bfloat16 tok_from_float(float v) { return __float2bfloat16(v); } +template <> inline __half tok_from_float<__half> (float v) { return __float2half(v); } +template <> inline float tok_from_float (float v) { return v; } + +template +static bool check_no_nan_inf(const T* dev, int count, const char* name) { + std::vector h(count); + cudaMemcpy(h.data(), dev, count * sizeof(T), cudaMemcpyDeviceToHost); for (int i = 0; i < count; ++i) { - float v = __bfloat162float(h[i]); + float v = tok_to_float(h[i]); if (std::isnan(v) || std::isinf(v)) { fprintf(stderr, "Rank %d: %s in %s[%d]\n", g_process_id, std::isnan(v) ? "NaN" : "Inf", name, i); @@ -109,20 +121,21 @@ static bool check_no_nan_inf(const nv_bfloat16* dev, int count, const char* name // ── Forward buffer set with RAII ────────────────────────────────────────────── +template struct EPBuffers { // Forward DevBuf topk_idx; DevBuf topk_weights; - DevBuf tokens; + DevBuf tokens; DevBuf token_counts; DevBuf handle_mem; - DevBuf recv_tokens; + DevBuf recv_tokens; DevBuf recv_topk_weights; - DevBuf result; + DevBuf result; // Backward - DevBuf grad_result; - DevBuf grad_expert; - DevBuf grad_tokens; + DevBuf grad_result; + DevBuf grad_expert; + DevBuf grad_tokens; DevBuf g_recv_topk_weights; DevBuf grad_topk_weights; @@ -158,14 +171,16 @@ struct EPBuffers { // Bundled NVTETensor views over an EPBuffers, with the shapes the EP C API // expects. +template struct EPTensors { TensorWrapper topk_idx, topk_weights, token_counts, handle_mem, tokens; TensorWrapper recv_tokens, recv_topk_weights, result; TensorWrapper grad_result, grad_expert, grad_tokens; TensorWrapper g_recv_topk_weights, grad_topk_weights; - EPTensors(EPBuffers& b, int num_tokens, int top_k, int hidden_dim, + EPTensors(EPBuffers& b, int num_tokens, int top_k, int hidden_dim, int num_local_experts) { + constexpr DType kTokDType = test::TypeInfo::dtype; topk_idx = TensorWrapper(b.topk_idx.get(), {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); topk_weights = TensorWrapper(b.topk_weights.get(), @@ -175,19 +190,19 @@ struct EPTensors { handle_mem = TensorWrapper(b.handle_mem.get(), {b.handle_mem_size}, DType::kByte); tokens = TensorWrapper(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); recv_tokens = TensorWrapper(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); + {b.recv_capacity, (size_t)hidden_dim}, kTokDType); recv_topk_weights = TensorWrapper(b.recv_topk_weights.get(), {b.recv_capacity}, DType::kFloat32); result = TensorWrapper(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); grad_result = TensorWrapper(b.grad_result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); grad_expert = TensorWrapper(b.grad_expert.get(), - {b.recv_capacity, (size_t)hidden_dim}, DType::kBFloat16); + {b.recv_capacity, (size_t)hidden_dim}, kTokDType); grad_tokens = TensorWrapper(b.grad_tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, DType::kBFloat16); + {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); g_recv_topk_weights = TensorWrapper(b.g_recv_topk_weights.get(), {b.recv_capacity}, DType::kFloat32); grad_topk_weights = TensorWrapper(b.grad_topk_weights.get(), @@ -217,19 +232,20 @@ class EpOpTestBase : public ::testing::Test { num_tokens_ = 32; } - void upload_inputs(EPBuffers& buf, int rank = -1) { + template + void upload_inputs(EPBuffers& buf, int rank = -1) { if (rank < 0) rank = g_process_id; auto h_idx = routing_balanced(rank, num_tokens_, top_k_, num_experts_, num_local_experts_); std::vector h_w(num_tokens_ * top_k_, 1.0f / top_k_); - auto h_tok = generate_tokens(rank, num_tokens_, hidden_dim_); + auto h_tok = generate_tokens(rank, num_tokens_, hidden_dim_); NVTE_CHECK_CUDA(cudaMemcpy(buf.topk_idx.get(), h_idx.data(), - h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); + h_idx.size() * sizeof(int64_t), cudaMemcpyHostToDevice)); NVTE_CHECK_CUDA(cudaMemcpy(buf.topk_weights.get(), h_w.data(), - h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); + h_w.size() * sizeof(float), cudaMemcpyHostToDevice)); NVTE_CHECK_CUDA(cudaMemcpy(buf.tokens.get(), h_tok.data(), - h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); + h_tok.size() * sizeof(T), cudaMemcpyHostToDevice)); } NVTEEpLayerConfig layer_config(size_t alignment = 0) const { @@ -237,7 +253,8 @@ class EpOpTestBase : public ::testing::Test { } // NVTE_CHECK_CUDA (fprintf+exit) so this non-void helper stays legal. - int read_total_recv(const EPBuffers& buf) const { + template + int read_total_recv(const EPBuffers& buf) const { std::vector cnt(num_local_experts_); NVTE_CHECK_CUDA(cudaMemcpy(cnt.data(), buf.token_counts.get(), num_local_experts_ * sizeof(int32_t), cudaMemcpyDeviceToHost)); @@ -254,11 +271,11 @@ class EpOpTestBase : public ::testing::Test { class EPDispatchTest : public EpOpTestBase {}; TEST_F(EPDispatchTest, PrepareAndDispatch) { - EPBuffers buf; + EPBuffers<> buf; buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); NVTE_CHECK_CUDA(cudaMemset(buf.recv_tokens.get(), 0, buf.recv_tokens.bytes())); @@ -333,11 +350,11 @@ TEST_F(EPDispatchTest, PrepareAndDispatch) { class EPCombineTest : public EpOpTestBase {}; TEST_F(EPCombineTest, Combine) { - EPBuffers buf; + EPBuffers<> buf; buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); @@ -378,11 +395,11 @@ TEST_F(EPCombineTest, Combine) { class EPCombineBwdTest : public EpOpTestBase {}; TEST_F(EPCombineBwdTest, CombineBwdCheck) { - EPBuffers buf; + EPBuffers<> buf; buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); @@ -445,11 +462,11 @@ TEST_F(EPCombineBwdTest, CombineBwdCheck) { class EPDispatchBwdTest : public EpOpTestBase {}; TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { - EPBuffers buf; + EPBuffers<> buf; buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); @@ -501,11 +518,11 @@ TEST_F(EPDispatchBwdTest, DispatchBwdCheck) { class EPDispatchBwdGradWeightsTest : public EpOpTestBase {}; TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { - EPBuffers buf; + EPBuffers<> buf; buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); // Distinct per-(rank, t, k) weights so each slot carries a unique value. std::vector h_w(num_tokens_ * top_k_); @@ -578,50 +595,69 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { // Integrated FwdBwd: NaN/Inf check end-to-end. // ============================================================================= -class EPPipelineTest : public EpOpTestBase {}; - -TEST_F(EPPipelineTest, FullForwardBackward) { - EPBuffers buf; - buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, - ep_size_, max_tokens_per_rank_); - upload_inputs(buf); - EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); - - cudaStream_t stream; - NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); - - uint64_t handle_id = buf.handle_id; - ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), - t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), - NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, - t.recv_topk_weights.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, - t.result.data(), stream)); - - std::vector h_grad(num_tokens_ * hidden_dim_, __float2bfloat16(0.1f)); - NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), - h_grad.size() * sizeof(nv_bfloat16), - cudaMemcpyHostToDevice, stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); - NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); - - ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), NVTECommWindow{}, - t.grad_expert.data(), NVTECommWindow{}, stream)); - ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), NVTECommWindow{}, - t.g_recv_topk_weights.data(), NVTECommWindow{}, - t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); - NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); - - ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); - ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); - - if (g_process_id == 0) printf(" FullForwardBackward: passed\n"); +class EPPipelineTest : public EpOpTestBase, public ::testing::WithParamInterface { + protected: + template + void run_full_forward_backward() { + EPBuffers buf; + buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, + ep_size_, max_tokens_per_rank_); + upload_inputs(buf); + EPTensors t(buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + + cudaStream_t stream; + NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); + + uint64_t handle_id = buf.handle_id; + ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), t.token_counts.data(), /*alignment=*/0, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.topk_idx.data(), + t.tokens.data(), NVTECommWindow{}, t.topk_weights.data(), + NVTECommWindow{}, t.recv_tokens.data(), NVTECommWindow{}, + t.recv_topk_weights.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_combine(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.recv_tokens.data(), NVTECommWindow{}, + t.result.data(), stream)); + + std::vector h_grad(num_tokens_ * hidden_dim_, tok_from_float(0.1f)); + NVTE_CHECK_CUDA(cudaMemcpyAsync(buf.grad_result.get(), h_grad.data(), + h_grad.size() * sizeof(Tok), + cudaMemcpyHostToDevice, stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_expert.get(), 0, buf.grad_expert.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.g_recv_topk_weights.get(), 0, buf.g_recv_topk_weights.bytes(), stream)); + NVTE_CHECK_CUDA(cudaMemsetAsync(buf.grad_topk_weights.get(), 0, buf.grad_topk_weights.bytes(), stream)); + + ASSERT_NO_THROW(nvte_ep_combine_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_result.data(), NVTECommWindow{}, + t.grad_expert.data(), NVTECommWindow{}, stream)); + ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), NVTECommWindow{}, + t.g_recv_topk_weights.data(), NVTECommWindow{}, + t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); + NVTE_CHECK_CUDA(cudaStreamSynchronize(stream)); + + ASSERT_TRUE(check_no_nan_inf(buf.result.get(), num_tokens_ * hidden_dim_, "result")); + ASSERT_TRUE(check_no_nan_inf(buf.grad_tokens.get(), num_tokens_ * hidden_dim_, "grad_tokens")); + + NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); + } +}; - NVTE_CHECK_CUDA(cudaStreamDestroy(stream)); +TEST_P(EPPipelineTest, FullForwardBackward) { + const DType dtype = GetParam(); + switch (dtype) { + case DType::kBFloat16: run_full_forward_backward(); break; + case DType::kFloat16: run_full_forward_backward<__half> (); break; + case DType::kFloat32: run_full_forward_backward (); break; + default: FAIL() << "unsupported token dtype " << static_cast(dtype); + } + if (g_process_id == 0) + printf(" FullForwardBackward[%s]: passed\n", test::typeName(dtype).c_str()); } +INSTANTIATE_TEST_SUITE_P( + Dtypes, EPPipelineTest, + ::testing::Values(DType::kBFloat16, DType::kFloat16, DType::kFloat32), + [](const ::testing::TestParamInfo& info) { + return test::typeName(info.param); + }); + // ============================================================================= // EPZeroCopyTest: dispatch/combine with NCCL symmetric-memory windows attached // to payload tensors (zero-copy fast path via ncclEpTensorCreateFromWindow). @@ -671,11 +707,11 @@ class EPZeroCopyTest : public EpOpTestBase {}; // vs HBM reference (same routing, same input). TEST_F(EPZeroCopyTest, IdentityAllSymm) { // HBM reference run. - EPBuffers ref_buf; + EPBuffers<> ref_buf; ref_buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(ref_buf); - EPTensors ref_t(ref_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> ref_t(ref_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); cudaStream_t stream; NVTE_CHECK_CUDA(cudaStreamCreate(&stream)); @@ -698,7 +734,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { ref_result.size() * sizeof(nv_bfloat16), cudaMemcpyDeviceToHost)); // Symm-mem run: tokens, recv_tokens, combine_input (== recv_tokens) all symm. - EPBuffers sym_buf; // alloc all buffers except the symm ones. + EPBuffers<> sym_buf; // alloc all buffers except the symm ones. sym_buf.alloc(num_tokens_, top_k_, hidden_dim_, num_local_experts_, ep_size_, max_tokens_per_rank_); upload_inputs(sym_buf); @@ -712,7 +748,7 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { NVTE_CHECK_CUDA(cudaMemcpy(sym_tokens.ptr, h_tok.data(), h_tok.size() * sizeof(nv_bfloat16), cudaMemcpyHostToDevice)); - EPTensors sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); + EPTensors<> sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); // Replace the tokens/recv_tokens views with ones pointing at the symm buffers. sym_t.tokens = TensorWrapper(sym_tokens.ptr, {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index 66f11e92fb..971375e131 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -13,6 +13,7 @@ #include #include +#include #include #include #include From edf871dda217febb922b7b4c43f8abd924c47f37 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:30:51 -0700 Subject: [PATCH 16/21] cmake: drop NO_CMAKE_SYSTEM_PATH on TE_LIB lookup and order nvrtc after TE_LIB to remove --no-as-needed Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/CMakeLists.txt | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index 31c5eeb064..c68934e6e6 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(NOT DEFINED TE_LIB_PATH) get_filename_component(TE_LIB_PATH ${TE_LIB_FILE} DIRECTORY) endif() -find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED NO_CMAKE_SYSTEM_PATH) +find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_PATH} ENV TE_LIB_PATH REQUIRED) message(STATUS "Found transformer_engine library: ${TE_LIB}") include_directories(../../transformer_engine/common/include) @@ -107,25 +107,21 @@ set(EP_TEST_COMMON_INCLUDES ../../transformer_engine/common ${CMAKE_CURRENT_SOURCE_DIR}) +# nvrtc must follow TE_LIB so symbols referenced from libtransformer_engine.so +# (loaded via dlopen in Python; not in its DT_NEEDED) resolve through nvrtc. set(EP_TEST_COMMON_LIBS CUDA::cuda_driver CUDA::cudart - CUDA::nvrtc GTest::gtest ${TE_LIB} + CUDA::nvrtc ${NCCL_LIB} ${NCCL_EP_LIB}) -# nvrtc symbols are referenced from libtransformer_engine.so but not in its -# DT_NEEDED list (loaded via dlopen in Python). For cpp tests we link nvrtc -# explicitly with --no-as-needed so the linker keeps the dependency. -set(EP_TEST_LINK_OPTS "LINKER:--no-as-needed") - # ── EP distributed tests (per-op + full pipeline + zero-copy symm) ─────────── add_executable(test_ep test_ep.cu ../cpp/test_common.cu) target_include_directories(test_ep PRIVATE ${EP_TEST_COMMON_INCLUDES}) target_link_libraries(test_ep PUBLIC ${EP_TEST_COMMON_LIBS}) -target_link_options(test_ep PUBLIC ${EP_TEST_LINK_OPTS}) # Do NOT use gtest_discover_tests — these binaries require multi-process # launch via run_test_ep.sh, not direct single-process execution. From c596afa1c879aea1f3ea23df9219c1dc4cc81d40 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:33:15 -0700 Subject: [PATCH 17/21] tests: use MPI for EP distributed tests (bootstrap, build, run script) Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/CMakeLists.txt | 10 +- tests/cpp_distributed/run_test_ep.sh | 121 ++++--------------------- tests/cpp_distributed/test_ep_common.h | 74 ++++----------- 3 files changed, 45 insertions(+), 160 deletions(-) diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index c68934e6e6..fa7eb4a7f1 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -73,10 +73,8 @@ target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest include(GoogleTest) gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600) -# ── EP distributed tests (HT mode) ───────────────────────────────────────── -# No MPI dependency — processes are spawned by run_test_ep.sh with -# --rank / --nranks flags. ncclUniqueId exchange uses a -# shared temp file (see test_ep_common.h for details). +# ── EP distributed tests ────────────────────────────────────────────────────── +# Launched via mpirun; ncclUniqueId exchange uses MPI_Bcast (see test_ep_common.h). # Headers + libs come from the in-tree 3rdparty/nccl submodule build. set(NCCL_EP_SUBMODULE_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/nccl") @@ -103,6 +101,7 @@ endif() set(EP_TEST_COMMON_INCLUDES ${EP_TEST_NCCL_INCLUDES} + ${MPI_CXX_INCLUDE_PATH} ../../transformer_engine/common/include ../../transformer_engine/common ${CMAKE_CURRENT_SOURCE_DIR}) @@ -116,7 +115,8 @@ set(EP_TEST_COMMON_LIBS ${TE_LIB} CUDA::nvrtc ${NCCL_LIB} - ${NCCL_EP_LIB}) + ${NCCL_EP_LIB} + MPI::MPI_CXX) # ── EP distributed tests (per-op + full pipeline + zero-copy symm) ─────────── add_executable(test_ep test_ep.cu ../cpp/test_common.cu) diff --git a/tests/cpp_distributed/run_test_ep.sh b/tests/cpp_distributed/run_test_ep.sh index 8c22edd389..13e86fa02d 100755 --- a/tests/cpp_distributed/run_test_ep.sh +++ b/tests/cpp_distributed/run_test_ep.sh @@ -3,12 +3,8 @@ # # See LICENSE for license information. # -# Run TE EP distributed unit tests across multiple GPUs. -# -# Spawns one background bash process per GPU (no MPI dependency), matching the -# JAX multi-process launcher style. ncclUniqueId is exchanged via a shared -# temp file (see test_ep_common.h). Each rank builds its own ncclComm_t and -# passes it to nvte_ep_initialize. +# Run TE EP distributed unit tests via mpirun. Each MPI rank pins to one GPU +# (rank % device_count) and exchanges ncclUniqueId through MPI_Bcast. # # Usage: # bash run_test_ep.sh [num_gpus] [build_dir] @@ -18,15 +14,16 @@ # build_dir = /build # # Environment variables: -# GTEST_FILTER — forwarded to all processes (e.g., "EPDispatchTest.*") -# TEST_TIMEOUT_S — per-process timeout in seconds (default: 180) +# GTEST_FILTER — forwarded to all processes (e.g., "EPPipelineTest.*") +# MPIRUN — override the mpirun binary (default: mpirun) +# MPIRUN_EXTRA — extra flags forwarded to mpirun set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BUILD_DIR="${2:-${SCRIPT_DIR}/build}" NUM_GPUS="${1:-$(nvidia-smi -L 2>/dev/null | wc -l)}" -TEST_TIMEOUT_S="${TEST_TIMEOUT_S:-180}" +MPIRUN="${MPIRUN:-mpirun}" # Skip cleanly on pre-Hopper: NCCL EP requires SM>=90. MIN_SM=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null \ @@ -36,100 +33,22 @@ if (( MIN_SM > 0 && MIN_SM < 90 )); then exit 0 fi -GTEST_ARGS="${GTEST_FILTER:+--gtest_filter=${GTEST_FILTER}}" -OVERALL_FAIL=0 - -# --------------------------------------------------------------------------- -# run_suite BINARY SUITE_NAME MIN_GPUS -# --------------------------------------------------------------------------- -run_suite() { - local BINARY="$1" - local SUITE_NAME="$2" - local MIN_GPUS="${3:-2}" - - local TEST_BIN="${BUILD_DIR}/${BINARY}" - - if [[ ! -x "${TEST_BIN}" ]]; then - echo "ERROR: binary not found: ${TEST_BIN}" - echo "Build: cd ${SCRIPT_DIR} && mkdir -p build && cd build && cmake .. && make" - OVERALL_FAIL=1 - return - fi - - if (( NUM_GPUS < MIN_GPUS )); then - echo "${SUITE_NAME}: requires ${MIN_GPUS} GPUs, found ${NUM_GPUS}. Skipping." - return - fi - - local TMPDIR_L="${TMPDIR:-/tmp}" - local UID_FILE="${TMPDIR_L}/te_ep_uid_${BINARY}_$$" - rm -f "${UID_FILE}" - - local LOG_DIR - LOG_DIR=$(mktemp -d) - local FAIL=0 - - echo "=== ${SUITE_NAME} ===" - echo " GPUs: ${NUM_GPUS} Binary: ${TEST_BIN}" - echo - - # Spawn one background process per GPU. ncclUniqueId is exchanged via the - # shared UID_FILE. Each process is wrapped in `timeout` to detect hangs early. - local PIDS=() - for i in $(seq 0 $((NUM_GPUS - 1))); do - timeout --foreground --signal=KILL "${TEST_TIMEOUT_S}" \ - "${TEST_BIN}" \ - --rank="${i}" \ - --nranks="${NUM_GPUS}" \ - --uid-file="${UID_FILE}" \ - ${GTEST_ARGS} \ - > "${LOG_DIR}/rank_${i}.log" 2>&1 & - PIDS+=($!) - done - for i in $(seq 0 $((NUM_GPUS - 1))); do - if ! wait "${PIDS[$i]}"; then - local rc=$? - FAIL=1 - if [[ $rc -eq 137 || $rc -eq 124 ]]; then - echo " rank ${i}: TIMEOUT after ${TEST_TIMEOUT_S}s (rc=${rc})" - fi - fi - done - - echo "--- Rank 0 output ---" - cat "${LOG_DIR}/rank_0.log" - - if (( FAIL )); then - for i in $(seq 1 $((NUM_GPUS - 1))); do - echo "--- Rank ${i} output ---" - cat "${LOG_DIR}/rank_${i}.log" - done - echo "=== ${SUITE_NAME}: FAILED ===" - OVERALL_FAIL=1 - else - echo "=== ${SUITE_NAME}: ALL PASSED ===" - fi - - rm -rf "${LOG_DIR}" - rm -f "${UID_FILE}" -} +TEST_BIN="${BUILD_DIR}/test_ep" +if [[ ! -x "${TEST_BIN}" ]]; then + echo "ERROR: binary not found: ${TEST_BIN}" + echo "Build: cd ${SCRIPT_DIR} && mkdir -p build && cd build && cmake .. && make" + exit 1 +fi -# --------------------------------------------------------------------------- -# Cleanup on abort -# --------------------------------------------------------------------------- -cleanup() { rm -f "${TMPDIR:-/tmp}"/te_ep_uid_*_"$$" 2>/dev/null || true; } -trap cleanup EXIT INT TERM +if (( NUM_GPUS < 2 )); then + echo "EP Tests: requires at least 2 GPUs, found ${NUM_GPUS}. Skipping." + exit 0 +fi -# --------------------------------------------------------------------------- -# Run all suites -# --------------------------------------------------------------------------- -run_suite "test_ep" "EP Tests" 2 +GTEST_ARGS="${GTEST_FILTER:+--gtest_filter=${GTEST_FILTER}}" +echo "=== EP Tests ===" +echo " GPUs: ${NUM_GPUS} Binary: ${TEST_BIN}" echo -if (( OVERALL_FAIL )); then - echo "=== SOME SUITES FAILED ===" -else - echo "=== ALL SUITES PASSED ===" -fi -exit "${OVERALL_FAIL}" +"${MPIRUN}" -n "${NUM_GPUS}" ${MPIRUN_EXTRA:-} "${TEST_BIN}" ${GTEST_ARGS} diff --git a/tests/cpp_distributed/test_ep_common.h b/tests/cpp_distributed/test_ep_common.h index 971375e131..135a39416e 100644 --- a/tests/cpp_distributed/test_ep_common.h +++ b/tests/cpp_distributed/test_ep_common.h @@ -16,14 +16,13 @@ #include #include #include +#include #include -#include #include #include #include #include -#include #include #include @@ -35,11 +34,16 @@ using transformer_engine::DType; using transformer_engine::TensorWrapper; +#define CHECK_MPI(expr) \ + do { \ + int _err_mpi = (expr); \ + NVTE_CHECK(_err_mpi == MPI_SUCCESS, "MPI error: ", _err_mpi); \ + } while (false) + // ── Process-level state ─────────────────────────────────────────────────────── static int g_process_id = -1; static int g_num_processes = -1; -static std::string g_uid_file; static int g_sm_major = -1; // set by ep_bootstrap; -1 until then static int g_ep_size = -1; @@ -86,39 +90,11 @@ static inline std::vector routing_balanced( return idx; } -// ── File-based ncclUniqueId exchange ───────────────────────────────────────── +// ── ncclUniqueId exchange via MPI ───────────────────────────────────────────── static void exchange_unique_id(ncclUniqueId* uid) { - const size_t sz = sizeof(ncclUniqueId); - - if (g_process_id == 0) { - NVTE_CHECK_NCCL(ncclGetUniqueId(uid)); - FILE* f = fopen(g_uid_file.c_str(), "wb"); - if (!f) { fprintf(stderr, "Cannot open uid file: %s\n", g_uid_file.c_str()); exit(EXIT_FAILURE); } - fwrite(uid, 1, sz, f); - fclose(f); - } else { - auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(60); - while (true) { - FILE* f = fopen(g_uid_file.c_str(), "rb"); - if (f) { - fseek(f, 0, SEEK_END); - if (static_cast(ftell(f)) >= sz) { - fseek(f, 0, SEEK_SET); - size_t n = fread(uid, 1, sz, f); - fclose(f); - if (n == sz) break; - } else { - fclose(f); - } - } - if (std::chrono::steady_clock::now() > deadline) { - fprintf(stderr, "Process %d: timed out waiting for uid file\n", g_process_id); - exit(EXIT_FAILURE); - } - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - } - } + if (g_process_id == 0) NVTE_CHECK_NCCL(ncclGetUniqueId(uid)); + CHECK_MPI(MPI_Bcast(uid, sizeof(*uid), MPI_BYTE, 0, MPI_COMM_WORLD)); } // ── CLI parsing ─────────────────────────────────────────────────────────────── @@ -126,33 +102,21 @@ static void exchange_unique_id(ncclUniqueId* uid) { static void ep_parse_args(int argc, char* argv[]) { for (int i = 1; i < argc; ++i) { std::string a(argv[i]); - if (a.rfind("--process-id=", 0) == 0) g_process_id = std::stoi(a.substr(13)); - else if (a.rfind("--rank=", 0) == 0) g_process_id = std::stoi(a.substr(7)); - else if (a.rfind("--num-processes=",0)==0) g_num_processes = std::stoi(a.substr(16)); - else if (a.rfind("--nranks=", 0) == 0) g_num_processes = std::stoi(a.substr(9)); - else if (a.rfind("--uid-file=", 0) == 0) g_uid_file = a.substr(11); - else if (a.rfind("--max-token-dtype=", 0) == 0) + if (a.rfind("--max-token-dtype=", 0) == 0) g_max_token_dtype = static_cast(std::stoi(a.substr(18))); } - - if (g_process_id < 0 || g_num_processes <= 0) { - fprintf(stderr, - "Usage: %s --rank=N --nranks=N [--uid-file=path] [gtest flags]\n" - " Aliases: --process-id=N, --num-processes=N\n", - argc > 0 ? argv[0] : "test_ep"); - exit(EXIT_FAILURE); - } - - if (g_uid_file.empty()) { - const char* t = getenv("TMPDIR"); if (!t) t = "/tmp"; - g_uid_file = std::string(t) + "/te_ep_uid_" + std::to_string(g_process_id); - } } // ── Bootstrap / teardown ────────────────────────────────────────────────────── // Returns false if the binary should exit without running tests (wrong SM, etc.). static bool ep_bootstrap(int argc, char* argv[]) { + int mpi_initialized = 0; + MPI_Initialized(&mpi_initialized); + if (!mpi_initialized) CHECK_MPI(MPI_Init(&argc, &argv)); + CHECK_MPI(MPI_Comm_rank(MPI_COMM_WORLD, &g_process_id)); + CHECK_MPI(MPI_Comm_size(MPI_COMM_WORLD, &g_num_processes)); + ep_parse_args(argc, argv); ::testing::InitGoogleTest(&argc, argv); @@ -214,5 +178,7 @@ static void ep_teardown() { } g_ep_initialized = false; } - if (g_process_id == 0) remove(g_uid_file.c_str()); + int finalized = 0; + MPI_Finalized(&finalized); + if (!finalized) MPI_Finalize(); } From 370e6a42b5a04747644293f8eb476f2c10423088 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 16:34:46 -0700 Subject: [PATCH 18/21] ep.h: add TODO note about struct versioning Signed-off-by: Phuong Nguyen --- transformer_engine/common/include/transformer_engine/ep.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transformer_engine/common/include/transformer_engine/ep.h b/transformer_engine/common/include/transformer_engine/ep.h index d426f8845a..22e7ec48ac 100644 --- a/transformer_engine/common/include/transformer_engine/ep.h +++ b/transformer_engine/common/include/transformer_engine/ep.h @@ -23,6 +23,8 @@ extern "C" { #endif /* ── Config structs ─────────────────────────────────────────────────────── */ +/* TODO: add a struct_size/version field to these configs (and align with other + * TE public structs) once a TE-wide convention for ABI versioning lands. */ /*! \brief Group-level EP configuration (fixed for the EP group lifetime). */ typedef struct { From dbd1ef5e538cdcd643669734bfe883d5d6c6b01e Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 17:09:12 -0700 Subject: [PATCH 19/21] tests/cpp_distributed: fix test_ep build (helper ordering, TensorWrapper disambiguation, OpenMP link) and gate FullForwardBackward to bf16 Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/CMakeLists.txt | 3 +- tests/cpp_distributed/test_ep.cu | 57 +++++++++++++++------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/tests/cpp_distributed/CMakeLists.txt b/tests/cpp_distributed/CMakeLists.txt index fa7eb4a7f1..7dd8ea33e7 100644 --- a/tests/cpp_distributed/CMakeLists.txt +++ b/tests/cpp_distributed/CMakeLists.txt @@ -116,7 +116,8 @@ set(EP_TEST_COMMON_LIBS CUDA::nvrtc ${NCCL_LIB} ${NCCL_EP_LIB} - MPI::MPI_CXX) + MPI::MPI_CXX + OpenMP::OpenMP_CXX) # ── EP distributed tests (per-op + full pipeline + zero-copy symm) ─────────── add_executable(test_ep test_ep.cu ../cpp/test_common.cu) diff --git a/tests/cpp_distributed/test_ep.cu b/tests/cpp_distributed/test_ep.cu index aff8d04a9e..f46279b5f0 100644 --- a/tests/cpp_distributed/test_ep.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -39,6 +39,16 @@ static inline float token_value(int rank, int t, int num_tokens) { return static_cast(rank * num_tokens + t + 1) * (1.0f / 256.0f); } +// Per-element host-side conversion helpers used by templated test code. +inline float tok_to_float(nv_bfloat16 v) { return __bfloat162float(v); } +inline float tok_to_float(__half v) { return __half2float(v); } +inline float tok_to_float(float v) { return v; } + +template T tok_from_float(float v); +template <> inline nv_bfloat16 tok_from_float(float v) { return __float2bfloat16(v); } +template <> inline __half tok_from_float<__half> (float v) { return __float2half(v); } +template <> inline float tok_from_float (float v) { return v; } + template static std::vector generate_tokens(int rank, int num_tokens, int hidden_dim) { std::vector v(num_tokens * hidden_dim); @@ -94,16 +104,6 @@ static float bf16_tol(float magnitude) { return kBf16Atol + kBf16Rtol * std::fabs(magnitude); } -// Per-element host-side conversion helpers used by templated test code. -inline float tok_to_float(nv_bfloat16 v) { return __bfloat162float(v); } -inline float tok_to_float(__half v) { return __half2float(v); } -inline float tok_to_float(float v) { return v; } - -template T tok_from_float(float v); -template <> inline nv_bfloat16 tok_from_float(float v) { return __float2bfloat16(v); } -template <> inline __half tok_from_float<__half> (float v) { return __float2half(v); } -template <> inline float tok_from_float (float v) { return v; } - template static bool check_no_nan_inf(const T* dev, int count, const char* name) { std::vector h(count); @@ -181,32 +181,33 @@ struct EPTensors { EPTensors(EPBuffers& b, int num_tokens, int top_k, int hidden_dim, int num_local_experts) { constexpr DType kTokDType = test::TypeInfo::dtype; + using Shape = std::vector; topk_idx = TensorWrapper(b.topk_idx.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kInt64); + Shape{(size_t)num_tokens, (size_t)top_k}, DType::kInt64); topk_weights = TensorWrapper(b.topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + Shape{(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); token_counts = TensorWrapper(b.token_counts.get(), - {(size_t)num_local_experts}, DType::kInt32); + Shape{(size_t)num_local_experts}, DType::kInt32); handle_mem = TensorWrapper(b.handle_mem.get(), - {b.handle_mem_size}, DType::kByte); + Shape{b.handle_mem_size}, DType::kByte); tokens = TensorWrapper(b.tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); + Shape{(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); recv_tokens = TensorWrapper(b.recv_tokens.get(), - {b.recv_capacity, (size_t)hidden_dim}, kTokDType); + Shape{b.recv_capacity, (size_t)hidden_dim}, kTokDType); recv_topk_weights = TensorWrapper(b.recv_topk_weights.get(), - {b.recv_capacity}, DType::kFloat32); + Shape{b.recv_capacity}, DType::kFloat32); result = TensorWrapper(b.result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); + Shape{(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); grad_result = TensorWrapper(b.grad_result.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); + Shape{(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); grad_expert = TensorWrapper(b.grad_expert.get(), - {b.recv_capacity, (size_t)hidden_dim}, kTokDType); + Shape{b.recv_capacity, (size_t)hidden_dim}, kTokDType); grad_tokens = TensorWrapper(b.grad_tokens.get(), - {(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); + Shape{(size_t)num_tokens, (size_t)hidden_dim}, kTokDType); g_recv_topk_weights = TensorWrapper(b.g_recv_topk_weights.get(), - {b.recv_capacity}, DType::kFloat32); + Shape{b.recv_capacity}, DType::kFloat32); grad_topk_weights = TensorWrapper(b.grad_topk_weights.get(), - {(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); + Shape{(size_t)num_tokens, (size_t)top_k}, DType::kFloat32); } }; @@ -555,7 +556,7 @@ TEST_F(EPDispatchBwdGradWeightsTest, RoundTrip) { // g_recv_topk_weights := recv_topk_weights (the round-trip input). auto g_recv_t = TensorWrapper(buf.recv_topk_weights.get(), - {buf.recv_capacity}, DType::kFloat32); + std::vector{buf.recv_capacity}, DType::kFloat32); ASSERT_NO_THROW(nvte_ep_dispatch_bwd(NVTEEpHandle{handle_id, t.handle_mem.data()}, t.grad_expert.data(), NVTECommWindow{}, g_recv_t.data(), NVTECommWindow{}, t.grad_tokens.data(), t.grad_topk_weights.data(), stream)); @@ -651,9 +652,11 @@ TEST_P(EPPipelineTest, FullForwardBackward) { printf(" FullForwardBackward[%s]: passed\n", test::typeName(dtype).c_str()); } +// NCCL EP backend currently asserts ncclBfloat16 at dispatch; FP16/FP32 are +// reserved for when the backend adds support. INSTANTIATE_TEST_SUITE_P( Dtypes, EPPipelineTest, - ::testing::Values(DType::kBFloat16, DType::kFloat16, DType::kFloat32), + ::testing::Values(DType::kBFloat16), [](const ::testing::TestParamInfo& info) { return test::typeName(info.param); }); @@ -751,9 +754,9 @@ TEST_F(EPZeroCopyTest, IdentityAllSymm) { EPTensors<> sym_t(sym_buf, num_tokens_, top_k_, hidden_dim_, num_local_experts_); // Replace the tokens/recv_tokens views with ones pointing at the symm buffers. sym_t.tokens = TensorWrapper(sym_tokens.ptr, - {(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); + std::vector{(size_t)num_tokens_, (size_t)hidden_dim_}, DType::kBFloat16); sym_t.recv_tokens = TensorWrapper(sym_recv.ptr, - {sym_buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); + std::vector{sym_buf.recv_capacity, (size_t)hidden_dim_}, DType::kBFloat16); uint64_t sym_hid = sym_buf.handle_id; ASSERT_NO_THROW(nvte_ep_prepare(NVTEEpHandle{sym_hid, sym_t.handle_mem.data()}, sym_t.topk_idx.data(), sym_t.token_counts.data(), /*alignment=*/0, stream)); From 28566740aa30256782c96dd8fcd6eb3dec916994 Mon Sep 17 00:00:00 2001 From: Phuong Nguyen Date: Thu, 28 May 2026 17:11:01 -0700 Subject: [PATCH 20/21] tests/cpp_distributed: skip FP16/FP32 FullForwardBackward at runtime instead of dropping them from the parameter list Signed-off-by: Phuong Nguyen --- tests/cpp_distributed/test_ep.cu | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/cpp_distributed/test_ep.cu b/tests/cpp_distributed/test_ep.cu index f46279b5f0..bcf4ca3c98 100644 --- a/tests/cpp_distributed/test_ep.cu +++ b/tests/cpp_distributed/test_ep.cu @@ -642,6 +642,11 @@ class EPPipelineTest : public EpOpTestBase, public ::testing::WithParamInterface TEST_P(EPPipelineTest, FullForwardBackward) { const DType dtype = GetParam(); + // NCCL EP backend currently asserts ncclBfloat16 in ncclEpDispatch + // (contrib/nccl_ep/nccl_ep.cc); skip FP16/FP32 until the backend supports them. + if (dtype != DType::kBFloat16) { + GTEST_SKIP() << test::typeName(dtype) << " not yet supported by NCCL EP backend"; + } switch (dtype) { case DType::kBFloat16: run_full_forward_backward(); break; case DType::kFloat16: run_full_forward_backward<__half> (); break; @@ -652,11 +657,9 @@ TEST_P(EPPipelineTest, FullForwardBackward) { printf(" FullForwardBackward[%s]: passed\n", test::typeName(dtype).c_str()); } -// NCCL EP backend currently asserts ncclBfloat16 at dispatch; FP16/FP32 are -// reserved for when the backend adds support. INSTANTIATE_TEST_SUITE_P( Dtypes, EPPipelineTest, - ::testing::Values(DType::kBFloat16), + ::testing::Values(DType::kBFloat16, DType::kFloat16, DType::kFloat32), [](const ::testing::TestParamInfo& info) { return test::typeName(info.param); }); From 1e74f993c5a48ad8ea5da48a69572e982253169b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 May 2026 00:13:43 +0000 Subject: [PATCH 21/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/ep/ep_backend.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformer_engine/common/ep/ep_backend.cpp b/transformer_engine/common/ep/ep_backend.cpp index 5e11db6f48..a5ae99b089 100644 --- a/transformer_engine/common/ep/ep_backend.cpp +++ b/transformer_engine/common/ep/ep_backend.cpp @@ -348,8 +348,7 @@ void EPBackend::dispatch(uint64_t handle_id, void* handle_mem, const NVTETensor NVTEDType tok_dtype = nvte_tensor_type(tokens); NVTE_CHECK(typeToSize(static_cast(tok_dtype)) <= typeToSize(static_cast(group_config_.max_token_dtype)), - "tokens dtype (", static_cast(tok_dtype), - ") wider than group max_token_dtype (", + "tokens dtype (", static_cast(tok_dtype), ") wider than group max_token_dtype (", static_cast(group_config_.max_token_dtype), ")"); const size_t num_tokens = tok_shape.data[0];