From 499c93086228d3edca7dc57773c507cb42a6abb8 Mon Sep 17 00:00:00 2001
From: Feodor Kichatov <sozforex@gmail.com>
Date: Tue, 26 May 2026 10:19:50 +0200
Subject: [PATCH 1/2] hip/rocm support, with multiprocessing-safe sweeps

---
 build.sh               | 289 ++++++++++++++++++++++++++++++++++-------
 pufferlib/pufferl.py   |  53 +++++---
 pufferlib/sweep.py     |  94 +++++++-------
 src/bindings.cu        |  43 ++----
 src/curriculum.cu      |  18 ++-
 src/ocean.cu           |   6 +
 src/pufferlib.cu       | 137 ++++++++++++++++---
 src/rocm_cuda_shim.cpp |  57 ++++++++
 src/vecenv.h           |   6 +-
 9 files changed, 533 insertions(+), 170 deletions(-)
 create mode 100644 src/rocm_cuda_shim.cpp

diff --git a/build.sh b/build.sh
index 99a898fc6f..3fa248603a 100755
--- a/build.sh
+++ b/build.sh
@@ -10,10 +10,12 @@ set -e
 #   ./build.sh breakout --fast       # Standalone executable (optimized)
 #   ./build.sh breakout --web        # Emscripten web build
 #   ./build.sh breakout --profile    # Kernel profiling binary
+#   ./build.sh breakout --rocm       # HIP/ROCm training backend
+#   ./build.sh breakout --cuda       # CUDA training backend
 #   ./build.sh all                   # Build all envs with default and --float
 
 if [ -z "$1" ]; then
-    echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile|--cpu|--all]"
+    echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--cuda|--rocm] [--local|--fast|--web|--profile|--cpu|--all]"
     exit 1
 fi
 ENV=$1
@@ -28,6 +30,8 @@ for arg in "$@"; do
         --web)   MODE=web ;;
         --profile) MODE=profile ;;
         --cpu)   MODE=cpu; PRECISION="-DPRECISION_FLOAT" ;;
+        --cuda)  BACKEND=cuda ;;
+        --rocm)  BACKEND=rocm ;;
         *) echo "Error: unknown argument '$arg'" && exit 1 ;;
     esac
 done
@@ -205,57 +209,9 @@ elif [ "$MODE" = "web" ]; then
     exit 0
 fi
 
-# Find cuDNN path
-CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}}
-CUDNN_IFLAG=""
-CUDNN_LFLAG=""
-for dir in /usr/local/cuda/include /usr/include; do
-    if [ -f "$dir/cudnn.h" ]; then
-        CUDNN_IFLAG="-I$dir"
-        break
-    fi
-done
-for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do
-    if [ -f "$dir/libcudnn.so" ]; then
-        CUDNN_LFLAG="-L$dir"
-        break
-    fi
-done
-if [ -z "$CUDNN_IFLAG" ]; then
-    CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
-fi
-if [ -z "$CUDNN_LFLAG" ]; then
-    CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
-fi
-
-# NCCL include/lib fallback (mirrors the cuDNN fallback above).
-# Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv.
-NCCL_IFLAG=""
-NCCL_LFLAG=""
-for dir in /usr/include /usr/local/cuda/include; do
-    if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi
-done
-for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do
-    if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi
-done
-if [ -z "$NCCL_IFLAG" ]; then
-    NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
-fi
-if [ -z "$NCCL_LFLAG" ]; then
-    NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
-fi
-
-WHEEL_RPATH_FLAGS=()
-for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do
-    if [[ "$lib_flag" == -L* ]]; then
-        WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}")
-    fi
-done
-
 export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}"
 export CCACHE_BASEDIR="$(pwd)"
 export CCACHE_COMPILERCHECK=content
-NVCC="ccache $CUDA_HOME/bin/nvcc"
 CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}"
 ARCH=${NVCC_ARCH:-native}
 
@@ -275,6 +231,41 @@ if [ ! -f "$BINDING_SRC" ]; then
     exit 1
 fi
 
+if [ -z "$MODE" ]; then
+    if [ -z "$BACKEND" ]; then
+        if "$PYTHON_BIN" -c "from torch.utils.cpp_extension import IS_HIP_EXTENSION; raise SystemExit(0 if IS_HIP_EXTENSION else 1)" 2>/dev/null && ! command -v nvcc >/dev/null 2>&1; then
+            BACKEND=rocm
+        else
+            BACKEND=cuda
+        fi
+    fi
+elif [ -n "$BACKEND" ]; then
+    echo "Error: --cuda/--rocm only apply to the training backend"
+    exit 1
+fi
+
+if [ "$BACKEND" = "rocm" ] && [ "$ENV" = "nmmo3" ]; then
+    echo "Error: NMMO3 native encoder is CUDA-only in build.sh --rocm"
+    exit 1
+fi
+
+CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-}}
+CUDA_IFLAG=""
+if [ "$BACKEND" != "rocm" ]; then
+    CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}}
+fi
+if [ "$BACKEND" = "cuda" ] || [ "$MODE" = "profile" ]; then
+    if [ -z "$CUDA_HOME" ]; then
+        if command -v nvcc >/dev/null 2>&1; then
+            CUDA_HOME=$(dirname "$(dirname "$(command -v nvcc)")")
+        else
+            echo "Error: nvcc not found. Use --rocm for HIP/ROCm or --cpu for CPU fallback."
+            exit 1
+        fi
+    fi
+    CUDA_IFLAG="-I$CUDA_HOME/include"
+fi
+
 if [ "$MODE" = "cpu" ]; then
     echo "Compiling static library for $ENV..."
     ${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \
@@ -288,7 +279,59 @@ if [ "$MODE" = "cpu" ]; then
     ar rcs "$STATIC_LIB" "$STATIC_OBJ"
 fi
 
-if [ -z "$MODE" ]; then
+if [ -z "$MODE" ] && [ "$BACKEND" = "cuda" ]; then
+    # Find cuDNN path
+    CUDNN_IFLAG=""
+    CUDNN_LFLAG=""
+    for dir in /usr/local/cuda/include /usr/include; do
+        if [ -f "$dir/cudnn.h" ]; then
+            CUDNN_IFLAG="-I$dir"
+            break
+        fi
+    done
+    for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do
+        if [ -f "$dir/libcudnn.so" ]; then
+            CUDNN_LFLAG="-L$dir"
+            break
+        fi
+    done
+    if [ -z "$CUDNN_IFLAG" ]; then
+        CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "")
+    fi
+    if [ -z "$CUDNN_LFLAG" ]; then
+        CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    fi
+
+    # NCCL include/lib fallback (mirrors the cuDNN fallback above).
+    # Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv.
+    NCCL_IFLAG=""
+    NCCL_LFLAG=""
+    for dir in /usr/include /usr/local/cuda/include; do
+        if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi
+    done
+    for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do
+        if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi
+    done
+    if [ -z "$NCCL_IFLAG" ]; then
+        NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "")
+    fi
+    if [ -z "$NCCL_LFLAG" ]; then
+        NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "")
+    fi
+
+    WHEEL_RPATH_FLAGS=()
+    for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do
+        if [[ "$lib_flag" == -L* ]]; then
+            WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}")
+        fi
+    done
+
+    if command -v ccache >/dev/null 2>&1; then
+        NVCC="ccache $CUDA_HOME/bin/nvcc"
+    else
+        NVCC="$CUDA_HOME/bin/nvcc"
+    fi
+
     echo "Compiling CUDA ($ARCH) training backend with $ENV binding..."
     $NVCC -c -arch=$ARCH -Xcompiler -fPIC \
         -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \
@@ -320,6 +363,152 @@ if [ -z "$MODE" ]; then
     "${LINK_CMD[@]}"
     echo "Built: $OUTPUT"
 
+elif [ -z "$MODE" ] && [ "$BACKEND" = "rocm" ]; then
+    mapfile -t ROCM_INFO < <("$PYTHON_BIN" - <<'PY'
+import os
+from torch.utils.cpp_extension import ROCM_HOME, library_paths, include_paths
+
+rocm_home = os.environ.get("ROCM_HOME") or ROCM_HOME
+if not rocm_home:
+    raise SystemExit("ROCM_HOME not found. Install/use a ROCm-enabled PyTorch environment.")
+print(rocm_home)
+print(os.environ.get("HIPCC") or os.path.join(rocm_home, "bin", "hipcc"))
+print(os.pathsep.join(include_paths("cuda")))
+print(os.pathsep.join(library_paths("cuda")))
+PY
+)
+    ROCM_HOME=${ROCM_INFO[0]}
+    HIPCC=${ROCM_INFO[1]}
+    ROCM_INCLUDE_PATHS=${ROCM_INFO[2]}
+    ROCM_LIBRARY_PATHS=${ROCM_INFO[3]}
+
+    if [ ! -x "$HIPCC" ]; then
+        if command -v hipcc >/dev/null 2>&1; then
+            HIPCC=$(command -v hipcc)
+        else
+            echo "Error: hipcc not found"
+            exit 1
+        fi
+    fi
+
+    if [ -z "$HIP_CLANG_PATH" ] || [ ! -x "$HIP_CLANG_PATH/clang++" ]; then
+        for dir in "$ROCM_HOME/lib/llvm/bin" /usr/lib/llvm/*/bin; do
+            if [ -x "$dir/clang++" ]; then
+                export HIP_CLANG_PATH="$dir"
+                break
+            fi
+        done
+    fi
+
+    HIPIFY_SRC="build/hip/src"
+    HIPIFY_SRC_ABS="$(pwd)/$HIPIFY_SRC"
+    SRC_ABS="$(pwd)/src"
+    echo "Hipifying CUDA sources into $HIPIFY_SRC..."
+    rm -rf "$HIPIFY_SRC"
+    "$PYTHON_BIN" - <<PY
+from torch.utils.hipify import hipify_python
+hipify_python.hipify(
+    project_directory="$SRC_ABS",
+    output_directory="$HIPIFY_SRC_ABS",
+    includes=["*"],
+    show_progress=False,
+    show_detailed=False,
+    is_pytorch_extension=True,
+)
+PY
+    cp "$HIPIFY_SRC/vecenv_hip.h" "$HIPIFY_SRC/vecenv.h"
+    "$PYTHON_BIN" - <<PY
+path = "$HIPIFY_SRC/pufferlib.hip"
+with open(path) as f:
+    src = f.read()
+src = src.replace('#include "vecenv_hip.h"', '#include "vecenv.h"')
+with open(path, 'w') as f:
+    f.write(src)
+PY
+
+    ROCM_IFLAGS=()
+    IFS=':' read -ra ROCM_INC_ARR <<< "$ROCM_INCLUDE_PATHS"
+    for dir in "${ROCM_INC_ARR[@]}"; do
+        [ -n "$dir" ] && ROCM_IFLAGS+=("-I$dir")
+    done
+    ROCM_LFLAGS=()
+    ROCM_RPATH_FLAGS=()
+    if [ -d /usr/lib64 ]; then
+        ROCM_LFLAGS+=("-L/usr/lib64")
+        ROCM_RPATH_FLAGS+=("-Wl,-rpath,/usr/lib64")
+    fi
+    IFS=':' read -ra ROCM_LIB_ARR <<< "$ROCM_LIBRARY_PATHS"
+    for dir in "${ROCM_LIB_ARR[@]}"; do
+        [ -n "$dir" ] || continue
+        [ "$dir" = "/usr/lib" ] && [ -d /usr/lib64 ] && continue
+        ROCM_LFLAGS+=("-L$dir")
+        ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+    done
+    ROCM_OMP_LIB=""
+    for dir in /usr/lib64 /usr/lib /usr/local/lib; do
+        if [ -f "$dir/libomp.so" ]; then
+            ROCM_LFLAGS+=("-L$dir")
+            ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+            ROCM_OMP_LIB="-lomp"
+            break
+        elif [ -f "$dir/libomp5.so" ]; then
+            ROCM_LFLAGS+=("-L$dir")
+            ROCM_RPATH_FLAGS+=("-Wl,-rpath,$dir")
+            ROCM_OMP_LIB="-lomp5"
+            break
+        fi
+    done
+
+    ROCM_ARCH_FLAGS=()
+    if [ -n "$PYTORCH_ROCM_ARCH" ]; then
+        IFS=';' read -ra ROCM_ARCH_ARR <<< "$PYTORCH_ROCM_ARCH"
+        for arch in "${ROCM_ARCH_ARR[@]}"; do
+            [ -n "$arch" ] && ROCM_ARCH_FLAGS+=("--offload-arch=$arch")
+        done
+    fi
+
+    HIPCC_OPT=()
+    if [ -n "$DEBUG" ]; then
+        HIPCC_OPT=(-O0 -g)
+    else
+        HIPCC_OPT=(-O2)
+    fi
+
+    echo "Compiling ROCm/HIP training backend with $ENV binding..."
+    "$HIPCC" "${ROCM_ARCH_FLAGS[@]}" -c -fPIC \
+        -D_GLIBCXX_USE_CXX11_ABI=1 \
+        -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION \
+        -DPLATFORM_DESKTOP \
+        -DUSE_ROCM \
+        -std=c++17 \
+        -I. -I"$HIPIFY_SRC" -I$SRC_DIR -Ivendor -I$RAYLIB_NAME/include \
+        -I$PYTHON_INCLUDE -I$PYBIND_INCLUDE -I$NUMPY_INCLUDE \
+        "${ROCM_IFLAGS[@]}" \
+        -fopenmp \
+        -Wno-c++11-narrowing \
+        -DENV_BINDING_SRC=\"$BINDING_SRC\" \
+        -DENV_NAME=$ENV \
+        $PRECISION "${HIPCC_OPT[@]}" \
+        "$HIPIFY_SRC/bindings.hip" -o build/bindings.o
+
+    "$HIPCC" -c -fPIC -std=c++17 \
+        "${ROCM_IFLAGS[@]}" \
+        src/rocm_cuda_shim.cpp -o build/rocm_cuda_shim.o
+
+    LINK_CMD=(
+        ${CXX:-g++} -shared -fPIC -fopenmp
+        build/bindings.o build/rocm_cuda_shim.o "$RAYLIB_A"
+        "${ROCM_LFLAGS[@]}"
+        "${ROCM_RPATH_FLAGS[@]}"
+        -lamdhip64 -lhipblas -lhiprand -lrccl -lrocm_smi64
+        $ROCM_OMP_LIB
+        $LINK_OPT
+        "${SHARED_LDFLAGS[@]}"
+        -o "$OUTPUT"
+    )
+    "${LINK_CMD[@]}"
+    echo "Built: $OUTPUT"
+
 elif [ "$MODE" = "cpu" ]; then
     echo "Compiling CPU training backend..."
     ${CXX:-g++} -c -fPIC \
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 03b3b828c2..4b1d56b3a9 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -187,10 +187,11 @@ def _train_worker(args):
 
     backend.close(pufferl)
 
-def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
+def _train(env_name, args, sweep_early_stop=None, result_queue=None, verbose=False):
     '''Single-GPU training worker. Process target for both DDP ranks and sweep trials.'''
     backend = _resolve_backend(args)
     rank = args['rank']
+    is_sweep = sweep_early_stop is not None
     run_id = str(int(1000*time.time()))
     if args['wandb']:
         import wandb
@@ -207,7 +208,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
 
     # When sweeping, optionally score each trial by winrate vs a fixed enemy
     # checkpoint (match mode) instead of the training-time self-play metric.
-    match_mode = (sweep_obj is not None
+    match_mode = (is_sweep
         and bool(args.get('sweep', {}).get('match_enemy_model_path')))
 
     checkpoint_dir = os.path.join(args['checkpoint_dir'], args['env_name'], run_id)
@@ -254,10 +255,10 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
 
         # In match-sweep mode we need the final checkpoint to feed into match().
         is_final = epoch == train_epochs - 1
-        should_save = (sweep_obj is None
-            and (epoch % args['checkpoint_interval'] == 0 or is_final)
-        ) or (match_mode and is_final)
-        if should_save:
+        regular_checkpoint = not is_sweep and (
+            epoch % args['checkpoint_interval'] == 0 or is_final)
+        match_checkpoint = match_mode and is_final
+        if regular_checkpoint or match_checkpoint:
             model_path = os.path.join(checkpoint_dir, f'{pufferl.global_step:016d}.bin')
             backend.save_weights(pufferl, model_path)
 
@@ -266,6 +267,19 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
             continue
 
         logs = backend.eval_log(pufferl) if epoch >= train_epochs else backend.log(pufferl)
+
+        should_early_stop = False
+        logs.setdefault('early_stop_threshold', -0.5)
+        logs.setdefault('is_loss_nan', False)
+
+        # NaN loss is a failed training run for both sweeps and regular train.
+        if any(np.isnan(v) for v in logs.get('loss', {}).values()):
+            logs['is_loss_nan'] = True
+            should_early_stop = True
+        elif (is_sweep and epoch < train_epochs and pufferl.global_step > min(0.20*total_timesteps, 100_000_000)):
+            # side effect: writes `early_stop_threshold` to logs
+            should_early_stop = sweep_early_stop.early_stop(logs, target_key)
+
         flat_logs = {**flat_logs, **dict(unroll_nested_dict(logs))}
 
         if epoch < train_epochs:
@@ -283,9 +297,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
         if epoch < train_epochs:
             all_logs.append(flat_logs)
 
-            if (sweep_obj is not None
-                    and pufferl.global_step > min(0.20*total_timesteps, 100_000_000) and
-                    sweep_obj.early_stop(logs, target_key)):
+            if should_early_stop:
                 break
         elif flat_logs['env/n'] > args['eval_episodes']:
             break
@@ -327,13 +339,14 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
     all_logs.append(flat_logs)
 
     # Downsample results
+    metrics = {k: [[]] for k in all_logs[0] if k != 'is_loss_nan'}
     n = args['sweep']['downsample']
-    metrics = {k: [[]] for k in all_logs[0]}
     logged_timesteps = all_logs[-1]['agent_steps']
     next_bin = logged_timesteps / (n - 1) if n > 1 else np.inf
     for log in all_logs:
         for k, v in log.items():
-            metrics[k][-1].append(v)
+            if k != 'is_loss_nan':
+                metrics[k][-1].append(v)
 
         if log['agent_steps'] < next_bin:
             continue
@@ -346,6 +359,8 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
     for k in metrics:
         metrics[k][-1] = all_logs[-1][k]
 
+    is_loss_nan = any(log.get('is_loss_nan', False) for log in all_logs)
+
     # Match-mode: single observation at final-training cost. Protein's curve
     # fit collapses to one point — we only trust the match winrate, not any
     # training-time proxy. Replicate the scalar across all downsample bins so
@@ -358,10 +373,10 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
     log_dir = os.path.join(args['log_dir'], args['env_name'])
     os.makedirs(log_dir, exist_ok=True)
     with open(os.path.join(log_dir, run_id + '.json'), 'w') as f:
-        json.dump({**args, 'metrics': metrics}, f)
+        json.dump({**args, 'is_loss_nan': is_loss_nan, 'metrics': metrics}, f)
 
     if args['wandb']:
-        if sweep_obj is None and model_path: # Don't spam uploads during sweeps
+        if not is_sweep and model_path: # Don't spam uploads during sweeps
             artifact = wandb.Artifact(run_id, type='model')
             artifact.add_file(model_path)
             wandb.run.log_artifact(artifact)
@@ -379,6 +394,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False):
 def train(env_name, args=None, gpus=None, **kwargs):
     args = args or load_config(env_name)
     validate_config(args)
+    sweep_obj = kwargs.pop('sweep_obj', None)
+    if sweep_obj is not None and 'sweep_early_stop' not in kwargs:
+        kwargs['sweep_early_stop'] = sweep_obj.early_stop_state()
 
     subprocess = gpus is not None
     gpus = list(gpus or range(args['train']['gpus']))
@@ -395,11 +413,8 @@ def train(env_name, args=None, gpus=None, **kwargs):
         worker_args['rank'] = rank
         worker_args['gpu_id'] = gpu_id
         if rank == 0 and not subprocess:
-            _train(env_name, worker_args, verbose=True)
+            _train(env_name, worker_args, verbose=True, **kwargs)
         else:
-            # Protein's GP models live on cuda:0 on non-WSL setups; spawn-pickling
-            # them works fine via CUDA IPC. On WSL, sweep.py forces device='cpu'
-            # at construction so there's nothing to move.
             ctx.Process(target=_train, args=(env_name, worker_args),
                 kwargs=kwargs).start()
 
@@ -407,7 +422,7 @@ def sweep(env_name, args=None, pareto=False):
     '''Train entry point. Handles single-GPU, multi-GPU DDP, and sweeps.'''
     args = args or load_config(env_name)
     exp_gpus = args['train']['gpus']
-    sweep_gpus = args['sweep']['gpus'] or len(os.listdir('/proc/driver/nvidia/gpus'))
+    sweep_gpus = args['sweep']['gpus'] or torch.cuda.device_count()
     args['vec']['num_threads'] //= (sweep_gpus // exp_gpus)
     args['no_model_upload'] = True
 
@@ -463,7 +478,7 @@ def sweep(env_name, args=None, pareto=False):
         exp_args = deepcopy(args)
         active[gpu_id] = exp_args
         train(env_name, exp_args, range(gpu_id, gpu_id + exp_gpus),
-            sweep_obj=sweep_obj, result_queue=result_queue)
+            sweep_early_stop=sweep_obj.early_stop_state(), result_queue=result_queue)
 
 def eval(env_name, args=None, load_path=None):
     '''Evaluate a trained policy. Supports both native and --slowly torch backends.'''
diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py
index fc753e5395..ab1ed34fe1 100644
--- a/pufferlib/sweep.py
+++ b/pufferlib/sweep.py
@@ -22,6 +22,48 @@
 from sklearn.linear_model import LogisticRegression
 
 EPSILON = 1e-6
+LOGIT_MIN = -5.0
+LOGIT_MAX = 100.0
+EARLY_STOP_THRESHOLD_FLOOR = LOGIT_MIN
+
+def _has_nan_loss(logs):
+    return any(math.isnan(v) for v in logs.get('loss', {}).values())
+
+def logit_transform(value, epsilon=1e-9):
+    value = np.clip(value, epsilon, 1 - epsilon)
+    return np.clip(np.log(value / (1 - value)), LOGIT_MIN, LOGIT_MAX)
+
+class NanLossEarlyStop:
+    def early_stop(self, logs, target_key):
+        if _has_nan_loss(logs):
+            logs['is_loss_nan'] = True
+            return True
+        return False
+
+class ProteinEarlyStop:
+    def __init__(self, metric_distribution, threshold_model):
+        self.metric_distribution = metric_distribution
+        self.threshold_model = threshold_model
+        self.running_targets = deque(maxlen=30)
+
+    def early_stop(self, logs, target_key):
+        if _has_nan_loss(logs):
+            logs['is_loss_nan'] = True
+            return True
+
+        target_name = target_key.split('/', 1)[1]
+        if 'uptime' not in logs or target_name not in logs.get('env', {}):
+            return False
+
+        metric_val, cost = logs['env'][target_name], logs['uptime']
+        self.running_targets.append(metric_val)
+        score = max(np.mean(self.running_targets), metric_val)
+        if self.metric_distribution == 'percentile':
+            score = logit_transform(score)
+
+        threshold = self.threshold_model.get_threshold(cost)
+        logs['early_stop_threshold'] = max(threshold, EARLY_STOP_THRESHOLD_FLOOR)
+        return score < threshold
 
 def unroll_nested_dict(d):
     if not isinstance(d, dict):
@@ -335,11 +377,8 @@ def observe(self, hypers, score, cost, is_failure=False):
             is_failure=is_failure,
         ))
 
-    def early_stop(self, logs, target_key):
-        if any("loss/" in k and np.isnan(v) for k, v in logs.items()):
-            logs['is_loss_nan'] = True
-            return True
-        return False
+    def early_stop_state(self):
+        return NanLossEarlyStop()
 
 
 class ParetoGenetic:
@@ -392,11 +431,8 @@ def observe(self, hypers, score, cost, is_failure=False):
             is_failure=is_failure,
         ))
 
-    def early_stop(self, logs, target_key):
-        if any("loss/" in k and np.isnan(v) for k, v in logs.items()):
-            logs['is_loss_nan'] = True
-            return True
-        return False
+    def early_stop_state(self):
+        return NanLossEarlyStop()
 
 
 class ExactGPModel(ExactGP):
@@ -902,16 +938,11 @@ def suggest(self, fill, fixed_total_timesteps=None):
         best = suggestions[best_idx]
         return self.hyperparameters.to_dict(best, fill), info
 
-    def logit_transform(self, value, epsilon=1e-9):
-        value = np.clip(value, epsilon, 1 - epsilon)
-        logit = math.log(value / (1 - value))
-        return np.clip(logit, -5, 100)
-
     def observe(self, hypers, score, cost, is_failure=False):
         params = self.hyperparameters.from_dict(hypers)
 
         if self.metric_distribution == 'percentile':
-            score = self.logit_transform(score)
+            score = logit_transform(score)
 
         new_observation = dict(
             input=params,
@@ -948,32 +979,5 @@ def observe(self, hypers, score, cost, is_failure=False):
             self.top_observations.append(new_observation)
             self.top_observations.sort(key=lambda x: x['output'], reverse=True)
 
-    def get_early_stop_threshold(self, cost):
-        return self.stop_threshold_model.get_threshold(cost)
-
-    def should_stop(self, score, cost):
-        threshold = self.get_early_stop_threshold(cost)
-
-        if self.metric_distribution == 'percentile':
-            score = self.logit_transform(score)
-
-        return score < threshold
-
-    def early_stop(self, logs, target_key):
-        for k, v in logs['loss'].items():
-            if np.isnan(v):
-                logs['is_loss_nan'] = True
-                return True
-
-        if 'uptime' not in logs or target_key not in logs:
-            return False
-
-        metric_val, cost = logs['env'][target_key], logs['uptime']
-        self._running_target_buffer.append(metric_val)
-        target_running_mean = np.mean(self._running_target_buffer)
-        threshold = self.get_early_stop_threshold(cost)
-        logs['early_stop_threshold'] = max(threshold, -5)
-        if self.should_stop(max(target_running_mean, metric_val), cost):
-            logs['is_loss_nan'] = False
-            return True
-        return False
+    def early_stop_state(self):
+        return ProteinEarlyStop(self.metric_distribution, self.stop_threshold_model)
diff --git a/src/bindings.cu b/src/bindings.cu
index e118e72168..7f6c82054f 100644
--- a/src/bindings.cu
+++ b/src/bindings.cu
@@ -75,18 +75,11 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) {
 
     // Utilization
     pybind11::dict util_dict;
-    nvmlUtilization_t util;
-    nvmlDeviceGetUtilizationRates(pufferl.nvml_device, &util);
-    util_dict["gpu_percent"] = (float)util.gpu;
-
-    nvmlMemory_t mem;
-    nvmlDeviceGetMemoryInfo(pufferl.nvml_device, &mem);
-    util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total;
-
-    size_t cuda_free, cuda_total;
-    cudaMemGetInfo(&cuda_free, &cuda_total);
-    util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f);
-    util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f);
+    GpuUtil util = gpu_get_utilization(pufferl.gpu_device);
+    util_dict["gpu_percent"] = util.gpu_percent;
+    util_dict["gpu_mem"] = util.gpu_mem;
+    util_dict["vram_used_gb"] = util.vram_used_gb;
+    util_dict["vram_total_gb"] = util.vram_total_gb;
 
     long rss_kb = 0;
     FILE* f = fopen("/proc/self/status", "r");
@@ -491,25 +484,15 @@ PYBIND11_MODULE(_C, m) {
     });
     // Standalone utilization monitor (no PuffeRL instance needed)
     m.def("get_utilization", [](int gpu_id) {
-        static bool nvml_inited = false;
-        if (!nvml_inited) { nvmlInit(); nvml_inited = true; }
-
         py::dict util_dict;
-        nvmlDevice_t device;
-        nvmlDeviceGetHandleByIndex(gpu_id, &device);
-
-        nvmlUtilization_t util;
-        nvmlDeviceGetUtilizationRates(device, &util);
-        util_dict["gpu_percent"] = (float)util.gpu;
-
-        nvmlMemory_t mem;
-        nvmlDeviceGetMemoryInfo(device, &mem);
-        util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total;
-
-        size_t cuda_free, cuda_total;
-        cudaMemGetInfo(&cuda_free, &cuda_total);
-        util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f);
-        util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f);
+        PufferGpuDevice device;
+        gpu_monitor_init(gpu_id, &device);
+        GpuUtil util = gpu_get_utilization(device);
+        gpu_monitor_shutdown();
+        util_dict["gpu_percent"] = util.gpu_percent;
+        util_dict["gpu_mem"] = util.gpu_mem;
+        util_dict["vram_used_gb"] = util.vram_used_gb;
+        util_dict["vram_total_gb"] = util.vram_total_gb;
 
         long rss_kb = 0;
         FILE* f = fopen("/proc/self/status", "r");
diff --git a/src/curriculum.cu b/src/curriculum.cu
index 4addabd8a2..089a210126 100644
--- a/src/curriculum.cu
+++ b/src/curriculum.cu
@@ -125,10 +125,22 @@ void close_state_buffer(StateBuffer* buf) {
 #ifdef PUFFER_CURRICULUM_IMPL
 
 #define PRIO_WARP_SIZE 32
-#define PRIO_FULL_MASK 0xffffffff
+#ifdef USE_ROCM
+using PufWarpMask = unsigned long long;
+#else
+using PufWarpMask = unsigned int;
+#endif
 #define PRIO_BLOCK_SIZE 256
 #define PRIO_NUM_WARPS (PRIO_BLOCK_SIZE / PRIO_WARP_SIZE)
 
+__device__ __forceinline__ float prio_warp_sum(float val, int width = PRIO_WARP_SIZE) {
+    PufWarpMask mask = (PufWarpMask)__activemask();
+    for (int s = width / 2; s >= 1; s /= 2) {
+        val += __shfl_down_sync(mask, val, s, width);
+    }
+    return val;
+}
+
 __device__ __forceinline__ float priority_power(float value, float alpha) {
     if (alpha == 0.0f) {
         return 1.0f;
@@ -163,9 +175,7 @@ __global__ void compute_prio_abs(
         local_sum += fabsf(to_float(advantages[offset + t]));
     }
 
-    for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) {
-        local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s);
-    }
+    local_sum = prio_warp_sum(local_sum);
     if (tx == 0) {
         prio_weights[row] = priority_power(local_sum, prio_alpha) + eps;
     }
diff --git a/src/ocean.cu b/src/ocean.cu
index baaa9b7be6..314c8311be 100644
--- a/src/ocean.cu
+++ b/src/ocean.cu
@@ -1,6 +1,10 @@
 // NMMO3 CUDA encoder: multihot, cuDNN conv, embedding, concat, projection
 // Included by pufferlib.cu — requires precision_t, PrecisionTensor, Allocator, puf_mm, etc.
 
+#ifdef USE_ROCM
+static void create_custom_encoder(const std::string&, Encoder*) {}
+#else
+
 #include "cudnn_conv2d.cu"
 
 // ---- NMMO3 constants ----
@@ -588,3 +592,5 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) {
         };
     }
 }
+
+#endif
diff --git a/src/pufferlib.cu b/src/pufferlib.cu
index df6faa57f4..812a400a5a 100644
--- a/src/pufferlib.cu
+++ b/src/pufferlib.cu
@@ -1,7 +1,11 @@
 #include <cuda_runtime.h>
+#ifndef USE_ROCM
 #include <cuda_profiler_api.h>
 #include <nvtx3/nvToolsExt.h>
 #include <nvml.h>
+#else
+#include <rocm_smi/rocm_smi.h>
+#endif
 #include <nccl.h>
 #include <vector>
 
@@ -46,6 +50,110 @@ typedef struct {
     float accum[NUM_PROF];
 } ProfileT;
 
+struct GpuUtil {
+    float gpu_percent;
+    float gpu_mem;
+    float vram_used_gb;
+    float vram_total_gb;
+};
+
+#ifndef USE_ROCM
+using PufferGpuDevice = nvmlDevice_t;
+
+inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) {
+    nvmlInit();
+    nvmlDeviceGetHandleByIndex(gpu_id, device);
+}
+
+inline void gpu_monitor_shutdown() {
+    nvmlShutdown();
+}
+
+inline GpuUtil gpu_get_utilization(PufferGpuDevice device) {
+    GpuUtil out = {};
+    nvmlUtilization_t util;
+    if (nvmlDeviceGetUtilizationRates(device, &util) == NVML_SUCCESS) {
+        out.gpu_percent = (float)util.gpu;
+    }
+
+    nvmlMemory_t mem;
+    if (nvmlDeviceGetMemoryInfo(device, &mem) == NVML_SUCCESS && mem.total > 0) {
+        out.gpu_mem = 100.0f * (float)mem.used / (float)mem.total;
+    }
+
+    size_t free_bytes = 0, total_bytes = 0;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    if (total_bytes > 0) {
+        out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f);
+        out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f);
+    }
+    return out;
+}
+
+inline void profile_begin(const char* tag, bool enable) {
+    if (enable) nvtxRangePushA(tag);
+}
+
+inline void profile_end(bool enable) {
+    if (enable) nvtxRangePop();
+}
+
+inline void gpu_profiler_start(bool enable) {
+    if (enable) cudaProfilerStart();
+}
+
+inline void gpu_profiler_stop(bool enable) {
+    if (enable) cudaProfilerStop();
+}
+#else
+using PufferGpuDevice = uint32_t;
+
+inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) {
+    *device = (uint32_t)gpu_id;
+    rsmi_init(RSMI_INIT_FLAG_ALL_GPUS);
+}
+
+inline void gpu_monitor_shutdown() {
+    rsmi_shut_down();
+}
+
+inline GpuUtil gpu_get_utilization(PufferGpuDevice device) {
+    GpuUtil out = {};
+    uint32_t busy = 0;
+    if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) {
+        out.gpu_percent = (float)busy;
+    }
+
+    uint64_t used = 0, total = 0;
+    if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS &&
+            rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS &&
+            total > 0) {
+        out.gpu_mem = 100.0f * (float)used / (float)total;
+    }
+
+    size_t free_bytes = 0, total_bytes = 0;
+    cudaMemGetInfo(&free_bytes, &total_bytes);
+    if (total_bytes > 0) {
+        out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f);
+        out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f);
+    }
+    return out;
+}
+
+inline void profile_begin(const char*, bool) {}
+inline void profile_end(bool) {}
+inline void gpu_profiler_start(bool) {}
+inline void gpu_profiler_stop(bool) {}
+#endif
+
+inline cudaError_t puf_graph_instantiate(cudaGraphExec_t* exec, cudaGraph_t graph) {
+#ifdef USE_ROCM
+    return cudaGraphInstantiate(exec, graph, nullptr, nullptr, 0);
+#else
+    return cudaGraphInstantiate(exec, graph, 0);
+#endif
+}
+
 // Data collected by parallel environment workers. Each worker handles
 // a constant subset of agents 
 struct RolloutBuf {
@@ -355,7 +463,7 @@ typedef struct {
     PrecisionTensor grad_puf;
     LongTensor rng_offset_puf;   // (num_buffers+1,) int64 CUDA device counters
     ProfileT profile;
-    nvmlDevice_t nvml_device;
+    PufferGpuDevice gpu_device;
     long epoch;
     long global_step;
     double start_time;
@@ -390,14 +498,6 @@ Dict* log_environments_impl(PuffeRL& pufferl) {
     return out;
 }
 
-inline void profile_begin(const char* tag, bool enable) {
-    if (enable) nvtxRangePushA(tag);
-}
-
-inline void profile_end(bool enable) {
-    if (enable) nvtxRangePop();
-}
-
 // Thread-local stream for per-buffer threads (set once by thread_init_wrapper)
 static thread_local cudaStream_t tl_stream = 0;
 
@@ -692,7 +792,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) {
         cudaGraph_t _graph;
         assert(cudaStreamEndCapture(current_stream, &_graph) == cudaSuccess
                 && "cudaStreamEndCapture failed");
-        assert(cudaGraphInstantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph, 0) == cudaSuccess
+        assert(puf_graph_instantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph) == cudaSuccess
                 && "cudaGraphInstantiate failed");
         assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed");
         cudaDeviceSynchronize();
@@ -1497,7 +1597,7 @@ void train_impl(PuffeRL& pufferl) {
                 cudaGraph_t _graph;
                 assert(cudaStreamEndCapture(train_stream, &_graph) == cudaSuccess
                         && "cudaStreamEndCapture failed");
-                assert(cudaGraphInstantiate(&pufferl.train_cudagraph, _graph, 0) == cudaSuccess
+                assert(puf_graph_instantiate(&pufferl.train_cudagraph, _graph) == cudaSuccess
                         && "cudaGraphInstantiate failed");
                 assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed");
                 cudaDeviceSynchronize();
@@ -1860,8 +1960,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         cudaEventCreate(&pufferl->profile.events[i]);
     }
     memset(pufferl->profile.accum, 0, sizeof(pufferl->profile.accum));
-    nvmlInit();
-    nvmlDeviceGetHandleByIndex(hypers.gpu_id, &pufferl->nvml_device);
+    gpu_monitor_init(hypers.gpu_id, &pufferl->gpu_device);
 
     // Create policy
     int input_size = pufferl->env.obs.shape[1];
@@ -2110,10 +2209,8 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
         net_callback_wrapper, thread_init_wrapper);
     static_vec_reset(vec);
 
-    if (hypers.profile) {
-        cudaDeviceSynchronize();
-        cudaProfilerStart();
-    }
+    if (hypers.profile) cudaDeviceSynchronize();
+    gpu_profiler_start(hypers.profile);
 
     double now = wall_clock();
     pufferl->start_time = now;
@@ -2125,9 +2222,7 @@ std::unique_ptr<PuffeRL> create_pufferl_impl(HypersT& hypers,
 
 void close_impl(PuffeRL& pufferl) {
     cudaDeviceSynchronize();
-    if (pufferl.hypers.profile) {
-        cudaProfilerStop();
-    }
+    gpu_profiler_stop(pufferl.hypers.profile);
 
     cudaGraphExecDestroy(pufferl.train_cudagraph);
     for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) {
@@ -2162,7 +2257,7 @@ void close_impl(PuffeRL& pufferl) {
     for (int i = 0; i < NUM_TRAIN_EVENTS; i++) {
         cudaEventDestroy(pufferl.profile.events[i]);
     }
-    nvmlShutdown();
+    gpu_monitor_shutdown();
 
     static_vec_close(pufferl.vec);
 
diff --git a/src/rocm_cuda_shim.cpp b/src/rocm_cuda_shim.cpp
new file mode 100644
index 0000000000..0feed14b2e
--- /dev/null
+++ b/src/rocm_cuda_shim.cpp
@@ -0,0 +1,57 @@
+#include <hip/hip_runtime.h>
+
+extern "C" {
+
+hipError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) {
+    return hipHostMalloc(ptr, size, flags);
+}
+
+hipError_t cudaMalloc(void** ptr, size_t size) {
+    return hipMalloc(ptr, size);
+}
+
+hipError_t cudaMemcpy(void* dst, const void* src, size_t size, int kind) {
+    return hipMemcpy(dst, src, size, (hipMemcpyKind)kind);
+}
+
+hipError_t cudaMemcpyAsync(void* dst, const void* src, size_t size, int kind, hipStream_t stream) {
+    return hipMemcpyAsync(dst, src, size, (hipMemcpyKind)kind, stream);
+}
+
+hipError_t cudaMemset(void* dst, int value, size_t size) {
+    return hipMemset(dst, value, size);
+}
+
+hipError_t cudaFree(void* ptr) {
+    return hipFree(ptr);
+}
+
+hipError_t cudaFreeHost(void* ptr) {
+    return hipHostFree(ptr);
+}
+
+hipError_t cudaSetDevice(int device) {
+    return hipSetDevice(device);
+}
+
+hipError_t cudaDeviceSynchronize(void) {
+    return hipDeviceSynchronize();
+}
+
+hipError_t cudaStreamSynchronize(hipStream_t stream) {
+    return hipStreamSynchronize(stream);
+}
+
+hipError_t cudaStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+    return hipStreamCreateWithFlags(stream, flags);
+}
+
+hipError_t cudaStreamQuery(hipStream_t stream) {
+    return hipStreamQuery(stream);
+}
+
+const char* cudaGetErrorString(hipError_t error) {
+    return hipGetErrorString(error);
+}
+
+}
diff --git a/src/vecenv.h b/src/vecenv.h
index ba5c635074..882c73b62a 100644
--- a/src/vecenv.h
+++ b/src/vecenv.h
@@ -10,7 +10,11 @@
 #include <assert.h>
 
 #include "tensor.h"
+#ifdef USE_ROCM
+#include <hip/hip_runtime_api.h>
+#else
 #include <cuda_runtime_api.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -252,7 +256,7 @@ static void* static_omp_threadmanager(void* arg) {
 
     Env* envs = vec->envs;
 
-    printf("Num workers: %d\n", num_workers);
+    // printf("Num workers: %d\n", num_workers);
     while (true) {
         while (atomic_load(&buffer_states[buf]) != OMP_RUNNING) {
             if (atomic_load(&threading->shutdown)) {

From 1c3bc3d376be05a92c9482f22d06971054abcaad Mon Sep 17 00:00:00 2001
From: Feodor Kichatov <sozforex@gmail.com>
Date: Tue, 2 Jun 2026 19:53:57 +0200
Subject: [PATCH 2/2] rocm_smi is deprecated, migrate to amdsmi

---
 build.sh         |  2 +-
 src/pufferlib.cu | 66 ++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/build.sh b/build.sh
index 3fa248603a..8ca8a6e701 100755
--- a/build.sh
+++ b/build.sh
@@ -500,7 +500,7 @@ PY
         build/bindings.o build/rocm_cuda_shim.o "$RAYLIB_A"
         "${ROCM_LFLAGS[@]}"
         "${ROCM_RPATH_FLAGS[@]}"
-        -lamdhip64 -lhipblas -lhiprand -lrccl -lrocm_smi64
+        -lamdhip64 -lhipblas -lhiprand -lrccl -lamd_smi
         $ROCM_OMP_LIB
         $LINK_OPT
         "${SHARED_LDFLAGS[@]}"
diff --git a/src/pufferlib.cu b/src/pufferlib.cu
index 812a400a5a..23f50aabcb 100644
--- a/src/pufferlib.cu
+++ b/src/pufferlib.cu
@@ -4,7 +4,7 @@
 #include <nvtx3/nvToolsExt.h>
 #include <nvml.h>
 #else
-#include <rocm_smi/rocm_smi.h>
+#include <amd_smi/amdsmi.h>
 #endif
 #include <nccl.h>
 #include <vector>
@@ -106,29 +106,73 @@ inline void gpu_profiler_stop(bool enable) {
     if (enable) cudaProfilerStop();
 }
 #else
-using PufferGpuDevice = uint32_t;
+using PufferGpuDevice = amdsmi_processor_handle;
 
 inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) {
-    *device = (uint32_t)gpu_id;
-    rsmi_init(RSMI_INIT_FLAG_ALL_GPUS);
+    *device = nullptr;
+    if (amdsmi_init(AMDSMI_INIT_AMD_GPUS) != AMDSMI_STATUS_SUCCESS) {
+        return;
+    }
+
+    uint32_t socket_count = 0;
+    if (amdsmi_get_socket_handles(&socket_count, nullptr) != AMDSMI_STATUS_SUCCESS) {
+        return;
+    }
+
+    std::vector<amdsmi_socket_handle> sockets(socket_count);
+    if (socket_count > 0 &&
+            amdsmi_get_socket_handles(&socket_count, sockets.data()) != AMDSMI_STATUS_SUCCESS) {
+        return;
+    }
+
+    uint32_t gpu_count = 0;
+    for (uint32_t i = 0; i < socket_count; i++) {
+        uint32_t processor_count = 0;
+        if (amdsmi_get_processor_handles(sockets[i], &processor_count, nullptr) != AMDSMI_STATUS_SUCCESS) {
+            continue;
+        }
+
+        std::vector<amdsmi_processor_handle> processors(processor_count);
+        if (processor_count > 0 &&
+                amdsmi_get_processor_handles(sockets[i], &processor_count, processors.data()) != AMDSMI_STATUS_SUCCESS) {
+            continue;
+        }
+
+        for (uint32_t j = 0; j < processor_count; j++) {
+            processor_type_t processor_type;
+            if (amdsmi_get_processor_type(processors[j], &processor_type) != AMDSMI_STATUS_SUCCESS ||
+                    processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) {
+                continue;
+            }
+
+            if ((int)gpu_count == gpu_id) {
+                *device = processors[j];
+                return;
+            }
+            gpu_count++;
+        }
+    }
 }
 
 inline void gpu_monitor_shutdown() {
-    rsmi_shut_down();
+    amdsmi_shut_down();
 }
 
 inline GpuUtil gpu_get_utilization(PufferGpuDevice device) {
     GpuUtil out = {};
+    if (device == nullptr) {
+        return out;
+    }
+
     uint32_t busy = 0;
-    if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) {
+    if (amdsmi_get_gpu_busy_percent(device, &busy) == AMDSMI_STATUS_SUCCESS) {
         out.gpu_percent = (float)busy;
     }
 
-    uint64_t used = 0, total = 0;
-    if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS &&
-            rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS &&
-            total > 0) {
-        out.gpu_mem = 100.0f * (float)used / (float)total;
+    amdsmi_vram_usage_t vram = {};
+    if (amdsmi_get_gpu_vram_usage(device, &vram) == AMDSMI_STATUS_SUCCESS &&
+            vram.vram_total > 0) {
+        out.gpu_mem = 100.0f * (float)vram.vram_used / (float)vram.vram_total;
     }
 
     size_t free_bytes = 0, total_bytes = 0;