From 499c93086228d3edca7dc57773c507cb42a6abb8 Mon Sep 17 00:00:00 2001 From: Feodor Kichatov Date: Tue, 26 May 2026 10:19:50 +0200 Subject: [PATCH 1/2] hip/rocm support, with multiprocessing-safe sweeps --- build.sh | 289 ++++++++++++++++++++++++++++++++++------- pufferlib/pufferl.py | 53 +++++--- pufferlib/sweep.py | 94 +++++++------- src/bindings.cu | 43 ++---- src/curriculum.cu | 18 ++- src/ocean.cu | 6 + src/pufferlib.cu | 137 ++++++++++++++++--- src/rocm_cuda_shim.cpp | 57 ++++++++ src/vecenv.h | 6 +- 9 files changed, 533 insertions(+), 170 deletions(-) create mode 100644 src/rocm_cuda_shim.cpp diff --git a/build.sh b/build.sh index 99a898fc6f..3fa248603a 100755 --- a/build.sh +++ b/build.sh @@ -10,10 +10,12 @@ set -e # ./build.sh breakout --fast # Standalone executable (optimized) # ./build.sh breakout --web # Emscripten web build # ./build.sh breakout --profile # Kernel profiling binary +# ./build.sh breakout --rocm # HIP/ROCm training backend +# ./build.sh breakout --cuda # CUDA training backend # ./build.sh all # Build all envs with default and --float if [ -z "$1" ]; then - echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--local|--fast|--web|--profile|--cpu|--all]" + echo "Usage: ./build.sh ENV_NAME [--float] [--debug] [--cuda|--rocm] [--local|--fast|--web|--profile|--cpu|--all]" exit 1 fi ENV=$1 @@ -28,6 +30,8 @@ for arg in "$@"; do --web) MODE=web ;; --profile) MODE=profile ;; --cpu) MODE=cpu; PRECISION="-DPRECISION_FLOAT" ;; + --cuda) BACKEND=cuda ;; + --rocm) BACKEND=rocm ;; *) echo "Error: unknown argument '$arg'" && exit 1 ;; esac done @@ -205,57 +209,9 @@ elif [ "$MODE" = "web" ]; then exit 0 fi -# Find cuDNN path -CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}} -CUDNN_IFLAG="" -CUDNN_LFLAG="" -for dir in /usr/local/cuda/include /usr/include; do - if [ -f "$dir/cudnn.h" ]; then - CUDNN_IFLAG="-I$dir" - break - fi -done -for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do - if [ -f "$dir/libcudnn.so" ]; then - CUDNN_LFLAG="-L$dir" - break - fi -done -if [ -z "$CUDNN_IFLAG" ]; then - CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") -fi -if [ -z "$CUDNN_LFLAG" ]; then - CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") -fi - -# NCCL include/lib fallback (mirrors the cuDNN fallback above). -# Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv. -NCCL_IFLAG="" -NCCL_LFLAG="" -for dir in /usr/include /usr/local/cuda/include; do - if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi -done -for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do - if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi -done -if [ -z "$NCCL_IFLAG" ]; then - NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") -fi -if [ -z "$NCCL_LFLAG" ]; then - NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") -fi - -WHEEL_RPATH_FLAGS=() -for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do - if [[ "$lib_flag" == -L* ]]; then - WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}") - fi -done - export CCACHE_DIR="${CCACHE_DIR:-$HOME/.ccache}" export CCACHE_BASEDIR="$(pwd)" export CCACHE_COMPILERCHECK=content -NVCC="ccache $CUDA_HOME/bin/nvcc" CC="${CC:-$(command -v ccache >/dev/null && echo 'ccache clang' || echo 'clang')}" ARCH=${NVCC_ARCH:-native} @@ -275,6 +231,41 @@ if [ ! -f "$BINDING_SRC" ]; then exit 1 fi +if [ -z "$MODE" ]; then + if [ -z "$BACKEND" ]; then + if "$PYTHON_BIN" -c "from torch.utils.cpp_extension import IS_HIP_EXTENSION; raise SystemExit(0 if IS_HIP_EXTENSION else 1)" 2>/dev/null && ! command -v nvcc >/dev/null 2>&1; then + BACKEND=rocm + else + BACKEND=cuda + fi + fi +elif [ -n "$BACKEND" ]; then + echo "Error: --cuda/--rocm only apply to the training backend" + exit 1 +fi + +if [ "$BACKEND" = "rocm" ] && [ "$ENV" = "nmmo3" ]; then + echo "Error: NMMO3 native encoder is CUDA-only in build.sh --rocm" + exit 1 +fi + +CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-}} +CUDA_IFLAG="" +if [ "$BACKEND" != "rocm" ]; then + CUDA_HOME=${CUDA_HOME:-${CUDA_PATH:-$(dirname "$(dirname "$(which nvcc)")")}} +fi +if [ "$BACKEND" = "cuda" ] || [ "$MODE" = "profile" ]; then + if [ -z "$CUDA_HOME" ]; then + if command -v nvcc >/dev/null 2>&1; then + CUDA_HOME=$(dirname "$(dirname "$(command -v nvcc)")") + else + echo "Error: nvcc not found. Use --rocm for HIP/ROCm or --cpu for CPU fallback." + exit 1 + fi + fi + CUDA_IFLAG="-I$CUDA_HOME/include" +fi + if [ "$MODE" = "cpu" ]; then echo "Compiling static library for $ENV..." ${CC:-clang} -c "${CLANG_OPT[@]}" $EXTRA_CFLAGS \ @@ -288,7 +279,59 @@ if [ "$MODE" = "cpu" ]; then ar rcs "$STATIC_LIB" "$STATIC_OBJ" fi -if [ -z "$MODE" ]; then +if [ -z "$MODE" ] && [ "$BACKEND" = "cuda" ]; then + # Find cuDNN path + CUDNN_IFLAG="" + CUDNN_LFLAG="" + for dir in /usr/local/cuda/include /usr/include; do + if [ -f "$dir/cudnn.h" ]; then + CUDNN_IFLAG="-I$dir" + break + fi + done + for dir in /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu; do + if [ -f "$dir/libcudnn.so" ]; then + CUDNN_LFLAG="-L$dir" + break + fi + done + if [ -z "$CUDNN_IFLAG" ]; then + CUDNN_IFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-I' + os.path.join(nvidia.cudnn.__path__[0], 'include'))" 2>/dev/null || echo "") + fi + if [ -z "$CUDNN_LFLAG" ]; then + CUDNN_LFLAG=$("$PYTHON_BIN" -c "import nvidia.cudnn, os; print('-L' + os.path.join(nvidia.cudnn.__path__[0], 'lib'))" 2>/dev/null || echo "") + fi + + # NCCL include/lib fallback (mirrors the cuDNN fallback above). + # Needed when NCCL is provided by the nvidia-nccl-cu12 wheel in the active venv. + NCCL_IFLAG="" + NCCL_LFLAG="" + for dir in /usr/include /usr/local/cuda/include; do + if [ -f "$dir/nccl.h" ]; then NCCL_IFLAG="-I$dir"; break; fi + done + for dir in /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64; do + if [ -f "$dir/libnccl.so" ] || [ -f "$dir/libnccl.so.2" ]; then NCCL_LFLAG="-L$dir"; break; fi + done + if [ -z "$NCCL_IFLAG" ]; then + NCCL_IFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-I' + os.path.join(nvidia.nccl.__path__[0], 'include'))" 2>/dev/null || echo "") + fi + if [ -z "$NCCL_LFLAG" ]; then + NCCL_LFLAG=$("$PYTHON_BIN" -c "import nvidia.nccl, os; print('-L' + os.path.join(nvidia.nccl.__path__[0], 'lib'))" 2>/dev/null || echo "") + fi + + WHEEL_RPATH_FLAGS=() + for lib_flag in "$CUDNN_LFLAG" "$NCCL_LFLAG"; do + if [[ "$lib_flag" == -L* ]]; then + WHEEL_RPATH_FLAGS+=("-Wl,-rpath,${lib_flag#-L}") + fi + done + + if command -v ccache >/dev/null 2>&1; then + NVCC="ccache $CUDA_HOME/bin/nvcc" + else + NVCC="$CUDA_HOME/bin/nvcc" + fi + echo "Compiling CUDA ($ARCH) training backend with $ENV binding..." $NVCC -c -arch=$ARCH -Xcompiler -fPIC \ -Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=1 \ @@ -320,6 +363,152 @@ if [ -z "$MODE" ]; then "${LINK_CMD[@]}" echo "Built: $OUTPUT" +elif [ -z "$MODE" ] && [ "$BACKEND" = "rocm" ]; then + mapfile -t ROCM_INFO < <("$PYTHON_BIN" - <<'PY' +import os +from torch.utils.cpp_extension import ROCM_HOME, library_paths, include_paths + +rocm_home = os.environ.get("ROCM_HOME") or ROCM_HOME +if not rocm_home: + raise SystemExit("ROCM_HOME not found. Install/use a ROCm-enabled PyTorch environment.") +print(rocm_home) +print(os.environ.get("HIPCC") or os.path.join(rocm_home, "bin", "hipcc")) +print(os.pathsep.join(include_paths("cuda"))) +print(os.pathsep.join(library_paths("cuda"))) +PY +) + ROCM_HOME=${ROCM_INFO[0]} + HIPCC=${ROCM_INFO[1]} + ROCM_INCLUDE_PATHS=${ROCM_INFO[2]} + ROCM_LIBRARY_PATHS=${ROCM_INFO[3]} + + if [ ! -x "$HIPCC" ]; then + if command -v hipcc >/dev/null 2>&1; then + HIPCC=$(command -v hipcc) + else + echo "Error: hipcc not found" + exit 1 + fi + fi + + if [ -z "$HIP_CLANG_PATH" ] || [ ! -x "$HIP_CLANG_PATH/clang++" ]; then + for dir in "$ROCM_HOME/lib/llvm/bin" /usr/lib/llvm/*/bin; do + if [ -x "$dir/clang++" ]; then + export HIP_CLANG_PATH="$dir" + break + fi + done + fi + + HIPIFY_SRC="build/hip/src" + HIPIFY_SRC_ABS="$(pwd)/$HIPIFY_SRC" + SRC_ABS="$(pwd)/src" + echo "Hipifying CUDA sources into $HIPIFY_SRC..." + rm -rf "$HIPIFY_SRC" + "$PYTHON_BIN" - <= train_epochs else backend.log(pufferl) + + should_early_stop = False + logs.setdefault('early_stop_threshold', -0.5) + logs.setdefault('is_loss_nan', False) + + # NaN loss is a failed training run for both sweeps and regular train. + if any(np.isnan(v) for v in logs.get('loss', {}).values()): + logs['is_loss_nan'] = True + should_early_stop = True + elif (is_sweep and epoch < train_epochs and pufferl.global_step > min(0.20*total_timesteps, 100_000_000)): + # side effect: writes `early_stop_threshold` to logs + should_early_stop = sweep_early_stop.early_stop(logs, target_key) + flat_logs = {**flat_logs, **dict(unroll_nested_dict(logs))} if epoch < train_epochs: @@ -283,9 +297,7 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): if epoch < train_epochs: all_logs.append(flat_logs) - if (sweep_obj is not None - and pufferl.global_step > min(0.20*total_timesteps, 100_000_000) and - sweep_obj.early_stop(logs, target_key)): + if should_early_stop: break elif flat_logs['env/n'] > args['eval_episodes']: break @@ -327,13 +339,14 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): all_logs.append(flat_logs) # Downsample results + metrics = {k: [[]] for k in all_logs[0] if k != 'is_loss_nan'} n = args['sweep']['downsample'] - metrics = {k: [[]] for k in all_logs[0]} logged_timesteps = all_logs[-1]['agent_steps'] next_bin = logged_timesteps / (n - 1) if n > 1 else np.inf for log in all_logs: for k, v in log.items(): - metrics[k][-1].append(v) + if k != 'is_loss_nan': + metrics[k][-1].append(v) if log['agent_steps'] < next_bin: continue @@ -346,6 +359,8 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): for k in metrics: metrics[k][-1] = all_logs[-1][k] + is_loss_nan = any(log.get('is_loss_nan', False) for log in all_logs) + # Match-mode: single observation at final-training cost. Protein's curve # fit collapses to one point — we only trust the match winrate, not any # training-time proxy. Replicate the scalar across all downsample bins so @@ -358,10 +373,10 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): log_dir = os.path.join(args['log_dir'], args['env_name']) os.makedirs(log_dir, exist_ok=True) with open(os.path.join(log_dir, run_id + '.json'), 'w') as f: - json.dump({**args, 'metrics': metrics}, f) + json.dump({**args, 'is_loss_nan': is_loss_nan, 'metrics': metrics}, f) if args['wandb']: - if sweep_obj is None and model_path: # Don't spam uploads during sweeps + if not is_sweep and model_path: # Don't spam uploads during sweeps artifact = wandb.Artifact(run_id, type='model') artifact.add_file(model_path) wandb.run.log_artifact(artifact) @@ -379,6 +394,9 @@ def _train(env_name, args, sweep_obj=None, result_queue=None, verbose=False): def train(env_name, args=None, gpus=None, **kwargs): args = args or load_config(env_name) validate_config(args) + sweep_obj = kwargs.pop('sweep_obj', None) + if sweep_obj is not None and 'sweep_early_stop' not in kwargs: + kwargs['sweep_early_stop'] = sweep_obj.early_stop_state() subprocess = gpus is not None gpus = list(gpus or range(args['train']['gpus'])) @@ -395,11 +413,8 @@ def train(env_name, args=None, gpus=None, **kwargs): worker_args['rank'] = rank worker_args['gpu_id'] = gpu_id if rank == 0 and not subprocess: - _train(env_name, worker_args, verbose=True) + _train(env_name, worker_args, verbose=True, **kwargs) else: - # Protein's GP models live on cuda:0 on non-WSL setups; spawn-pickling - # them works fine via CUDA IPC. On WSL, sweep.py forces device='cpu' - # at construction so there's nothing to move. ctx.Process(target=_train, args=(env_name, worker_args), kwargs=kwargs).start() @@ -407,7 +422,7 @@ def sweep(env_name, args=None, pareto=False): '''Train entry point. Handles single-GPU, multi-GPU DDP, and sweeps.''' args = args or load_config(env_name) exp_gpus = args['train']['gpus'] - sweep_gpus = args['sweep']['gpus'] or len(os.listdir('/proc/driver/nvidia/gpus')) + sweep_gpus = args['sweep']['gpus'] or torch.cuda.device_count() args['vec']['num_threads'] //= (sweep_gpus // exp_gpus) args['no_model_upload'] = True @@ -463,7 +478,7 @@ def sweep(env_name, args=None, pareto=False): exp_args = deepcopy(args) active[gpu_id] = exp_args train(env_name, exp_args, range(gpu_id, gpu_id + exp_gpus), - sweep_obj=sweep_obj, result_queue=result_queue) + sweep_early_stop=sweep_obj.early_stop_state(), result_queue=result_queue) def eval(env_name, args=None, load_path=None): '''Evaluate a trained policy. Supports both native and --slowly torch backends.''' diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index fc753e5395..ab1ed34fe1 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -22,6 +22,48 @@ from sklearn.linear_model import LogisticRegression EPSILON = 1e-6 +LOGIT_MIN = -5.0 +LOGIT_MAX = 100.0 +EARLY_STOP_THRESHOLD_FLOOR = LOGIT_MIN + +def _has_nan_loss(logs): + return any(math.isnan(v) for v in logs.get('loss', {}).values()) + +def logit_transform(value, epsilon=1e-9): + value = np.clip(value, epsilon, 1 - epsilon) + return np.clip(np.log(value / (1 - value)), LOGIT_MIN, LOGIT_MAX) + +class NanLossEarlyStop: + def early_stop(self, logs, target_key): + if _has_nan_loss(logs): + logs['is_loss_nan'] = True + return True + return False + +class ProteinEarlyStop: + def __init__(self, metric_distribution, threshold_model): + self.metric_distribution = metric_distribution + self.threshold_model = threshold_model + self.running_targets = deque(maxlen=30) + + def early_stop(self, logs, target_key): + if _has_nan_loss(logs): + logs['is_loss_nan'] = True + return True + + target_name = target_key.split('/', 1)[1] + if 'uptime' not in logs or target_name not in logs.get('env', {}): + return False + + metric_val, cost = logs['env'][target_name], logs['uptime'] + self.running_targets.append(metric_val) + score = max(np.mean(self.running_targets), metric_val) + if self.metric_distribution == 'percentile': + score = logit_transform(score) + + threshold = self.threshold_model.get_threshold(cost) + logs['early_stop_threshold'] = max(threshold, EARLY_STOP_THRESHOLD_FLOOR) + return score < threshold def unroll_nested_dict(d): if not isinstance(d, dict): @@ -335,11 +377,8 @@ def observe(self, hypers, score, cost, is_failure=False): is_failure=is_failure, )) - def early_stop(self, logs, target_key): - if any("loss/" in k and np.isnan(v) for k, v in logs.items()): - logs['is_loss_nan'] = True - return True - return False + def early_stop_state(self): + return NanLossEarlyStop() class ParetoGenetic: @@ -392,11 +431,8 @@ def observe(self, hypers, score, cost, is_failure=False): is_failure=is_failure, )) - def early_stop(self, logs, target_key): - if any("loss/" in k and np.isnan(v) for k, v in logs.items()): - logs['is_loss_nan'] = True - return True - return False + def early_stop_state(self): + return NanLossEarlyStop() class ExactGPModel(ExactGP): @@ -902,16 +938,11 @@ def suggest(self, fill, fixed_total_timesteps=None): best = suggestions[best_idx] return self.hyperparameters.to_dict(best, fill), info - def logit_transform(self, value, epsilon=1e-9): - value = np.clip(value, epsilon, 1 - epsilon) - logit = math.log(value / (1 - value)) - return np.clip(logit, -5, 100) - def observe(self, hypers, score, cost, is_failure=False): params = self.hyperparameters.from_dict(hypers) if self.metric_distribution == 'percentile': - score = self.logit_transform(score) + score = logit_transform(score) new_observation = dict( input=params, @@ -948,32 +979,5 @@ def observe(self, hypers, score, cost, is_failure=False): self.top_observations.append(new_observation) self.top_observations.sort(key=lambda x: x['output'], reverse=True) - def get_early_stop_threshold(self, cost): - return self.stop_threshold_model.get_threshold(cost) - - def should_stop(self, score, cost): - threshold = self.get_early_stop_threshold(cost) - - if self.metric_distribution == 'percentile': - score = self.logit_transform(score) - - return score < threshold - - def early_stop(self, logs, target_key): - for k, v in logs['loss'].items(): - if np.isnan(v): - logs['is_loss_nan'] = True - return True - - if 'uptime' not in logs or target_key not in logs: - return False - - metric_val, cost = logs['env'][target_key], logs['uptime'] - self._running_target_buffer.append(metric_val) - target_running_mean = np.mean(self._running_target_buffer) - threshold = self.get_early_stop_threshold(cost) - logs['early_stop_threshold'] = max(threshold, -5) - if self.should_stop(max(target_running_mean, metric_val), cost): - logs['is_loss_nan'] = False - return True - return False + def early_stop_state(self): + return ProteinEarlyStop(self.metric_distribution, self.stop_threshold_model) diff --git a/src/bindings.cu b/src/bindings.cu index e118e72168..7f6c82054f 100644 --- a/src/bindings.cu +++ b/src/bindings.cu @@ -75,18 +75,11 @@ pybind11::dict puf_log(pybind11::object pufferl_obj) { // Utilization pybind11::dict util_dict; - nvmlUtilization_t util; - nvmlDeviceGetUtilizationRates(pufferl.nvml_device, &util); - util_dict["gpu_percent"] = (float)util.gpu; - - nvmlMemory_t mem; - nvmlDeviceGetMemoryInfo(pufferl.nvml_device, &mem); - util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total; - - size_t cuda_free, cuda_total; - cudaMemGetInfo(&cuda_free, &cuda_total); - util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f); - util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f); + GpuUtil util = gpu_get_utilization(pufferl.gpu_device); + util_dict["gpu_percent"] = util.gpu_percent; + util_dict["gpu_mem"] = util.gpu_mem; + util_dict["vram_used_gb"] = util.vram_used_gb; + util_dict["vram_total_gb"] = util.vram_total_gb; long rss_kb = 0; FILE* f = fopen("/proc/self/status", "r"); @@ -491,25 +484,15 @@ PYBIND11_MODULE(_C, m) { }); // Standalone utilization monitor (no PuffeRL instance needed) m.def("get_utilization", [](int gpu_id) { - static bool nvml_inited = false; - if (!nvml_inited) { nvmlInit(); nvml_inited = true; } - py::dict util_dict; - nvmlDevice_t device; - nvmlDeviceGetHandleByIndex(gpu_id, &device); - - nvmlUtilization_t util; - nvmlDeviceGetUtilizationRates(device, &util); - util_dict["gpu_percent"] = (float)util.gpu; - - nvmlMemory_t mem; - nvmlDeviceGetMemoryInfo(device, &mem); - util_dict["gpu_mem"] = 100.0f * (float)mem.used / (float)mem.total; - - size_t cuda_free, cuda_total; - cudaMemGetInfo(&cuda_free, &cuda_total); - util_dict["vram_used_gb"] = (float)(cuda_total - cuda_free) / (1024.0f * 1024.0f * 1024.0f); - util_dict["vram_total_gb"] = (float)cuda_total / (1024.0f * 1024.0f * 1024.0f); + PufferGpuDevice device; + gpu_monitor_init(gpu_id, &device); + GpuUtil util = gpu_get_utilization(device); + gpu_monitor_shutdown(); + util_dict["gpu_percent"] = util.gpu_percent; + util_dict["gpu_mem"] = util.gpu_mem; + util_dict["vram_used_gb"] = util.vram_used_gb; + util_dict["vram_total_gb"] = util.vram_total_gb; long rss_kb = 0; FILE* f = fopen("/proc/self/status", "r"); diff --git a/src/curriculum.cu b/src/curriculum.cu index 4addabd8a2..089a210126 100644 --- a/src/curriculum.cu +++ b/src/curriculum.cu @@ -125,10 +125,22 @@ void close_state_buffer(StateBuffer* buf) { #ifdef PUFFER_CURRICULUM_IMPL #define PRIO_WARP_SIZE 32 -#define PRIO_FULL_MASK 0xffffffff +#ifdef USE_ROCM +using PufWarpMask = unsigned long long; +#else +using PufWarpMask = unsigned int; +#endif #define PRIO_BLOCK_SIZE 256 #define PRIO_NUM_WARPS (PRIO_BLOCK_SIZE / PRIO_WARP_SIZE) +__device__ __forceinline__ float prio_warp_sum(float val, int width = PRIO_WARP_SIZE) { + PufWarpMask mask = (PufWarpMask)__activemask(); + for (int s = width / 2; s >= 1; s /= 2) { + val += __shfl_down_sync(mask, val, s, width); + } + return val; +} + __device__ __forceinline__ float priority_power(float value, float alpha) { if (alpha == 0.0f) { return 1.0f; @@ -163,9 +175,7 @@ __global__ void compute_prio_abs( local_sum += fabsf(to_float(advantages[offset + t])); } - for (int s = PRIO_WARP_SIZE / 2; s >= 1; s /= 2) { - local_sum += __shfl_down_sync(PRIO_FULL_MASK, local_sum, s); - } + local_sum = prio_warp_sum(local_sum); if (tx == 0) { prio_weights[row] = priority_power(local_sum, prio_alpha) + eps; } diff --git a/src/ocean.cu b/src/ocean.cu index baaa9b7be6..314c8311be 100644 --- a/src/ocean.cu +++ b/src/ocean.cu @@ -1,6 +1,10 @@ // NMMO3 CUDA encoder: multihot, cuDNN conv, embedding, concat, projection // Included by pufferlib.cu — requires precision_t, PrecisionTensor, Allocator, puf_mm, etc. +#ifdef USE_ROCM +static void create_custom_encoder(const std::string&, Encoder*) {} +#else + #include "cudnn_conv2d.cu" // ---- NMMO3 constants ---- @@ -588,3 +592,5 @@ static void create_custom_encoder(const std::string& env_name, Encoder* enc) { }; } } + +#endif diff --git a/src/pufferlib.cu b/src/pufferlib.cu index df6faa57f4..812a400a5a 100644 --- a/src/pufferlib.cu +++ b/src/pufferlib.cu @@ -1,7 +1,11 @@ #include +#ifndef USE_ROCM #include #include #include +#else +#include +#endif #include #include @@ -46,6 +50,110 @@ typedef struct { float accum[NUM_PROF]; } ProfileT; +struct GpuUtil { + float gpu_percent; + float gpu_mem; + float vram_used_gb; + float vram_total_gb; +}; + +#ifndef USE_ROCM +using PufferGpuDevice = nvmlDevice_t; + +inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) { + nvmlInit(); + nvmlDeviceGetHandleByIndex(gpu_id, device); +} + +inline void gpu_monitor_shutdown() { + nvmlShutdown(); +} + +inline GpuUtil gpu_get_utilization(PufferGpuDevice device) { + GpuUtil out = {}; + nvmlUtilization_t util; + if (nvmlDeviceGetUtilizationRates(device, &util) == NVML_SUCCESS) { + out.gpu_percent = (float)util.gpu; + } + + nvmlMemory_t mem; + if (nvmlDeviceGetMemoryInfo(device, &mem) == NVML_SUCCESS && mem.total > 0) { + out.gpu_mem = 100.0f * (float)mem.used / (float)mem.total; + } + + size_t free_bytes = 0, total_bytes = 0; + cudaMemGetInfo(&free_bytes, &total_bytes); + if (total_bytes > 0) { + out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f); + out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f); + } + return out; +} + +inline void profile_begin(const char* tag, bool enable) { + if (enable) nvtxRangePushA(tag); +} + +inline void profile_end(bool enable) { + if (enable) nvtxRangePop(); +} + +inline void gpu_profiler_start(bool enable) { + if (enable) cudaProfilerStart(); +} + +inline void gpu_profiler_stop(bool enable) { + if (enable) cudaProfilerStop(); +} +#else +using PufferGpuDevice = uint32_t; + +inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) { + *device = (uint32_t)gpu_id; + rsmi_init(RSMI_INIT_FLAG_ALL_GPUS); +} + +inline void gpu_monitor_shutdown() { + rsmi_shut_down(); +} + +inline GpuUtil gpu_get_utilization(PufferGpuDevice device) { + GpuUtil out = {}; + uint32_t busy = 0; + if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) { + out.gpu_percent = (float)busy; + } + + uint64_t used = 0, total = 0; + if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS && + rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS && + total > 0) { + out.gpu_mem = 100.0f * (float)used / (float)total; + } + + size_t free_bytes = 0, total_bytes = 0; + cudaMemGetInfo(&free_bytes, &total_bytes); + if (total_bytes > 0) { + out.vram_used_gb = (float)(total_bytes - free_bytes) / (1024.0f * 1024.0f * 1024.0f); + out.vram_total_gb = (float)total_bytes / (1024.0f * 1024.0f * 1024.0f); + } + return out; +} + +inline void profile_begin(const char*, bool) {} +inline void profile_end(bool) {} +inline void gpu_profiler_start(bool) {} +inline void gpu_profiler_stop(bool) {} +#endif + +inline cudaError_t puf_graph_instantiate(cudaGraphExec_t* exec, cudaGraph_t graph) { +#ifdef USE_ROCM + return cudaGraphInstantiate(exec, graph, nullptr, nullptr, 0); +#else + return cudaGraphInstantiate(exec, graph, 0); +#endif +} + // Data collected by parallel environment workers. Each worker handles // a constant subset of agents struct RolloutBuf { @@ -355,7 +463,7 @@ typedef struct { PrecisionTensor grad_puf; LongTensor rng_offset_puf; // (num_buffers+1,) int64 CUDA device counters ProfileT profile; - nvmlDevice_t nvml_device; + PufferGpuDevice gpu_device; long epoch; long global_step; double start_time; @@ -390,14 +498,6 @@ Dict* log_environments_impl(PuffeRL& pufferl) { return out; } -inline void profile_begin(const char* tag, bool enable) { - if (enable) nvtxRangePushA(tag); -} - -inline void profile_end(bool enable) { - if (enable) nvtxRangePop(); -} - // Thread-local stream for per-buffer threads (set once by thread_init_wrapper) static thread_local cudaStream_t tl_stream = 0; @@ -692,7 +792,7 @@ extern "C" void net_callback_wrapper(void* ctx, int buf, int t) { cudaGraph_t _graph; assert(cudaStreamEndCapture(current_stream, &_graph) == cudaSuccess && "cudaStreamEndCapture failed"); - assert(cudaGraphInstantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph, 0) == cudaSuccess + assert(puf_graph_instantiate(&pufferl->fused_rollout_cudagraphs[graph], _graph) == cudaSuccess && "cudaGraphInstantiate failed"); assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed"); cudaDeviceSynchronize(); @@ -1497,7 +1597,7 @@ void train_impl(PuffeRL& pufferl) { cudaGraph_t _graph; assert(cudaStreamEndCapture(train_stream, &_graph) == cudaSuccess && "cudaStreamEndCapture failed"); - assert(cudaGraphInstantiate(&pufferl.train_cudagraph, _graph, 0) == cudaSuccess + assert(puf_graph_instantiate(&pufferl.train_cudagraph, _graph) == cudaSuccess && "cudaGraphInstantiate failed"); assert(cudaGraphDestroy(_graph) == cudaSuccess && "cudaGraphDestroy failed"); cudaDeviceSynchronize(); @@ -1860,8 +1960,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, cudaEventCreate(&pufferl->profile.events[i]); } memset(pufferl->profile.accum, 0, sizeof(pufferl->profile.accum)); - nvmlInit(); - nvmlDeviceGetHandleByIndex(hypers.gpu_id, &pufferl->nvml_device); + gpu_monitor_init(hypers.gpu_id, &pufferl->gpu_device); // Create policy int input_size = pufferl->env.obs.shape[1]; @@ -2110,10 +2209,8 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, net_callback_wrapper, thread_init_wrapper); static_vec_reset(vec); - if (hypers.profile) { - cudaDeviceSynchronize(); - cudaProfilerStart(); - } + if (hypers.profile) cudaDeviceSynchronize(); + gpu_profiler_start(hypers.profile); double now = wall_clock(); pufferl->start_time = now; @@ -2125,9 +2222,7 @@ std::unique_ptr create_pufferl_impl(HypersT& hypers, void close_impl(PuffeRL& pufferl) { cudaDeviceSynchronize(); - if (pufferl.hypers.profile) { - cudaProfilerStop(); - } + gpu_profiler_stop(pufferl.hypers.profile); cudaGraphExecDestroy(pufferl.train_cudagraph); for (int i = 0; i < pufferl.hypers.horizon * pufferl.hypers.num_buffers; i++) { @@ -2162,7 +2257,7 @@ void close_impl(PuffeRL& pufferl) { for (int i = 0; i < NUM_TRAIN_EVENTS; i++) { cudaEventDestroy(pufferl.profile.events[i]); } - nvmlShutdown(); + gpu_monitor_shutdown(); static_vec_close(pufferl.vec); diff --git a/src/rocm_cuda_shim.cpp b/src/rocm_cuda_shim.cpp new file mode 100644 index 0000000000..0feed14b2e --- /dev/null +++ b/src/rocm_cuda_shim.cpp @@ -0,0 +1,57 @@ +#include + +extern "C" { + +hipError_t cudaHostAlloc(void** ptr, size_t size, unsigned int flags) { + return hipHostMalloc(ptr, size, flags); +} + +hipError_t cudaMalloc(void** ptr, size_t size) { + return hipMalloc(ptr, size); +} + +hipError_t cudaMemcpy(void* dst, const void* src, size_t size, int kind) { + return hipMemcpy(dst, src, size, (hipMemcpyKind)kind); +} + +hipError_t cudaMemcpyAsync(void* dst, const void* src, size_t size, int kind, hipStream_t stream) { + return hipMemcpyAsync(dst, src, size, (hipMemcpyKind)kind, stream); +} + +hipError_t cudaMemset(void* dst, int value, size_t size) { + return hipMemset(dst, value, size); +} + +hipError_t cudaFree(void* ptr) { + return hipFree(ptr); +} + +hipError_t cudaFreeHost(void* ptr) { + return hipHostFree(ptr); +} + +hipError_t cudaSetDevice(int device) { + return hipSetDevice(device); +} + +hipError_t cudaDeviceSynchronize(void) { + return hipDeviceSynchronize(); +} + +hipError_t cudaStreamSynchronize(hipStream_t stream) { + return hipStreamSynchronize(stream); +} + +hipError_t cudaStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) { + return hipStreamCreateWithFlags(stream, flags); +} + +hipError_t cudaStreamQuery(hipStream_t stream) { + return hipStreamQuery(stream); +} + +const char* cudaGetErrorString(hipError_t error) { + return hipGetErrorString(error); +} + +} diff --git a/src/vecenv.h b/src/vecenv.h index ba5c635074..882c73b62a 100644 --- a/src/vecenv.h +++ b/src/vecenv.h @@ -10,7 +10,11 @@ #include #include "tensor.h" +#ifdef USE_ROCM +#include +#else #include +#endif #ifdef __cplusplus extern "C" { @@ -252,7 +256,7 @@ static void* static_omp_threadmanager(void* arg) { Env* envs = vec->envs; - printf("Num workers: %d\n", num_workers); + // printf("Num workers: %d\n", num_workers); while (true) { while (atomic_load(&buffer_states[buf]) != OMP_RUNNING) { if (atomic_load(&threading->shutdown)) { From 1c3bc3d376be05a92c9482f22d06971054abcaad Mon Sep 17 00:00:00 2001 From: Feodor Kichatov Date: Tue, 2 Jun 2026 19:53:57 +0200 Subject: [PATCH 2/2] rocm_smi is deprecated, migrate to amdsmi --- build.sh | 2 +- src/pufferlib.cu | 66 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/build.sh b/build.sh index 3fa248603a..8ca8a6e701 100755 --- a/build.sh +++ b/build.sh @@ -500,7 +500,7 @@ PY build/bindings.o build/rocm_cuda_shim.o "$RAYLIB_A" "${ROCM_LFLAGS[@]}" "${ROCM_RPATH_FLAGS[@]}" - -lamdhip64 -lhipblas -lhiprand -lrccl -lrocm_smi64 + -lamdhip64 -lhipblas -lhiprand -lrccl -lamd_smi $ROCM_OMP_LIB $LINK_OPT "${SHARED_LDFLAGS[@]}" diff --git a/src/pufferlib.cu b/src/pufferlib.cu index 812a400a5a..23f50aabcb 100644 --- a/src/pufferlib.cu +++ b/src/pufferlib.cu @@ -4,7 +4,7 @@ #include #include #else -#include +#include #endif #include #include @@ -106,29 +106,73 @@ inline void gpu_profiler_stop(bool enable) { if (enable) cudaProfilerStop(); } #else -using PufferGpuDevice = uint32_t; +using PufferGpuDevice = amdsmi_processor_handle; inline void gpu_monitor_init(int gpu_id, PufferGpuDevice* device) { - *device = (uint32_t)gpu_id; - rsmi_init(RSMI_INIT_FLAG_ALL_GPUS); + *device = nullptr; + if (amdsmi_init(AMDSMI_INIT_AMD_GPUS) != AMDSMI_STATUS_SUCCESS) { + return; + } + + uint32_t socket_count = 0; + if (amdsmi_get_socket_handles(&socket_count, nullptr) != AMDSMI_STATUS_SUCCESS) { + return; + } + + std::vector sockets(socket_count); + if (socket_count > 0 && + amdsmi_get_socket_handles(&socket_count, sockets.data()) != AMDSMI_STATUS_SUCCESS) { + return; + } + + uint32_t gpu_count = 0; + for (uint32_t i = 0; i < socket_count; i++) { + uint32_t processor_count = 0; + if (amdsmi_get_processor_handles(sockets[i], &processor_count, nullptr) != AMDSMI_STATUS_SUCCESS) { + continue; + } + + std::vector processors(processor_count); + if (processor_count > 0 && + amdsmi_get_processor_handles(sockets[i], &processor_count, processors.data()) != AMDSMI_STATUS_SUCCESS) { + continue; + } + + for (uint32_t j = 0; j < processor_count; j++) { + processor_type_t processor_type; + if (amdsmi_get_processor_type(processors[j], &processor_type) != AMDSMI_STATUS_SUCCESS || + processor_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + continue; + } + + if ((int)gpu_count == gpu_id) { + *device = processors[j]; + return; + } + gpu_count++; + } + } } inline void gpu_monitor_shutdown() { - rsmi_shut_down(); + amdsmi_shut_down(); } inline GpuUtil gpu_get_utilization(PufferGpuDevice device) { GpuUtil out = {}; + if (device == nullptr) { + return out; + } + uint32_t busy = 0; - if (rsmi_dev_busy_percent_get(device, &busy) == RSMI_STATUS_SUCCESS) { + if (amdsmi_get_gpu_busy_percent(device, &busy) == AMDSMI_STATUS_SUCCESS) { out.gpu_percent = (float)busy; } - uint64_t used = 0, total = 0; - if (rsmi_dev_memory_usage_get(device, RSMI_MEM_TYPE_VRAM, &used) == RSMI_STATUS_SUCCESS && - rsmi_dev_memory_total_get(device, RSMI_MEM_TYPE_VRAM, &total) == RSMI_STATUS_SUCCESS && - total > 0) { - out.gpu_mem = 100.0f * (float)used / (float)total; + amdsmi_vram_usage_t vram = {}; + if (amdsmi_get_gpu_vram_usage(device, &vram) == AMDSMI_STATUS_SUCCESS && + vram.vram_total > 0) { + out.gpu_mem = 100.0f * (float)vram.vram_used / (float)vram.vram_total; } size_t free_bytes = 0, total_bytes = 0;