ROCm · mawad-amd · Oct 16, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
@@ -35,10 +35,17 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
     fi
 
 elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
-    echo "[INFO] Building with Docker..."
-    IMAGE_NAME=${1:-"iris-dev"}
-    # We don't want to build a docker container for now.
-    # bash docker/build.sh "$IMAGE_NAME"
+    echo "[INFO] Checking Docker images..."
+    IMAGE_NAME="iris-dev-triton-aafec41"
+
+    # Check if the triton image exists
+    if docker image inspect "$IMAGE_NAME" &> /dev/null; then
+        echo "[INFO] Using existing Docker image: $IMAGE_NAME"
+    else
+        echo "[WARNING] Docker image $IMAGE_NAME not found"
+        echo "[INFO] Please build it using: ./build_triton_image.sh"
+        echo "[INFO] Or pull it if available from registry"
+    fi
 fi
 
 echo "[INFO] Container build completed successfully with $CONTAINER_RUNTIME"

@@ -80,7 +80,7 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
     $EXEC_CMD "$IMAGE" bash -c "$COMMAND"
 
 elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
-    IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev"}}
+    IMAGE_NAME=${CUSTOM_IMAGE:-${DOCKER_IMAGE_NAME:-"iris-dev-triton-aafec41"}}
 
     if ! docker image inspect "$IMAGE_NAME" &> /dev/null; then
         echo "[ERROR] Docker image $IMAGE_NAME not found"

@@ -25,7 +25,7 @@ if [ "$CONTAINER_RUNTIME" = "apptainer" ]; then
     bash apptainer/run.sh "$@"
 elif [ "$CONTAINER_RUNTIME" = "docker" ]; then
     echo "[INFO] Running with Docker..."
-    IMAGE_NAME=${1:-"iris-dev"}
+    IMAGE_NAME=${1:-"iris-dev-triton-aafec41"}
     WORKSPACE_DIR=${2:-"$(pwd)"}
     bash docker/run.sh "$IMAGE_NAME" "$WORKSPACE_DIR"
 fi

@@ -63,4 +63,33 @@ jobs:
           "
           echo "::endgroup::"
 
-          echo "✅ External validation test passed!"
+          echo "✅ External validation test passed!"
+
+  external-gluon-validation-test:
+    name: External Gluon Validation Test
+    needs: build-container-image
+    runs-on: [self-hosted, mi3008x]
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Cleanup lingering ports before tests
+        run: |
+          bash .github/scripts/cleanup_ports.sh
+
+      - name: Run External Gluon Validation Test
+        run: |
+          set -e
+
+          echo "::group::Running external gluon validation test"
+          bash .github/scripts/container_exec.sh --gpus "0,1" "
+            set -e
+            pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }}
+            wget -O test_iris_gluon_distributed.py https://gist.githubusercontent.com/mawad-amd/2666dde8ebe2755eb0c4f2108709fcd5/raw/aa567ef3185c37a80d25bc9724ae9589548261b4/test_iris_gluon_distributed.py
+            python test_iris_gluon_distributed.py
+          "
+          echo "::endgroup::"
+
+          echo "✅ External gluon validation test passed!"
@@ -18,6 +18,7 @@ Iris is a Triton-based framework for Remote Memory Access (RMA) operations devel
 - **SHMEM-like RMA**: Iris provides SHMEM-like RMA support in Triton.
 - **Simple and Intuitive API**: Iris provides simple and intuitive RMA APIs. Writing multi-GPU programs is as easy as writing single-GPU programs.
 - **Triton-based**: Iris is built on top of Triton and inherits Triton's performance and capabilities.
+- **Gluon-style Aggregate API (Experimental)**: Optional cleaner API using Triton's `@aggregate` decorator for better encapsulation.
 
 ## Documentation
 
@@ -98,6 +99,73 @@ if __name__ == "__main__":
     mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True)
 ```
 
+### Alternative: Gluon-style Aggregate API (Experimental)
+
+Iris also provides an experimental cleaner API using Triton's Gluon with `@gluon.jit` decorator:
+
+```python
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from triton.experimental import gluon
+from triton.experimental.gluon import language as gl
+import iris.experimental.iris_gluon as iris_gl
+
+# Device-side APIs - context encapsulates heap_bases
+@gluon.jit
+def kernel(IrisDeviceCtx: gl.constexpr, context_tensor,
+          buffer, buffer_size: gl.constexpr, block_size: gl.constexpr):
+    # Initialize device context from tensor
+    ctx = IrisDeviceCtx.initialize(context_tensor)
+
+    pid = gl.program_id(0)
+    block_start = pid * block_size
+    layout: gl.constexpr = gl.BlockedLayout([1], [64], [1], [0])
+    offsets = block_start + gl.arange(0, block_size, layout=layout)
+    mask = offsets < buffer_size
+
+    # Store 1 in the target buffer - no need to pass heap_bases separately!
+    target_rank = 1
+    ctx.store(buffer + offsets, 1, target_rank, mask=mask)
+
+def _worker(rank, world_size):
+    # Torch distributed initialization
+    device_id = rank % torch.cuda.device_count()
+    dist.init_process_group(
+        backend="nccl",
+        rank=rank,
+        world_size=world_size,
+        init_method="tcp://127.0.0.1:29500",
+        device_id=torch.device(f"cuda:{device_id}")
+    )
+
+    # Iris initialization
+    heap_size = 2**30   # 1GiB symmetric heap
+    iris_ctx = iris_gl.iris(heap_size)
+    context_tensor = iris_ctx.get_device_context()  # Get encoded context
+    cur_rank = iris_ctx.get_rank()
+
+    # Iris tensor allocation
+    buffer_size = 4096  # 4K elements buffer
+    buffer = iris_ctx.zeros(buffer_size, device="cuda", dtype=torch.float32)
+
+    # Launch the kernel on rank 0
+    block_size = 1024
+    grid = (buffer_size + block_size - 1) // block_size
+    source_rank = 0
+    if cur_rank == source_rank:
+        kernel[(grid,)](iris_gl.IrisDeviceCtx, context_tensor, 
+                       buffer, buffer_size, block_size, num_warps=1)
+
+    # Synchronize all ranks
+    iris_ctx.barrier()
+    dist.destroy_process_group()
+
+if __name__ == "__main__":
+    world_size = 2  # Using two ranks
+    mp.spawn(_worker, args=(world_size,), nprocs=world_size, join=True)
+```
+
 ## Quick Start Guide
 
 ### Quick Installation