pytorch
diff --git a/‎helion/_compiler/device_ir.py‎
Lines changed: 13 additions & 4 deletions b/‎helion/_compiler/device_ir.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 153 additions & 0 deletions b/‎helion/_compiler/indexing_strategy.py‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎helion/_compiler/roll_reduction.py‎
Lines changed: 30 additions & 0 deletions b/‎helion/_compiler/roll_reduction.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎helion/_compiler/type_propagation.py‎
Lines changed: 82 additions & 1 deletion b/‎helion/_compiler/type_propagation.py‎
Lines changed: 82 additions & 1 deletion
@@ -52,6 +52,7 @@
 from .type_propagation import GridIndexType
 from .type_propagation import IterType
 from .type_propagation import LiteralType
+from .type_propagation import MulticastTensorType
 from .type_propagation import NumericType
 from .type_propagation import SequenceType
 from .type_propagation import TensorType
@@ -321,12 +322,14 @@ def build_rolled_reductions(self) -> None:
             graph_to_info = {}
             allow_loop = False
 
-            # First, check if any graph contains matmul with rdim
+            # First, check if any graph contains matmul or dev_prts multicast with rdim
             # If so, we can't roll any graphs in this reduction dimension
             can_roll_graphs = True
             for graph_info in self.graphs:
                 roller = ReductionRoller(self, rdim, {})
-                if roller.has_matmul_with_rdim(graph_info.graph):
+                if roller.has_matmul_with_rdim(
+                    graph_info.graph
+                ) or roller.has_multicast_tensor_with_rdim(graph_info.graph):
                     can_roll_graphs = False
                     break
 
@@ -783,7 +786,9 @@ def visit_Assign(self, node: ast.Assign) -> None:
         assert isinstance(target.value, ExtendedAST)
         assert target.value._type_info is not None
         target_origin = target.value._type_info.origin  # pyright: ignore[reportOptionalMemberAccess]
-        if not target_origin.is_host():
+        if not target_origin.is_host() and not isinstance(
+            target.value._type_info, MulticastTensorType
+        ):
             # Get the variable name for the error message
             var_name = (
                 target.value.id
@@ -808,7 +813,9 @@ def _assign_subscript(self, target: ast.Subscript, val: object) -> None:
         assert isinstance(target.value, ExtendedAST)
         assert target.value._type_info is not None
         target_origin = target.value._type_info.origin
-        assert target_origin.is_host()
+        assert target_origin.is_host() or isinstance(
+            target.value._type_info, MulticastTensorType
+        )
 
         return hl.store(
             self.visit(target.value),  # pyright: ignore[reportArgumentType]
@@ -841,6 +848,8 @@ def visit_Subscript(self, node: ast.Subscript) -> object:
             if isinstance(node.slice, ast.Constant):
                 return self.visit(value)[self.visit(node.slice)]  # pyright: ignore[reportIndexIssue]
             raise exc.InvalidSequenceSubscription(node.slice)
+        if isinstance(type_info, MulticastTensorType):
+            return hl.load(self.visit(value), self._subscript_slice_proxy(node.slice))  # pyright: ignore[reportArgumentType]
         if type_info is not None and type_info.origin.is_host():
             return hl.load(self.visit(value), self._subscript_slice_proxy(node.slice))  # pyright: ignore[reportArgumentType]
         return hl.subscript(self.visit(value), self._subscript_slice_proxy(node.slice))  # pyright: ignore[reportArgumentType]
 
@@ -8,6 +8,7 @@
 
 import sympy
 import torch
+from torch._inductor.utils import triton_type
 
 from .. import exc
 from .._compat import get_tensor_descriptor_fn_name
@@ -19,10 +20,15 @@
 from .variable_origin import BlockSizeOrigin
 
 if TYPE_CHECKING:
+    from collections.abc import Sequence
+
     from ..runtime.config import Config
     from .device_function import TensorDescriptorArg
     from .inductor_lowering import CodegenState
 
+    SymIntLike = torch.SymInt | int
+    ShapeLike = Sequence[SymIntLike]
+
 
 class IndexingStrategy:
     def codegen_load(
@@ -289,6 +295,153 @@ def codegen_store(
         )
 
 
+class MulticastIndexingStrategy:
+    """
+    Generate pointer math for multicasting load/store to several device memory pointers sharing the same indexing.
+
+    offset, mask are calculated for the tensor_like template tensor and then broadcasted to each dev_ptr
+    , with the results stacked.
+
+    e.g. for a 1D offset tensor and a 1D dev_ptr array, the multicasted offset is:
+    multicast_offset = dev_ptrs[:, None] + offset[None, :]
+
+    """
+
+    @staticmethod
+    def get_broadcast_str(
+        multicast_shape: ShapeLike,
+        subscript_shape: ShapeLike,
+    ) -> tuple[str, str]:
+        """
+        Args:
+            multicast_shape: shape of the dev_ptr tensor.
+            subscript_shape: shape of subscription for each individual tensor.
+
+        Returns:
+            the broadcast str for dev_ptrs and individual tensor offset.
+        """
+        multicast_broadcast_keys = [":" for _ in multicast_shape] + [
+            "None" for _ in subscript_shape
+        ]
+        multicast_broadcast = f"[{', '.join(multicast_broadcast_keys)}]"
+        tensor_broadcast_keys = ["None" for _ in multicast_shape] + [
+            ":" for _ in subscript_shape
+        ]
+        tensor_broadcast = f"[{', '.join(tensor_broadcast_keys)}]"
+
+        return multicast_broadcast, tensor_broadcast
+
+    @staticmethod
+    def get_mask_expr(
+        state: CodegenState,
+        indexing: SubscriptIndexing,
+        multicast_shape: ShapeLike,
+        subscript_shape: ShapeLike,
+    ) -> ast.AST | None:
+        multicast_broadcast, tensor_broadcast = (
+            MulticastIndexingStrategy.get_broadcast_str(
+                multicast_shape, subscript_shape
+            )
+        )
+
+        mask_exprs = []
+        dev_ptr_mask_exprs = []
+        # Generate Mask
+
+        for dim, size in enumerate(multicast_shape):
+            if (
+                index := CompileEnvironment.current().get_block_id(size)
+            ) is not None and (mask_var := state.codegen.mask_var(index)) is not None:
+                expand = state.tile_strategy.expand_str(multicast_shape, dim)
+                dev_ptr_mask_exprs.append(f"({mask_var}{expand})")
+
+        if dev_ptr_mask_exprs:
+            dev_ptr_mask_expr = f"({'&'.join(dev_ptr_mask_exprs)})"
+            if len(dev_ptr_mask_exprs) < len(multicast_shape):
+                dev_ptr_mask_expr = f"tl.broadcast_to({dev_ptr_mask_expr}, {state.tile_strategy.shape_str(multicast_shape)})"
+            dev_ptr_mask_expr = f"({dev_ptr_mask_expr}){multicast_broadcast}"
+            mask_exprs.append(dev_ptr_mask_expr)
+
+        if indexing.has_mask():
+            mask_exprs.append(f"(tensor_mask){tensor_broadcast}")
+            return expr_from_string(
+                "&".join(mask_exprs), tensor_mask=indexing.mask_expr
+            )
+        if mask_exprs:
+            return expr_from_string("&".join(mask_exprs))
+        return None
+
+    @staticmethod
+    def codegen_load(
+        state: CodegenState,
+        multicast_tensor: tuple[torch.Tensor, torch.Tensor],
+        dev_ptrs_ast: ast.AST,
+        subscript: list[object],
+        extra_mask: ast.AST | None,
+    ) -> ast.AST:
+        tensor_like, dev_ptrs = multicast_tensor
+        indexing = SubscriptIndexing.create(state, tensor_like, subscript, extra_mask)
+        subscripts_shape = SubscriptIndexing.compute_shape(tensor_like, subscript)
+        multicast_shape = [*dev_ptrs.size()]
+
+        mask_expr = MulticastIndexingStrategy.get_mask_expr(
+            state, indexing, multicast_shape, subscripts_shape
+        )
+        extra = ", other=0"
+        if mask_expr is None:
+            mask_expr = expr_from_string("None")
+            extra = ""
+
+        multicast_broadcast, tensor_broadcast = (
+            MulticastIndexingStrategy.get_broadcast_str(
+                multicast_shape, subscripts_shape
+            )
+        )
+
+        dtype = triton_type(tensor_like.dtype)
+        return expr_from_string(
+            f"tl.load((base.to(tl.pointer_type({dtype}))){multicast_broadcast} + (offset){tensor_broadcast}, mask{extra})",
+            base=dev_ptrs_ast,
+            offset=indexing.index_expr,
+            mask=mask_expr,
+        )
+
+    @staticmethod
+    def codegen_store(
+        state: CodegenState,
+        multicast_tensor: tuple[torch.Tensor, torch.Tensor],
+        dev_ptrs_ast: ast.AST,
+        subscript: list[object],
+        value: ast.AST,
+        extra_mask: ast.AST | None,
+    ) -> ast.AST:
+        tensor_like, dev_ptrs = multicast_tensor
+        indexing = SubscriptIndexing.create(state, tensor_like, subscript, extra_mask)
+        subscripts_shape = SubscriptIndexing.compute_shape(tensor_like, subscript)
+        multicast_shape = [*dev_ptrs.size()]
+
+        mask_expr = MulticastIndexingStrategy.get_mask_expr(
+            state, indexing, multicast_shape, subscripts_shape
+        )
+        if mask_expr is None:
+            mask_expr = expr_from_string("None")
+
+        multicast_broadcast, tensor_broadcast = (
+            MulticastIndexingStrategy.get_broadcast_str(
+                multicast_shape, subscripts_shape
+            )
+        )
+
+        dtype = triton_type(tensor_like.dtype)
+        return expr_from_string(
+            f"tl.store(base.to(tl.pointer_type({dtype})){multicast_broadcast} + (offset){tensor_broadcast}, value, mask)",
+            base=dev_ptrs_ast,
+            value=value,
+            offset=indexing.index_expr,
+            mask=mask_expr,
+        )
+
+
 class SubscriptIndexing(NamedTuple):
     index_expr: ast.AST
     mask_expr: ast.AST
 
@@ -6,6 +6,7 @@
 import torch
 from torch.fx import map_arg
 
+from ..language import _MEMORY_OPS
 from ..language._tracing_ops import _for_loop
 from ..language._tracing_ops import _get_symnode
 from ..language._tracing_ops import _host_tensor
@@ -277,6 +278,35 @@ def is_matmul_with_rdim(node: torch.fx.Node) -> bool:
 
         return any(is_matmul_with_rdim(node) for node in graph.nodes)
 
+    def has_multicast_tensor_with_rdim(self, graph: torch.fx.Graph) -> bool:
+        """Check if a graph contains multicast tensors with rdim inputs."""
+
+        def is_multicast_with_rdim(node: torch.fx.Node) -> bool:
+            """Check if a node is a multicast dev_ptr with rdim inputs."""
+            if node.op != "call_function":
+                return False
+
+            if node.target not in _MEMORY_OPS:
+                return False
+
+            host_tensor = node.args[0]
+
+            if not isinstance(host_tensor, tuple):
+                return False
+
+            # Check if multicast dims have rdim
+            if len(host_tensor) == 2:
+                assert isinstance(host_tensor[1], torch.fx.Node)
+                multicast = host_tensor[1].meta.get("val", None)
+                if isinstance(multicast, torch.Tensor):
+                    for size in multicast.size():
+                        block_idx = CompileEnvironment.current().get_block_id(size)
+                        if block_idx == self.rdim.block_id:
+                            return True
+            return False
+
+        return any(is_multicast_with_rdim(node) for node in graph.nodes)
+
     def process(self, graph: torch.fx.Graph) -> torch.fx.Graph:
         for node in graph.nodes:
             if self.should_go_in_inner_graph(node):
 
@@ -27,6 +27,7 @@
 from ..autotuner.config_spec import BlockSizeSpec
 from ..language._decorators import get_device_func_replacement
 from ..language._decorators import is_api_func
+from ..language.multicast_tensor import MulticastTensor
 from ..language.tile_proxy import Tile
 from ..language.tile_proxy import _CheckForIndexCalls
 from .ast_extension import ExtendedAST
@@ -1289,6 +1290,86 @@ def propagate_attribute(self, attr: str, origin: AttributeOrigin) -> TypeInfo:
         return self.element_types[attr]
 
 
+class MulticastTensorType(ClassType):
+    element_types: dict[str, TypeInfo]  # pyright: ignore[reportIncompatibleVariableOverride]
+
+    def proxy(self) -> MulticastTensor:  # pyright: ignore[reportIncompatibleMethodOverride]
+        with proxy_tensor.disable_proxy_modes_tracing():
+            fake_mode = torch._C._unset_dispatch_mode(  # pyright: ignore[reportAttributeAccessIssue]
+                torch._C._TorchDispatchModeKey.FAKE  # pyright: ignore[reportAttributeAccessIssue]
+            )
+            try:
+                assert isinstance(self.element_types["tensor_like"], TensorType)
+                assert isinstance(self.element_types["dev_ptrs"], TensorType)
+                return MulticastTensor(
+                    self.element_types["tensor_like"].proxy(),
+                    self.element_types["dev_ptrs"].proxy(),
+                )
+            finally:
+                assert fake_mode is not None
+                torch._C._set_dispatch_mode(fake_mode)  # pyright: ignore[reportAttributeAccessIssue]
+
+    def merge(self, other: TypeInfo) -> TypeInfo:
+        if isinstance(other, MulticastTensorType):
+            self_elements = self.element_types
+            other_elements = other.element_types
+            if set(self_elements.keys()) == set(other_elements.keys()):
+                return MulticastTensorType(
+                    origin=other.origin,
+                    element_types={
+                        key: self_elements[key].merge(other_elements[key])
+                        for key in self_elements
+                    },
+                )
+        return super().merge(other)
+
+    def _device_indexing_size(self, key: TypeInfo) -> list[int | torch.SymInt]:
+        tensor_like_type = self.element_types["tensor_like"]
+        assert isinstance(tensor_like_type, TensorType)
+        size_like = tensor_like_type._device_indexing_size(key)
+
+        dev_ptrs_type = self.element_types["dev_ptrs"]
+        assert isinstance(dev_ptrs_type, TensorType)
+        multicast_size = list(dev_ptrs_type.fake_value.size())
+
+        return multicast_size + size_like
+
+    def propagate_setitem(
+        self, key: TypeInfo, value: TypeInfo, origin: Origin
+    ) -> TypeInfo:
+        if origin.is_host():
+            warning(exc.TensorOperationInWrapper)
+        else:
+            lhs_shape = self._device_indexing_size(key)
+            lhs_rank = len(lhs_shape)
+            if isinstance(value, TensorType):
+                rhs_rank = value.fake_value.ndim
+                if lhs_rank != rhs_rank:
+                    raise exc.RankMismatch(
+                        lhs_rank,
+                        rhs_rank,
+                        f"LHS shape: {tuple(lhs_shape)}, RHS shape: {tuple(value.fake_value.shape)}",
+                    )
+            elif isinstance(value, (NumericType, LiteralType)):
+                # Allow scalar assignment to tensor (broadcasts to tensor shape)
+                pass
+            else:
+                raise exc.RequiresTensorInAssignment(value)
+        return self
+
+    def propagate_getitem(self, key: TypeInfo, origin: Origin) -> TypeInfo:
+        if origin.is_host():
+            warning(exc.TensorOperationInWrapper)
+
+        assert isinstance(self.element_types["tensor_like"], TensorType)
+        return TensorType(
+            origin,
+            self.element_types["tensor_like"]
+            .proxy()
+            .new_empty(self._device_indexing_size(key)),
+        )
+
+
 class SliceType(CollectionType):
     element_types: slice  # pyright: ignore[reportIncompatibleVariableOverride]
 
@@ -1614,7 +1695,7 @@ def _assign(self, lhs: ast.AST, rhs: TypeInfo) -> None:
         if isinstance(lhs, ast.Subscript):
             # TODO(jansel): test different types of subscript
             lhs_base_type = self.visit(lhs.value)
-            if isinstance(lhs_base_type, TensorType):
+            if isinstance(lhs_base_type, (TensorType, MulticastTensorType)):
                 self.visit(lhs)  # need to populate shape info
             lhs_base_type = lhs_base_type.propagate_setitem(
                 self.visit(lhs.slice), rhs, self.origin()