pytorch
diff --git a/‎helion/_compat.py‎
Lines changed: 9 additions & 0 deletions b/‎helion/_compat.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 2 additions & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 6 additions & 0 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎helion/autotuner/config_spec.py‎
Lines changed: 11 additions & 0 deletions b/‎helion/autotuner/config_spec.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎helion/runtime/__init__.py‎
Lines changed: 10 additions & 3 deletions b/‎helion/runtime/__init__.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎helion/runtime/config.py‎
Lines changed: 6 additions & 0 deletions b/‎helion/runtime/config.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎helion/runtime/ptxas_configs/__init__.py‎
Lines changed: 56 additions & 0 deletions b/‎helion/runtime/ptxas_configs/__init__.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎helion/runtime/ptxas_configs/fp8fatt_0.bin‎
9.75 KB b/‎helion/runtime/ptxas_configs/fp8fatt_0.bin‎
9.75 KB
diff --git a/‎helion/runtime/ptxas_configs/fp8fatt_1.bin‎
2.44 KB b/‎helion/runtime/ptxas_configs/fp8fatt_1.bin‎
2.44 KB
diff --git a/‎helion/runtime/ptxas_configs/fp8fatt_2.bin‎
2.38 KB b/‎helion/runtime/ptxas_configs/fp8fatt_2.bin‎
2.38 KB
@@ -105,3 +105,12 @@ def warps_to_threads(num_warps: int) -> int:
         )
         return num_warps * (props.warp_size or 32)
     return num_warps * 32
+
+
+def supports_ptxas(device: torch.device) -> bool:
+    """Return True if PTXAS options are available for the given device."""
+    if device.type != "cuda":
+        return False
+    if torch.version.hip is not None:
+        return False
+    return supports_tensor_descriptor()
@@ -20,6 +20,7 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from .. import exc
+from .._compat import supports_ptxas
 from ..language.constexpr import ConstExpr
 from .loop_dependency_checker import LoopDependencyChecker
 from .source_location import SourceLocation
@@ -90,6 +91,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self.block_sizes: list[BlockSizeInfo] = []
         self.debug_shape_renames: dict[sympy.Expr, sympy.Expr] = {}
         self.config_spec = ConfigSpec()
+        self.config_spec.ptxas_supported = supports_ptxas(device)
         self.kernel_tensor_sizes: dict[tuple[sympy.Expr, ...], int] = (
             collections.Counter()
         )
 
@@ -574,6 +574,12 @@ def codegen_function_call(self) -> ast.AST:
                 f"num_stages={self.config.num_stages}",
             ]
         )
+        ptxas_config = self.config.ptxas_config
+        if ptxas_config:
+            from ..runtime.ptxas_configs import get_ptxas_option
+
+            ptx_option = get_ptxas_option(ptxas_config)
+            args.append(f"ptx_options={ptx_option!r}")
         pid = self.pid
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
 
@@ -52,6 +52,7 @@
         "pid_type",
         "indexing",
         "load_eviction_policies",
+        "ptxas_config",
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
@@ -105,6 +106,7 @@ class ConfigSpec:
             EnumFragment(choices=VALID_EVICTION_POLICIES), length=0
         )
     )
+    ptxas_supported: bool = False
 
     @staticmethod
     def _valid_indexing_types() -> tuple[IndexingLiteral, ...]:
@@ -238,6 +240,11 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             else:
                 config[name] = values[0]
 
+        if self.ptxas_supported:
+            value = config.get("ptxas_config") or 0
+            if not isinstance(value, int):
+                raise InvalidConfig(f"ptxas_config must be integer, got {value!r}")
+
         # Set default values for grid indices when pid_type is not persistent
         pid_type = config["pid_type"]
         if pid_type in ("flat", "xyz") and self.grid_block_ids:
@@ -280,6 +287,10 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
             "pid_type": fn(EnumFragment(self.allowed_pid_types)),
             "load_eviction_policies": fn(self.load_eviction_policies),
         }
+        if self.ptxas_supported:
+            from ..runtime.ptxas_configs import search_ptxas_configs
+
+            config["ptxas_config"] = fn(EnumFragment((0, *search_ptxas_configs())))
         # Add tunable parameters
         config.update(
             {key: fn(fragment) for key, fragment in self.user_defined_tunables.items()}
 
@@ -61,8 +61,15 @@ def default_launcher(
     *args: object,
     num_warps: int,
     num_stages: int,
+    ptx_options: str | None = None,
 ) -> object:
     """Default launcher function that executes the kernel immediately."""
-    return triton_kernel.run(
-        *args, grid=grid, warmup=False, num_warps=num_warps, num_stages=num_stages
-    )
+    run_kwargs = {
+        "grid": grid,
+        "warmup": False,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+    }
+    if ptx_options:
+        run_kwargs["ptx_options"] = ptx_options
+    return triton_kernel.run(*args, **run_kwargs)
@@ -39,6 +39,7 @@ def __init__(
         num_stages: int | None = None,
         pid_type: PidTypeLiteral | None = None,
         indexing: IndexingLiteral | None = None,
+        ptxas_config: int | None = None,
         # For user-defined properties
         **kwargs: object,
     ) -> None:
@@ -81,6 +82,7 @@ def __init__(
             "num_stages": num_stages,
             "indexing": indexing,
             "pid_type": pid_type,
+            "ptxas_config": ptxas_config,
         }
         for key, value in core_props.items():
             if value is not None:
@@ -178,6 +180,10 @@ def pid_type(self) -> PidTypeLiteral:
     def range_unroll_factors(self) -> list[int]:
         return cast("list[int]", self.config.get("range_unroll_factors", []))
 
+    @property
+    def ptxas_config(self) -> int:
+        return cast("int", self.config.get("ptxas_config", 0))
+
     @property
     def range_warp_specializes(self) -> list[bool | None]:
         return cast("list[bool | None]", self.config.get("range_warp_specializes", []))
 
@@ -0,0 +1,56 @@
+"""Utilities for working with packaged PTXAS control files."""
+
+from __future__ import annotations
+
+from functools import cache
+from pathlib import Path
+
+_ADVANCED_COMPILER_CONFIGURATIONS: dict[int, str] = {
+    # 1: "fp8fatt_0.bin",  # caused timeouts
+    2: "fp8fatt_1.bin",
+    3: "fp8fatt_2.bin",
+    4: "fp8fatt_3.bin",
+    5: "matmul_0.bin",
+    6: "matmul_1.bin",
+    7: "matmul_2.bin",
+    8: "matmul_3.bin",
+    9: "matmul_4.bin",
+    10: "matmul_5.bin",
+}
+
+
+def _config_root() -> Path:
+    return Path(__file__).resolve().parent
+
+
+@cache
+def search_ptxas_configs() -> tuple[int, ...]:
+    """Return the sorted tuple of available PTXAS config IDs."""
+
+    return tuple(sorted(_ADVANCED_COMPILER_CONFIGURATIONS))
+
+
+def _advanced_compiler_configuration_path(config_id: int) -> str:
+    """Return the absolute path to the advanced compiler configuration for ``config_id``."""
+
+    try:
+        filename = _ADVANCED_COMPILER_CONFIGURATIONS[config_id]
+    except KeyError as exc:  # pragma: no cover - defensive
+        raise ValueError(
+            f"Unknown advanced compiler configuration id: {config_id}"
+        ) from exc
+    resolved = (_config_root() / filename).resolve()
+    if not resolved.is_file():
+        raise FileNotFoundError(
+            f"Missing advanced compiler configuration file: {resolved}"
+        )
+    return str(resolved)
+
+
+@cache
+def get_ptxas_option(config_value: int) -> str | None:
+    """Translate a config enum value into a PTXAS option string."""
+
+    if config_value == 0:
+        return None
+    return f"--apply-controls {_advanced_compiler_configuration_path(config_value)}"
Original file line number	Diff line number	Diff line change
`@@ -574,6 +574,12 @@ def codegen_function_call(self) -> ast.AST:`
`574`	`574`	`f"num_stages={self.config.num_stages}",`
`575`	`575`	`]`
`576`	`576`	`)`
	`577`	`+ ptxas_config = self.config.ptxas_config`
	`578`	`+ if ptxas_config:`
	`579`	`+ from ..runtime.ptxas_configs import get_ptxas_option`
	`580`	`+`
	`581`	`+ ptx_option = get_ptxas_option(ptxas_config)`
	`582`	`+ args.append(f"ptx_options={ptx_option!r}")`
`577`	`583`	`pid = self.pid`
`578`	`584`	`assert pid is not None`
`579`	`585`	`# TODO(jansel): we should run CSE this statement`