pytorch
diff --git a/‎docs/api/settings.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/api/settings.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎helion/_compat.py‎
Lines changed: 9 additions & 0 deletions b/‎helion/_compat.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎helion/_compiler/compile_environment.py‎
Lines changed: 2 additions & 0 deletions b/‎helion/_compiler/compile_environment.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎helion/_compiler/device_function.py‎
Lines changed: 6 additions & 0 deletions b/‎helion/_compiler/device_function.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎helion/autotuner/base_search.py‎
Lines changed: 1 addition & 0 deletions b/‎helion/autotuner/base_search.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/autotuner/config_generation.py‎
Lines changed: 12 additions & 2 deletions b/‎helion/autotuner/config_generation.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎helion/autotuner/config_spec.py‎
Lines changed: 31 additions & 1 deletion b/‎helion/autotuner/config_spec.py‎
Lines changed: 31 additions & 1 deletion
diff --git a/‎helion/autotuner/random_search.py‎
Lines changed: 1 addition & 0 deletions b/‎helion/autotuner/random_search.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎helion/runtime/__init__.py‎
Lines changed: 10 additions & 3 deletions b/‎helion/runtime/__init__.py‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎helion/runtime/config.py‎
Lines changed: 7 additions & 0 deletions b/‎helion/runtime/config.py‎
Lines changed: 7 additions & 0 deletions
@@ -154,6 +154,10 @@ with helion.set_default_settings(
 
    Validate each candidate configuration against a baseline output before accepting it. Default is ``True``. Controlled by ``HELION_AUTOTUNE_ACCURACY_CHECK``.
 
+.. autoattribute:: Settings.autotune_search_acc
+
+   Enable searching packaged PTXAS advanced compiler configurations during autotuning. Default is ``True``. Controlled by ``HELION_AUTOTUNE_SEARCH_ACC``.
+
 .. autoattribute:: Settings.autotune_rebenchmark_threshold
 
    Controls how aggressively Helion re-runs promising configs to avoid outliers. Default is ``1.5`` (re-benchmark anything within 1.5x of the best).
@@ -233,6 +237,7 @@ Built-in values for ``HELION_AUTOTUNER`` include ``"PatternSearch"``, ``"Differe
 | ``HELION_AUTOTUNE_RANDOM_SEED`` | ``autotune_random_seed`` | Seed used for randomized autotuning searches. |
 | ``HELION_AUTOTUNE_MAX_GENERATIONS`` | ``autotune_max_generations`` | Upper bound on generations for Pattern Search and Differential Evolution. |
 | ``HELION_AUTOTUNE_ACCURACY_CHECK`` | ``autotune_accuracy_check`` | Toggle baseline validation for candidate configs. |
+| ``HELION_AUTOTUNE_SEARCH_ACC`` | ``autotune_search_acc`` | Enable packaged PTXAS advanced compiler configuration search during autotuning. |
 | ``HELION_REBENCHMARK_THRESHOLD`` | ``autotune_rebenchmark_threshold`` | Re-run configs whose performance is within a multiplier of the current best. |
 | ``HELION_AUTOTUNE_PROGRESS_BAR`` | ``autotune_progress_bar`` | Enable or disable the progress bar UI during autotuning. |
 | ``HELION_PRINT_OUTPUT_CODE`` | ``print_output_code`` | Print generated Triton code to stderr for inspection. |
 
@@ -243,3 +243,12 @@ def warps_to_threads(num_warps: int) -> int:
         )
         return num_warps * (props.warp_size or 32)
     return num_warps * 32
+
+
+def supports_ptxas(device: torch.device) -> bool:
+    """Return True if PTXAS options are available for the given device."""
+    if device.type != "cuda":
+        return False
+    if torch.version.hip is not None:
+        return False
+    return supports_tensor_descriptor()
@@ -20,6 +20,7 @@
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
 from .. import exc
+from .._compat import supports_ptxas
 from ..language.constexpr import ConstExpr
 from .loop_dependency_checker import LoopDependencyChecker
 from .source_location import SourceLocation
@@ -90,6 +91,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         self.block_sizes: list[BlockSizeInfo] = []
         self.debug_shape_renames: dict[sympy.Expr, sympy.Expr] = {}
         self.config_spec = ConfigSpec()
+        self.config_spec.ptxas_supported = supports_ptxas(device)
         self.kernel_tensor_sizes: dict[tuple[sympy.Expr, ...], int] = (
             collections.Counter()
         )
 
@@ -574,6 +574,12 @@ def codegen_function_call(self) -> ast.AST:
                 f"num_stages={self.config.num_stages}",
             ]
         )
+        advanced_compiler_configuration = self.config.advanced_compiler_configuration
+        if advanced_compiler_configuration:
+            from ..runtime.ptxas_configs import get_ptxas_option
+
+            ptx_option = get_ptxas_option(advanced_compiler_configuration)
+            args.append(f"ptx_options={ptx_option!r}")
         pid = self.pid
         assert pid is not None
         # TODO(jansel): we should run CSE this statement
 
@@ -481,6 +481,7 @@ def __init__(
         self.config_gen: ConfigGeneration = ConfigGeneration(
             self.config_spec,
             overrides=overrides,
+            include_advanced_compiler_configuration=self.settings.autotune_search_acc,
         )
 
     @property
 
@@ -31,6 +31,7 @@ def __init__(
         config_spec: ConfigSpec,
         *,
         overrides: Mapping[str, object] | None = None,
+        include_advanced_compiler_configuration: bool = True,
     ) -> None:
         def _collect_spec(spec: ConfigSpecFragment) -> object:
             """
@@ -47,8 +48,14 @@ def _collect_spec(spec: ConfigSpecFragment) -> object:
 
         super().__init__()
         self.config_spec = config_spec
+        self._include_advanced_compiler_configuration = (
+            include_advanced_compiler_configuration
+        )
         self.flat_spec: list[ConfigSpecFragment] = []
-        config_spec.flat_config(_collect_spec)
+        config_spec.flat_config(
+            _collect_spec,
+            include_advanced_compiler_configuration=include_advanced_compiler_configuration,
+        )
         assert self.flat_spec, "No config values to tune"
         self._override_values = dict(overrides or {})
         self.block_size_indices: list[int] = [
@@ -93,7 +100,10 @@ def get_next_value(spec: ConfigSpecFragment) -> object:
 
         assert len(flat_values) == len(self.flat_spec)
         count: itertools.count[int] = itertools.count()
-        config = self.config_spec.flat_config(get_next_value)
+        config = self.config_spec.flat_config(
+            get_next_value,
+            include_advanced_compiler_configuration=self._include_advanced_compiler_configuration,
+        )
         assert next(count) == len(flat_values)
         return self._apply_overrides(config)
 
 
@@ -52,6 +52,7 @@
         "pid_type",
         "indexing",
         "load_eviction_policies",
+        "advanced_compiler_configuration",
     ]
 )
 VALID_PID_TYPES = ("flat", "xyz", "persistent_blocked", "persistent_interleaved")
@@ -105,6 +106,7 @@ class ConfigSpec:
             EnumFragment(choices=VALID_EVICTION_POLICIES), length=0
         )
     )
+    ptxas_supported: bool = False
 
     @staticmethod
     def _valid_indexing_types() -> tuple[IndexingLiteral, ...]:
@@ -238,6 +240,18 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
             else:
                 config[name] = values[0]
 
+        if "advanced_compiler_configuration" in config:
+            value = config.get("advanced_compiler_configuration") or 0
+            if not isinstance(value, int):
+                raise InvalidConfig(
+                    f"advanced_compiler_configuration must be integer, got {value!r}"
+                )
+            if value and not self.ptxas_supported:
+                raise InvalidConfig(
+                    "advanced_compiler_configuration requires PTXAS support"
+                )
+            config["advanced_compiler_configuration"] = value
+
         # Set default values for grid indices when pid_type is not persistent
         pid_type = config["pid_type"]
         if pid_type in ("flat", "xyz") and self.grid_block_ids:
@@ -260,8 +274,18 @@ def normalize(self, config: helion.Config | dict[str, object]) -> None:
     def default_config(self) -> helion.Config:
         return self.flat_config(lambda x: x.default())
 
-    def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Config:
+    def flat_config(
+        self,
+        fn: Callable[[ConfigSpecFragment], object],
+        *,
+        include_advanced_compiler_configuration: bool | None = None,
+    ) -> helion.Config:
         """Map a flattened version of the config using the given function."""
+        include_advanced = self.ptxas_supported
+        if include_advanced_compiler_configuration is not None:
+            include_advanced = (
+                include_advanced and include_advanced_compiler_configuration
+            )
         config = {
             "block_sizes": self.block_sizes._flat_config(self, fn),
             "loop_orders": self.loop_orders._flat_config(self, fn),
@@ -280,6 +304,12 @@ def flat_config(self, fn: Callable[[ConfigSpecFragment], object]) -> helion.Conf
             "pid_type": fn(EnumFragment(self.allowed_pid_types)),
             "load_eviction_policies": fn(self.load_eviction_policies),
         }
+        if include_advanced:
+            from ..runtime.ptxas_configs import search_ptxas_configs
+
+            config["advanced_compiler_configuration"] = fn(
+                EnumFragment((0, *search_ptxas_configs()))
+            )
         # Add tunable parameters
         config.update(
             {key: fn(fragment) for key, fragment in self.user_defined_tunables.items()}
 
@@ -39,5 +39,6 @@ def __init__(
             configs=ConfigGeneration(
                 kernel.config_spec,
                 overrides=kernel.settings.autotune_config_overrides or None,
+                include_advanced_compiler_configuration=kernel.settings.autotune_search_acc,
             ).random_population(count),
         )
@@ -62,8 +62,15 @@ def default_launcher(
     *args: object,
     num_warps: int,
     num_stages: int,
+    ptx_options: str | None = None,
 ) -> object:
     """Default launcher function that executes the kernel immediately."""
-    return triton_kernel.run(
-        *args, grid=grid, warmup=False, num_warps=num_warps, num_stages=num_stages
-    )
+    run_kwargs = {
+        "grid": grid,
+        "warmup": False,
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+    }
+    if ptx_options:
+        run_kwargs["ptx_options"] = ptx_options
+    return triton_kernel.run(*args, **run_kwargs)
@@ -39,6 +39,7 @@ def __init__(
         num_stages: int | None = None,
         pid_type: PidTypeLiteral | None = None,
         indexing: IndexingLiteral | None = None,
+        advanced_compiler_configuration: int | None = None,
         # For user-defined properties
         **kwargs: object,
     ) -> None:
@@ -61,6 +62,7 @@ def __init__(
             num_stages: Number of stages for software pipelining.
             pid_type: Program ID type strategy ("flat", "xyz", "persistent_blocked", "persistent_interleaved").
             indexing: Indexing strategy ("pointer", "tensor_descriptor", "block_ptr").
+            advanced_compiler_configuration: Identifier for packaged PTXAS control files applied during compilation.
             **kwargs: Additional user-defined configuration parameters.
         """
         self.config = {}
@@ -81,6 +83,7 @@ def __init__(
             "num_stages": num_stages,
             "indexing": indexing,
             "pid_type": pid_type,
+            "advanced_compiler_configuration": advanced_compiler_configuration,
         }
         for key, value in core_props.items():
             if value is not None:
@@ -178,6 +181,10 @@ def pid_type(self) -> PidTypeLiteral:
     def range_unroll_factors(self) -> list[int]:
         return cast("list[int]", self.config.get("range_unroll_factors", []))
 
+    @property
+    def advanced_compiler_configuration(self) -> int:
+        return cast("int", self.config.get("advanced_compiler_configuration", 0))
+
     @property
     def range_warp_specializes(self) -> list[bool | None]:
         return cast("list[bool | None]", self.config.get("range_warp_specializes", []))
Original file line number	Diff line number	Diff line change
`@@ -574,6 +574,12 @@ def codegen_function_call(self) -> ast.AST:`
`574`	`574`	`f"num_stages={self.config.num_stages}",`
`575`	`575`	`]`
`576`	`576`	`)`
	`577`	`+ advanced_compiler_configuration = self.config.advanced_compiler_configuration`
	`578`	`+ if advanced_compiler_configuration:`
	`579`	`+ from ..runtime.ptxas_configs import get_ptxas_option`
	`580`	`+`
	`581`	`+ ptx_option = get_ptxas_option(advanced_compiler_configuration)`
	`582`	`+ args.append(f"ptx_options={ptx_option!r}")`
`577`	`583`	`pid = self.pid`
`578`	`584`	`assert pid is not None`
`579`	`585`	`# TODO(jansel): we should run CSE this statement`
Original file line number	Diff line number	Diff line change
`@@ -481,6 +481,7 @@ def __init__(`
`481`	`481`	`self.config_gen: ConfigGeneration = ConfigGeneration(`
`482`	`482`	`self.config_spec,`
`483`	`483`	`overrides=overrides,`
	`484`	`+ include_advanced_compiler_configuration=self.settings.autotune_search_acc,`
`484`	`485`	`)`
`485`	`486`
`486`	`487`	`@property`
Original file line number	Diff line number	Diff line change
`@@ -39,5 +39,6 @@ def __init__(`
`39`	`39`	`configs=ConfigGeneration(`
`40`	`40`	`kernel.config_spec,`
`41`	`41`	`overrides=kernel.settings.autotune_config_overrides or None,`
	`42`	`+ include_advanced_compiler_configuration=kernel.settings.autotune_search_acc,`
`42`	`43`	`).random_population(count),`
`43`	`44`	`)`