| 
1 | 1 | This file is automatically generated by assertExpectedJournal calls in test_signal_wait.py.  | 
2 | 2 | Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.  | 
3 | 3 | 
 
  | 
 | 4 | +--- assertExpectedJournal(TestWait.test_global_sync)  | 
 | 5 | +from __future__ import annotations  | 
 | 6 | + | 
 | 7 | +import torch  | 
 | 8 | +import helion  | 
 | 9 | +import triton  | 
 | 10 | +import triton.language as tl  | 
 | 11 | + | 
 | 12 | +@triton.jit  | 
 | 13 | +def _gmem_multi_bar_sync_kernel_kernel(signal_pad, signal_pad_stride_0, signal_pad_stride_1, N, _BLOCK_SIZE_1: tl.constexpr):  | 
 | 14 | +    pid_0 = tl.program_id(0)  | 
 | 15 | +    offset_0 = pid_0  | 
 | 16 | +    for offset_1 in tl.range(0, N.to(tl.int32), step=_BLOCK_SIZE_1):  | 
 | 17 | +        indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)  | 
 | 18 | +        helion.runtime.triton_send_signal(addr=signal_pad + (indices_1 * signal_pad_stride_0 + offset_0 * signal_pad_stride_1), update=1, sem='release', scope='gpu', op='atomic_xchg', skip_sync=True)  | 
 | 19 | +        helion.runtime.triton_wait_multiple_signal(addr=signal_pad + (offset_0 * signal_pad_stride_0 + indices_1 * signal_pad_stride_1), expect=1, update=0, sem='acquire', scope='gpu', op='ld', skip_sync=False)  | 
 | 20 | + | 
 | 21 | +def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor):  | 
 | 22 | +    M, N = signal_pad.shape  | 
 | 23 | +    assert M == N  | 
 | 24 | +    _BLOCK_SIZE_1 = N  | 
 | 25 | +    _gmem_multi_bar_sync_kernel_kernel[N,](signal_pad, signal_pad.stride(0), signal_pad.stride(1), N, _BLOCK_SIZE_1, num_warps=4, num_stages=3)  | 
 | 26 | +    return signal_pad  | 
 | 27 | + | 
 | 28 | +def _gmem_multi_bar_sync_kernel_make_precompiler(signal_pad: torch.Tensor):  | 
 | 29 | +    M, N = signal_pad.shape  | 
 | 30 | +    assert M == N  | 
 | 31 | +    _BLOCK_SIZE_1 = N  | 
 | 32 | +    from helion.runtime.precompile_shim import make_precompiler  | 
 | 33 | +    return make_precompiler(_gmem_multi_bar_sync_kernel_kernel)(signal_pad, signal_pad.stride(0), signal_pad.stride(1), N, _BLOCK_SIZE_1, num_warps=4, num_stages=3)  | 
 | 34 | + | 
4 | 35 | --- assertExpectedJournal(TestWait.test_signal_basic)  | 
5 | 36 | from __future__ import annotations  | 
6 | 37 | 
 
  | 
@@ -76,6 +107,33 @@ def _gmem_signal_tensor_bar_kernel_make_precompiler(signal_pad: torch.Tensor):  | 
76 | 107 |     from helion.runtime.precompile_shim import make_precompiler  | 
77 | 108 |     return make_precompiler(_gmem_signal_tensor_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
78 | 109 | 
 
  | 
 | 110 | +--- assertExpectedJournal(TestWait.test_signal_multiple_cas)  | 
 | 111 | +from __future__ import annotations  | 
 | 112 | + | 
 | 113 | +import torch  | 
 | 114 | +import helion  | 
 | 115 | +import triton  | 
 | 116 | +import triton.language as tl  | 
 | 117 | + | 
 | 118 | +@triton.jit  | 
 | 119 | +def _gmem_signal_tensor_bar_kernel_kernel(signal_pad, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr):  | 
 | 120 | +    pid_0 = tl.program_id(0)  | 
 | 121 | +    offset_0 = pid_0 * _BLOCK_SIZE_0  | 
 | 122 | +    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)  | 
 | 123 | +    helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=0, update=1, sem='release', scope='gpu', op='atomic_cas', skip_sync=True, sync_before=not False)  | 
 | 124 | + | 
 | 125 | +def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor):  | 
 | 126 | +    n, = signal_pad.shape  | 
 | 127 | +    _BLOCK_SIZE_0 = 4  | 
 | 128 | +    _gmem_signal_tensor_bar_kernel_kernel[triton.cdiv(n, _BLOCK_SIZE_0),](signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 129 | +    return signal_pad  | 
 | 130 | + | 
 | 131 | +def _gmem_signal_tensor_bar_kernel_make_precompiler(signal_pad: torch.Tensor):  | 
 | 132 | +    n, = signal_pad.shape  | 
 | 133 | +    _BLOCK_SIZE_0 = 4  | 
 | 134 | +    from helion.runtime.precompile_shim import make_precompiler  | 
 | 135 | +    return make_precompiler(_gmem_signal_tensor_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 136 | + | 
79 | 137 | --- assertExpectedJournal(TestWait.test_wait_2d_tile)  | 
80 | 138 | from __future__ import annotations  | 
81 | 139 | 
 
  | 
@@ -144,3 +202,67 @@ def _gmem_wait_kernel_make_precompiler(signal_pad: torch.Tensor):  | 
144 | 202 |     from helion.runtime.precompile_shim import make_precompiler  | 
145 | 203 |     return make_precompiler(_gmem_wait_kernel_kernel)(signal_pad, out, out.stride(0), signal_pad.stride(0), num_warps=4, num_stages=3)  | 
146 | 204 | 
 
  | 
 | 205 | +--- assertExpectedJournal(TestWait.test_wait_multi_bar)  | 
 | 206 | +from __future__ import annotations  | 
 | 207 | + | 
 | 208 | +import torch  | 
 | 209 | +import helion  | 
 | 210 | +import triton  | 
 | 211 | +import triton.language as tl  | 
 | 212 | + | 
 | 213 | +import __main__ as _source_module  | 
 | 214 | + | 
 | 215 | +@triton.jit  | 
 | 216 | +def _gmem_wait_multi_bar_kernel_kernel(signal_pad, out, out_stride_0, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr):  | 
 | 217 | +    pid_0 = tl.program_id(0)  | 
 | 218 | +    offset_0 = pid_0 * _BLOCK_SIZE_0  | 
 | 219 | +    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)  | 
 | 220 | +    helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=1, update=0, sem='acquire', scope='gpu', op='ld', skip_sync=False)  | 
 | 221 | +    tile_id = offset_0 // _BLOCK_SIZE_0  | 
 | 222 | +    tl.store(out + tile_id * out_stride_0, tile_id, None)  | 
 | 223 | + | 
 | 224 | +def gmem_wait_multi_bar_kernel(signal_pad: torch.Tensor):  | 
 | 225 | +    N, = signal_pad.shape  | 
 | 226 | +    n = 4  | 
 | 227 | +    out = torch.empty(n, dtype=torch.int32, device=_source_module.DEVICE)  | 
 | 228 | +    _BLOCK_SIZE_0 = 4  | 
 | 229 | +    _gmem_wait_multi_bar_kernel_kernel[triton.cdiv(N, _BLOCK_SIZE_0),](signal_pad, out, out.stride(0), signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 230 | +    return out  | 
 | 231 | + | 
 | 232 | +def _gmem_wait_multi_bar_kernel_make_precompiler(signal_pad: torch.Tensor):  | 
 | 233 | +    N, = signal_pad.shape  | 
 | 234 | +    n = 4  | 
 | 235 | +    out = torch.empty(n, dtype=torch.int32, device=_source_module.DEVICE)  | 
 | 236 | +    _BLOCK_SIZE_0 = 4  | 
 | 237 | +    from helion.runtime.precompile_shim import make_precompiler  | 
 | 238 | +    return make_precompiler(_gmem_wait_multi_bar_kernel_kernel)(signal_pad, out, out.stride(0), signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 239 | + | 
 | 240 | +--- assertExpectedJournal(TestWait.test_wait_multi_bar_cas)  | 
 | 241 | +from __future__ import annotations  | 
 | 242 | + | 
 | 243 | +import torch  | 
 | 244 | +import helion  | 
 | 245 | +import triton  | 
 | 246 | +import triton.language as tl  | 
 | 247 | + | 
 | 248 | +@triton.jit  | 
 | 249 | +def _gmem_wait_multi_bar_kernel_cas_kernel(signal_pad, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr):  | 
 | 250 | +    pid_0 = tl.program_id(0)  | 
 | 251 | +    offset_0 = pid_0 * _BLOCK_SIZE_0  | 
 | 252 | +    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)  | 
 | 253 | +    helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=1, update=2, sem='acquire', scope='gpu', op='atomic_cas', skip_sync=False)  | 
 | 254 | + | 
 | 255 | +def gmem_wait_multi_bar_kernel_cas(signal_pad: torch.Tensor):  | 
 | 256 | +    N, = signal_pad.shape  | 
 | 257 | +    n = 4  | 
 | 258 | +    _BLOCK_SIZE_0 = 4  | 
 | 259 | +    _gmem_wait_multi_bar_kernel_cas_kernel[triton.cdiv(N, _BLOCK_SIZE_0),](signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 260 | +    return signal_pad  | 
 | 261 | + | 
 | 262 | +def _gmem_wait_multi_bar_kernel_cas_make_precompiler(signal_pad: torch.Tensor):  | 
 | 263 | +    N, = signal_pad.shape  | 
 | 264 | +    n = 4  | 
 | 265 | +    _BLOCK_SIZE_0 = 4  | 
 | 266 | +    from helion.runtime.precompile_shim import make_precompiler  | 
 | 267 | +    return make_precompiler(_gmem_wait_multi_bar_kernel_cas_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)  | 
 | 268 | + | 
0 commit comments