From cbdb75736910cb613594288f84422a2c6de42a4c Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 2 Jun 2025 16:12:57 -0500 Subject: [PATCH 01/20] add localized allocation and deallocation --- loopy/target/pyopencl.py | 137 ++++++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ae923f1fe..ff1bc83b6 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -26,7 +26,7 @@ """ import logging -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Any, Sequence, cast, Tuple, Mapping from warnings import warn import numpy as np @@ -790,15 +790,6 @@ def get_function_definition( Line(), function_body, Line(), - ] + ([ - For("_tv", "_global_temporaries", - # Free global temporaries. - # Zero-size temporaries allocate as None, tolerate that. - # https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator - S("if _tv is not None: _tv.release()")) - ] if self._get_global_temporaries(codegen_state) else [] - ) + [ - Line(), Return("_lpy_evt"), ])) @@ -818,48 +809,81 @@ def _get_global_temporaries(self, codegen_state): key=lambda tv: tv.name) def get_temporary_decls(self, codegen_state, schedule_index): - from genpy import Assign, Comment, Line - from pymbolic.mapper.stringifier import PREC_NONE - ecm = self.get_expression_to_code_mapper(codegen_state) - - global_temporaries = self._get_global_temporaries(codegen_state) - if not global_temporaries: - return [] - - allocated_var_names = [] - code_lines = [] - code_lines.append(Line()) - code_lines.append(Comment("{{{ allocate global temporaries")) - code_lines.append(Line()) - - for tv in global_temporaries: - if not tv.base_storage: - if tv.nbytes: - # NB: This does not prevent all zero-size allocations, - # as sizes are parametric, and allocation size - # could turn out to be zero at runtime. - nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") - allocated_var_names.append(tv.name) - code_lines.append(Assign(tv.name, - f"allocator({nbytes_str})")) - else: - code_lines.append(Assign(tv.name, "None")) - - code_lines.append(Assign("_global_temporaries", "[{tvs}]".format( - tvs=", ".join(tv for tv in allocated_var_names)))) - - code_lines.append(Line()) - code_lines.append(Comment("}}}")) - code_lines.append(Line()) - - return code_lines + return [] + # from genpy import Assign, Comment, Line + # from pymbolic.mapper.stringifier import PREC_NONE + # ecm = self.get_expression_to_code_mapper(codegen_state) + + # global_temporaries = self._get_global_temporaries(codegen_state) + # if not global_temporaries: + # return [] + + # allocated_var_names = [] + # code_lines = [] + # code_lines.append(Line()) + # code_lines.append(Comment("{{{ allocate global temporaries")) + # code_lines.append(Line()) + + # for tv in global_temporaries: + # if not tv.base_storage: + # if tv.nbytes: + # # NB: This does not prevent all zero-size allocations, + # # as sizes are parametric, and allocation size + # # could turn out to be zero at runtime. + # nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") + # allocated_var_names.append(tv.name) + # code_lines.append(Assign(tv.name, + # f"allocator({nbytes_str})")) + # else: + # code_lines.append(Assign(tv.name, "None")) + + # code_lines.append(Assign("_global_temporaries", "[{tvs}]".format( + # tvs=", ".join(tv for tv in allocated_var_names)))) + + # code_lines.append(Line()) + # code_lines.append(Comment("}}}")) + # code_lines.append(Line()) + + # return code_lines + + def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tuple[Mapping[str, set[str]], Mapping[str, set[str]]]: + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, + temporaries_written_in_subkernel, + ) + # Find sub-kernels + kernel = codegen_state.kernel + sched_index = 0 + subkernel_names = [] + for sched_index in range(0, codegen_state.schedule_index_end): + sched_item = kernel.linearization[sched_index] + if isinstance(sched_item, CallKernel): + subkernel_names.append(sched_item.kernel_name) + + # Forward pass to find first writes + first_accesses = {} + seen_temporary_variables = set() + for subkernel_name in subkernel_names: + new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) - seen_temporary_variables + first_accesses[subkernel_name] = new_temporary_variables + seen_temporary_variables = new_temporary_variables.union(seen_temporary_variables) + + # Backwards pass to find last reads + last_accesses = {} + seen_temporary_variables = set() + for subkernel_name in reversed(subkernel_names): + new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) - seen_temporary_variables + last_accesses[subkernel_name] = new_temporary_variables + seen_temporary_variables = new_temporary_variables.union(seen_temporary_variables) + return (first_accesses, last_accesses) def get_kernel_call( self, codegen_state: CodeGenerationState, subkernel_name: str, gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] ) -> genpy.Suite: - from genpy import Assert, Assign, Comment, Line, Suite + from genpy import Assert, Assign, Statement, Comment, Line, Suite + from pymbolic.mapper.stringifier import PREC_NONE kernel = codegen_state.kernel @@ -868,6 +892,25 @@ def get_kernel_call( ecm = self.get_expression_to_code_mapper(codegen_state) + start_temporary_variables, end_temporary_variables = self.get_temporary_decl_locations(codegen_state) + allocation_code_lines = [] + for tv_name in start_temporary_variables[subkernel_name]: + tv = kernel.temporary_variables[tv_name] + if not tv.base_storage: + if tv.nbytes: + # NB: This does not prevent all zero-size allocations, + # as sizes are parametric, and allocation size + # could turn out to be zero at runtime. + nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") + allocation_code_lines.append(Assign(tv.name, + f"allocator({nbytes_str})")) + else: + allocation_code_lines.append(Assign(tv.name, "None")) + + deallocation_code_lines = [] + for tv_name in end_temporary_variables[subkernel_name]: + deallocation_code_lines.append(Statement(f"if {tv_name} is not None: {tv_name}.release()")) + if not gsize: gsize = (1,) if not lsize: @@ -943,6 +986,7 @@ def get_kernel_call( # TODO: Generate finer-grained dependency structure return Suite([ + *allocation_code_lines, Comment("{{{ enqueue %s" % subkernel_name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels."+subkernel_name), @@ -971,6 +1015,7 @@ def get_kernel_call( Line(), Comment("}}}"), Line(), + *deallocation_code_lines ]) # }}} From 2fee1586ae9302e7d768a4c8e9dc1334668122fe Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 2 Jun 2025 16:13:26 -0500 Subject: [PATCH 02/20] delete commented out code --- loopy/target/pyopencl.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ff1bc83b6..5f9e842d5 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -810,41 +810,6 @@ def _get_global_temporaries(self, codegen_state): def get_temporary_decls(self, codegen_state, schedule_index): return [] - # from genpy import Assign, Comment, Line - # from pymbolic.mapper.stringifier import PREC_NONE - # ecm = self.get_expression_to_code_mapper(codegen_state) - - # global_temporaries = self._get_global_temporaries(codegen_state) - # if not global_temporaries: - # return [] - - # allocated_var_names = [] - # code_lines = [] - # code_lines.append(Line()) - # code_lines.append(Comment("{{{ allocate global temporaries")) - # code_lines.append(Line()) - - # for tv in global_temporaries: - # if not tv.base_storage: - # if tv.nbytes: - # # NB: This does not prevent all zero-size allocations, - # # as sizes are parametric, and allocation size - # # could turn out to be zero at runtime. - # nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") - # allocated_var_names.append(tv.name) - # code_lines.append(Assign(tv.name, - # f"allocator({nbytes_str})")) - # else: - # code_lines.append(Assign(tv.name, "None")) - - # code_lines.append(Assign("_global_temporaries", "[{tvs}]".format( - # tvs=", ".join(tv for tv in allocated_var_names)))) - - # code_lines.append(Line()) - # code_lines.append(Comment("}}}")) - # code_lines.append(Line()) - - # return code_lines def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tuple[Mapping[str, set[str]], Mapping[str, set[str]]]: from loopy.schedule.tools import ( From 8ace895771535fc8aa1c406b4848294d582301a9 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Wed, 4 Jun 2025 13:29:09 -0500 Subject: [PATCH 03/20] deal with base storage --- loopy/target/pyopencl.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 5f9e842d5..f1144f8ba 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -816,6 +816,7 @@ def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tu temporaries_read_in_subkernel, temporaries_written_in_subkernel, ) + from collections import defaultdict # Find sub-kernels kernel = codegen_state.kernel sched_index = 0 @@ -825,21 +826,40 @@ def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tu if isinstance(sched_item, CallKernel): subkernel_names.append(sched_item.kernel_name) - # Forward pass to find first writes + # deal with base storage + global_temporaries = self._get_global_temporaries(codegen_state) + storage_variables = defaultdict(set) + for tv in global_temporaries: + if tv.base_storage: + storage_variables[tv.base_storage].add(tv.name) + else: + storage_variables[tv.name].add(tv.name) + + + # Forward pass to find first accesses first_accesses = {} - seen_temporary_variables = set() + unseen_storage_variables = set(storage_variables.keys()) for subkernel_name in subkernel_names: - new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) - seen_temporary_variables - first_accesses[subkernel_name] = new_temporary_variables - seen_temporary_variables = new_temporary_variables.union(seen_temporary_variables) + new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) + new_storage_variables = set() + for sv in unseen_storage_variables: + if not storage_variables[sv].isdisjoint(new_temporary_variables): + new_storage_variables.add(sv) + unseen_storage_variables = unseen_storage_variables - new_storage_variables + first_accesses[subkernel_name] = new_storage_variables - # Backwards pass to find last reads + # Backwards pass to find last accesses last_accesses = {} - seen_temporary_variables = set() + unseen_storage_variables = set(storage_variables.keys()) for subkernel_name in reversed(subkernel_names): - new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) - seen_temporary_variables - last_accesses[subkernel_name] = new_temporary_variables - seen_temporary_variables = new_temporary_variables.union(seen_temporary_variables) + new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) + new_storage_variables = set() + for sv in unseen_storage_variables: + if not storage_variables[sv].isdisjoint(new_temporary_variables): + new_storage_variables.add(sv) + unseen_storage_variables = unseen_storage_variables - new_storage_variables + last_accesses[subkernel_name] = new_storage_variables + return (first_accesses, last_accesses) def get_kernel_call( From c4e635cc651a5756087e9e72179d937a015ae2c2 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Thu, 5 Jun 2025 14:51:56 -0500 Subject: [PATCH 04/20] ruff check fixes --- loopy/target/pyopencl.py | 41 ++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index f1144f8ba..bb0799ea7 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -26,7 +26,7 @@ """ import logging -from typing import TYPE_CHECKING, Any, Sequence, cast, Tuple, Mapping +from typing import TYPE_CHECKING, Any, Mapping, Sequence, Tuple, cast from warnings import warn import numpy as np @@ -780,13 +780,12 @@ def get_function_definition( ["_lpy_cl_kernels", "queue", *kai.passed_arg_names, "wait_for=None", "allocator=None"]) - from genpy import For, Function, Line, Return, Statement as S, Suite + from genpy import Function, Line, Return, Suite return Function( codegen_result.current_program(codegen_state).name, args, Suite([ Line(), - ] + [ Line(), function_body, Line(), @@ -811,12 +810,15 @@ def _get_global_temporaries(self, codegen_state): def get_temporary_decls(self, codegen_state, schedule_index): return [] - def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tuple[Mapping[str, set[str]], Mapping[str, set[str]]]: + def get_temporary_decl_locations( + self, codegen_state: CodeGenerationState + ) -> Tuple[Mapping[str, set[str]], Mapping[str, set[str]]]: + from collections import defaultdict + from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel, ) - from collections import defaultdict # Find sub-kernels kernel = codegen_state.kernel sched_index = 0 @@ -835,31 +837,36 @@ def get_temporary_decl_locations(self, codegen_state: CodeGenerationState) -> Tu else: storage_variables[tv.name].add(tv.name) - # Forward pass to find first accesses first_accesses = {} unseen_storage_variables = set(storage_variables.keys()) for subkernel_name in subkernel_names: - new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) + new_temporary_variables = ( + temporaries_written_in_subkernel(kernel, subkernel_name) + .union(temporaries_read_in_subkernel(kernel, subkernel_name)) + ) new_storage_variables = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): new_storage_variables.add(sv) unseen_storage_variables = unseen_storage_variables - new_storage_variables first_accesses[subkernel_name] = new_storage_variables - + # Backwards pass to find last accesses last_accesses = {} unseen_storage_variables = set(storage_variables.keys()) for subkernel_name in reversed(subkernel_names): - new_temporary_variables = temporaries_written_in_subkernel(kernel, subkernel_name).union(temporaries_read_in_subkernel(kernel, subkernel_name)) + new_temporary_variables = ( + temporaries_written_in_subkernel(kernel, subkernel_name) + .union(temporaries_read_in_subkernel(kernel, subkernel_name)) + ) new_storage_variables = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): new_storage_variables.add(sv) unseen_storage_variables = unseen_storage_variables - new_storage_variables last_accesses[subkernel_name] = new_storage_variables - + return (first_accesses, last_accesses) def get_kernel_call( @@ -867,7 +874,7 @@ def get_kernel_call( subkernel_name: str, gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] ) -> genpy.Suite: - from genpy import Assert, Assign, Statement, Comment, Line, Suite + from genpy import Assert, Assign, Comment, Line, Statement, Suite from pymbolic.mapper.stringifier import PREC_NONE kernel = codegen_state.kernel @@ -877,9 +884,9 @@ def get_kernel_call( ecm = self.get_expression_to_code_mapper(codegen_state) - start_temporary_variables, end_temporary_variables = self.get_temporary_decl_locations(codegen_state) + start_tvs, end_tvs = self.get_temporary_decl_locations(codegen_state) allocation_code_lines = [] - for tv_name in start_temporary_variables[subkernel_name]: + for tv_name in start_tvs[subkernel_name]: tv = kernel.temporary_variables[tv_name] if not tv.base_storage: if tv.nbytes: @@ -891,10 +898,12 @@ def get_kernel_call( f"allocator({nbytes_str})")) else: allocation_code_lines.append(Assign(tv.name, "None")) - + deallocation_code_lines = [] - for tv_name in end_temporary_variables[subkernel_name]: - deallocation_code_lines.append(Statement(f"if {tv_name} is not None: {tv_name}.release()")) + for tv_name in end_tvs[subkernel_name]: + deallocation_code_lines.append( + Statement(f"if {tv_name} is not None: {tv_name}.release()") + ) if not gsize: gsize = (1,) From 24b1a47afc9c8b64cfb43b4ed4f812db43453462 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Sun, 8 Jun 2025 17:36:51 -0500 Subject: [PATCH 05/20] rework to push allocations outside of loops --- loopy/codegen/control.py | 27 ++++++ loopy/target/pyopencl.py | 193 +++++++++++++++++++++++++++++---------- 2 files changed, 170 insertions(+), 50 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 044f03bea..4f4401bef 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -66,6 +66,24 @@ def generate_code_for_sched_index(codegen_state, sched_index): glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.linearization, sched_index), codegen_state.callables_table) + + from loopy.target.pyopencl import PyOpenCLPythonASTBuilder + if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder): + prefix, postfix = ( + codegen_state.ast_builder + .get_temporary_decl_at_index(codegen_state, sched_index) + ) + results = [ + prefix, + codegen_result, + codegen_state.ast_builder.get_kernel_call( + codegen_state, + sched_item.kernel_name, + glob_grid, loc_grid), + postfix + ] + return merge_codegen_results(codegen_state, results) + return merge_codegen_results(codegen_state, [ codegen_result, @@ -117,6 +135,15 @@ def generate_code_for_sched_index(codegen_state, sched_index): "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) + from loopy.target.pyopencl import PyOpenCLPythonASTBuilder + if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder): + prefix, postfix = ( + codegen_state.ast_builder + .get_temporary_decl_at_index(codegen_state, sched_index) + ) + results = [prefix, func(codegen_state, sched_index), postfix] + return merge_codegen_results(codegen_state, results) + return func(codegen_state, sched_index) elif isinstance(sched_item, Barrier): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index bb0799ea7..62ec0f97c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -26,7 +26,7 @@ """ import logging -from typing import TYPE_CHECKING, Any, Mapping, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Mapping, Sequence, Tuple, cast, Iterable from warnings import warn import numpy as np @@ -55,7 +55,7 @@ ValueArg, ) from loopy.kernel.function_interface import ScalarCallable -from loopy.schedule import CallKernel +from loopy.schedule import CallKernel, EnterLoop, LeaveLoop, ReturnFromKernel from loopy.target.opencl import ( ExpressionToOpenCLCExpressionMapper, OpenCLCASTBuilder, @@ -812,7 +812,7 @@ def get_temporary_decls(self, codegen_state, schedule_index): def get_temporary_decl_locations( self, codegen_state: CodeGenerationState - ) -> Tuple[Mapping[str, set[str]], Mapping[str, set[str]]]: + ) -> Tuple[Mapping[int, set[str]], Mapping[int, set[str]]]: from collections import defaultdict from loopy.schedule.tools import ( @@ -821,89 +821,185 @@ def get_temporary_decl_locations( ) # Find sub-kernels kernel = codegen_state.kernel + assert kernel.linearization is not None sched_index = 0 - subkernel_names = [] - for sched_index in range(0, codegen_state.schedule_index_end): - sched_item = kernel.linearization[sched_index] - if isinstance(sched_item, CallKernel): - subkernel_names.append(sched_item.kernel_name) # deal with base storage - global_temporaries = self._get_global_temporaries(codegen_state) storage_variables = defaultdict(set) + global_temporaries = self._get_global_temporaries(codegen_state) for tv in global_temporaries: if tv.base_storage: storage_variables[tv.base_storage].add(tv.name) else: storage_variables[tv.name].add(tv.name) - # Forward pass to find first accesses - first_accesses = {} + # Collapse into blocks + def get_temporaries_in_bounds(linearization, lower_bound, upper_bound): + temporaries: frozenset[str] = frozenset() + for sched_index in range(lower_bound, upper_bound+1): + sched_item = linearization[sched_index] + if isinstance(sched_item, CallKernel): + temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + .union(temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + )) + .union(temporaries) + ) + return temporaries + + def get_leave_loop_index(linearization, iname, starting_index): + for sched_index in range(starting_index, len(linearization)): + sched_item = linearization[sched_index] + if isinstance(sched_item, LeaveLoop) and sched_item.iname == iname: + return sched_index + raise LoopyError("LeaveLoop for iname '%s' not found" % iname) + + def get_return_from_kernel_index(linearization, kernel_name, starting_index): + for sched_index in range(starting_index, len(linearization)): + sched_item = linearization[sched_index] + if ( + isinstance(sched_item, ReturnFromKernel) + and sched_item.kernel_name == kernel_name + ): + return sched_index + raise LoopyError("ReturnFromKernel for subkernel" + "'%s' not found" % kernel_name) + + bounds = {} + sched_index = 0 + while sched_index < codegen_state.schedule_index_end: + sched_item = kernel.linearization[sched_index] + if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): + if isinstance(sched_item, CallKernel): + block_end = get_return_from_kernel_index( + kernel.linearization, sched_item.kernel_name, sched_index + ) + accessed_temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + .union(temporaries_read_in_subkernel( + kernel, sched_item.kernel_name) + ) + ) + else: + block_end = get_leave_loop_index( + kernel.linearization, sched_item.iname, sched_index + ) + accessed_temporaries = get_temporaries_in_bounds( + kernel.linearization, sched_index, block_end + ) + bounds[sched_index] = accessed_temporaries + sched_index = block_end + 1 + else: + sched_index += 1 + + # forward pass for first accesses + first_accesses: dict[int, set[str]] = {} unseen_storage_variables = set(storage_variables.keys()) - for subkernel_name in subkernel_names: - new_temporary_variables = ( - temporaries_written_in_subkernel(kernel, subkernel_name) - .union(temporaries_read_in_subkernel(kernel, subkernel_name)) - ) - new_storage_variables = set() + for sched_index in range(0, codegen_state.schedule_index_end): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + new_storage_variables: set[str] = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): new_storage_variables.add(sv) unseen_storage_variables = unseen_storage_variables - new_storage_variables - first_accesses[subkernel_name] = new_storage_variables + if (len(new_storage_variables) > 0): + target_index = sched_index + if target_index in first_accesses: + first_accesses[target_index] = ( + first_accesses[target_index].union(new_storage_variables) + ) + else: + first_accesses[target_index] = new_storage_variables - # Backwards pass to find last accesses - last_accesses = {} + last_accesses: dict[int, set[str]] = {} unseen_storage_variables = set(storage_variables.keys()) - for subkernel_name in reversed(subkernel_names): - new_temporary_variables = ( - temporaries_written_in_subkernel(kernel, subkernel_name) - .union(temporaries_read_in_subkernel(kernel, subkernel_name)) - ) - new_storage_variables = set() + for sched_index in range(codegen_state.schedule_index_end-1, -1, -1): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + new_storage_variables: set[str] = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): new_storage_variables.add(sv) unseen_storage_variables = unseen_storage_variables - new_storage_variables - last_accesses[subkernel_name] = new_storage_variables - + if (len(new_storage_variables) > 0): + target_index = sched_index + if target_index in last_accesses: + last_accesses[target_index] = ( + last_accesses[target_index].union(new_storage_variables) + ) + else: + last_accesses[target_index] = new_storage_variables return (first_accesses, last_accesses) - def get_kernel_call( - self, codegen_state: CodeGenerationState, - subkernel_name: str, - gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] - ) -> genpy.Suite: - from genpy import Assert, Assign, Comment, Line, Statement, Suite + def get_temporary_allocation( + self, + codegen_state: CodeGenerationState, + temporary_variable_names: Iterable[str] + ) -> genpy.Suite: + from genpy import Assign, Suite from pymbolic.mapper.stringifier import PREC_NONE - kernel = codegen_state.kernel - - from loopy.schedule.tools import get_subkernel_arg_info - skai = get_subkernel_arg_info(kernel, subkernel_name) - ecm = self.get_expression_to_code_mapper(codegen_state) - - start_tvs, end_tvs = self.get_temporary_decl_locations(codegen_state) - allocation_code_lines = [] - for tv_name in start_tvs[subkernel_name]: + allocation_code_lines: list[Assign] = [] + for tv_name in temporary_variable_names: tv = kernel.temporary_variables[tv_name] if not tv.base_storage: if tv.nbytes: - # NB: This does not prevent all zero-size allocations, - # as sizes are parametric, and allocation size - # could turn out to be zero at runtime. nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") allocation_code_lines.append(Assign(tv.name, f"allocator({nbytes_str})")) else: allocation_code_lines.append(Assign(tv.name, "None")) + return Suite(allocation_code_lines) + def get_temporary_deallocation( + self, + codegen_state: CodeGenerationState, + temporary_variable_names: Iterable[str] + ) -> genpy.Suite: + from genpy import Statement, Suite deallocation_code_lines = [] - for tv_name in end_tvs[subkernel_name]: + for tv_name in temporary_variable_names: deallocation_code_lines.append( Statement(f"if {tv_name} is not None: {tv_name}.release()") ) + return Suite(deallocation_code_lines) + + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, sched_index: int + ) -> Tuple[genpy.Suite, genpy.Suite]: + from genpy import Suite + first_accesses, last_accesses = self.get_temporary_decl_locations(codegen_state) + prefixes, suffixes = Suite(), Suite() + if sched_index in first_accesses: + prefixes = self.get_temporary_allocation( + codegen_state, first_accesses[sched_index] + ) + if sched_index in last_accesses: + suffixes = self.get_temporary_deallocation( + codegen_state, last_accesses[sched_index] + ) + return (prefixes, suffixes) + + def get_kernel_call( + self, codegen_state: CodeGenerationState, + subkernel_name: str, + gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] + ) -> genpy.Suite: + from genpy import Assert, Assign, Comment, Line, Suite + from pymbolic.mapper.stringifier import PREC_NONE + + kernel = codegen_state.kernel + ecm = self.get_expression_to_code_mapper(codegen_state) + + from loopy.schedule.tools import get_subkernel_arg_info + skai = get_subkernel_arg_info(kernel, subkernel_name) if not gsize: gsize = (1,) @@ -971,7 +1067,6 @@ def get_kernel_call( overflow_args_code = Suite([]) import pyopencl.version as cl_ver - from pymbolic.mapper.stringifier import PREC_NONE if cl_ver.VERSION < (2020, 2): from warnings import warn warn("Your kernel invocation will likely fail because your " @@ -980,7 +1075,6 @@ def get_kernel_call( # TODO: Generate finer-grained dependency structure return Suite([ - *allocation_code_lines, Comment("{{{ enqueue %s" % subkernel_name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels."+subkernel_name), @@ -1009,7 +1103,6 @@ def get_kernel_call( Line(), Comment("}}}"), Line(), - *deallocation_code_lines ]) # }}} From be78797f7a1c654f37695fecac264562cc148305 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 9 Jun 2025 14:08:47 -0500 Subject: [PATCH 06/20] add types, fix ruff --- loopy/target/pyopencl.py | 71 +++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 62ec0f97c..1afd692b4 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -26,7 +26,7 @@ """ import logging -from typing import TYPE_CHECKING, Any, Mapping, Sequence, Tuple, cast, Iterable +from typing import TYPE_CHECKING, Any, cast from warnings import warn import numpy as np @@ -55,7 +55,13 @@ ValueArg, ) from loopy.kernel.function_interface import ScalarCallable -from loopy.schedule import CallKernel, EnterLoop, LeaveLoop, ReturnFromKernel +from loopy.schedule import ( + CallKernel, + EnterLoop, + LeaveLoop, + ReturnFromKernel, + ScheduleItem, +) from loopy.target.opencl import ( ExpressionToOpenCLCExpressionMapper, OpenCLCASTBuilder, @@ -68,19 +74,22 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + import genpy import pyopencl as cl from loopy.codegen import CodeGenerationState from loopy.codegen.result import CodeGenerationResult from loopy.kernel import LoopKernel + from loopy.schedule import ScheduleItem from loopy.target.pyopencl_execution import PyOpenCLExecutor from loopy.translation_unit import FunctionIdT, TranslationUnit from loopy.typing import Expression - # {{{ pyopencl function scopers + class PyOpenCLCallable(ScalarCallable): """ Records information about the callables which are not covered by @@ -812,7 +821,7 @@ def get_temporary_decls(self, codegen_state, schedule_index): def get_temporary_decl_locations( self, codegen_state: CodeGenerationState - ) -> Tuple[Mapping[int, set[str]], Mapping[int, set[str]]]: + ) -> tuple[Mapping[int, set[str]], Mapping[int, set[str]]]: from collections import defaultdict from loopy.schedule.tools import ( @@ -825,7 +834,7 @@ def get_temporary_decl_locations( sched_index = 0 # deal with base storage - storage_variables = defaultdict(set) + storage_variables: defaultdict[str, set[str]] = defaultdict(set) global_temporaries = self._get_global_temporaries(codegen_state) for tv in global_temporaries: if tv.base_storage: @@ -834,7 +843,11 @@ def get_temporary_decl_locations( storage_variables[tv.name].add(tv.name) # Collapse into blocks - def get_temporaries_in_bounds(linearization, lower_bound, upper_bound): + def get_temporaries_in_bounds( + linearization: Sequence[ScheduleItem], + lower_bound: int, + upper_bound: int + ) -> frozenset[str]: temporaries: frozenset[str] = frozenset() for sched_index in range(lower_bound, upper_bound+1): sched_item = linearization[sched_index] @@ -848,14 +861,22 @@ def get_temporaries_in_bounds(linearization, lower_bound, upper_bound): ) return temporaries - def get_leave_loop_index(linearization, iname, starting_index): + def get_leave_loop_index( + linearization: Sequence[ScheduleItem], + iname: str, + starting_index: int + ) -> int: for sched_index in range(starting_index, len(linearization)): sched_item = linearization[sched_index] if isinstance(sched_item, LeaveLoop) and sched_item.iname == iname: return sched_index raise LoopyError("LeaveLoop for iname '%s' not found" % iname) - def get_return_from_kernel_index(linearization, kernel_name, starting_index): + def get_return_from_kernel_index( + linearization: Sequence[ScheduleItem], + kernel_name: str, + starting_index: int + ) -> int: for sched_index in range(starting_index, len(linearization)): sched_item = linearization[sched_index] if ( @@ -866,7 +887,7 @@ def get_return_from_kernel_index(linearization, kernel_name, starting_index): raise LoopyError("ReturnFromKernel for subkernel" "'%s' not found" % kernel_name) - bounds = {} + bounds: dict[int, frozenset[str]] = {} sched_index = 0 while sched_index < codegen_state.schedule_index_end: sched_item = kernel.linearization[sched_index] @@ -901,19 +922,21 @@ def get_return_from_kernel_index(linearization, kernel_name, starting_index): continue sched_item = kernel.linearization[sched_index] new_temporary_variables = bounds[sched_index] - new_storage_variables: set[str] = set() + fwd_new_storage_variables: set[str] = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): - new_storage_variables.add(sv) - unseen_storage_variables = unseen_storage_variables - new_storage_variables - if (len(new_storage_variables) > 0): + fwd_new_storage_variables.add(sv) + unseen_storage_variables = ( + unseen_storage_variables - fwd_new_storage_variables + ) + if (len(fwd_new_storage_variables) > 0): target_index = sched_index if target_index in first_accesses: first_accesses[target_index] = ( - first_accesses[target_index].union(new_storage_variables) + first_accesses[target_index].union(fwd_new_storage_variables) ) else: - first_accesses[target_index] = new_storage_variables + first_accesses[target_index] = fwd_new_storage_variables last_accesses: dict[int, set[str]] = {} unseen_storage_variables = set(storage_variables.keys()) @@ -922,19 +945,21 @@ def get_return_from_kernel_index(linearization, kernel_name, starting_index): continue sched_item = kernel.linearization[sched_index] new_temporary_variables = bounds[sched_index] - new_storage_variables: set[str] = set() + back_new_storage_variables: set[str] = set() for sv in unseen_storage_variables: if not storage_variables[sv].isdisjoint(new_temporary_variables): - new_storage_variables.add(sv) - unseen_storage_variables = unseen_storage_variables - new_storage_variables - if (len(new_storage_variables) > 0): + back_new_storage_variables.add(sv) + unseen_storage_variables = ( + unseen_storage_variables - back_new_storage_variables + ) + if (len(back_new_storage_variables) > 0): target_index = sched_index if target_index in last_accesses: last_accesses[target_index] = ( - last_accesses[target_index].union(new_storage_variables) + last_accesses[target_index].union(back_new_storage_variables) ) else: - last_accesses[target_index] = new_storage_variables + last_accesses[target_index] = back_new_storage_variables return (first_accesses, last_accesses) def get_temporary_allocation( @@ -964,7 +989,7 @@ def get_temporary_deallocation( temporary_variable_names: Iterable[str] ) -> genpy.Suite: from genpy import Statement, Suite - deallocation_code_lines = [] + deallocation_code_lines: list[Statement] = [] for tv_name in temporary_variable_names: deallocation_code_lines.append( Statement(f"if {tv_name} is not None: {tv_name}.release()") @@ -973,7 +998,7 @@ def get_temporary_deallocation( def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int - ) -> Tuple[genpy.Suite, genpy.Suite]: + ) -> tuple[genpy.Suite, genpy.Suite]: from genpy import Suite first_accesses, last_accesses = self.get_temporary_decl_locations(codegen_state) prefixes, suffixes = Suite(), Suite() From 0b6abdd7829b65c023b271f4aca9d79e13602c6a Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Tue, 17 Jun 2025 16:05:31 -0500 Subject: [PATCH 07/20] refactor to make more target-generic --- loopy/codegen/control.py | 165 ++++++++++++++++++++++------ loopy/target/__init__.py | 18 +++ loopy/target/c/__init__.py | 9 ++ loopy/target/pyopencl.py | 219 +++++++------------------------------ 4 files changed, 202 insertions(+), 209 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 84a2d8497..004c9c87d 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -49,12 +49,130 @@ from pymbolic import Expression from loopy.codegen import CodeGenerationState - from loopy.kernel import LoopKernel + from loopy.kernel import LoopKernel, ScheduleItem from loopy.typing import InameStr _EMPTY_INT_FROZENSET: frozenset[int] = frozenset() +def get_temporary_decl_locations( + codegen_state: CodeGenerationState + ) -> tuple[map[int, set[str]], map[int, set[str]]]: + from loopy.kernel.data import AddressSpace + from loopy.schedule.tools import ( + get_block_boundaries, + temporaries_read_in_subkernel, + temporaries_written_in_subkernel, + ) + + kernel = codegen_state.kernel + assert kernel.linearization is not None + sched_index = 0 + + global_temporaries = ( + tv for tv in codegen_state.kernel.temporary_variables.values() + if tv.address_space == AddressSpace.GLOBAL + ) + + # Collapse into blocks + def get_temporaries_in_bounds( + linearization: Sequence[ScheduleItem], + lower_bound: int, + upper_bound: int + ) -> frozenset[str]: + temporaries: frozenset[str] = frozenset() + for sched_index in range(lower_bound, upper_bound+1): + sched_item = linearization[sched_index] + if isinstance(sched_item, CallKernel): + temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + | (temporaries) + ) + return temporaries & global_temporaries + + block_boundaries = get_block_boundaries(kernel.linearization) + + bounds: dict[int, frozenset[str]] = {} + sched_index = 0 + while sched_index < codegen_state.schedule_index_end: + sched_item = kernel.linearization[sched_index] + if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): + if isinstance(sched_item, CallKernel): + block_end = block_boundaries[sched_index] + accessed_temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + ) + else: + block_end = block_boundaries[sched_index] + accessed_temporaries = get_temporaries_in_bounds( + kernel.linearization, sched_index, block_end + ) + bounds[sched_index] = accessed_temporaries + sched_index = block_end + 1 + else: + sched_index += 1 + + def update_seen_storage_vars(seen_sv, new_temp_variables): + new_storage_variables = set() + for new_tv_name in new_temp_variables: + new_tv = kernel.temporary_variables[new_tv_name] + storage_var = new_tv_name if new_tv.base_storage == None else new_tv.base_storage + new_storage_variables.add(storage_var) + + return (seen_sv | new_storage_variables, new_storage_variables - seen_sv) + # forward pass for first accesses + first_accesses: dict[int, set[str]] = {} + seen_storage_variables = set() + for sched_index in range(0, codegen_state.schedule_index_end): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + seen_storage_variables, new_storage_variables = update_seen_storage_vars( + seen_storage_variables, new_temporary_variables + ) + + if (len(new_storage_variables) > 0): + first_accesses[sched_index] = new_storage_variables + + last_accesses: dict[int, set[str]] = {} + seen_storage_variables = set() + for sched_index in range(codegen_state.schedule_index_end-1, -1, -1): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + seen_storage_variables, new_storage_variables = update_seen_storage_vars( + seen_storage_variables, new_temporary_variables + ) + + if (len(new_storage_variables) > 0): + last_accesses[sched_index] = new_storage_variables + return (first_accesses, last_accesses) + + +def get_temporary_decl_at_index( + codegen_state: CodeGenerationState, sched_index: int + ) -> tuple[Any, Any]: + first_accesses, last_accesses = get_temporary_decl_locations(codegen_state) + prefixes, suffixes = None, None + if sched_index in first_accesses: + prefixes = codegen_state.ast_builder.target.get_temporary_allocation( + codegen_state, first_accesses[sched_index] + ) + if sched_index in last_accesses: + suffixes = codegen_state.ast_builder.target.get_temporary_deallocation( + codegen_state, last_accesses[sched_index] + ) + return (prefixes, suffixes) + + def generate_code_for_sched_index( codegen_state: CodeGenerationState, sched_index: int @@ -86,31 +204,19 @@ def generate_code_for_sched_index( get_insn_ids_for_block_at(kernel.linearization, sched_index), codegen_state.callables_table) - from loopy.target.pyopencl import PyOpenCLPythonASTBuilder - if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder): - prefix, postfix = ( - codegen_state.ast_builder - .get_temporary_decl_at_index(codegen_state, sched_index) - ) - results = [ - prefix, - codegen_result, - codegen_state.ast_builder.get_kernel_call( - codegen_state, - sched_item.kernel_name, - glob_grid, loc_grid), - postfix - ] - return merge_codegen_results(codegen_state, results) - - return merge_codegen_results(codegen_state, [ + prefixes, suffixes = ( + get_temporary_decl_at_index(codegen_state, sched_index) + ) + results = [ + prefixes, codegen_result, - codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, - glob_grid, loc_grid) - ]) + glob_grid, loc_grid), + suffixes + ] + return merge_codegen_results(codegen_state, [r for r in results if r is not None]) else: # do not generate host code for non-entrypoint kernels return codegen_result @@ -154,16 +260,11 @@ def generate_code_for_sched_index( "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) - from loopy.target.pyopencl import PyOpenCLPythonASTBuilder - if isinstance(codegen_state.ast_builder, PyOpenCLPythonASTBuilder): - prefix, postfix = ( - codegen_state.ast_builder - .get_temporary_decl_at_index(codegen_state, sched_index) - ) - results = [prefix, func(codegen_state, sched_index), postfix] - return merge_codegen_results(codegen_state, results) - - return func(codegen_state, sched_index) + prefixes, suffixes = ( + get_temporary_decl_at_index(codegen_state, sched_index) + ) + results = [prefixes, func(codegen_state, sched_index), suffixes] + return merge_codegen_results(codegen_state, [r for r in results if r is not None]) elif isinstance(sched_item, Barrier): # {{{ emit barrier code diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 673a46d4d..e459407f7 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -190,6 +190,24 @@ def get_kernel_executor( """ raise NotImplementedError() + def get_temporary_allocation( + self, codegen_state: CodeGenerationState, + temporary_variables: frozenset[str] + ) -> Any: + """ + :returns: code that will allocate the specified temporary variables + """ + raise NotImplementedError() + + def get_temporary_deallocation( + self, codegen_state: CodeGenerationState, + temporary_variables: frozenset[str] + ) -> Any: + """ + :returns: code that will free the specified temporary variables + """ + raise NotImplementedError() + @dataclass(frozen=True) class ASTBuilderBase(Generic[ASTType], ABC): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index cdceb4121..89c55fe98 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -538,6 +538,15 @@ def dtype_to_typename(self, dtype): # These kind of shouldn't be here. return self.get_dtype_registry().dtype_to_ctype(dtype) + @override + def get_temporary_allocation(self, codegen_state, temporary_variables): + from cgen import Comment + return Comment("Do nothing") + + @override + def get_temporary_deallocation(self, codegen_state, temporary_variables): + from cgen import Comment + return Comment("Do nothing") # }}} diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 199926f81..573b182fb 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -70,7 +70,7 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from collections.abc import Iterable, Mapping, Sequence + from collections.abc import Mapping, Sequence import genpy import pyopencl as cl @@ -79,10 +79,12 @@ from loopy.codegen import CodeGenerationState from loopy.codegen.result import CodeGenerationResult from loopy.kernel import LoopKernel - from loopy.schedule import ScheduleItem from loopy.target.pyopencl_execution import PyOpenCLExecutor - from loopy.translation_unit import FunctionIdT, TranslationUnit - from loopy.typing import Expression + from loopy.translation_unit import ( + CallableId, + CallablesInferenceContext, + TranslationUnit, + ) # {{{ pyopencl function scopers @@ -661,6 +663,41 @@ def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override] from loopy.target.pyopencl_execution import PyOpenCLExecutor return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint) + @override + def get_temporary_allocation( + self, codegen_state: CodeGenerationState, + temporary_variables: frozenset[str] + ) -> genpy.Suite: + from genpy import Assign, Suite + from pymbolic.mapper.stringifier import PREC_NONE + + from loopy.target.python import ExpressionToPythonMapper + ecm = ExpressionToPythonMapper(codegen_state) + allocation_code_lines: list[Assign] = [] + for tv_name in temporary_variables: + tv = codegen_state.kernel.temporary_variables[tv_name] + if not tv.base_storage: + if tv.nbytes: + nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") + allocation_code_lines.append(Assign(tv.name, + f"allocator({nbytes_str})")) + else: + allocation_code_lines.append(Assign(tv.name, "None")) + return Suite(allocation_code_lines) + + @override + def get_temporary_deallocation( + self, codegen_state: CodeGenerationState, + temporary_variables: frozenset[str] + ) -> genpy.Suite: + from genpy import Statement, Suite + deallocation_code_lines: list[Statement] = [] + for tv_name in temporary_variables: + deallocation_code_lines.append( + Statement(f"if {tv_name} is not None: {tv_name}.release()") + ) + return Suite(deallocation_code_lines) + # }}} @@ -840,153 +877,10 @@ def _get_global_temporaries(self, codegen_state: CodeGenerationState): def get_temporary_decls(self, codegen_state, schedule_index): return [] - def get_temporary_decl_locations( - self, codegen_state: CodeGenerationState - ) -> tuple[Mapping[int, set[str]], Mapping[int, set[str]]]: - from collections import defaultdict - - from loopy.schedule.tools import ( - temporaries_read_in_subkernel, - temporaries_written_in_subkernel, - ) - # Find sub-kernels - kernel = codegen_state.kernel - assert kernel.linearization is not None - sched_index = 0 - - # deal with base storage - storage_variables: defaultdict[str, set[str]] = defaultdict(set) - global_temporaries = self._get_global_temporaries(codegen_state) - for tv in global_temporaries: - if tv.base_storage: - storage_variables[tv.base_storage].add(tv.name) - else: - storage_variables[tv.name].add(tv.name) - - # Collapse into blocks - def get_temporaries_in_bounds( - linearization: Sequence[ScheduleItem], - lower_bound: int, - upper_bound: int - ) -> frozenset[str]: - temporaries: frozenset[str] = frozenset() - for sched_index in range(lower_bound, upper_bound+1): - sched_item = linearization[sched_index] - if isinstance(sched_item, CallKernel): - temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - .union(temporaries_read_in_subkernel( - kernel, sched_item.kernel_name - )) - .union(temporaries) - ) - return temporaries - - def get_leave_loop_index( - linearization: Sequence[ScheduleItem], - iname: str, - starting_index: int - ) -> int: - for sched_index in range(starting_index, len(linearization)): - sched_item = linearization[sched_index] - if isinstance(sched_item, LeaveLoop) and sched_item.iname == iname: - return sched_index - raise LoopyError("LeaveLoop for iname '%s' not found" % iname) - - def get_return_from_kernel_index( - linearization: Sequence[ScheduleItem], - kernel_name: str, - starting_index: int - ) -> int: - for sched_index in range(starting_index, len(linearization)): - sched_item = linearization[sched_index] - if ( - isinstance(sched_item, ReturnFromKernel) - and sched_item.kernel_name == kernel_name - ): - return sched_index - raise LoopyError("ReturnFromKernel for subkernel" - "'%s' not found" % kernel_name) - - bounds: dict[int, frozenset[str]] = {} - sched_index = 0 - while sched_index < codegen_state.schedule_index_end: - sched_item = kernel.linearization[sched_index] - if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): - if isinstance(sched_item, CallKernel): - block_end = get_return_from_kernel_index( - kernel.linearization, sched_item.kernel_name, sched_index - ) - accessed_temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - .union(temporaries_read_in_subkernel( - kernel, sched_item.kernel_name) - ) - ) - else: - block_end = get_leave_loop_index( - kernel.linearization, sched_item.iname, sched_index - ) - accessed_temporaries = get_temporaries_in_bounds( - kernel.linearization, sched_index, block_end - ) - bounds[sched_index] = accessed_temporaries - sched_index = block_end + 1 - else: - sched_index += 1 - - # forward pass for first accesses - first_accesses: dict[int, set[str]] = {} - unseen_storage_variables = set(storage_variables.keys()) - for sched_index in range(0, codegen_state.schedule_index_end): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - fwd_new_storage_variables: set[str] = set() - for sv in unseen_storage_variables: - if not storage_variables[sv].isdisjoint(new_temporary_variables): - fwd_new_storage_variables.add(sv) - unseen_storage_variables = ( - unseen_storage_variables - fwd_new_storage_variables - ) - if (len(fwd_new_storage_variables) > 0): - target_index = sched_index - if target_index in first_accesses: - first_accesses[target_index] = ( - first_accesses[target_index].union(fwd_new_storage_variables) - ) - else: - first_accesses[target_index] = fwd_new_storage_variables - - last_accesses: dict[int, set[str]] = {} - unseen_storage_variables = set(storage_variables.keys()) - for sched_index in range(codegen_state.schedule_index_end-1, -1, -1): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - back_new_storage_variables: set[str] = set() - for sv in unseen_storage_variables: - if not storage_variables[sv].isdisjoint(new_temporary_variables): - back_new_storage_variables.add(sv) - unseen_storage_variables = ( - unseen_storage_variables - back_new_storage_variables - ) - if (len(back_new_storage_variables) > 0): - target_index = sched_index - if target_index in last_accesses: - last_accesses[target_index] = ( - last_accesses[target_index].union(back_new_storage_variables) - ) - else: - last_accesses[target_index] = back_new_storage_variables - return (first_accesses, last_accesses) - def get_temporary_allocation( self, codegen_state: CodeGenerationState, - temporary_variable_names: Iterable[str] + temporary_variable_names: frozenset[str] ) -> genpy.Suite: from genpy import Assign, Suite from pymbolic.mapper.stringifier import PREC_NONE @@ -1004,35 +898,6 @@ def get_temporary_allocation( allocation_code_lines.append(Assign(tv.name, "None")) return Suite(allocation_code_lines) - def get_temporary_deallocation( - self, - codegen_state: CodeGenerationState, - temporary_variable_names: Iterable[str] - ) -> genpy.Suite: - from genpy import Statement, Suite - deallocation_code_lines: list[Statement] = [] - for tv_name in temporary_variable_names: - deallocation_code_lines.append( - Statement(f"if {tv_name} is not None: {tv_name}.release()") - ) - return Suite(deallocation_code_lines) - - def get_temporary_decl_at_index( - self, codegen_state: CodeGenerationState, sched_index: int - ) -> tuple[genpy.Suite, genpy.Suite]: - from genpy import Suite - first_accesses, last_accesses = self.get_temporary_decl_locations(codegen_state) - prefixes, suffixes = Suite(), Suite() - if sched_index in first_accesses: - prefixes = self.get_temporary_allocation( - codegen_state, first_accesses[sched_index] - ) - if sched_index in last_accesses: - suffixes = self.get_temporary_deallocation( - codegen_state, last_accesses[sched_index] - ) - return (prefixes, suffixes) - def get_kernel_call( self, codegen_state: CodeGenerationState, subkernel_name: str, From 4f95a6b15bd280326ef50c7bf25f440ad3beccc7 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Tue, 17 Jun 2025 16:21:42 -0500 Subject: [PATCH 08/20] resolve lingering merge issues --- loopy/codegen/control.py | 16 ++++++++++++---- loopy/target/pyopencl.py | 23 +---------------------- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 004c9c87d..10c0d57c3 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -57,7 +57,7 @@ def get_temporary_decl_locations( codegen_state: CodeGenerationState - ) -> tuple[map[int, set[str]], map[int, set[str]]]: + ) -> tuple[dict[int, set[str]], dict[int, set[str]]]: from loopy.kernel.data import AddressSpace from loopy.schedule.tools import ( get_block_boundaries, @@ -122,7 +122,10 @@ def update_seen_storage_vars(seen_sv, new_temp_variables): new_storage_variables = set() for new_tv_name in new_temp_variables: new_tv = kernel.temporary_variables[new_tv_name] - storage_var = new_tv_name if new_tv.base_storage == None else new_tv.base_storage + if new_tv.base_storage is None: + storage_var = new_tv_name + else: + storage_var = new_tv.base_storage new_storage_variables.add(storage_var) return (seen_sv | new_storage_variables, new_storage_variables - seen_sv) @@ -216,7 +219,10 @@ def generate_code_for_sched_index( glob_grid, loc_grid), suffixes ] - return merge_codegen_results(codegen_state, [r for r in results if r is not None]) + return merge_codegen_results( + codegen_state, + [r for r in results if r is not None] + ) else: # do not generate host code for non-entrypoint kernels return codegen_result @@ -264,7 +270,9 @@ def generate_code_for_sched_index( get_temporary_decl_at_index(codegen_state, sched_index) ) results = [prefixes, func(codegen_state, sched_index), suffixes] - return merge_codegen_results(codegen_state, [r for r in results if r is not None]) + return merge_codegen_results( + codegen_state, [r for r in results if r is not None] + ) elif isinstance(sched_item, Barrier): # {{{ emit barrier code diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 573b182fb..d93acc56c 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -678,7 +678,7 @@ def get_temporary_allocation( tv = codegen_state.kernel.temporary_variables[tv_name] if not tv.base_storage: if tv.nbytes: - nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") + nbytes_str = ecm(tv.nbytes, PREC_NONE, type_context="i") allocation_code_lines.append(Assign(tv.name, f"allocator({nbytes_str})")) else: @@ -877,27 +877,6 @@ def _get_global_temporaries(self, codegen_state: CodeGenerationState): def get_temporary_decls(self, codegen_state, schedule_index): return [] - def get_temporary_allocation( - self, - codegen_state: CodeGenerationState, - temporary_variable_names: frozenset[str] - ) -> genpy.Suite: - from genpy import Assign, Suite - from pymbolic.mapper.stringifier import PREC_NONE - kernel = codegen_state.kernel - ecm = self.get_expression_to_code_mapper(codegen_state) - allocation_code_lines: list[Assign] = [] - for tv_name in temporary_variable_names: - tv = kernel.temporary_variables[tv_name] - if not tv.base_storage: - if tv.nbytes: - nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") - allocation_code_lines.append(Assign(tv.name, - f"allocator({nbytes_str})")) - else: - allocation_code_lines.append(Assign(tv.name, "None")) - return Suite(allocation_code_lines) - def get_kernel_call( self, codegen_state: CodeGenerationState, subkernel_name: str, From bd986369d9633386d79874fee51207ed21e56b12 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Tue, 17 Jun 2025 17:16:55 -0500 Subject: [PATCH 09/20] fix to only allocate global temporaries --- loopy/codegen/control.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 10c0d57c3..ac7a54d8c 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -69,8 +69,8 @@ def get_temporary_decl_locations( assert kernel.linearization is not None sched_index = 0 - global_temporaries = ( - tv for tv in codegen_state.kernel.temporary_variables.values() + global_temporaries = frozenset( + tv.name for tv in codegen_state.kernel.temporary_variables.values() if tv.address_space == AddressSpace.GLOBAL ) From 47dda68cf361f740155ddd3de95e47cc1ac46539 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Thu, 19 Jun 2025 14:48:30 -0500 Subject: [PATCH 10/20] move temp declarations to ASTBuilder --- loopy/codegen/control.py | 24 +++++++++---- loopy/target/__init__.py | 35 ++++++++++--------- loopy/target/c/__init__.py | 14 +++----- loopy/target/pyopencl.py | 69 +++++++++++++++++++------------------- 4 files changed, 74 insertions(+), 68 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index ac7a54d8c..9b73510d2 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -166,13 +166,25 @@ def get_temporary_decl_at_index( first_accesses, last_accesses = get_temporary_decl_locations(codegen_state) prefixes, suffixes = None, None if sched_index in first_accesses: - prefixes = codegen_state.ast_builder.target.get_temporary_allocation( - codegen_state, first_accesses[sched_index] - ) + prefix_lines = [] + for tv_name in first_accesses[sched_index]: + prefix_lines.append( + codegen_state.ast_builder.get_temporary_var_declarator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + prefixes = codegen_state.ast_builder.ast_block_class(prefix_lines) if sched_index in last_accesses: - suffixes = codegen_state.ast_builder.target.get_temporary_deallocation( - codegen_state, last_accesses[sched_index] - ) + suffix_lines = [] + for tv_name in last_accesses[sched_index]: + suffix_lines.append( + codegen_state.ast_builder.get_temporary_var_deallocator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + suffixes = codegen_state.ast_builder.ast_block_class(suffix_lines) return (prefixes, suffixes) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index e459407f7..47a4c42a2 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -64,6 +64,7 @@ from loopy.translation_unit import CallableId, CallablesTable, TranslationUnit from loopy.types import LoopyType from loopy.typing import InameStr + from loopy.kernel.tools import TemporaryVariable ASTType = TypeVar("ASTType") @@ -190,24 +191,6 @@ def get_kernel_executor( """ raise NotImplementedError() - def get_temporary_allocation( - self, codegen_state: CodeGenerationState, - temporary_variables: frozenset[str] - ) -> Any: - """ - :returns: code that will allocate the specified temporary variables - """ - raise NotImplementedError() - - def get_temporary_deallocation( - self, codegen_state: CodeGenerationState, - temporary_variables: frozenset[str] - ) -> Any: - """ - :returns: code that will free the specified temporary variables - """ - raise NotImplementedError() - @dataclass(frozen=True) class ASTBuilderBase(Generic[ASTType], ABC): @@ -269,6 +252,16 @@ def get_temporary_decls(self, codegen_state: CodeGenerationState, schedule_index: int) -> ASTType: raise NotImplementedError + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable) -> ASTType: + raise NotImplementedError() + + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable) -> ASTType: + raise NotImplementedError() + def get_kernel_call(self, codegen_state: CodeGenerationState, subkernel_name: str, gsize: tuple[Expression, ...], @@ -382,6 +375,12 @@ def get_expression_to_code_mapper(self, codegen_state): def get_kernel_call(self, codegen_state, name, gsize, lsize): return None + + def get_temporary_var_declarator(self, codegen_state, temp_var): + return None + + def get_temporary_var_deallocator(self, codegen_state, temp_var): + return None @property def ast_block_class(self): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 89c55fe98..3b3be75b9 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -43,6 +43,7 @@ Initializer, NestedDeclarator, Pointer, + Comment ) from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic import Expression @@ -538,15 +539,6 @@ def dtype_to_typename(self, dtype): # These kind of shouldn't be here. return self.get_dtype_registry().dtype_to_ctype(dtype) - @override - def get_temporary_allocation(self, codegen_state, temporary_variables): - from cgen import Comment - return Comment("Do nothing") - - @override - def get_temporary_deallocation(self, codegen_state, temporary_variables): - from cgen import Comment - return Comment("Do nothing") # }}} @@ -1249,6 +1241,7 @@ def arg_to_cgen_declarator( raise ValueError(f"unexpected type of argument '{passed_name}': " f"'{type(var_descr)}'") + @override def get_temporary_var_declarator(self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable) -> Declarator: @@ -1281,6 +1274,9 @@ def get_temporary_var_declarator(self, return self.wrap_decl_for_address_space(temp_var_decl, temp_var.address_space) + @override + def get_temporary_var_deallocator(self, codegen_state, temp_var): + return Comment("Dynamic freeing of temp_var not supported") # }}} @override diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index d93acc56c..6506b8c61 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -663,40 +663,6 @@ def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override] from loopy.target.pyopencl_execution import PyOpenCLExecutor return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint) - @override - def get_temporary_allocation( - self, codegen_state: CodeGenerationState, - temporary_variables: frozenset[str] - ) -> genpy.Suite: - from genpy import Assign, Suite - from pymbolic.mapper.stringifier import PREC_NONE - - from loopy.target.python import ExpressionToPythonMapper - ecm = ExpressionToPythonMapper(codegen_state) - allocation_code_lines: list[Assign] = [] - for tv_name in temporary_variables: - tv = codegen_state.kernel.temporary_variables[tv_name] - if not tv.base_storage: - if tv.nbytes: - nbytes_str = ecm(tv.nbytes, PREC_NONE, type_context="i") - allocation_code_lines.append(Assign(tv.name, - f"allocator({nbytes_str})")) - else: - allocation_code_lines.append(Assign(tv.name, "None")) - return Suite(allocation_code_lines) - - @override - def get_temporary_deallocation( - self, codegen_state: CodeGenerationState, - temporary_variables: frozenset[str] - ) -> genpy.Suite: - from genpy import Statement, Suite - deallocation_code_lines: list[Statement] = [] - for tv_name in temporary_variables: - deallocation_code_lines.append( - Statement(f"if {tv_name} is not None: {tv_name}.release()") - ) - return Suite(deallocation_code_lines) # }}} @@ -874,9 +840,42 @@ def _get_global_temporaries(self, codegen_state: CodeGenerationState): if tv.address_space == AddressSpace.GLOBAL), key=lambda tv: tv.name) - def get_temporary_decls(self, codegen_state, schedule_index): + @override + def get_temporary_decls(self, + codegen_state: CodeGenerationState, + schedule_index: int + ): return [] + @override + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable: + from pymbolic.mapper.stringifier import PREC_NONE + from genpy import Assign, Suite + ecm = self.get_expression_to_code_mapper(codegen_state) + + if not temp_var.base_storage: + if temp_var.nbytes: + # NB: This does not prevent all zero-size allocations, + # as sizes are parametric, and allocation size + # could turn out to be zero at runtime. + nbytes_str = ecm(temp_var.nbytes, PREC_NONE, type_context="i") + return Assign(temp_var.name, f"allocator({nbytes_str})") + else: + return Assign(temp_var.name, "None") + + return Suite() + + @override + def get_temporary_var_deallocator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable: + from genpy import Statement, Suite + return Statement(f"if {temp_var.name} is not None: {temp_var.name}.release()") + def get_kernel_call( self, codegen_state: CodeGenerationState, subkernel_name: str, From 88c436fd77ef8135c2a1175d5e2bd31f0c6f835c Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Thu, 19 Jun 2025 14:56:05 -0500 Subject: [PATCH 11/20] fix typing --- loopy/target/__init__.py | 8 ++++---- loopy/target/c/__init__.py | 9 ++++++--- loopy/target/pyopencl.py | 12 ++++++------ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 47a4c42a2..fde463d2b 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -59,12 +59,12 @@ from loopy.codegen import CodeGenerationState, PreambleInfo from loopy.codegen.result import CodeGenerationResult from loopy.kernel import LoopKernel + from loopy.kernel.data import TemporaryVariable from loopy.target.c import DTypeRegistry from loopy.target.execution import ExecutorBase from loopy.translation_unit import CallableId, CallablesTable, TranslationUnit from loopy.types import LoopyType from loopy.typing import InameStr - from loopy.kernel.tools import TemporaryVariable ASTType = TypeVar("ASTType") @@ -256,7 +256,7 @@ def get_temporary_var_declarator(self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable) -> ASTType: raise NotImplementedError() - + def get_temporary_var_deallocator(self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable) -> ASTType: @@ -375,10 +375,10 @@ def get_expression_to_code_mapper(self, codegen_state): def get_kernel_call(self, codegen_state, name, gsize, lsize): return None - + def get_temporary_var_declarator(self, codegen_state, temp_var): return None - + def get_temporary_var_deallocator(self, codegen_state, temp_var): return None diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 3b3be75b9..bd2239c30 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -37,13 +37,13 @@ from cgen import ( Block, Collection, + Comment, Const, Declarator, Generable, Initializer, NestedDeclarator, Pointer, - Comment ) from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic import Expression @@ -1275,8 +1275,11 @@ def get_temporary_var_declarator(self, temp_var.address_space) @override - def get_temporary_var_deallocator(self, codegen_state, temp_var): - return Comment("Dynamic freeing of temp_var not supported") + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable: + return Comment("Dynamic freeing of temp vars not supported") # }}} @override diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 6506b8c61..c62faab5a 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -851,9 +851,9 @@ def get_temporary_decls(self, def get_temporary_var_declarator(self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable - ) -> Generable: - from pymbolic.mapper.stringifier import PREC_NONE + ) -> genpy.Generable: from genpy import Assign, Suite + from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) if not temp_var.base_storage: @@ -867,15 +867,15 @@ def get_temporary_var_declarator(self, return Assign(temp_var.name, "None") return Suite() - + @override def get_temporary_var_deallocator( self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable - ) -> Generable: - from genpy import Statement, Suite + ) -> genpy.Generable: + from genpy import Statement return Statement(f"if {temp_var.name} is not None: {temp_var.name}.release()") - + def get_kernel_call( self, codegen_state: CodeGenerationState, subkernel_name: str, From dae91e2790b01773e40e4e8d448c5a1926e2728b Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 23 Jun 2025 14:14:52 -0500 Subject: [PATCH 12/20] fix typing hopefully --- loopy/codegen/control.py | 152 +++---------------------------------- loopy/schedule/tools.py | 101 ++++++++++++++++++++++++ loopy/target/__init__.py | 10 +++ loopy/target/c/__init__.py | 6 ++ loopy/target/pyopencl.py | 29 +++++++ 5 files changed, 155 insertions(+), 143 deletions(-) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 9b73510d2..c08c32818 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -49,145 +49,12 @@ from pymbolic import Expression from loopy.codegen import CodeGenerationState - from loopy.kernel import LoopKernel, ScheduleItem + from loopy.kernel import LoopKernel from loopy.typing import InameStr _EMPTY_INT_FROZENSET: frozenset[int] = frozenset() -def get_temporary_decl_locations( - codegen_state: CodeGenerationState - ) -> tuple[dict[int, set[str]], dict[int, set[str]]]: - from loopy.kernel.data import AddressSpace - from loopy.schedule.tools import ( - get_block_boundaries, - temporaries_read_in_subkernel, - temporaries_written_in_subkernel, - ) - - kernel = codegen_state.kernel - assert kernel.linearization is not None - sched_index = 0 - - global_temporaries = frozenset( - tv.name for tv in codegen_state.kernel.temporary_variables.values() - if tv.address_space == AddressSpace.GLOBAL - ) - - # Collapse into blocks - def get_temporaries_in_bounds( - linearization: Sequence[ScheduleItem], - lower_bound: int, - upper_bound: int - ) -> frozenset[str]: - temporaries: frozenset[str] = frozenset() - for sched_index in range(lower_bound, upper_bound+1): - sched_item = linearization[sched_index] - if isinstance(sched_item, CallKernel): - temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - | temporaries_read_in_subkernel( - kernel, sched_item.kernel_name - ) - | (temporaries) - ) - return temporaries & global_temporaries - - block_boundaries = get_block_boundaries(kernel.linearization) - - bounds: dict[int, frozenset[str]] = {} - sched_index = 0 - while sched_index < codegen_state.schedule_index_end: - sched_item = kernel.linearization[sched_index] - if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): - if isinstance(sched_item, CallKernel): - block_end = block_boundaries[sched_index] - accessed_temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - | temporaries_read_in_subkernel( - kernel, sched_item.kernel_name - ) - ) - else: - block_end = block_boundaries[sched_index] - accessed_temporaries = get_temporaries_in_bounds( - kernel.linearization, sched_index, block_end - ) - bounds[sched_index] = accessed_temporaries - sched_index = block_end + 1 - else: - sched_index += 1 - - def update_seen_storage_vars(seen_sv, new_temp_variables): - new_storage_variables = set() - for new_tv_name in new_temp_variables: - new_tv = kernel.temporary_variables[new_tv_name] - if new_tv.base_storage is None: - storage_var = new_tv_name - else: - storage_var = new_tv.base_storage - new_storage_variables.add(storage_var) - - return (seen_sv | new_storage_variables, new_storage_variables - seen_sv) - # forward pass for first accesses - first_accesses: dict[int, set[str]] = {} - seen_storage_variables = set() - for sched_index in range(0, codegen_state.schedule_index_end): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - seen_storage_variables, new_storage_variables = update_seen_storage_vars( - seen_storage_variables, new_temporary_variables - ) - - if (len(new_storage_variables) > 0): - first_accesses[sched_index] = new_storage_variables - - last_accesses: dict[int, set[str]] = {} - seen_storage_variables = set() - for sched_index in range(codegen_state.schedule_index_end-1, -1, -1): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - seen_storage_variables, new_storage_variables = update_seen_storage_vars( - seen_storage_variables, new_temporary_variables - ) - - if (len(new_storage_variables) > 0): - last_accesses[sched_index] = new_storage_variables - return (first_accesses, last_accesses) - - -def get_temporary_decl_at_index( - codegen_state: CodeGenerationState, sched_index: int - ) -> tuple[Any, Any]: - first_accesses, last_accesses = get_temporary_decl_locations(codegen_state) - prefixes, suffixes = None, None - if sched_index in first_accesses: - prefix_lines = [] - for tv_name in first_accesses[sched_index]: - prefix_lines.append( - codegen_state.ast_builder.get_temporary_var_declarator( - codegen_state, - codegen_state.kernel.temporary_variables[tv_name] - ) - ) - prefixes = codegen_state.ast_builder.ast_block_class(prefix_lines) - if sched_index in last_accesses: - suffix_lines = [] - for tv_name in last_accesses[sched_index]: - suffix_lines.append( - codegen_state.ast_builder.get_temporary_var_deallocator( - codegen_state, - codegen_state.kernel.temporary_variables[tv_name] - ) - ) - suffixes = codegen_state.ast_builder.ast_block_class(suffix_lines) - return (prefixes, suffixes) - - def generate_code_for_sched_index( codegen_state: CodeGenerationState, sched_index: int @@ -220,7 +87,9 @@ def generate_code_for_sched_index( codegen_state.callables_table) prefixes, suffixes = ( - get_temporary_decl_at_index(codegen_state, sched_index) + codegen_state.ast_builder.get_temporary_decl_at_index( + codegen_state, sched_index + ) ) results = [ prefixes, @@ -231,10 +100,7 @@ def generate_code_for_sched_index( glob_grid, loc_grid), suffixes ] - return merge_codegen_results( - codegen_state, - [r for r in results if r is not None] - ) + return merge_codegen_results(codegen_state, results) else: # do not generate host code for non-entrypoint kernels return codegen_result @@ -279,12 +145,12 @@ def generate_code_for_sched_index( % (sched_item.iname, ", ".join(str(tag) for tag in tags))) prefixes, suffixes = ( - get_temporary_decl_at_index(codegen_state, sched_index) + codegen_state.ast_builder.get_temporary_decl_at_index( + codegen_state, sched_index + ) ) results = [prefixes, func(codegen_state, sched_index), suffixes] - return merge_codegen_results( - codegen_state, [r for r in results if r is not None] - ) + return merge_codegen_results(codegen_state, results) elif isinstance(sched_item, Barrier): # {{{ emit barrier code diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5b4420be1..096c02dd1 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -176,6 +176,107 @@ def supporting_temporary_names( return frozenset(result) + +def get_temporary_decl_blocks( + kernel: LoopKernel + ) -> tuple[dict[int, set[str]], dict[int, set[str]]]: + from loopy.kernel.data import AddressSpace + from loopy.schedule import CallKernel, EnterLoop + + assert kernel.linearization is not None + + global_temporaries = frozenset( + tv.name for tv in kernel.temporary_variables.values() + if tv.address_space == AddressSpace.GLOBAL + ) + + # Collapse into blocks + def get_temporaries_in_bounds( + linearization: Sequence[ScheduleItem], + lower_bound: int, + upper_bound: int + ) -> frozenset[str]: + temporaries: frozenset[str] = frozenset() + for sched_index in range(lower_bound, upper_bound+1): + sched_item = linearization[sched_index] + if isinstance(sched_item, CallKernel): + temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + | (temporaries) + ) + return temporaries & global_temporaries + + block_boundaries = get_block_boundaries(kernel.linearization) + + bounds: dict[int, frozenset[str]] = {} + sched_index = 0 + while sched_index < len(kernel.linearization): + sched_item = kernel.linearization[sched_index] + if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): + if isinstance(sched_item, CallKernel): + block_end = block_boundaries[sched_index] + accessed_temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + ) + else: + block_end = block_boundaries[sched_index] + accessed_temporaries = get_temporaries_in_bounds( + kernel.linearization, sched_index, block_end + ) + bounds[sched_index] = accessed_temporaries + sched_index = block_end + 1 + else: + sched_index += 1 + + def update_seen_storage_vars( + seen_sv: frozenset[str], + new_temp_variables: frozenset[str] + ) -> tuple[frozenset[str], frozenset[str]]: + new_storage_variables: set[str] = set() + for new_tv_name in new_temp_variables: + new_tv = kernel.temporary_variables[new_tv_name] + if new_tv.base_storage is None: + storage_var = new_tv_name + else: + storage_var = new_tv.base_storage + new_storage_variables.add(storage_var) + new_sv = frozenset(new_storage_variables) + return (seen_sv | new_sv, new_sv - seen_sv) + # forward pass for first accesses + first_accesses: dict[int, frozenset[str]] = {} + seen_storage_variables: frozenset[str] = frozenset() + for sched_index in range(0, len(kernel.linearization)): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + seen_storage_variables, new_storage_variables = update_seen_storage_vars( + seen_storage_variables, new_temporary_variables + ) + + if (len(new_storage_variables) > 0): + first_accesses[sched_index] = new_storage_variables + + last_accesses: dict[int, frozenset[str]] = {} + seen_storage_variables: frozenset[str] = frozenset() + for sched_index in range(len(kernel.linearization)-1, -1, -1): + if (sched_index not in bounds): + continue + sched_item = kernel.linearization[sched_index] + new_temporary_variables = bounds[sched_index] + seen_storage_variables, new_storage_variables = update_seen_storage_vars( + seen_storage_variables, new_temporary_variables + ) + + if (len(new_storage_variables) > 0): + last_accesses[sched_index] = new_storage_variables + return (first_accesses, last_accesses) # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index fde463d2b..447000415 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -262,6 +262,11 @@ def get_temporary_var_deallocator(self, temp_var: TemporaryVariable) -> ASTType: raise NotImplementedError() + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, + sched_index: int) -> tuple[ASTType, ASTType]: + raise NotImplementedError() + def get_kernel_call(self, codegen_state: CodeGenerationState, subkernel_name: str, gsize: tuple[Expression, ...], @@ -382,6 +387,11 @@ def get_temporary_var_declarator(self, codegen_state, temp_var): def get_temporary_var_deallocator(self, codegen_state, temp_var): return None + def get_temporary_decl_at_index( + self, codegen_state, + sched_index): + return (None, None) + @property def ast_block_class(self): return _DummyASTBlock diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index bd2239c30..aeedea15b 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1108,6 +1108,12 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, sched_index: int + ): + return (self.ast_block_class(), self.ast_block_class()) + @property @override def ast_block_class(self): diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c62faab5a..ce9bdc632 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -847,6 +847,35 @@ def get_temporary_decls(self, ): return [] + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, sched_index: int + ): + from loopy.schedule.tools import get_temporary_decl_blocks + first_accesses, last_accesses = get_temporary_decl_blocks(codegen_state.kernel) + prefixes, suffixes = self.ast_block_class(), self.ast_block_class() + if sched_index in first_accesses: + prefix_lines: list[genpy.Generable] = [] + for tv_name in first_accesses[sched_index]: + prefix_lines.append( + self.get_temporary_var_declarator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + prefixes = self.ast_block_class(prefix_lines) + if sched_index in last_accesses: + suffix_lines: list[genpy.Generable] = [] + for tv_name in last_accesses[sched_index]: + suffix_lines.append( + self.get_temporary_var_deallocator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + suffixes = self.ast_block_class(suffix_lines) + return (prefixes, suffixes) + @override def get_temporary_var_declarator(self, codegen_state: CodeGenerationState, From 1cfe83ae63525453ad5705b4477105c446070bc3 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 23 Jun 2025 14:30:06 -0500 Subject: [PATCH 13/20] add basic test --- test/test_loopy.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/test_loopy.py b/test/test_loopy.py index 2f66bf377..42b4f4870 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3692,6 +3692,56 @@ def test_long_kernel(): t_unit = lp.preprocess_kernel(t_unit) lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) +def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): + from pyopencl.tools import MemoryPool, ImmediateAllocator + + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + n = 16 + + knl = lp.make_kernel( + "{ [i]: 0<=i b[i] = a[i] + ... gbarrier + <> c[i] = b[i] + 1 + ... gbarrier + <> d[i] = c[i] + 1 + ... gbarrier + <> e[i] = d[i] + 1 + ... gbarrier + <> f[i] = e[i] + 1 + ... gbarrier + <> g[i] = f[i] + 1 + ... gbarrier + <> h[i] = g[i] + 1 + ... gbarrier + <> j[i] = h[i] + 1 + ... gbarrier + <> k[i] = j[i] + 1 + ... gbarrier + <> l[i] = k[i] + 1 + ... gbarrier + <> m[i] = l[i] + 1 + ... gbarrier + + out[i] = m[i] + end + """, seq_dependencies=True) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32, "out": np.float32, "n": np.int32}) + + temp_vars = ['b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm'] + knl = lp.set_temporary_address_space(knl, temp_vars, "global") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + mem_pool_alloc = MemoryPool(ImmediateAllocator(cq)) + + knl(cq, a=np.arange(n, dtype=np.float32), allocator=mem_pool_alloc) + assert mem_pool_alloc.managed_bytes < (len(temp_vars) * 4 * n) @pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning") def test_loop_imperfect_nest_priorities_in_v2_scheduler(): From 3ef324c795584bfbf1d92d111b340dd8503213dd Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 30 Jun 2025 13:54:29 -0500 Subject: [PATCH 14/20] more typing/ruff fixes --- loopy/schedule/tools.py | 18 ++++++++++-------- loopy/target/__init__.py | 18 ++++++++++++++---- loopy/target/c/__init__.py | 2 +- loopy/target/pyopencl.py | 2 +- test/test_loopy.py | 7 ++++--- 5 files changed, 30 insertions(+), 17 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 096c02dd1..0d4117ceb 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -179,7 +179,7 @@ def supporting_temporary_names( def get_temporary_decl_blocks( kernel: LoopKernel - ) -> tuple[dict[int, set[str]], dict[int, set[str]]]: + ) -> tuple[dict[int, frozenset[str]], dict[int, frozenset[str]]]: from loopy.kernel.data import AddressSpace from loopy.schedule import CallKernel, EnterLoop @@ -235,10 +235,11 @@ def get_temporaries_in_bounds( sched_index += 1 def update_seen_storage_vars( - seen_sv: frozenset[str], + seen_sv: set[str], new_temp_variables: frozenset[str] - ) -> tuple[frozenset[str], frozenset[str]]: + ) -> frozenset[str]: new_storage_variables: set[str] = set() + past_sv = frozenset(seen_sv) for new_tv_name in new_temp_variables: new_tv = kernel.temporary_variables[new_tv_name] if new_tv.base_storage is None: @@ -246,17 +247,18 @@ def update_seen_storage_vars( else: storage_var = new_tv.base_storage new_storage_variables.add(storage_var) + seen_sv.add(storage_var) new_sv = frozenset(new_storage_variables) - return (seen_sv | new_sv, new_sv - seen_sv) + return new_sv - past_sv # forward pass for first accesses first_accesses: dict[int, frozenset[str]] = {} - seen_storage_variables: frozenset[str] = frozenset() + seen_storage_variables: set[str] = set() for sched_index in range(0, len(kernel.linearization)): if (sched_index not in bounds): continue sched_item = kernel.linearization[sched_index] new_temporary_variables = bounds[sched_index] - seen_storage_variables, new_storage_variables = update_seen_storage_vars( + new_storage_variables = update_seen_storage_vars( seen_storage_variables, new_temporary_variables ) @@ -264,13 +266,13 @@ def update_seen_storage_vars( first_accesses[sched_index] = new_storage_variables last_accesses: dict[int, frozenset[str]] = {} - seen_storage_variables: frozenset[str] = frozenset() + seen_storage_variables.clear() for sched_index in range(len(kernel.linearization)-1, -1, -1): if (sched_index not in bounds): continue sched_item = kernel.linearization[sched_index] new_temporary_variables = bounds[sched_index] - seen_storage_variables, new_storage_variables = update_seen_storage_vars( + new_storage_variables = update_seen_storage_vars( seen_storage_variables, new_temporary_variables ) diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 447000415..291f9073e 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -381,15 +381,25 @@ def get_expression_to_code_mapper(self, codegen_state): def get_kernel_call(self, codegen_state, name, gsize, lsize): return None - def get_temporary_var_declarator(self, codegen_state, temp_var): + @override + def get_temporary_var_declarator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> None: return None - def get_temporary_var_deallocator(self, codegen_state, temp_var): + @override + def get_temporary_var_deallocator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> None: return None + @override def get_temporary_decl_at_index( - self, codegen_state, - sched_index): + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[None, None]: return (None, None) @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index aeedea15b..9b573e689 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1111,7 +1111,7 @@ def get_temporary_decls(self, codegen_state, schedule_index): @override def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int - ): + ) -> tuple[Generable, Generable]: return (self.ast_block_class(), self.ast_block_class()) @property diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index ce9bdc632..c03160e4d 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -850,7 +850,7 @@ def get_temporary_decls(self, @override def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int - ): + ) -> tuple[genpy.Generable, genpy.Generable]: from loopy.schedule.tools import get_temporary_decl_blocks first_accesses, last_accesses = get_temporary_decl_blocks(codegen_state.kernel) prefixes, suffixes = self.ast_block_class(), self.ast_block_class() diff --git a/test/test_loopy.py b/test/test_loopy.py index 42b4f4870..fdd74a2f0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3692,8 +3692,9 @@ def test_long_kernel(): t_unit = lp.preprocess_kernel(t_unit) lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) + def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): - from pyopencl.tools import MemoryPool, ImmediateAllocator + from pyopencl.tools import ImmediateAllocator, MemoryPool ctx = ctx_factory() cq = cl.CommandQueue(ctx) @@ -3725,7 +3726,6 @@ def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): ... gbarrier <> m[i] = l[i] + 1 ... gbarrier - out[i] = m[i] end """, seq_dependencies=True) @@ -3733,7 +3733,7 @@ def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "out": np.float32, "n": np.int32}) - temp_vars = ['b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'l', 'm'] + temp_vars = ["b", "c", "d", "e", "f", "g", "h", "j", "k", "l", "m"] knl = lp.set_temporary_address_space(knl, temp_vars, "global") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") @@ -3743,6 +3743,7 @@ def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): knl(cq, a=np.arange(n, dtype=np.float32), allocator=mem_pool_alloc) assert mem_pool_alloc.managed_bytes < (len(temp_vars) * 4 * n) + @pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning") def test_loop_imperfect_nest_priorities_in_v2_scheduler(): # Reported by Connor Ward. See . From f708b660f80bb956b5fa4fa442f8c95eeb1288f7 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Mon, 30 Jun 2025 16:33:50 -0500 Subject: [PATCH 15/20] fix tutorial.rst and add to baseline --- .basedpyright/baseline.json | 12 ++++++++++-- loopy/codegen/control.py | 2 ++ loopy/target/__init__.py | 2 +- loopy/target/c/__init__.py | 4 ++-- loopy/target/pyopencl.py | 4 ++-- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.basedpyright/baseline.json b/.basedpyright/baseline.json index 8ae478dee..5b05ff79e 100644 --- a/.basedpyright/baseline.json +++ b/.basedpyright/baseline.json @@ -8533,8 +8533,8 @@ "code": "reportUnknownArgumentType", "range": { "startColumn": 56, - "endColumn": 17, - "lineCount": 8 + "endColumn": 63, + "lineCount": 1 } }, { @@ -8545,6 +8545,14 @@ "lineCount": 1 } }, + { + "code": "reportUnknownArgumentType", + "range": { + "startColumn": 52, + "endColumn": 59, + "lineCount": 1 + } + }, { "code": "reportUnknownMemberType", "range": { diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index c08c32818..2328a1ff2 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -100,6 +100,7 @@ def generate_code_for_sched_index( glob_grid, loc_grid), suffixes ] + results = [r for r in results if r is not None] return merge_codegen_results(codegen_state, results) else: # do not generate host code for non-entrypoint kernels @@ -150,6 +151,7 @@ def generate_code_for_sched_index( ) ) results = [prefixes, func(codegen_state, sched_index), suffixes] + results = [r for r in results if r is not None] return merge_codegen_results(codegen_state, results) elif isinstance(sched_item, Barrier): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 291f9073e..6be2f7e9b 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -264,7 +264,7 @@ def get_temporary_var_deallocator(self, def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, - sched_index: int) -> tuple[ASTType, ASTType]: + sched_index: int) -> tuple[ASTType | None, ASTType | None]: raise NotImplementedError() def get_kernel_call(self, codegen_state: CodeGenerationState, diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 9b573e689..bdf51be49 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -1111,8 +1111,8 @@ def get_temporary_decls(self, codegen_state, schedule_index): @override def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int - ) -> tuple[Generable, Generable]: - return (self.ast_block_class(), self.ast_block_class()) + ) -> tuple[Generable | None, Generable | None]: + return (None, None) @property @override diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index c03160e4d..d8c2b79af 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -850,10 +850,10 @@ def get_temporary_decls(self, @override def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int - ) -> tuple[genpy.Generable, genpy.Generable]: + ) -> tuple[genpy.Generable | None, genpy.Generable | None]: from loopy.schedule.tools import get_temporary_decl_blocks first_accesses, last_accesses = get_temporary_decl_blocks(codegen_state.kernel) - prefixes, suffixes = self.ast_block_class(), self.ast_block_class() + prefixes, suffixes = None, None if sched_index in first_accesses: prefix_lines: list[genpy.Generable] = [] for tv_name in first_accesses[sched_index]: From 39855765e36a6ca72bba020edcae10ac4a77592d Mon Sep 17 00:00:00 2001 From: dsding2 <90988235+dsding2@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:41:12 -0500 Subject: [PATCH 16/20] Update loopy/schedule/tools.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Andreas Klöckner --- loopy/schedule/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index e4c5a9672..7d80e7013 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -277,7 +277,7 @@ def update_seen_storage_vars( seen_storage_variables, new_temporary_variables ) - if (len(new_storage_variables) > 0): + if new_storage_variables: last_accesses[sched_index] = new_storage_variables return (first_accesses, last_accesses) # }}} From 95e119e06f223d5351ca3e9333860428afc681b3 Mon Sep 17 00:00:00 2001 From: dsding2 <90988235+dsding2@users.noreply.github.com> Date: Fri, 11 Jul 2025 10:48:45 -0500 Subject: [PATCH 17/20] Apply suggested test changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Andreas Klöckner --- test/test_loopy.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index fdd74a2f0..1b2d44884 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3731,17 +3731,20 @@ def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): """, seq_dependencies=True) knl = lp.add_and_infer_dtypes(knl, - {"a": np.float32, "out": np.float32, "n": np.int32}) + {"a": np.float32}) - temp_vars = ["b", "c", "d", "e", "f", "g", "h", "j", "k", "l", "m"] + temp_vars = list(knl.default_entrypoint.temporary_variables) knl = lp.set_temporary_address_space(knl, temp_vars, "global") knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") mem_pool_alloc = MemoryPool(ImmediateAllocator(cq)) - knl(cq, a=np.arange(n, dtype=np.float32), allocator=mem_pool_alloc) - assert mem_pool_alloc.managed_bytes < (len(temp_vars) * 4 * n) + a = np.arange(n, dtype=np.float32) + knl(cq, a=a, allocator=mem_pool_alloc) + + # FIXME This relies on the memory pool not freeing any memory it allocates + assert mem_pool_alloc.managed_bytes < len(temp_vars) * a.nbytes @pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning") From 5cbfbf111ea6a9b4bfcf61376842d7997ecaaee0 Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Fri, 11 Jul 2025 10:53:32 -0500 Subject: [PATCH 18/20] implement rename and documentation suggestions --- loopy/schedule/tools.py | 7 ++++++- loopy/target/pyopencl.py | 8 ++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 7d80e7013..b4c7103f0 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -178,9 +178,14 @@ def supporting_temporary_names( return frozenset(result) -def get_temporary_decl_blocks( +def get_sched_index_to_first_and_last_used( kernel: LoopKernel ) -> tuple[dict[int, frozenset[str]], dict[int, frozenset[str]]]: + """ + Returns the tuple (first_used, last_used), where first_used is a dict such that + first_used[sched_index] is the set of all temporary variable names first used at sched_index. + Likewise, last_used[sched_index] is the set of all temporary variable names last used at sched_index. + """ from loopy.kernel.data import AddressSpace from loopy.schedule import CallKernel, EnterLoop diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 4bfb7283e..93786dffd 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -848,8 +848,8 @@ def get_temporary_decls(self, def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int ) -> tuple[genpy.Generable | None, genpy.Generable | None]: - from loopy.schedule.tools import get_temporary_decl_blocks - first_accesses, last_accesses = get_temporary_decl_blocks(codegen_state.kernel) + from loopy.schedule.tools import get_sched_index_to_first_and_last_used + first_accesses, last_accesses = get_sched_index_to_first_and_last_used(codegen_state.kernel) prefixes, suffixes = None, None if sched_index in first_accesses: prefix_lines: list[genpy.Generable] = [] @@ -900,6 +900,8 @@ def get_temporary_var_deallocator( temp_var: TemporaryVariable ) -> genpy.Generable: from genpy import Statement + # Zero-size temporaries allocate as None, tolerate that. + # https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator return Statement(f"if {temp_var.name} is not None: {temp_var.name}.release()") def get_kernel_call( @@ -908,10 +910,8 @@ def get_kernel_call( gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] ) -> genpy.Suite: from genpy import Assert, Assign, Comment, Line, Suite - from pymbolic.mapper.stringifier import PREC_NONE kernel = codegen_state.kernel - ecm = self.get_expression_to_code_mapper(codegen_state) from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(kernel, subkernel_name) From 612b2381c648b05baf433457298b4721f497a37e Mon Sep 17 00:00:00 2001 From: Daniel Ding Date: Sat, 12 Jul 2025 16:21:16 -0500 Subject: [PATCH 19/20] ruff fixes, revert broken change --- loopy/schedule/tools.py | 11 +++++++---- loopy/target/pyopencl.py | 6 +++++- test/test_loopy.py | 2 +- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index b4c7103f0..cffeb3f84 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -182,9 +182,12 @@ def get_sched_index_to_first_and_last_used( kernel: LoopKernel ) -> tuple[dict[int, frozenset[str]], dict[int, frozenset[str]]]: """ - Returns the tuple (first_used, last_used), where first_used is a dict such that - first_used[sched_index] is the set of all temporary variable names first used at sched_index. - Likewise, last_used[sched_index] is the set of all temporary variable names last used at sched_index. + Returns the tuple (first_used, last_used), where first_used is + a dict such that first_used[sched_index] is the set of all temporary + variable names first used at sched_index. + + Likewise, last_used[sched_index] is the set of all temporary variable names + last used at sched_index. """ from loopy.kernel.data import AddressSpace from loopy.schedule import CallKernel, EnterLoop @@ -221,7 +224,7 @@ def get_temporaries_in_bounds( sched_index = 0 while sched_index < len(kernel.linearization): sched_item = kernel.linearization[sched_index] - if isinstance(sched_item, EnterLoop) or isinstance(sched_item, CallKernel): + if isinstance(sched_item, (EnterLoop, CallKernel)): if isinstance(sched_item, CallKernel): block_end = block_boundaries[sched_index] accessed_temporaries = ( diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 93786dffd..5e2c54fda 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -849,7 +849,9 @@ def get_temporary_decl_at_index( self, codegen_state: CodeGenerationState, sched_index: int ) -> tuple[genpy.Generable | None, genpy.Generable | None]: from loopy.schedule.tools import get_sched_index_to_first_and_last_used - first_accesses, last_accesses = get_sched_index_to_first_and_last_used(codegen_state.kernel) + first_accesses, last_accesses = get_sched_index_to_first_and_last_used( + codegen_state.kernel + ) prefixes, suffixes = None, None if sched_index in first_accesses: prefix_lines: list[genpy.Generable] = [] @@ -910,8 +912,10 @@ def get_kernel_call( gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] ) -> genpy.Suite: from genpy import Assert, Assign, Comment, Line, Suite + from pymbolic.mapper.stringifier import PREC_NONE kernel = codegen_state.kernel + ecm = self.get_expression_to_code_mapper(codegen_state) from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(kernel, subkernel_name) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1b2d44884..bd41ef92a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3742,7 +3742,7 @@ def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): a = np.arange(n, dtype=np.float32) knl(cq, a=a, allocator=mem_pool_alloc) - + # FIXME This relies on the memory pool not freeing any memory it allocates assert mem_pool_alloc.managed_bytes < len(temp_vars) * a.nbytes From b24fe997c5ce1dab5918289ea334e590091bf457 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 28 Aug 2025 17:47:59 -0500 Subject: [PATCH 20/20] Improvements --- loopy/schedule/tools.py | 178 ++++++++++++++++++++++----------------- loopy/target/__init__.py | 24 ++++-- loopy/target/pyopencl.py | 6 +- loopy/target/python.py | 23 ++++- 4 files changed, 138 insertions(+), 93 deletions(-) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index d92d6f652..7d69aec9d 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -178,19 +178,66 @@ def supporting_temporary_names( return frozenset(result) +def _get_temporaries_accessed_in_schedule( + kernel: LoopKernel, + sched_idx_lower_bound: int, + sched_idx_upper_bound: int + ) -> frozenset[str]: + from loopy.schedule import CallKernel, EnterLoop, LeaveLoop + + linearization = kernel.linearization + assert linearization is not None + + temporaries: frozenset[str] = frozenset() + for sched_index in range(sched_idx_lower_bound, sched_idx_upper_bound): + sched_item = linearization[sched_index] + if isinstance(sched_item, CallKernel): + temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + | (temporaries) + ) + elif isinstance(sched_item, (EnterLoop, LeaveLoop)): + # ignore further outside-kernel loops + pass + + else: + raise NotImplementedError("kernel with non-CallKernel outermost") + + return temporaries + + +def _map_to_base_storage(kernel: LoopKernel, tv_names: Set[str]) -> Set[str]: + result: set[str] = set() + for tv_name in tv_names: + while True: + tv = kernel.temporary_variables[tv_name] + if tv.base_storage is not None: + tv_name = tv.base_storage + else: + break + + result.add(tv_name) + + return result + + +@memoize_on_first_arg def get_sched_index_to_first_and_last_used( kernel: LoopKernel - ) -> tuple[dict[int, frozenset[str]], dict[int, frozenset[str]]]: + ) -> tuple[Mapping[int, Set[str]], Mapping[int, Set[str]]]: """ Returns the tuple (first_used, last_used), where first_used is - a dict such that first_used[sched_index] is the set of all temporary + a dict such that first_used[sched_index] is the set of all global temporary variable names first used at sched_index. - Likewise, last_used[sched_index] is the set of all temporary variable names - last used at sched_index. + Likewise, last_used[sched_index] is the set of all global temporary + variable names last used at sched_index. """ from loopy.kernel.data import AddressSpace - from loopy.schedule import CallKernel, EnterLoop + from loopy.schedule import CallKernel, EnterLoop, Barrier assert kernel.linearization is not None @@ -200,94 +247,67 @@ def get_sched_index_to_first_and_last_used( ) # Collapse into blocks - def get_temporaries_in_bounds( - linearization: Sequence[ScheduleItem], - lower_bound: int, - upper_bound: int - ) -> frozenset[str]: - temporaries: frozenset[str] = frozenset() - for sched_index in range(lower_bound, upper_bound+1): - sched_item = linearization[sched_index] - if isinstance(sched_item, CallKernel): - temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - | temporaries_read_in_subkernel( - kernel, sched_item.kernel_name - ) - | (temporaries) - ) - return temporaries & global_temporaries - block_boundaries = get_block_boundaries(kernel.linearization) - bounds: dict[int, frozenset[str]] = {} + tvs_accessed_at: dict[int, frozenset[str]] = {} sched_index = 0 while sched_index < len(kernel.linearization): sched_item = kernel.linearization[sched_index] - if isinstance(sched_item, (EnterLoop, CallKernel)): - if isinstance(sched_item, CallKernel): - block_end = block_boundaries[sched_index] - accessed_temporaries = ( - temporaries_written_in_subkernel(kernel, sched_item.kernel_name) - | temporaries_read_in_subkernel( - kernel, sched_item.kernel_name - ) - ) - else: - block_end = block_boundaries[sched_index] - accessed_temporaries = get_temporaries_in_bounds( - kernel.linearization, sched_index, block_end + if isinstance(sched_item, CallKernel): + block_end = block_boundaries[sched_index] + tvs_accessed_at[sched_index] = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name ) - bounds[sched_index] = accessed_temporaries + ) & global_temporaries + sched_index = block_end + 1 - else: - sched_index += 1 - def update_seen_storage_vars( - seen_sv: set[str], - new_temp_variables: frozenset[str] - ) -> frozenset[str]: - new_storage_variables: set[str] = set() - past_sv = frozenset(seen_sv) - for new_tv_name in new_temp_variables: - new_tv = kernel.temporary_variables[new_tv_name] - if new_tv.base_storage is None: - storage_var = new_tv_name - else: - storage_var = new_tv.base_storage - new_storage_variables.add(storage_var) - seen_sv.add(storage_var) - new_sv = frozenset(new_storage_variables) - return new_sv - past_sv - # forward pass for first accesses - first_accesses: dict[int, frozenset[str]] = {} - seen_storage_variables: set[str] = set() + elif isinstance(sched_item, EnterLoop): + block_end = block_boundaries[sched_index] + tvs_accessed_at[sched_index] = _get_temporaries_accessed_in_schedule( + kernel, sched_index, block_end+1 + ) & global_temporaries + + sched_index = block_end + 1 + + elif isinstance(sched_item, Barrier): + sched_index += 1 + else: + raise ValueError( + f"unexpected schedule item at outermost level: {type(sched_item)}") + + storage_vars_accessed_at = { + sched_index: _map_to_base_storage(kernel, accessed) + for sched_index, accessed in tvs_accessed_at.items() + } + del tvs_accessed_at + + # forward pass for first_accesses + first_accesses: dict[int, Set[str]] = {} + seen_storage_vars: set[str] = set() for sched_index in range(0, len(kernel.linearization)): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - new_storage_variables = update_seen_storage_vars( - seen_storage_variables, new_temporary_variables - ) + accessed = storage_vars_accessed_at.get(sched_index, set()) + new_storage_vars = accessed - seen_storage_vars + seen_storage_vars.update(accessed) - if (len(new_storage_variables) > 0): - first_accesses[sched_index] = new_storage_variables + if new_storage_vars: + first_accesses[sched_index] = new_storage_vars - last_accesses: dict[int, frozenset[str]] = {} - seen_storage_variables.clear() + # backward pass for last_accesses + last_accesses: dict[int, Set[str]] = {} + seen_storage_vars = set() for sched_index in range(len(kernel.linearization)-1, -1, -1): - if (sched_index not in bounds): - continue - sched_item = kernel.linearization[sched_index] - new_temporary_variables = bounds[sched_index] - new_storage_variables = update_seen_storage_vars( - seen_storage_variables, new_temporary_variables - ) + accessed = storage_vars_accessed_at.get(sched_index, set()) + new_storage_vars = accessed - seen_storage_vars + seen_storage_vars.update(accessed) + + if new_storage_vars: + last_accesses[sched_index] = new_storage_vars - if new_storage_variables: - last_accesses[sched_index] = new_storage_variables return (first_accesses, last_accesses) + # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 6be2f7e9b..feb03a4d0 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -252,20 +252,26 @@ def get_temporary_decls(self, codegen_state: CodeGenerationState, schedule_index: int) -> ASTType: raise NotImplementedError + @abstractmethod def get_temporary_var_declarator(self, - codegen_state: CodeGenerationState, - temp_var: TemporaryVariable) -> ASTType: - raise NotImplementedError() + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> ASTType | None: + ... + @abstractmethod def get_temporary_var_deallocator(self, - codegen_state: CodeGenerationState, - temp_var: TemporaryVariable) -> ASTType: - raise NotImplementedError() + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> ASTType | None: + ... + @abstractmethod def get_temporary_decl_at_index( - self, codegen_state: CodeGenerationState, - sched_index: int) -> tuple[ASTType | None, ASTType | None]: - raise NotImplementedError() + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[ASTType | None, ASTType | None]: + ... def get_kernel_call(self, codegen_state: CodeGenerationState, subkernel_name: str, diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 5e2c54fda..acca1fb53 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -89,7 +89,6 @@ # {{{ pyopencl function scopers - class PyOpenCLCallable(ScalarCallable): """ Records information about the callables which are not covered by @@ -912,14 +911,14 @@ def get_kernel_call( gsize: tuple[Expression, ...], lsize: tuple[Expression, ...] ) -> genpy.Suite: from genpy import Assert, Assign, Comment, Line, Suite - from pymbolic.mapper.stringifier import PREC_NONE kernel = codegen_state.kernel - ecm = self.get_expression_to_code_mapper(codegen_state) from loopy.schedule.tools import get_subkernel_arg_info skai = get_subkernel_arg_info(kernel, subkernel_name) + ecm = self.get_expression_to_code_mapper(codegen_state) + if not gsize: gsize = (1,) if not lsize: @@ -986,6 +985,7 @@ def get_kernel_call( overflow_args_code = Suite([]) import pyopencl.version as cl_ver + from pymbolic.mapper.stringifier import PREC_NONE if cl_ver.VERSION < (2020, 2): from warnings import warn warn("Your kernel invocation will likely fail because your " diff --git a/loopy/target/python.py b/loopy/target/python.py index d1bc51f56..51a575871 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -34,7 +34,7 @@ from pymbolic.mapper.stringifier import PREC_NONE, StringifyMapper from loopy.diagnostic import LoopyError -from loopy.kernel.data import ValueArg +from loopy.kernel.data import TemporaryVariable, ValueArg from loopy.kernel.function_interface import ScalarCallable from loopy.target import ASTBuilderBase from loopy.type_inference import TypeReader @@ -339,7 +339,26 @@ def emit_assignment(self, codegen_state: CodeGenerationState, insn: Assignment): ecm(insn.assignee, prec=PREC_NONE, type_context=None), ecm(insn.expression, prec=PREC_NONE, type_context=None)) - # }}} + @override + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable | None: + return None + + @override + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable | None: + return None + + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[Generable | None, Generable | None]: + return None, None # }}}