diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index b3cfbc5c4..ed995676e 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -141,4 +141,10 @@ TODO: Matching instruction tags .. automodule:: loopy.match + +Fusing Loops +------------ + +.. automodule:: loopy.transform.loop_fusion + .. vim: tw=75:spell diff --git a/doc/tutorial.rst b/doc/tutorial.rst index ec5bf5396..00c4f99f9 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -610,7 +610,7 @@ commonly called 'loop tiling': ... assumptions="n mod 16 = 0 and n >= 1") >>> knl = lp.split_iname(knl, "i", 16) >>> knl = lp.split_iname(knl, "j", 16) - >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner") + >>> knl = lp.prioritize_loops(knl, "i_outer,j_outer,i_inner,j_inner") >>> knl = lp.set_options(knl, "write_code") >>> evt, (out,) = knl(queue, a=a_mat_dev) #define lid(N) ((int) get_local_id(N)) @@ -1029,8 +1029,8 @@ transformation exists in :func:`loopy.add_prefetch`: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - acc_k = 0.0f; a_fetch = a[16 * gid(0) + lid(0)]; + acc_k = 0.0f; for (int k = 0; k <= 15; ++k) acc_k = acc_k + a_fetch; out[16 * gid(0) + lid(0)] = acc_k; @@ -1053,10 +1053,10 @@ earlier: >>> evt, (out,) = knl_pf(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) - acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; + if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) + acc_k = 0.0f; barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { @@ -1908,18 +1908,16 @@ Now to make things more interesting, we'll create a kernel with barriers: { __local int c[50 * 10 * 99]; - { - int const k_outer = 0; - + for (int i = 0; i <= 49; ++i) for (int j = 0; j <= 9; ++j) - for (int i = 0; i <= 49; ++i) - { - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; - c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; - barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; - e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; - } - } + { + int const k_outer = 0; + + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn rev-depends on insn_0) */; + c[990 * i + 99 * j + lid(0) + 1] = 2 * a[980 * i + 98 * j + lid(0) + 1]; + barrier(CLK_LOCAL_MEM_FENCE) /* for c (insn_0 depends on insn) */; + e[980 * i + 98 * j + lid(0) + 1] = c[990 * i + 99 * j + 1 + lid(0) + 1] + c[990 * i + 99 * j + -1 + lid(0) + 1]; + } } In this kernel, when a work-item performs the second instruction it uses data diff --git a/loopy/__init__.py b/loopy/__init__.py index a279b268c..dcd668b2e 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -161,6 +161,9 @@ from loopy.tools import Optional, t_unit_to_python, memoize_on_disk +from loopy.transform.loop_fusion import (get_kennedy_unweighted_fusion_candidates, + rename_inames_in_batch) + __all__ = [ "TaggedVariable", "Reduction", "LinearSubscript", "TypeCast", @@ -253,6 +256,9 @@ "pack_and_unpack_args_for_call", + "rename_inames_in_batch", + "get_kennedy_unweighted_fusion_candidates", + # }}} "get_dot_dependency_graph", @@ -320,6 +326,15 @@ # }}} ] + +try: + import loopy.relations as relations +except ImportError: + # catching ImportErrors to avoid making minikanren a hard-dep + pass +else: + __all__ += ["relations"] + # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 7523c11d7..cdd232854 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -195,6 +195,28 @@ def with_new_ast(self, codegen_state, new_ast): self.current_program(codegen_state).copy( ast=new_ast)) + def get_idis_for_subkernel(self, kernel, name): + """ + Returns a :class:`list` of :class:`~loopy.codegen.ImplementedDataInfo` for + the subkernel named *name*. + + :arg kernel: An instance of :class:`loopy.LoopKernel`. + """ + from loopy.schedule.tools import get_callkernel_dependencies + from loopy.kernel.data import InameArg + name2idi = {idi.name: idi for idi in self.implemented_data_info} + subknl_deps = get_callkernel_dependencies(kernel, name) + return [idi + for idi in self.implemented_data_info + if (idi.name in subknl_deps + or idi.arg_class is InameArg + or idi.base_name in subknl_deps + or (idi.offset_for_name is not None + and name2idi[idi.offset_for_name].base_name in subknl_deps) + or (idi.stride_for_name_and_axis is not None + and idi.stride_for_name_and_axis[0] in subknl_deps))] + + # }}} diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index edf5f0333..31dc46e1d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2070,4 +2070,67 @@ def get_outer_params(domains): # }}} +# {{{ get access map from an instruction + +class _IndexCollector(CombineMapper): + def __init__(self, var): + self.var = var + super().__init__() + + def combine(self, values): + import operator + return reduce(operator.or_, values, frozenset()) + + def map_subscript(self, expr): + if expr.aggregate.name == self.var: + return (super().map_subscript(expr) | frozenset([expr.index_tuple])) + else: + return super().map_subscript(expr) + + def map_algebraic_leaf(self, expr): + return frozenset() + + map_constant = map_algebraic_leaf + + +def _project_out_inames_from_maps(amaps, inames_to_project_out): + new_amaps = [] + for amap in amaps: + for iname in inames_to_project_out: + dt, pos = amap.get_var_dict()[iname] + amap = amap.project_out(dt, pos, 1) + + new_amaps.append(amap) + + return new_amaps + + +def _union_amaps(amaps): + import islpy as isl + return reduce(isl.Map.union, amaps[1:], amaps[0]) + + +def get_insn_access_map(kernel, insn_id, var, inner_inames): + from loopy.transform.subst import expand_subst + from loopy.match import Id + from loopy.symbolic import get_access_map + + insn = kernel.id_to_insn[insn_id] + + kernel = expand_subst(kernel, within=Id(insn_id)) + indices = list(_IndexCollector(var)((insn.expression, + insn.assignees, + tuple(insn.predicates)))) + + amaps = _project_out_inames_from_maps( + [get_access_map(kernel.get_inames_domain(insn.within_inames), + idx, kernel.assumptions) + + for idx in indices], + inner_inames) + + return _union_amaps(amaps) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/relations.py b/loopy/relations.py new file mode 100644 index 000000000..5d47bfa1d --- /dev/null +++ b/loopy/relations.py @@ -0,0 +1,122 @@ +from kanren import Relation, facts + + +def get_inameo(kernel): + inameo = Relation() + for iname in kernel.all_inames(): + facts(inameo, (iname,)) + return inameo + + +def get_argo(kernel): + argo = Relation() + for arg in kernel.args: + facts(argo, (arg.name,)) + + return argo + + +def get_tempo(kernel): + tempo = Relation() + for tv in kernel.temporary_variables: + facts(tempo, (tv,)) + + return tempo + + +def get_insno(kernel): + insno = Relation() + for insn in kernel.instructions: + facts(insno, (insn.id,)) + + return insno + + +def get_taggedo(kernel): + taggedo = Relation() + + for arg_name, arg in kernel.arg_dict.items(): + for tag in arg.tags: + facts(taggedo, (arg_name, tag)) + + for iname_name, iname in kernel.inames.items(): + for tag in iname.tags: + facts(taggedo, (iname_name, tag)) + + for insn in kernel.instructions: + for tag in insn.tags: + facts(taggedo, (insn.id, tag)) + + return taggedo + + +def get_taggedo_of_type(kernel, tag_type): + taggedo = Relation() + + for arg_name, arg in kernel.arg_dict.items(): + for tag in arg.tags_of_type(tag_type): + facts(taggedo, (arg_name, tag)) + + for iname_name, iname in kernel.inames.items(): + for tag in iname.tags_of_type(tag_type): + facts(taggedo, (iname_name, tag)) + + for insn in kernel.instructions: + for tag in insn.tags_of_type(tag_type): + facts(taggedo, (insn.id, tag)) + + return taggedo + + +def get_producero(kernel): + producero = Relation() + + for insn in kernel.instructions: + for var in insn.assignee_var_names(): + facts(producero, (insn.id, var)) + + return producero + + +def get_consumero(kernel): + consumero = Relation() + + for insn in kernel.instructions: + for var in insn.read_dependency_names(): + facts(consumero, (insn.id, var)) + + return consumero + + +def get_withino(kernel): + withino = Relation() + + for insn in kernel.instructions: + facts(withino, (insn.id, insn.within_inames)) + + return withino + + +def get_reduce_insno(kernel): + reduce_insno = Relation() + + for insn in kernel.instructions: + if insn.reduction_inames(): + facts(reduce_insno, (insn.id,)) + + return reduce_insno + + +def get_reduce_inameo(kernel): + from functools import reduce + reduce_inameo = Relation() + + for iname in reduce(frozenset.union, + (insn.reduction_inames() + for insn in kernel.instructions), + frozenset()): + facts(reduce_inameo, (iname,)) + + return reduce_inameo + +# vim: fdm=marker diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index b46f04826..98c3050b4 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -809,8 +809,161 @@ def is_similar_to_template(insn): # {{{ scheduling algorithm -def generate_loop_schedules_internal( - sched_state, debug=None): +def _get_outermost_diverging_inames(tree, within1, within2): + """ + For loop nestings *within1* and *within2*, returns the first inames at which + the loops nests diverge in the loop nesting tree *tree*. + + :arg tree: A :class:`loopy.tools.Tree` of inames, denoting a loop nesting. + :arg within1: A :class:`frozenset` of inames. + :arg within2: A :class:`frozenset` of inames. + """ + common_ancestors = (within1 & within2) | {""} + + innermost_parent = max(common_ancestors, + key=lambda k: tree.depth(k)) + iname1, = tree.children(innermost_parent) & within1 + iname2, = tree.children(innermost_parent) & within2 + + return iname1, iname2 + + +class V2SchedulerNotImplementedException(RuntimeError): + pass + + +def generate_loop_schedules_v2(kernel): + from loopy.schedule.tools import get_loop_nest_tree + from functools import reduce + from pytools.graph import compute_topological_order + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + parallel_inames = (concurrent_inames - ilp_inames - vec_inames) + + # {{{ can v2 scheduler handle?? + + if any(len(insn.conflicts_with_groups) != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction having conflicts with groups.") + + if any(insn.priority != 0 for insn in kernel.instructions): + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") + + if kernel.linearization is not None: + # cannnot handle preschedule yet + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " prescheduled kernels.") + + if ilp_inames or vec_inames: + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " loops tagged with 'ilp'/'vec' as they are not guaranteed to" + " be single entry loops.") + + # }}} + + loop_nest_tree = get_loop_nest_tree(kernel) + + # loop_inames: inames that are realized as loops. Concurrent inames aren't + # realized as a loop in the generated code for a loopy.TargetBase. + loop_inames = (reduce(frozenset.union, (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - parallel_inames) + + # The idea here is to build a DAG, where nodes are schedule items and if + # there exists an edge from schedule item A to schedule item B in the DAG => + # B *must* come after A in the linearized result. + + dag = {} + + # LeaveLoop(i) *must* follow EnterLoop(i) + dag.update({EnterLoop(iname=iname): frozenset({LeaveLoop(iname=iname)}) + for iname in loop_inames}) + dag.update({LeaveLoop(iname=iname): frozenset() + for iname in loop_inames}) + dag.update({RunInstruction(insn_id=insn.id): frozenset() + for insn in kernel.instructions}) + + # {{{ add constraints imposed by the loop nesting + + for outer_loop in loop_nest_tree.nodes(): + if outer_loop == "": + continue + + for child in loop_nest_tree.children(outer_loop): + inner_loop = child + dag[EnterLoop(iname=outer_loop)] |= {EnterLoop(iname=inner_loop)} + dag[LeaveLoop(iname=inner_loop)] |= {LeaveLoop(iname=outer_loop)} + + # }}} + + # {{{ add deps. b/w schedule items coming from insn. depepdencies + + for insn in kernel.instructions: + insn_loop_inames = insn.within_inames & loop_inames + for dep_id in insn.depends_on: + dep = kernel.id_to_insn[dep_id] + dep_loop_inames = dep.within_inames & loop_inames + # Enforce instruction dep: + dag[RunInstruction(insn_id=dep_id)] |= {RunInstruction(insn_id=insn.id)} + + # {{{ register deps on loop entry/leave because of insn. deps + + if dep_loop_inames < insn_loop_inames: + for iname in insn_loop_inames - dep_loop_inames: + dag[RunInstruction(insn_id=dep.id)] |= {EnterLoop(iname=iname)} + elif insn_loop_inames < dep_loop_inames: + for iname in dep_loop_inames - insn_loop_inames: + dag[LeaveLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + elif dep_loop_inames != insn_loop_inames: + insn_iname, dep_iname = _get_outermost_diverging_inames( + loop_nest_tree, insn_loop_inames, dep_loop_inames) + dag[LeaveLoop(iname=dep_iname)] |= {EnterLoop(iname=insn_iname)} + else: + pass + + # }}} + + for iname in insn_loop_inames: + # For an insn within a loop nest 'i' + # for i + # insn + # end i + # 'insn' *must* come b/w 'for i' and 'end i' + dag[EnterLoop(iname=iname)] |= {RunInstruction(insn_id=insn.id)} + dag[RunInstruction(insn_id=insn.id)] |= {LeaveLoop(iname=iname)} + + # }}} + + def iname_key(iname): + all_ancestors = sorted(loop_nest_tree.ancestors(iname), + key=lambda x: loop_nest_tree.depth(x)) + return ",".join(all_ancestors+[iname]) + + def key(x): + if isinstance(x, RunInstruction): + iname = max((kernel.id_to_insn[x.insn_id].within_inames & loop_inames), + key=lambda k: loop_nest_tree.depth(k), + default="") + result = (iname_key(iname), x.insn_id) + elif isinstance(x, (EnterLoop, LeaveLoop)): + result = (iname_key(x.iname),) + else: + raise NotImplementedError + + return result + + return compute_topological_order(dag, key=key) + + +def generate_loop_schedules_internal(sched_state, debug=None): # allow_insn is set to False initially and after entering each loop # to give loops containing high-priority instructions a chance. kernel = sched_state.kernel @@ -1955,6 +2108,42 @@ def generate_loop_schedules(kernel, callables_table, debug_args=None): callables_table, debug_args=debug_args) +def postprocess_schedule(kernel, callables_table, gen_sched): + + from loopy.kernel import KernelState + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) + + gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, + return_dict=True) + + if (gsize or lsize): + if not kernel.options.disable_global_barriers: + logger.debug("%s: barrier insertion: global" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="global", + verify_only=(not + kernel.options.insert_gbarriers)) + + logger.debug("%s: barrier insertion: local" % kernel.name) + gen_sched = insert_barriers(kernel, callables_table, gen_sched, + synchronization_kind="local", verify_only=False) + logger.debug("%s: barrier insertion: done" % kernel.name) + + new_kernel = kernel.copy( + linearization=gen_sched, + state=KernelState.LINEARIZED) + + from loopy.schedule.device_mapping import \ + map_schedule_onto_host_or_device + if kernel.state != KernelState.LINEARIZED: + # Device mapper only gets run once. + new_kernel = map_schedule_onto_host_or_device(new_kernel) + + from loopy.schedule.tools import add_extra_args_to_schedule + return add_extra_args_to_schedule(new_kernel) + + def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): if debug_args is None: debug_args = {} @@ -1964,6 +2153,14 @@ def generate_loop_schedules_inner(kernel, callables_table, debug_args=None): raise LoopyError("cannot schedule a kernel that has not been " "preprocessed") + try: + gen_sched = generate_loop_schedules_v2(kernel) + yield postprocess_schedule(kernel, callables_table, gen_sched) + return + except V2SchedulerNotImplementedException as e: + from warnings import warn + warn(f"Falling back to a slow scheduler implementation due to: {e}") + schedule_count = 0 debug = ScheduleDebugger(**debug_args) @@ -2073,37 +2270,7 @@ def print_longest_dead_end(): sched_state, debug=debug, **schedule_gen_kwargs): debug.stop() - gen_sched = convert_barrier_instructions_to_barriers( - kernel, gen_sched) - - gsize, lsize = kernel.get_grid_size_upper_bounds(callables_table, - return_dict=True) - - if (gsize or lsize): - if not kernel.options.disable_global_barriers: - logger.debug("%s: barrier insertion: global" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="global", - verify_only=(not - kernel.options.insert_gbarriers)) - - logger.debug("%s: barrier insertion: local" % kernel.name) - gen_sched = insert_barriers(kernel, callables_table, gen_sched, - synchronization_kind="local", verify_only=False) - logger.debug("%s: barrier insertion: done" % kernel.name) - - new_kernel = kernel.copy( - linearization=gen_sched, - state=KernelState.LINEARIZED) - - from loopy.schedule.device_mapping import \ - map_schedule_onto_host_or_device - if kernel.state != KernelState.LINEARIZED: - # Device mapper only gets run once. - new_kernel = map_schedule_onto_host_or_device(new_kernel) - - from loopy.schedule.tools import add_extra_args_to_schedule - new_kernel = add_extra_args_to_schedule(new_kernel) + new_kernel = postprocess_schedule(kernel, callables_table, gen_sched) yield new_kernel debug.start() diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index d021b0aa4..46b308bea 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -20,10 +20,15 @@ THE SOFTWARE. """ +import islpy as isl from loopy.kernel.data import AddressSpace -from pytools import memoize_method import islpy as isl import enum +from loopy.diagnostic import LoopyError +from loopy.tools import Tree +from functools import reduce +from pytools import memoize_method, memoize_on_first_arg +from pyrsistent import pmap # {{{ block boundary finder @@ -54,10 +59,18 @@ def get_block_boundaries(schedule): def temporaries_read_in_subkernel(kernel, subkernel): from loopy.kernel.tools import get_subkernel_to_insn_id_map insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] - return frozenset(tv - for insn_id in insn_ids - for tv in kernel.id_to_insn[insn_id].read_dependency_names() - if tv in kernel.temporary_variables) + inames = frozenset().union(*(kernel.insn_inames(insn_id) + for insn_id in insn_ids)) + domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames} + params = frozenset().union(*( + kernel.domains[dom_idx].get_var_names(isl.dim_type.param) + for dom_idx in domain_idxs)) + + return (frozenset(tv + for insn_id in insn_ids + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + | (params & frozenset(kernel.temporary_variables))) def temporaries_written_in_subkernel(kernel, subkernel): @@ -68,6 +81,63 @@ def temporaries_written_in_subkernel(kernel, subkernel): for tv in kernel.id_to_insn[insn_id].write_dependency_names() if tv in kernel.temporary_variables) + +def args_read_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + inames = frozenset().union(*(kernel.insn_inames(insn_id) + for insn_id in insn_ids)) + domain_idxs = {kernel.get_home_domain_index(iname) for iname in inames} + params = frozenset().union(*( + kernel.domains[dom_idx].get_var_names(isl.dim_type.param) + for dom_idx in domain_idxs)) + return (frozenset(arg + for insn_id in insn_ids + for arg in kernel.id_to_insn[insn_id].read_dependency_names() + if arg in kernel.arg_dict) + | (params & frozenset(kernel.arg_dict))) + + +def args_written_in_subkernel(kernel, subkernel): + from loopy.kernel.tools import get_subkernel_to_insn_id_map + insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel] + return frozenset(arg + for insn_id in insn_ids + for arg in kernel.id_to_insn[insn_id].write_dependency_names() + if arg in kernel.arg_dict) + + +def get_callkernel_dependencies(kernel, subkernel): + """ + Returns variable names referenced by :class:`~loopy.schedule.CallKernel` + named *subkernel*. + """ + from loopy.symbolic import IdentityMapper + from loopy.kernel.array import ArrayBase + + class VariableNoter(IdentityMapper): + def __init__(self): + self.deps = set() + super().__init__() + + def map_variable(self, expr): + self.deps.add(expr.name) + return super().map_variable(expr) + + var_names = (temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel) + | args_read_in_subkernel(kernel, subkernel) + | args_written_in_subkernel(kernel, subkernel)) + + noter = VariableNoter() + + for var_name in var_names: + var = kernel.arg_dict.get(var_name, kernel.temporary_variables.get(var_name)) + if isinstance(var, ArrayBase): + var.map_exprs(noter) + + return var_names | frozenset(noter.deps) + # }}} @@ -364,3 +434,413 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir, self.kernel, self.callables_table) # }}} + + +def _pull_out_loop_nest(tree, loop_nests, inames_to_pull_out): + """ + Returns a copy of *tree* that realizes *inames_to_pull_out* as loop + nesting. + + :arg tree: A :class:`loopy.tools.Tree`, where each node is + :class:`frozenset` of inames representing a loop nest. For example a + tree might look like: + + :arg loop_nests: A collection of nodes in *tree* that cover + *inames_to_pull_out*. + + :returns: a :class:`tuple` ``(new_tree, outer_loop_nest, inner_loop_nest)``, + where outer_loop_nest is the identifier for the new outer and inner + loop nests so that *inames_to_pull_out* is a valid nesting. + + .. note:: + + We could compute *loop_nests* within this routine's implementation, but + computing would be expensive and hence we ask the caller for this info. + + Example:: + *tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k', 'l'}) + + *inames_to_pull_out*: frozenset({'k', 'i', 'j'}) + *loop_nests*: {frozenset({'j', 'i'}), frozenset({'k', 'l'})} + + Returns: + + *new_tree*: frozenset() + └── frozenset({'j', 'i'}) + └── frozenset({'k'}) + └── frozenset({'l'}) + + *outer_loop_nest*: frozenset({'k'}) + *inner_loop_nest*: frozenset({'l'}) + """ + assert all(isinstance(loop_nest, frozenset) for loop_nest in loop_nests) + assert inames_to_pull_out <= reduce(frozenset.union, loop_nests, frozenset()) + + # {{{ sanity check to ensure the loop nest *inames_to_pull_out* is possible + + loop_nests = sorted(loop_nests, key=lambda nest: tree.depth(nest)) + + for outer, inner in zip(loop_nests[:-1], loop_nests[1:]): + if outer != tree.parent(inner): + raise LoopyError(f"Cannot schedule loop nest {inames_to_pull_out} " + f" in the nesting tree:\n{tree}") + + assert tree.depth(loop_nests[0]) == 0 + + # }}} + + innermost_loop_nest = loop_nests[-1] + new_outer_loop_nest = inames_to_pull_out - reduce(frozenset.union, + loop_nests[:-1], + frozenset()) + new_inner_loop_nest = innermost_loop_nest - inames_to_pull_out + + if new_outer_loop_nest == innermost_loop_nest: + # such a loop nesting already exists => do nothing + return tree, new_outer_loop_nest, None + + # add the outer loop to our loop nest tree + tree = tree.add_node(new_outer_loop_nest, + parent=tree.parent(innermost_loop_nest)) + + # rename the old loop to the inner loop + tree = tree.rename_node(innermost_loop_nest, + new_id=new_inner_loop_nest) + + # set the parent of inner loop to be the outer loop + tree = tree.move_node(new_inner_loop_nest, new_parent=new_outer_loop_nest) + + return tree, new_outer_loop_nest, new_inner_loop_nest + + +def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest): + """ + Returns a copy of *tree* that nests *inner_loop_nest* inside *outer_loop_nest*. + """ + # add the outer loop to our loop nest tree + return tree.add_node(inner_loop_nest, parent=outer_loop_nest) + + +def _order_loop_nests(loop_nest_tree, + strict_priorities, + relaxed_priorities, + iname_to_tree_node_id): + """ + Returns a loop nest where all nodes in the tree are instances of + :class:`str` denoting inames. Unlike *loop_nest_tree* which corresponds to + multiple loop nesting, this routine returns a unique loop nest that is + obtained after constraining *loop_nest_tree* with the constraints enforced + by *priorities*. + + :arg strict_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These priorities are imposed + strictly i.e. if these conditions cannot be met a + :class:`loopy.diagnostic.LoopyError` is raised. + + :arg relaxed_priorities: Expresses strict nesting constraints similar to + :attr:`loopy.LoopKernel.loop_priorities`. These nesting constraints are + treated as options. + + :arg iname_to_tree_node_id: A mapping from iname to the loop nesting its a + part of. + """ + from pytools.graph import compute_topological_order as toposort + from warnings import warn + + loop_nests = set(iname_to_tree_node_id.values()) + + # flow_requirements: A mapping from the loop nest level to the nesting + # constraints applicable to it. + # Each nesting constraint is represented as a DAG. In the DAG, if there + # exists an edge from from iname 'i' -> iname 'j' => 'j' should be nested + # inside 'i'. + flow_requirements = {loop_nest: {iname: frozenset() + for iname in loop_nest} + for loop_nest in loop_nests} + + # The plan here is populate DAGs in *flow_requirements* and then perform a + # toposort for each loop nest. + + def _update_flow_requirements(priorities, cannot_satisfy_callback): + """ + Records *priorities* in *flow_requirements* and calls + *cannot_satisfy_callback* with an appropriate error message if the + priorities cannot be met. + """ + for priority in priorities: + for outer_iname, inner_iname in zip(priority[:-1], priority[1:]): + if inner_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {inner_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + if outer_iname not in iname_to_tree_node_id: + cannot_satisfy_callback(f"Cannot enforce the constraint:" + f" {inner_iname} to be nested within" + f" {outer_iname}, as {outer_iname}" + f" is either a parallel loop or" + f" not an iname.") + continue + + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + flow_requirements[inner_iname_nest][outer_iname] |= {inner_iname} + else: + ancestors_of_inner_iname = (loop_nest_tree + .ancestors(inner_iname_nest)) + ancestors_of_outer_iname = (loop_nest_tree + .ancestors(outer_iname_nest)) + if outer_iname in ancestors_of_inner_iname: + # nesting constraint already satisfied => do nothing + pass + elif inner_iname in ancestors_of_outer_iname: + cannot_satisfy_callback("Cannot satisfy constraint that" + f" iname '{inner_iname}' must be" + f" nested within '{outer_iname}''.") + else: + # inner iname and outer iname are indirect family members + # => must be realized via dependencies in the linearization + # phase, not implemented in v2-scheduler yet. + from loopy.schedule import V2SchedulerNotImplementedException + raise V2SchedulerNotImplementedException("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") + + def _raise_loopy_err(x): + raise LoopyError(x) + + # record strict priorities + _update_flow_requirements(strict_priorities, _raise_loopy_err) + # record relaxed priorities + _update_flow_requirements(relaxed_priorities, warn) + + # ordered_loop_nests: A mapping from the unordered loop nests to their + # ordered couterparts. For example. If we had only one loop nest + # `frozenset({"i", "j", "k"})`, and the prioirities said added the + # constraint that "i" must be nested within "k", then `ordered_loop_nests` + # would be: `{frozenset({"i", "j", "k"}): ["j", "k", "i"]}` i.e. the loop + # nests would now have an order. + ordered_loop_nests = {unordered_nest: toposort(flow, + key=lambda x: x) + for unordered_nest, flow in flow_requirements.items()} + + # {{{ combine 'loop_nest_tree' along with 'ordered_loop_nest_tree' + + assert loop_nest_tree.root == frozenset() + + new_tree = Tree.from_root("") + + old_to_new_parent = {} + + old_to_new_parent[loop_nest_tree.root] = "" + + # traversing 'tree' in an BFS fashion to create 'new_tree' + queue = list(loop_nest_tree.children(loop_nest_tree.root)) + + while queue: + current_nest = queue.pop(0) + + ordered_nest = ordered_loop_nests[current_nest] + new_tree = new_tree.add_node(ordered_nest[0], + parent=old_to_new_parent[loop_nest_tree + .parent(current_nest)]) + for new_parent, new_child in zip(ordered_nest[:-1], ordered_nest[1:]): + new_tree = new_tree.add_node(node=new_child, parent=new_parent) + + old_to_new_parent[current_nest] = ordered_nest[-1] + + queue.extend(list(loop_nest_tree.children(current_nest))) + + # }}} + + return new_tree + + +@memoize_on_first_arg +def _get_parallel_inames(kernel): + from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag + + concurrent_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, ConcurrentTag)} + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + vec_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, VectorizeTag)} + return (concurrent_inames - ilp_inames - vec_inames) + + +def _get_partial_loop_nest_tree(kernel): + """ + Returns :class:`loopy.Tree` representing the *kernel*'s loop-nests. + + Each node of the returned tree has a :class:`frozenset` of inames. + All the inames in the identifier of a parent node of a loop nest in the + tree must be nested outside all the iname in identifier of the loop nest. + + .. note:: + + This routine only takes into account the nesting dependency + constraints of :attr:`loopy.InstructionBase.within_inames` of all the + *kernel*'s instructions and the iname tags. This routine does *NOT* + include the nesting constraints imposed by the dependencies between the + instructions and the dependencies imposed by the kernel's domain tree. + """ + from loopy.kernel.data import IlpBaseTag + + # figuring the possible loop nestings minus the concurrent_inames as they + # are never realized as actual loops + iname_chains = {insn.within_inames - _get_parallel_inames(kernel) + for insn in kernel.instructions} + + root = frozenset() + tree = Tree.from_root(root) + + # mapping from iname to the innermost loop nest they are part of in *tree*. + iname_to_tree_node_id = {} + + # if there were any loop with no inames, those have been already account + # for as the root. + iname_chains = iname_chains - {root} + + for iname_chain in iname_chains: + not_seen_inames = frozenset(iname for iname in iname_chain + if iname not in iname_to_tree_node_id) + seen_inames = iname_chain - not_seen_inames + + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + tree = _add_inner_loops(tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + for iname in not_seen_inames: + iname_to_tree_node_id[iname] = not_seen_inames + + # }}} + + # {{{ make ILP tagged inames innermost + + ilp_inames = {iname for iname in kernel.all_inames() + if kernel.iname_tags_of_type(iname, IlpBaseTag)} + + for iname_chain in iname_chains: + for ilp_iname in (ilp_inames & iname_chains): + # pull out other loops so that ilp_iname is the innermost + all_nests = {iname_to_tree_node_id[iname] for iname in seen_inames} + tree, outer_loop, inner_loop = _pull_out_loop_nest(tree, + (all_nests + | {frozenset()}), + (iname_chain + - {ilp_iname})) + + for iname in outer_loop: + iname_to_tree_node_id[iname] = outer_loop + + if inner_loop is not None: + for iname in inner_loop: + iname_to_tree_node_id[iname] = inner_loop + + # }}} + + return tree + + +def _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree): + """ + Returns the mapping from the iname to the *tree*'s node that it was a part + of. + + :arg tree: A partial loop nest tree. + """ + iname_to_tree_node_id = {} + for node in tree.nodes(): + assert isinstance(node, frozenset) + for iname in node: + iname_to_tree_node_id[iname] = node + + return pmap(iname_to_tree_node_id) + + +def get_loop_nest_tree(kernel): + """ + Returns ```tree``` (an instance of :class:`Tree`) representing the loop + nesting for *kernel*. Each node of ``tree`` is an instance of :class:`str` + corresponding to the inames of *kernel* that are realized as concrete + ``for-loops``. A parent node in `tree` is always nested outside all its + children. + + .. note:: + + Multiple loop nestings might exist for *kernel*, but this routine returns + one valid loop nesting. + """ + from islpy import dim_type + + tree = _get_partial_loop_nest_tree(kernel) + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + strict_loop_priorities = frozenset() + + # {{{ impose constraints by the domain tree + + loop_inames = (reduce(frozenset.union, + (insn.within_inames + for insn in kernel.instructions), + frozenset()) + - _get_parallel_inames(kernel)) + + for dom in kernel.domains: + for outer_iname in set(dom.get_var_names(dim_type.param)): + if outer_iname not in loop_inames: + continue + + for inner_iname in dom.get_var_names(dim_type.set): + if inner_iname not in loop_inames: + continue + + # either outer_iname and inner_iname should belong to the same + # loop nest level or outer should be strictly outside inner + # iname + inner_iname_nest = iname_to_tree_node_id[inner_iname] + outer_iname_nest = iname_to_tree_node_id[outer_iname] + + if inner_iname_nest == outer_iname_nest: + strict_loop_priorities |= {(outer_iname, inner_iname)} + else: + ancestors_of_inner_iname = tree.ancestors(inner_iname_nest) + if outer_iname_nest not in ancestors_of_inner_iname: + raise LoopyError(f"Loop '{outer_iname}' cannot be nested" + f" outside '{inner_iname}'.") + + # }}} + + return _order_loop_nests(tree, + strict_loop_priorities, + kernel.loop_priority, + iname_to_tree_node_id) + +# vim: fdm=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5cd0931de..4cdcb2fde 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -852,6 +852,9 @@ def idi_to_cgen_declarator(self, kernel, idi): def get_function_declaration(self, codegen_state, codegen_result, schedule_index): + kernel = codegen_state.kernel + subkernel = codegen_state.kernel.schedule[schedule_index].kernel_name + from cgen import FunctionDeclaration, Value name = codegen_result.current_program(codegen_state).name @@ -865,8 +868,10 @@ def get_function_declaration(self, codegen_state, codegen_result, return FunctionDeclarationWrapper( FunctionDeclaration( name, - [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info])) + [self.idi_to_cgen_declarator(kernel, idi) + for idi in codegen_result.get_idis_for_subkernel(kernel, + subkernel)] + )) def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): return None diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 81f1e0c11..2cbe76baa 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -446,6 +446,7 @@ def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), entrypoint, arg_to_dtype_set) from loopy.codegen import generate_code_v2 + from loopy.schedule.tools import get_callkernel_dependencies codegen_result = generate_code_v2(program) dev_code = codegen_result.device_code() @@ -472,9 +473,14 @@ def program_info(self, entrypoint, arg_to_dtype_set=frozenset(), c_kernels = [] for dp in codegen_result.device_programs: - c_kernels.append(CompiledCKernel(dp, - codegen_result.implemented_data_infos[entrypoint], all_code, - self.program.target, self.compiler)) + all_args = [ + arg + for arg in codegen_result.implemented_data_infos[entrypoint] + if arg.name in get_callkernel_dependencies(program[entrypoint], + dp.name)] + c_kernels.append(CompiledCKernel(dp, all_args, all_code, + self.program.target, + self.compiler)) return _KernelInfo( program=program, diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 37951d474..05c6b3b92 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -578,8 +578,7 @@ def map_nan(self, expr, type_context): else: if isinstance(expr.data_type(float("nan")), np.float32): return p.Variable("NAN") - elif isinstance(expr.data_type(float("nan")), (np.float64, - np.float128)): + elif isinstance(expr.data_type(float("nan")), np.floating): registry = self.codegen_state.ast_builder.target.get_dtype_registry() lpy_type = NumpyType(np.dtype(expr.data_type)) cast = var("(%s)" % registry.dtype_to_ctype(lpy_type)) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 8b36dd339..79ed1bbc5 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -766,6 +766,11 @@ def alloc_nbytes(tv): return code_lines def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + from loopy.schedule.tools import get_callkernel_dependencies + from loopy.kernel.data import InameArg + name2idi = {idi.name: idi for idi in (codegen_state.implemented_data_info + + extra_args)} + subknl_deps = get_callkernel_dependencies(codegen_state.kernel, name) ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: @@ -773,7 +778,17 @@ def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): if not lsize: lsize = (1,) - all_args = codegen_state.implemented_data_info + extra_args + all_args = [arg + for arg in (codegen_state.implemented_data_info + extra_args) + if (arg.name in subknl_deps + or arg.arg_class is InameArg + or arg.base_name in subknl_deps + or (arg.offset_for_name is not None + and (name2idi[arg.offset_for_name].base_name + in subknl_deps)) + or arg.offset_for_name in subknl_deps + or (arg.stride_for_name_and_axis is not None + and arg.stride_for_name_and_axis[0] in subknl_deps))] value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( diff --git a/loopy/tools.py b/loopy/tools.py index 50c9e4bd0..e0fe9bc5f 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -30,6 +30,9 @@ from pymbolic.mapper.persistent_hash import ( PersistentHashWalkMapper as PersistentHashWalkMapperBase) from sys import intern +from typing import FrozenSet, Generic, TypeVar, Iterator +from pyrsistent import PMap, pmap +from dataclasses import dataclass import logging logger = logging.getLogger(__name__) @@ -920,6 +923,225 @@ def _get_persistent_hashable_arg(arg): return wrapper + +# {{{ tree data structure + +T = TypeVar("T") + + +@dataclass(frozen=True) +class Tree(Generic[T]): + """ + An immutable tree implementation. + + .. automethod:: ancestors + .. automethod:: parent + .. automethod:: children + .. automethod:: depth + .. automethod:: rename_node + .. automethod:: move_node + + .. note:: + + Almost all the operations are implemented recursively. NOT suitable for + deep trees. At the very least if the Python implementation is CPython + this allocates a new stack frame for each iteration of the operation. + """ + _parent_to_children: "PMap[T, FrozenSet[T]]" + _child_to_parent: "PMap[T, Optional[T]]" + + @staticmethod + def from_root(root: T): + return Tree(pmap({root: frozenset()}), + pmap({root: None})) + + @property + def root(self) -> T: + guess = set(self._child_to_parent).pop() + while self.parent(guess) is not None: + guess = self.parent(guess) + + return guess + + def ancestors(self, node: T) -> "FrozenSet[T]": + """ + Returns a :class:`frozenset` of nodes that are ancestors of *node*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => root + return frozenset() + + parent = self._child_to_parent[node] + + return frozenset([parent]) | self.ancestors(parent) + + def parent(self, node: T) -> "Optional[T]": + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._child_to_parent[node] + + def children(self, node: T) -> "FrozenSet[T]": + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self._parent_to_children[node] + + def depth(self, node: T) -> int: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + if self.is_root(node): + # => None + return 0 + + return 1 + self.depth(self.parent(node)) + + def is_root(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return self.parent(node) is None + + def is_leaf(self, node: T) -> bool: + if not self.is_a_node(node): + raise ValueError(f"'{node}' not in tree.") + + return len(self.children(node)) == 0 + + def is_a_node(self, node: T) -> bool: + return node in self._child_to_parent + + def add_node(self, node: T, parent: T) -> "Tree[T]": + """ + Returns a :class:`Tree` with added node *node* having a parent + *parent*. + """ + if self.is_a_node(node): + raise ValueError(f"'{node}' already present in tree.") + + siblings = self._parent_to_children[parent] + + return Tree((self._parent_to_children + .set(parent, siblings | frozenset([node])) + .set(node, frozenset())), + self._child_to_parent.set(node, parent)) + + def rename_node(self, node: T, new_id: T) -> "Tree[T]": + """ + Returns a copy of *self* with *node* renamed to *new_id*. + """ + if not self.is_a_node(node): + raise ValueError(f"'{node}' not present in tree.") + + if self.is_a_node(new_id): + raise ValueError(f"cannot rename to '{new_id}', as its already a part" + " of the tree.") + + parent = self.parent(node) + children = self.children(node) + + # {{{ update child to parent + + new_child_to_parent = (self._child_to_parent.discard(node) + .set(new_id, parent)) + + for child in children: + new_child_to_parent = (new_child_to_parent + .set(child, new_id)) + + # }}} + + # {{{ update parent_to_children + + new_parent_to_children = (self._parent_to_children + .discard(node) + .set(new_id, self.children(node))) + + if parent is not None: + # update the child's name in the parent's children + new_parent_to_children = (new_parent_to_children + .discard(parent) + .set(parent, ((self.children(parent) + - frozenset([node])) + | frozenset([new_id])))) + + # }}} + + return Tree(new_parent_to_children, + new_child_to_parent) + + def move_node(self, node: T, new_parent: "Optional[T]") -> "Tree[T]": + """ + Returns a copy of *self* with node *node* as a child of *new_parent*. + """ + if self.is_root(node) and new_parent is not None: + raise ValueError("Moving root not allowed.") + + if not self.is_a_node(node): + raise ValueError(f"'{node}' not a part of the tree => cannot move.") + + if not self.is_a_node(new_parent): + raise ValueError(f"Cannot move to '{new_parent}' as it's not in tree.") + + parent = self.parent(node) + siblings = self.children(parent) + parents_new_children = siblings - frozenset([node]) + new_parents_children = self.children(new_parent) | frozenset([node]) + + new_child_to_parent = self._child_to_parent.set(node, new_parent) + new_parent_to_children = (self._parent_to_children + .set(parent, parents_new_children) + .set(new_parent, new_parents_children)) + + return Tree(new_parent_to_children, + new_child_to_parent) + + def __str__(self): + """ + Stringifies the tree by using the box-drawing unicode characters. + + :: + + >>> from loopy.tools import Tree + >>> tree = (Tree.from_root("Root") + ... .add_node("A", "Root") + ... .add_node("B", "Root") + ... .add_node("D", "B") + ... .add_node("E", "B") + ... .add_node("C", "A")) + + >>> print(tree) + Root + ├── A + │ └── C + └── B + ├── D + └── E + """ + def rec(node): + children_result = [rec(c) for c in self.children(node)] + + def post_process_non_last_child(child): + return ["├── " + child[0]] + [f"│ {c}" for c in child[1:]] + + def post_process_last_child(child): + return ["└── " + child[0]] + [f" {c}" for c in child[1:]] + + children_result = ([post_process_non_last_child(c) + for c in children_result[:-1]] + + [post_process_last_child(c) + for c in children_result[-1:]]) + return [str(node)] + sum(children_result, start=[]) + + return "\n".join(rec(self.root)) + + def nodes(self) -> "Iterator[T]": + return iter(self._child_to_parent.keys()) + # }}} # vim: fdm=marker diff --git a/loopy/transform/loop_fusion.py b/loopy/transform/loop_fusion.py new file mode 100644 index 000000000..2cb4be7b5 --- /dev/null +++ b/loopy/transform/loop_fusion.py @@ -0,0 +1,770 @@ +__copyright__ = """ +Copyright (C) 2021 Kaushik Kulkarni +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +from loopy.diagnostic import LoopyError +from loopy.symbolic import RuleAwareIdentityMapper +from loopy.kernel import LoopKernel +from typing import FrozenSet, Mapping, Tuple, Dict, Set +from functools import reduce +from dataclasses import dataclass + +__doc__ = """ +.. autofunction:: rename_inames_in_batch +.. autofunction:: get_kennedy_unweighted_fusion_candidates +""" + + +# {{{ Loop Dependence graph class + builder + + +@dataclass(frozen=True, eq=True) +class LoopDependenceGraph: + """ + .. attribute:: successors + + A mapping from iname (``i``) to the collection of inames that can be + scheduled only after the loop corresponding to ``i`` has been exited. + + .. attribute:: predecessors + + A mapping from iname (``i``) to the collection of inames that must have + been exited before entering ``i``. + + .. attribute:: is_infusible + + A mapping from the edges in the loop dependence graph to their + fusibility crierion. An edge in this mapping is represented by a pair + of inames``(iname_i, iname_j)`` such that the edge ``iname_i -> + iname_j`` is present in the graph. + + .. note:: + + Both :attr:`successors` and :attr:`predecessors` are maintained to + reduce the complexity of graph primitive operations (like remove node, + add edge, etc.). + """ + successors: Mapping[str, FrozenSet[str]] + predecessors: Mapping[str, FrozenSet[str]] + is_infusible: Mapping[Tuple[str, str], bool] + + @classmethod + def new(cls, successors, is_infusible): + predecessors = {node: set() + for node in successors} + for node, succs in successors.items(): + for succ in succs: + predecessors[succ].add(node) + + predecessors = {node: frozenset(preds) + for node, preds in predecessors.items()} + successors = {node: frozenset(succs) + for node, succs in successors.items()} + + return LoopDependenceGraph(successors, predecessors, is_infusible) + + def is_empty(self): + """ + Returns *True* only if the loop dependence graph contains no nodes. + """ + return (len(self.successors) == 0) + + def get_loops_with_no_predecessors(self): + return {loop + for loop, preds in self.predecessors.items() + if len(preds) == 0} + + def remove_nodes(self, nodes_to_remove): + """ + Returns a copy of *self* after removing *nodes_to_remove* in the graph. + This routine adds necessary edges after removing *nodes_to_remove* to + conserve the scheduling constraints present in the graph. + """ + # {{{ Step 1. Remove the nodes + + new_successors = {node: succs + for node, succs in self.successors.items() + if node not in nodes_to_remove} + new_predecessors = {node: preds + for node, preds in self.predecessors.items() + if node not in nodes_to_remove} + + new_is_infusible = {(from_, to): v + for (from_, to), v in self.is_infusible.items() + if (from_ not in nodes_to_remove + and to not in nodes_to_remove)} + + # }}} + + # {{{ Step 2. Propagate dependencies + + # For every Node 'R' to be removed and every pair (S, P) such that + # 1. there exists an edge 'P' -> 'R' in the original graph, and, + # 2. there exits an edge 'R' -> 'S' in the original graph. + # add the edge 'P' -> 'S' in the new graph. + + for node_to_remove in nodes_to_remove: + for succ in (self.successors[node_to_remove] + - nodes_to_remove): + new_predecessors[succ] = (new_predecessors[succ] + - frozenset([node_to_remove])) + + for pred in (self.predecessors[node_to_remove] + - nodes_to_remove): + new_successors[pred] = (new_successors[pred] + - frozenset([node_to_remove])) + + # }}} + + return LoopDependenceGraph(new_successors, + new_predecessors, + new_is_infusible) + + +@dataclass +class LoopDependenceGraphBuilder: + """ + A mutable type to act as a helper to instantiate a + :class:`LoopDependenceGraphBuilder`. + """ + _dag: Dict[str, Set[str]] + _is_infusible: Mapping[Tuple[str, str], bool] + + @classmethod + def new(cls, candidates): + return LoopDependenceGraphBuilder({iname: set() + for iname in candidates}, + {}) + + def add_edge(self, from_: str, to: str, is_infusible: bool): + self._dag[from_].add(to) + self._is_infusible[(from_, to)] = (is_infusible + or self._is_infusible.get((from_, to), + False)) + + def done(self): + """ + Returns the built :class:`LoopDependenceGraph`. + """ + return LoopDependenceGraph.new(self._dag, self._is_infusible) + +# }}} + + +# {{{ _build_ldg + +@dataclass(frozen=True, eq=True, repr=True) +class PreLDGNode: + """ + A node in the graph representing the dependencies before building + :class:`LoopDependenceGraph`. + """ + + +@dataclass(frozen=True, eq=True, repr=True) +class CandidateLoop(PreLDGNode): + iname: str + + +@dataclass(frozen=True, eq=True, repr=True) +class NonCandidateLoop(PreLDGNode): + loop_nest: FrozenSet[str] + + +@dataclass(frozen=True, eq=True, repr=True) +class OuterLoopNestStatement(PreLDGNode): + insn_id: str + + +def _remove_non_candidate_pre_ldg_nodes(kernel, + predecessors: Mapping[PreLDGNode, + PreLDGNode], + successors: Mapping[PreLDGNode, + PreLDGNode], + candidates: FrozenSet[str]): + """ + Returns ``(new_successors, new_predecessors, inufusible_edge)`` where + ``(new_successors, new_predecessors)`` is the graph describing the + dependencies between the *candidates* loops that has been obtained by + removing instances of :class:`NonCandidateLoop` and + :class:`OuterLoopNestStatement` from the graph described by *predecessors*, + *succcessors*. + + New dependency edges are added in the new graph to preserve the transitive + dependencies that exists in the original graph. + """ + # {{{ input validation + + assert set(predecessors) == set(successors) + assert all(isinstance(val, frozenset) for val in predecessors.values()) + assert all(isinstance(val, frozenset) for val in successors.values()) + + # }}} + + nodes_to_remove = {node + for node in predecessors + if isinstance(node, (NonCandidateLoop, + OuterLoopNestStatement)) + } + new_predecessors = predecessors.copy() + new_successors = successors.copy() + infusible_edges_in_statement_dag = set() + + for node_to_remove in nodes_to_remove: + for pred in new_predecessors[node_to_remove]: + new_successors[pred] = ((new_successors[pred] + - frozenset([node_to_remove])) + | new_successors[node_to_remove]) + + for succ in new_successors[node_to_remove]: + new_predecessors[succ] = ((new_predecessors[succ] + - frozenset([node_to_remove])) + | new_predecessors[node_to_remove]) + + for pred in new_predecessors[node_to_remove]: + for succ in new_successors[node_to_remove]: + # now mark the edge from pred -> succ infusible iff both 'pred' and + # 'succ' are *not* in insns_to_remove + if ((pred not in nodes_to_remove) and (succ not in nodes_to_remove)): + assert isinstance(pred, CandidateLoop) + assert isinstance(succ, CandidateLoop) + infusible_edges_in_statement_dag.add((pred.iname, succ.iname)) + + del new_predecessors[node_to_remove] + del new_successors[node_to_remove] + + return ({key.iname: frozenset({n.iname for n in value}) + for key, value in new_predecessors.items()}, + {key.iname: frozenset({n.iname for n in value}) + for key, value in new_successors.items()}, + infusible_edges_in_statement_dag) + + +def _get_ldg_nodes_from_loopy_insn(kernel, insn, candidates, non_candidates, + just_outer_loop_nest): + """ + Helper used in :func:`_build_ldg`. + + :arg just_outer_inames: A :class:`frozenset` of the loop nest that appears + just outer to the *candidates* in the partial loop nest tree. + """ + if (insn.within_inames | insn.reduction_inames()) & candidates: + # => the statement containing + return [CandidateLoop(candidate) + for candidate in ((insn.within_inames + | insn.reduction_inames()) + & candidates)] + elif {loop_nest + for loop_nest in non_candidates + if (loop_nest & insn.within_inames)}: + non_candidate, = {loop_nest + for loop_nest in non_candidates + if (loop_nest & insn.within_inames)} + + return [NonCandidateLoop(non_candidate)] + else: + assert ((insn.within_inames & just_outer_loop_nest) + or (insn.within_inames == just_outer_loop_nest)) + return [OuterLoopNestStatement(insn.id)] + + +def _compute_isinfusible_via_access_map(kernel, + insn_pred, candidate_pred, + insn_succ, candidate_succ, + outer_inames, + var): + """ + Returns *True* if the inames *candidate_pred* and *candidate_succ* are fused then + that might lead to a loop carried dependency for *var*. + + Helper used in :func:`_build_ldg`. + """ + import islpy as isl + from loopy.kernel.tools import get_insn_access_map + import pymbolic.primitives as prim + from loopy.symbolic import isl_set_from_expr + from loopy.diagnostic import UnableToDetermineAccessRangeError + + inner_inames_pred = (kernel.insn_inames(insn_pred) + - (frozenset([candidate_pred]) + | outer_inames)) + + inner_inames_succ = (kernel.insn_inames(insn_succ) + - (frozenset([candidate_succ]) + | outer_inames)) + + try: + amap_pred = get_insn_access_map(kernel, insn_pred, var, inner_inames_pred) + amap_succ = get_insn_access_map(kernel, insn_succ, var, inner_inames_succ) + except UnableToDetermineAccessRangeError: + # either predecessors or successors has a non-affine access i.e. + # fallback to the safer option => infusible + return True + + # since both ranges denote the same variable they must be subscripted with + # the same number of indices. + assert amap_pred.dim(isl.dim_type.out) == amap_succ.dim(isl.dim_type.out) + + ndim = amap_pred.dim(isl.dim_type.out) + + # {{{ set the out dim names as `amap_a_dim0`, `amap_a_dim1`, ... + + for idim in range(ndim): + amap_pred = amap_pred.set_dim_name(isl.dim_type.out, + idim, + f"_lpy_amap_a_dim{idim}") + amap_succ = amap_succ.set_dim_name(isl.dim_type.out, + idim, + f"_lpy_amap_b_dim{idim}") + + # }}} + + # {{{ amap_pred -> set_pred, amap_succ -> set_succ + + amap_pred = amap_pred.move_dims(isl.dim_type.in_, + amap_pred.dim(isl.dim_type.in_), + isl.dim_type.out, + 0, amap_pred.dim(isl.dim_type.out)) + + amap_succ = amap_succ.move_dims(isl.dim_type.in_, + amap_succ.dim(isl.dim_type.in_), + isl.dim_type.out, + 0, amap_succ.dim(isl.dim_type.out)) + + set_pred, set_succ = amap_pred.domain(), amap_succ.domain() + set_pred, set_succ = isl.align_two(set_pred, set_succ) + + # }}} + + # {{{ build the bset, both accesses access the same element + + accesses_same_index_set = isl.BasicSet.universe(set_pred.space) + for idim in range(ndim): + cnstrnt = isl.Constraint.eq_from_names(set_pred.space, + {f"_lpy_amap_a_dim{idim}": 1, + f"_lpy_amap_b_dim{idim}": -1}) + accesses_same_index_set = accesses_same_index_set.add_constraint(cnstrnt) + + # }}} + + candidates_not_equal = isl_set_from_expr(set_pred.space, + prim.Comparison( + prim.Variable(candidate_pred), + ">", + prim.Variable(candidate_succ))) + return (not (set_pred + & set_succ + & accesses_same_index_set & candidates_not_equal).is_empty()) + + +def _build_ldg(kernel: LoopKernel, + candidates: FrozenSet[str], + outer_inames: FrozenSet[str]): + """ + Returns an instance of :class:`LoopDependenceGraph` needed while fusing + *candidates*. Invoked as a helper function in + :func:`get_kennedy_unweighted_fusion_candidates`. + """ + + from pytools.graph import compute_topological_order + + loop_nest_tree = _get_partial_loop_nest_tree_for_fusion(kernel) + + non_candidate_loop_nests = { + child_loop_nest + for child_loop_nest in loop_nest_tree.children(outer_inames) + if len(child_loop_nest & candidates) == 0} + + insns = reduce(frozenset.intersection, + (frozenset(kernel.iname_to_insns()[iname]) + for iname in outer_inames), + frozenset(kernel.id_to_insn)) + predecessors = {} + successors = {} + + for insn in insns: + for successor in _get_ldg_nodes_from_loopy_insn(kernel, + kernel.id_to_insn[insn], + candidates, + non_candidate_loop_nests, + outer_inames): + predecessors.setdefault(successor, set()) + successors.setdefault(successor, set()) + for dep in kernel.id_to_insn[insn].depends_on: + if ((kernel.id_to_insn[dep].within_inames & outer_inames) + != outer_inames): + # this is not an instruction in 'outer_inames' => bogus dep. + continue + for predecessor in _get_ldg_nodes_from_loopy_insn( + kernel, + kernel.id_to_insn[dep], + candidates, + non_candidate_loop_nests, + outer_inames): + if predecessor != successor: + predecessors.setdefault(successor, set()).add(predecessor) + successors.setdefault(predecessor, set()).add(successor) + + predecessors, successors, infusible_edges = ( + _remove_non_candidate_pre_ldg_nodes( + kernel, + {key: frozenset(value) + for key, value in predecessors.items()}, + {key: frozenset(value) + for key, value in successors.items()}, + candidates)) + del predecessors + + builder = LoopDependenceGraphBuilder.new(candidates) + + # Interpret the statement DAG as LDG + for pred, succs in successors.items(): + for succ in succs: + builder.add_edge(pred, succ, + (pred, succ) in infusible_edges) + + # {{{ add infusible edges to the LDG depending on memory deps. + + all_candidate_insns = reduce(frozenset.union, + (kernel.iname_to_insns()[iname] + for iname in candidates), + frozenset()) + + dep_inducing_vars = reduce(frozenset.union, + (frozenset(kernel + .id_to_insn[insn] + .assignee_var_names()) + for insn in all_candidate_insns), + frozenset()) + wmap = kernel.writer_map() + rmap = kernel.reader_map() + + topo_order = {el: i + for i, el in enumerate(compute_topological_order(successors))} + + for var in dep_inducing_vars: + for writer_id in (wmap.get(var, frozenset()) + & all_candidate_insns): + for access_id in ((rmap.get(var, frozenset()) + | wmap.get(var, frozenset())) + & all_candidate_insns): + if writer_id == access_id: + # no need to add self dependence + continue + + writer_candidate, = (kernel.id_to_insn[writer_id].within_inames + & candidates) + access_candidate, = (kernel.id_to_insn[access_id].within_inames + & candidates) + (pred_candidate, pred), (succ_candidate, succ) = sorted( + [(writer_candidate, writer_id), + (access_candidate, access_id)], + key=lambda x: topo_order[x[0]]) + + is_infusible = _compute_isinfusible_via_access_map(kernel, + pred, + pred_candidate, + succ, + succ_candidate, + outer_inames, + var) + + builder.add_edge(pred_candidate, succ_candidate, is_infusible) + + # }}} + + return builder.done() + +# }}} + + +def _fuse_sequential_loops_with_outer_loops(kernel: LoopKernel, + candidates: FrozenSet[str], + outer_inames: FrozenSet[str], + name_gen, prefix): + from collections import deque + ldg = _build_ldg(kernel, candidates, outer_inames) + + fused_chunks = {} + + while not ldg.is_empty(): + + # sorting to have a deterministic order. + # prefer 'deque' over list, as popping elements off the queue would be + # O(1). + queue = deque(sorted(ldg.get_loops_with_no_predecessors())) + loops_to_be_fused = set() + non_fusible_loops = set() + while queue: + next_loop_in_queue = queue.popleft() + if not (ldg.predecessors[next_loop_in_queue] <= loops_to_be_fused): + # this loop still needs some other loops to be scheduled + # before we can reach this. + # Bye bye 'next_loop_in_queue' :'( , see you when all your + # predecessors have been scheduled. + continue + + if next_loop_in_queue in non_fusible_loops: + # had an non-fusible edge with an already schedule loop. + # Sorry 'next_loop_in_queue', until next time :'(. + continue + + loops_to_be_fused.add(next_loop_in_queue) + + for succ in ldg.successors[next_loop_in_queue]: + if ldg.is_infusible.get((next_loop_in_queue, succ), False): + non_fusible_loops.add(succ) + else: + queue.append(succ) + + ldg = ldg.remove_nodes(loops_to_be_fused) + fused_chunks[name_gen(prefix)] = loops_to_be_fused + + assert reduce(frozenset.union, fused_chunks.values(), frozenset()) == candidates + assert sum(len(val) for val in fused_chunks.values()) == len(candidates) + + return fused_chunks + + +class ReductionLoopInserter(RuleAwareIdentityMapper): + """ + Main mapper used by :func:`_add_reduction_loops_in_partial_loop_nest_tree`. + """ + def __init__(self, rule_mapping_context, tree): + super().__init__(rule_mapping_context) + self.tree = tree + from loopy.schedule.tools import ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree) + self.iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + def map_reduction(self, expr, expn_state, *, outer_redn_inames=frozenset()): + redn_inames = frozenset(expr.inames) + iname_chain = (expn_state.instruction.within_inames + | outer_redn_inames + | redn_inames) + not_seen_inames = frozenset(iname for iname in iname_chain + if iname not in self.iname_to_tree_node_id) + seen_inames = iname_chain - not_seen_inames + + # {{{ verbatim copied from loopy/schedule/tools.py + + from loopy.schedule.tools import (_pull_out_loop_nest, + _add_inner_loops) + + all_nests = {self.iname_to_tree_node_id[iname] + for iname in seen_inames} + + self.tree, outer_loop, inner_loop = _pull_out_loop_nest(self.tree, + (all_nests + | {frozenset()}), + seen_inames) + if not_seen_inames: + # make '_not_seen_inames' nest inside the seen ones. + # example: if there is already a loop nesting "i,j,k" + # and the current iname chain is "i,j,l". Only way this is possible + # is if "l" is nested within "i,j"-loops. + self.tree = _add_inner_loops(self.tree, outer_loop, not_seen_inames) + + # {{{ update iname to node id + + for iname in outer_loop: + self.iname_to_tree_node_id = self.iname_to_tree_node_id.set(iname, + outer_loop) + + if inner_loop is not None: + for iname in inner_loop: + self.iname_to_tree_node_id = self.iname_to_tree_node_id.set( + iname, inner_loop) + + for iname in not_seen_inames: + self.iname_to_tree_node_id = self.iname_to_tree_node_id.set( + iname, not_seen_inames) + + # }}} + + # }}} + + assert not (outer_redn_inames & redn_inames) + return super().map_reduction( + expr, + expn_state, + outer_redn_inames=(outer_redn_inames | redn_inames)) + + +def _add_reduction_loops_in_partial_loop_nest_tree(kernel, tree): + """ + Returns a partial loop nest tree with the loop nests corresponding to the + reduction inames added to *tree*. + """ + from loopy.symbolic import SubstitutionRuleMappingContext + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + reduction_loop_inserter = ReductionLoopInserter(rule_mapping_context, tree) + reduction_loop_inserter.map_kernel(kernel) + return reduction_loop_inserter.tree + + +def _get_partial_loop_nest_tree_for_fusion(kernel): + from loopy.schedule.tools import _get_partial_loop_nest_tree + tree = _get_partial_loop_nest_tree(kernel) + tree = _add_reduction_loops_in_partial_loop_nest_tree(kernel, tree) + return tree + + +def get_kennedy_unweighted_fusion_candidates(kernel: LoopKernel, + candidates: FrozenSet[str], + prefix="ifused"): + """ + Returns the fusion candidates mapping that could be fed to + :func:`rename_inames_in_batch` similar to Ken Kennedy's Unweighted + Loop-Fusion Algorithm. + + .. attribute:: prefix + + Prefix for the fused inames. + """ + from loopy.kernel.data import ConcurrentTag + from loopy.schedule.tools import ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree) + from collections.abc import Collection + assert not isinstance(candidates, str) + assert isinstance(candidates, Collection) + assert isinstance(kernel, LoopKernel) + + candidates = frozenset(candidates) + vng = kernel.get_var_name_generator() + fused_chunks = {} + + # {{{ implementation scope + + # All of the candidates must be either "pure" reduction loops or + # pure-within_inames loops. + # Reason: otherwise _compute_isinfusible_via_access_map might result in + # spurious results. + # One option is to simply perform 'realize_reduction' before implementing + # this algorithm, but that seems like an unnecessary cost to pay. + if any(candidates & insn.reduction_inames() + for insn in kernel.instructions): + if any(candidates & insn.within_inames + for insn in kernel.instructions): + raise NotImplementedError("Some candidates are reduction" + " inames while some of them are not. Such" + " cases are not yet supported.") + + # }}} + + # {{{ handle concurrent inames + + # filter out concurrent loops. + all_concurrent_tags = reduce(frozenset.union, + (kernel.inames[iname].tags_of_type(ConcurrentTag) + for iname in candidates), + frozenset()) + + concurrent_tag_to_inames = {tag: set() + for tag in all_concurrent_tags} + + for iname in candidates: + if kernel.inames[iname].tags_of_type(ConcurrentTag): + # since ConcurrentTag is a UniqueTag there must be exactly one of + # it. + tag, = kernel.tags_of_type(ConcurrentTag) + concurrent_tag_to_inames[tag].add(iname) + + for inames in concurrent_tag_to_inames.values(): + fused_chunks[vng(prefix)] = inames + candidates = candidates - inames + + # }}} + + tree = _get_partial_loop_nest_tree_for_fusion(kernel) + + iname_to_tree_node_id = ( + _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree)) + + # {{{ sanitary checks + + _nest_tree_id_to_candidate = {} + + for iname in candidates: + loop_nest_tree_node_id = iname_to_tree_node_id[iname] + if loop_nest_tree_node_id not in _nest_tree_id_to_candidate: + _nest_tree_id_to_candidate[loop_nest_tree_node_id] = iname + else: + conflict_iname = _nest_tree_id_to_candidate[loop_nest_tree_node_id] + raise LoopyError(f"'{iname}' and '{conflict_iname}' " + "cannot fused be fused as they can be nested " + "within one another.") + + for iname in candidates: + outer_loops = reduce(frozenset.union, + tree.ancestors(iname_to_tree_node_id[iname]), + frozenset()) + if outer_loops & candidates: + raise LoopyError(f"Cannot fuse '{iname}' with" + f" '{outer_loops & candidates}' as they" + " maybe nesting within one another.") + + del _nest_tree_id_to_candidate + + # }}} + + # just_outer_loop_nest: mapping from loop nest to the candidates they + # contain + just_outer_loop_nest = {tree.parent(iname_to_tree_node_id[iname]): set() + for iname in candidates} + + for iname in candidates: + just_outer_loop_nest[tree.parent(iname_to_tree_node_id[iname])].add(iname) + + for outer_inames, inames in just_outer_loop_nest.items(): + fused_chunks.update(_fuse_sequential_loops_with_outer_loops(kernel, + inames, + outer_inames, + vng, prefix)) + + return fused_chunks + + +def rename_inames_in_batch(kernel, batches: Mapping[str, FrozenSet[str]]): + """ + Returns a copy of *kernel* with inames renamed according to *batches*. + + :arg kernel: An instance of :class:`loopy.LoopKernel`. + :arg batches: A mapping from ``new_iname`` to a :class:`frozenset` of + inames that are to be renamed to ``new_iname``. + """ + from loopy.transform.iname import rename_inames, remove_unused_inames + for new_iname, candidates in batches.items(): + # pylint:disable=unexpected-keyword-arg + kernel = rename_inames( + kernel, candidates, new_iname, + remove_newly_unused_inames=False + ) + + return remove_unused_inames(kernel, reduce(frozenset.union, + batches.values(), + frozenset())) + +# vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index 8b755cca9..012ba0186 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -25,16 +25,18 @@ import islpy as isl from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, - SubstitutionRuleMappingContext) + SubstitutionRuleMappingContext, CombineMapper) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func from loopy.translation_unit import TranslationUnit +from loopy.kernel.instruction import MultiAssignmentBase from loopy.kernel.function_interface import CallableKernel, ScalarCallable from loopy.kernel.tools import (kernel_has_global_barriers, find_most_recent_global_barrier) from loopy.kernel.data import AddressSpace from pymbolic import var +from pytools import memoize_on_first_arg from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) @@ -245,6 +247,13 @@ def map_kernel(self, kernel): for insn in kernel.instructions: self.replaced_something = False + if (isinstance(insn, MultiAssignmentBase) + and not (get_all_deps_of_insn(insn) + & set(kernel.substitutions))): + # 'insn' does not have a call 'subst' => do not process + new_insns.append(insn) + continue + insn = insn.with_transformed_expressions( lambda expr: self(expr, kernel, insn)) @@ -263,8 +272,7 @@ def map_kernel(self, kernel): dep_insn = kernel.id_to_insn[dep] if (frozenset(dep_insn.assignee_var_names()) & self.compute_read_variables): - self.compute_insn_depends_on.update( - insn.depends_on - excluded_insn_ids) + self.compute_insn_depends_on.add(dep) new_insns.append(insn) @@ -280,6 +288,56 @@ class _not_provided: # noqa: N801 pass +class FunctionNameCollector(CombineMapper): + def combine(self, values): + from functools import reduce + return reduce(frozenset.union, values, frozenset()) + + def map_call(self, expr): + return self.combine([frozenset([expr.function.name])] + + [self.rec(arg) for arg in expr.parameters]) + + def map_call_with_kwargs(self, expr): + return self.combine([frozenset([expr.function.name])] + + [self.rec(arg) for arg in expr.parameters] + + [self.rec(arg) for arg in expr.kw_parameters.values()]) + + def map_algebraic_leaf(self, expr): + return frozenset() + + def map_constant(self, expr): + return frozenset() + + +@memoize_on_first_arg +def _get_calls_in_expr(expr): + return FunctionNameCollector()(expr) + + +def get_all_deps_of_insn(insn): + """ + Returns a :class:`frozenset` of all dependency of insns (including the + function names). + """ + assert isinstance(insn, MultiAssignmentBase) + from pymbolic.primitives import Expression + from functools import reduce + return ((_get_calls_in_expr(insn.expression) + if isinstance(insn.expression, Expression) + else frozenset()) + | reduce(frozenset.union, + (_get_calls_in_expr(pred) + for pred in insn.predicates + if isinstance(pred, Expression)), + frozenset()) + | reduce(frozenset.union, + (_get_calls_in_expr(assignee) + for assignee in insn.assignees + if isinstance(assignee, Expression)), + frozenset()) + ) | insn.read_dependency_names() + + def precompute_for_single_kernel(kernel, callables_table, subst_use, sweep_inames=None, within=None, storage_axes=None, temporary_name=None, precompute_inames=None, precompute_outer_inames=None, @@ -540,7 +598,9 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, import loopy as lp for insn in kernel.instructions: - if isinstance(insn, lp.MultiAssignmentBase): + if (isinstance(insn, lp.MultiAssignmentBase) + and (get_all_deps_of_insn(insn) + & set(kernel.substitutions))): for assignee in insn.assignees: invg(assignee, kernel, insn) invg(insn.expression, kernel, insn) diff --git a/requirements.txt b/requirements.txt index c44f010c3..3d8f1c38d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,5 @@ ply>=3.6 # Optional, for testing special math function scipy +# Optional, kanren-style relation helpers +git+https://github.com/pythological/kanren.git#egg=miniKanren diff --git a/test/test_loop_fusion.py b/test/test_loop_fusion.py new file mode 100644 index 000000000..678718295 --- /dev/null +++ b/test/test_loop_fusion.py @@ -0,0 +1,422 @@ +__copyright__ = "Copyright (C) 2021 Kaushik Kulkarni" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa + +__all__ = [ + "pytest_generate_tests", + "cl" # "cl.create_some_context" + ] + + +def test_loop_fusion_vanilla(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, i1, j0, j1]: 0 <= i0, i1, j0, j1 < 10}", + """ + a[i0] = 1 + b[i1, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + assert len(ref_knl["loopy_kernel"].all_inames()) == 4 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_outer_iname_preventing_fusion(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i0, j0, j1]: 0 <= i0, j0, j1 < 10}", + """ + a[i0] = 1 + b[i0, j0] = 2 {id=write_b} + c[j1] = 3 {id=write_c} + """) + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].all_inames()) == 3 + assert len(knl["loopy_kernel"].id_to_insn["write_b"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_independent_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 + b[j1] = 2 * a[j1] + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_constrained_by_outer_loop_deps(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[j0, j1]: 0 <= j0, j1 < 10}", + """ + a[j0] = 1 {id=write_a} + b = 2 {id=write_b} + c[j1] = 2 * a[j1] {id=write_c} + """, seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["j0", "j1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["write_a"].within_inames + & knl["loopy_kernel"].id_to_insn["write_c"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps1(ctx_factory): + + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[i1-1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 1 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 1 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_loop_carried_deps2(ctx_factory): + ctx = ctx_factory() + knl = lp.make_kernel( + "{[i0, i1]: 1<=i0, i1<10}", + """ + x[i0-1] = i0 {id=first_write} + x[i1] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + lp.auto_test_vs_ref(ref_knl, ctx, knl) + + +def test_loop_fusion_with_indirection(ctx_factory): + ctx = ctx_factory() + map_ = np.random.permutation(10) + cq = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i0, i1]: 0<=i0, i1<10}", + """ + x[i0] = i0 {id=first_write} + x[map[i1]] = i1 ** 2 {id=second_write} + """, + seq_dependencies=True) + + ref_knl = knl + + fused_chunks = lp.get_kennedy_unweighted_fusion_candidates(knl["loopy_kernel"], + frozenset(["i0", + "i1"])) + + knl = knl.with_kernel(lp.rename_inames_in_batch(knl["loopy_kernel"], + fused_chunks)) + + assert len(ref_knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].all_inames()) == 2 + assert len(knl["loopy_kernel"].id_to_insn["first_write"].within_inames + & knl["loopy_kernel"].id_to_insn["second_write"].within_inames) == 0 + + _, (out1,) = ref_knl(cq, map=map_) + _, (out2,) = knl(cq, map=map_) + np.testing.assert_allclose(out1, out2) + + +def test_loop_fusion_with_induced_dependencies_from_sibling_nests(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i0, j, i1, i2]: 0<=i0, j, i1, i2<10}", + """ + <> tmp0[i0] = i0 + <> tmp1[j] = tmp0[j] + <> tmp2[j] = j + out1[i1] = tmp2[i1] + out2[i2] = 2 * tmp1[i2] + """) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["i0", "i1"]))) + t_unit = t_unit.with_kernel(knl) + + # 'i1', 'i2' should not be fused. If fused that would lead to an + # unshcedulable kernel. Making sure that the kernel 'runs' suffices that + # the transformation was successful. + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_reduction_inames(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2]: 0<=i, j0, j1, j2<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j0, sum([j2], 3*A[i, j0, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + assert (knl.id_to_insn["insn"].reduction_inames() + == knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_reduction_inames_with_depth_mismatch(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j3"]))) + + # cannot fuse 'j1', 'j3' because they are not nested within the same outer + # inames. + assert (knl.id_to_insn["insn"].reduction_inames() + != knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_on_outer_reduction_inames(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1, j2, j3]: 0<=i, j0, j1, j2, j3<10}", + """ + y0[i] = sum(j0, sum([j1], 2*A[i, j0, j1])) + y1[i] = sum(j2, sum([j3], 3*A[i, j3, j2])) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j0", "j2"]))) + + assert len(knl.id_to_insn["insn"].reduction_inames() + & knl.id_to_insn["insn_0"].reduction_inames()) == 1 + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_loop_fusion_reduction_inames_simple(ctx_factory): + ctx = ctx_factory() + + t_unit = lp.make_kernel( + "{[i, j0, j1]: 0<=i, j0, j1<10}", + """ + y0[i] = sum(j0, 2*A[i, j0]) + y1[i] = sum(j1, 3*A[i, j1]) + """, [lp.GlobalArg("A", + dtype=np.float64, + shape=lp.auto), + ...]) + ref_t_unit = t_unit + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j0", "j1"]))) + + assert (knl.id_to_insn["insn"].reduction_inames() + == knl.id_to_insn["insn_0"].reduction_inames()) + + t_unit = t_unit.with_kernel(knl) + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) + + +def test_redn_loop_fusion_with_non_candidates_loops_in_nest(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[i, j1, j2, d]: 0<=i, j1, j2, d<10}", + """ + for i + for d + out1[i, d] = sum(j1, 2 * j1*i) + end + out2[i] = sum(j2, 2 * j2) + end + """, seq_dependencies=True) + ref_t_unit = t_unit + + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + + assert not (knl.id_to_insn["insn"].reduction_inames() + & knl.id_to_insn["insn_0"].reduction_inames()) + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit.with_kernel(knl)) + + +def test_reduction_loop_fusion_with_multiple_redn_in_same_insn(ctx_factory): + ctx = ctx_factory() + t_unit = lp.make_kernel( + "{[j1, j2]: 0<=j1, j2<10}", + """ + out = sum(j1, 2*j1) + sum(j2, 2*j2) + """, seq_dependencies=True) + ref_t_unit = t_unit + + knl = t_unit.default_entrypoint + knl = lp.rename_inames_in_batch( + knl, + lp.get_kennedy_unweighted_fusion_candidates( + knl, frozenset(["j1", "j2"]))) + + assert len(knl.id_to_insn["insn"].reduction_inames()) == 1 + + lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit.with_kernel(knl)) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from pytest import main + main([__file__]) + +# vim: fdm=marker