From b90adc04c8e2cceeb6d2d03354d38563197c9100 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 17:36:05 -0400 Subject: [PATCH 01/10] llvm: Remove support for running multiple contexts Execute the code per-context in multiple Python threads instead. Signed-off-by: Jan Vesely --- .../nonstateful/optimizationfunctions.py | 2 +- psyneulink/core/llvm/__init__.py | 10 - psyneulink/core/llvm/codegen.py | 50 ---- psyneulink/core/llvm/execution.py | 258 +++++------------- tests/llvm/test_multiple_executions.py | 235 ---------------- 5 files changed, 62 insertions(+), 493 deletions(-) delete mode 100644 tests/llvm/test_multiple_executions.py diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py index 5ff6359225e..eee98a83d2d 100644 --- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py +++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py @@ -831,7 +831,7 @@ def _is_static(it:SampleIterator): num_evals = np.prod([d._num for d in self.search_space]) # Map allocations to values - comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id]) + comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context.execution_id) execution_mode = ocm.parameters.comp_execution_mode._get(context) if execution_mode == "PTX": outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals, get_results) diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py index 568ef7ec910..1a921470690 100644 --- a/psyneulink/core/llvm/__init__.py +++ b/psyneulink/core/llvm/__init__.py @@ -245,16 +245,6 @@ def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()): def get(name: str, *, numpy_args:tuple=()): return LLVMBinaryFunction(name, numpy_args=numpy_args) - def get_multi_run(self, *, numpy_args=()): - try: - multirun_llvm = _find_llvm_function(self.name + "_multirun") - except ValueError: - function = _find_llvm_function(self.name) - with LLVMBuilderContext.get_current() as ctx: - multirun_llvm = codegen.gen_multirun_wrapper(ctx, function) - - return LLVMBinaryFunction.get(multirun_llvm.name, numpy_args=numpy_args) - _cpu_engine = None _ptx_engine = None diff --git a/psyneulink/core/llvm/codegen.py b/psyneulink/core/llvm/codegen.py index df792ce5fe9..cd14fadc52e 100644 --- a/psyneulink/core/llvm/codegen.py +++ b/psyneulink/core/llvm/codegen.py @@ -1119,56 +1119,6 @@ def gen_composition_run(ctx, composition, *, tags:frozenset): return llvm_func -def gen_multirun_wrapper(ctx, function: ir.Function) -> ir.Function: - if function.module is not ctx.module: - function = ir.Function(ctx.module, function.type.pointee, function.name) - assert function.is_declaration - - args = [a.type for a in function.args] - args.append(ctx.int32_ty.as_pointer()) - multirun_ty = ir.FunctionType(function.type.pointee.return_type, args) - multirun_f = ir.Function(ctx.module, multirun_ty, function.name + "_multirun") - block = multirun_f.append_basic_block(name="entry") - builder = ir.IRBuilder(block) - - multi_runs = builder.load(multirun_f.args[-1]) - # Runs need special handling. data_in and data_out are one dimensional, - # but hold entries for all parallel invocations. - is_comp_run = len(function.args) == 7 - if is_comp_run: - trials_count = builder.load(multirun_f.args[5]) - input_count = builder.load(multirun_f.args[6]) - - with helpers.for_loop_zero_inc(builder, multi_runs, "multi_run_loop") as (b, index): - # Index all pointer arguments - indexed_args = [] - for i, arg in enumerate(multirun_f.args[:-1]): - # Don't adjust #inputs and #trials - if isinstance(arg.type, ir.PointerType): - offset = index - # #runs and #trials needs to be the same for every invocation - if is_comp_run and i >= 5: - offset = ctx.int32_ty(0) - # Reset trial count for every invocation. - # Previous runs might have finished earlier - if i == 5: - builder.store(trials_count, arg) - # data arrays need special handling - elif is_comp_run and i == 4: # data_out - offset = b.mul(index, trials_count) - elif is_comp_run and i == 3: # data_in - offset = b.mul(index, input_count) - - arg = b.gep(arg, [offset]) - - indexed_args.append(arg) - - b.call(function, indexed_args) - - builder.ret_void() - return multirun_f - - def gen_autodiffcomp_exec(ctx, composition, *, tags:frozenset): """Creates llvm bin execute for autodiffcomp""" assert composition.controller is None diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index f90919b97bc..60a7967e10f 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -69,20 +69,15 @@ def _get_compilation_param(self, name, init_method, arg): if saved is None: struct_ty = self._bin_func.byref_arg_types[arg] init_f = getattr(self._obj, init_method) - if len(self._execution_contexts) > 1: - struct_ty = struct_ty * len(self._execution_contexts) - init_start = time.time() - initializer = (init_f(ex) for ex in self._execution_contexts) - else: - init_start = time.time() - initializer = init_f(self._execution_contexts[0]) + init_start = time.time() + initializer = init_f(self._execution_context) init_end = time.time() struct = struct_ty(*initializer) struct_end = time.time() # numpy "frombuffer" creates a shared memory view of the provided buffer - numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=len(self._execution_contexts)) + numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=1) assert numpy_struct.nbytes == ctypes.sizeof(struct), \ "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct)) @@ -101,21 +96,19 @@ def _get_compilation_param(self, name, init_method, arg): _pretty_size(ctypes.sizeof(struct_ty)), ")", "for", self._obj.name) - if len(self._execution_contexts) == 1: - - numpy_struct.shape = () + numpy_struct.shape = () - if name == '_state': - self._copy_params_to_pnl(self._execution_contexts[0], - self._obj, - numpy_struct, - "llvm_state_ids") + if name == '_state': + self._copy_params_to_pnl(self._execution_context, + self._obj, + numpy_struct, + "llvm_state_ids") - elif name == '_param': - self._copy_params_to_pnl(self._execution_contexts[0], - self._obj, - numpy_struct, - "llvm_param_ids") + elif name == '_param': + self._copy_params_to_pnl(self._execution_context, + self._obj, + numpy_struct, + "llvm_param_ids") return saved @@ -228,11 +221,6 @@ def __init__(self, buffers=['param_struct', 'state_struct']): # Initialize GPU buffer map self._gpu_buffers = {"_" + b: None for b in buffers} - @property - def _bin_func_multirun(self): - # CUDA uses the same function for single and multi run - return self._bin_func - def __get_cuda_arg(self, struct_name, arg_handler): gpu_buffer = self._gpu_buffers[struct_name] @@ -269,37 +257,25 @@ def cuda_execute(self, variable): new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) data_in = jit_engine.pycuda.driver.In(new_var) - extra_dims = (len(self._execution_contexts),) if len(self._execution_contexts) > 1 else () - data_out = self._bin_func.np_buffer_for_arg(3, extra_dimensions=extra_dims) + data_out = self._bin_func.np_buffer_for_arg(3) self._bin_func.cuda_call(self._cuda_param_struct, self._cuda_state_struct, data_in, - jit_engine.pycuda.driver.Out(data_out), - threads=len(self._execution_contexts)) + jit_engine.pycuda.driver.Out(data_out)) return self._get_indexable(data_out) class FuncExecution(CUDAExecution): - def __init__(self, component, execution_ids=[None], *, tags=frozenset()): + def __init__(self, component, execution_id=None, *, tags=frozenset()): super().__init__() self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3)) - self._execution_contexts = [ - Context(execution_id=eid) for eid in execution_ids - ] + self._execution_context = Context(execution_id=execution_id) self._component = component - - if len(execution_ids) > 1: - self._bin_multirun = self._bin_func.get_multi_run() - self._ct_len = ctypes.c_int(len(execution_ids)) - - vo_ty = self._bin_func.byref_arg_types[3] * len(execution_ids) - self._ct_vo = vo_ty() - @property def _obj(self): return self._component @@ -315,21 +291,10 @@ def _state_struct(self): def execute(self, variable): new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) - if len(self._execution_contexts) > 1: - # wrap_call casts the arguments so we only need contiguous data layout - ct_vi = np.ctypeslib.as_ctypes(new_variable) + data_in = new_variable.reshape(self._bin_func.np_params[2].shape) + data_out = self._bin_func.np_buffer_for_arg(3) - self._bin_multirun.wrap_call(self._param_struct[0], - self._state_struct[0], - ct_vi, - self._ct_vo, - self._ct_len) - return _convert_ctype_to_python(self._ct_vo) - else: - data_out = self._bin_func.np_buffer_for_arg(3) - data_in = new_variable.reshape(self._bin_func.np_params[2].shape) - - self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out) + self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out) return self._get_indexable(data_out) @@ -342,26 +307,19 @@ class CompExecution(CUDAExecution): active_executions = weakref.WeakSet() - def __init__(self, composition, execution_ids=[None], *, additional_tags=frozenset()): + def __init__(self, composition, execution_id, *, additional_tags=frozenset()): super().__init__(buffers=['state_struct', 'param_struct', 'data_struct', 'conditions']) self._composition = composition - self._execution_contexts = [ - Context(execution_id=eid) for eid in execution_ids - ] + self._execution_context = Context(execution_id=execution_id) self.__bin_exec_func = None - self.__bin_exec_multi_func = None self.__bin_func = None self.__bin_run_func = None - self.__bin_run_multi_func = None self.__frozen_values = None self.__tags = frozenset(additional_tags) # Scheduling conditions, only used by "execute" self.__conditions = None - if len(execution_ids) > 1: - self._ct_len = ctypes.c_int(len(execution_ids)) - self.active_executions.add(self) def __del__(self): @@ -376,8 +334,7 @@ def get(composition, context, additional_tags=frozenset()): execution = executions.get(additional_tags, None) if execution is None: - execution = pnlvm.CompExecution(composition, [context.execution_id], - additional_tags=additional_tags) + execution = pnlvm.CompExecution(composition, context.execution_id, additional_tags=additional_tags) executions[additional_tags] = execution return execution @@ -389,7 +346,6 @@ def _obj(self): @property def _bin_func(self): if self.__bin_func is not None: - assert len(self._execution_contexts) == 1 return self.__bin_func if self.__bin_exec_func is not None: return self.__bin_exec_func @@ -398,15 +354,6 @@ def _bin_func(self): assert False, "Binary function not set for execution!" - @property - def _bin_func_multirun(self): - if self.__bin_exec_multi_func is not None: - return self.__bin_exec_multi_func - if self.__bin_run_multi_func is not None: - return self.__bin_run_multi_func - - return super()._bin_func_multirun - def _set_bin_node(self, node): assert node in self._composition._all_nodes node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node) @@ -419,18 +366,13 @@ def _conditions(self): if self.__conditions is None: gen = helpers.ConditionGenerator(None, self._composition) - if len(self._execution_contexts) > 1: - conditions_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts) - conditions_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts) - else: - conditions_ctype = self._bin_func.byref_arg_types[4] - conditions_initializer = gen.get_condition_initializer() + conditions_ctype = self._bin_func.byref_arg_types[4] + conditions_initializer = gen.get_condition_initializer() ct_conditions = conditions_ctype(*conditions_initializer) - np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=len(self._execution_contexts)) + np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=1) - if len(self._execution_contexts) == 1: - np_conditions.shape = () + np_conditions.shape = () self.__conditions = (ct_conditions, np_conditions) @@ -459,23 +401,6 @@ def _data_struct(self): def _data_struct(self, data_struct): self._data = data_struct - def _extract_node_struct_from_ctype(self, node, data): - # state structure consists of a list of node states, - # followed by a list of projection contexts; get the first one - # parameter structure consists of a list of node parameters, - # followed by a list of projection parameters; get the first one - # output structure consists of a list of node outputs, - # followed by a list of nested data structures; get the first one - field_name = data._fields_[0][0] - res_struct = getattr(data, field_name) - - # Get the index into the array of all nodes - index = self._composition._get_node_index(node) - field_name = res_struct._fields_[index][0] - res_struct = getattr(res_struct, field_name) - - return _convert_ctype_to_python(res_struct) - def _extract_node_struct_from_numpy(self, node, data): # state structure consists of a list of node states, # followed by a list of projection contexts; get the first one @@ -494,10 +419,7 @@ def _extract_node_struct_from_numpy(self, node, data): return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy() def extract_node_struct(self, node, struct): - if len(self._execution_contexts) > 1: - return [self._extract_node_struct_from_ctype(node, struct[0][i]) for i, _ in enumerate(self._execution_contexts)] - else: - return self._extract_node_struct_from_numpy(node, struct[1]) + return self._extract_node_struct_from_numpy(node, struct[1]) def extract_frozen_node_output(self, node): return self.extract_node_struct(node, self.__frozen_values) @@ -525,22 +447,11 @@ def _get_input_struct(self, inputs): # Either node or composition execute. # Read provided input data and parse into an array (generator) - if len(self._execution_contexts) > 1: - assert len(self._execution_contexts) == len(inputs) - - # All execute functions expect inputs to be 3rd param. - ct_input_type = self._bin_func.byref_arg_types[2] * len(self._execution_contexts) - - input_data = (([x] for x in self._composition._build_variable_for_input_CIM(inp)) for inp in inputs) - - ct_input = ct_input_type(*_tupleize(input_data)) - np_input = np.ctypeslib.as_array(ct_input) - else: - ct_input = None - data = self._composition._build_variable_for_input_CIM(inputs) + ct_input = None + data = self._composition._build_variable_for_input_CIM(inputs) - np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base) - np_input = np_input.reshape(self._bin_func.np_params[2].shape) + np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base) + np_input = np_input.reshape(self._bin_func.np_params[2].shape) if "stat" in self._debug_env: print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name) @@ -555,8 +466,7 @@ def freeze_values(self): def execute_node(self, node, inputs=None): # We need to reconstruct the input dictionary here if it was not provided. # This happens during node execution of nested compositions. - assert len(self._execution_contexts) == 1 - context = self._execution_contexts[0] + context = self._execution_context if inputs is None and node is self._composition.input_CIM: @@ -610,29 +520,14 @@ def _bin_exec_func(self): return self.__bin_exec_func - @property - def _bin_exec_multi_func(self): - if self.__bin_exec_multi_func is None: - self.__bin_exec_multi_func = self._bin_exec_func.get_multi_run() - - return self.__bin_exec_multi_func - def execute(self, inputs): # NOTE: Make sure that input struct generation is inlined. # We need the binary function to be setup for it to work correctly. - if len(self._execution_contexts) > 1: - self._bin_exec_multi_func.wrap_call(self._state_struct[0], - self._param_struct[0], - self._get_input_struct(inputs)[0], - self._data_struct[0], - self._conditions[0], - self._ct_len) - else: - self._bin_exec_func(self._state_struct[1], - self._param_struct[1], - self._get_input_struct(inputs)[1], - self._data_struct[1], - self._conditions[1]) + self._bin_exec_func(self._state_struct[1], + self._param_struct[1], + self._get_input_struct(inputs)[1], + self._data_struct[1], + self._conditions[1]) def cuda_execute(self, inputs): # NOTE: Make sure that input struct generation is inlined. @@ -641,8 +536,7 @@ def cuda_execute(self, inputs): self._cuda_param_struct, jit_engine.pycuda.driver.In(self._get_input_struct(inputs)[1]), self._cuda_data_struct, - self._cuda_conditions, - threads=len(self._execution_contexts)) + self._cuda_conditions) # Methods used to accelerate "Run" def _get_run_input_struct(self, inputs, num_input_sets, arg=3): @@ -650,11 +544,9 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3): bin_f = self._bin_run_func if arg == 3 else self._bin_func input_type = bin_f.byref_arg_types[arg] - c_input_type = (input_type * num_input_sets) * len(self._execution_contexts) - if len(self._execution_contexts) == 1: - inputs = [inputs] + c_input_type = (input_type * num_input_sets) * 1 + inputs = [inputs] - assert len(inputs) == len(self._execution_contexts) # Extract input for each trial and execution id run_inputs = ((([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inp.items()})) for i in range(num_input_sets)) for inp in inputs) c_inputs = c_input_type(*_tupleize(run_inputs)) @@ -668,7 +560,6 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3): return c_inputs def _get_generator_run_input_struct(self, inputs, runs): - assert len(self._execution_contexts) == 1 # Extract input for each trial run_inputs = ((np.atleast_2d(x) for x in self._composition._build_variable_for_input_CIM({k:np.atleast_1d(v) for k,v in inp.items()})) for inp in inputs) run_inputs = _tupleize(run_inputs) @@ -685,13 +576,6 @@ def _bin_run_func(self): return self.__bin_run_func - @property - def _bin_run_multi_func(self): - if self.__bin_run_multi_func is None: - self.__bin_run_multi_func = self._bin_run_func.get_multi_run() - - return self.__bin_run_multi_func - def run(self, inputs, runs=0, num_input_sets=0): if isgenerator(inputs): inputs, runs = self._get_generator_run_input_struct(inputs, runs) @@ -701,8 +585,6 @@ def run(self, inputs, runs=0, num_input_sets=0): inputs = self._get_run_input_struct(inputs, num_input_sets) ct_vo = self._bin_run_func.byref_arg_types[4] * runs - if len(self._execution_contexts) > 1: - ct_vo = ct_vo * len(self._execution_contexts) outputs = ct_vo() @@ -714,32 +596,21 @@ def run(self, inputs, runs=0, num_input_sets=0): runs_count = ctypes.c_uint(runs) input_count = ctypes.c_uint(num_input_sets) - if len(self._execution_contexts) > 1: - self._bin_run_multi_func.wrap_call(self._state_struct[0], - self._param_struct[0], - self._data_struct[0], - inputs, - outputs, - runs_count, - input_count, - self._ct_len) - - return _convert_ctype_to_python(outputs) - else: - # This is only needed for non-generator inputs that are wrapped in an extra context dimension - inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3]) - self._bin_run_func(self._state_struct[1], - self._param_struct[1], - self._data_struct[1], - inputs, - outputs, - runs_count, - input_count) + # The cast is only needed for non-generator inputs that are wrapped in an extra context dimension + inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3]) - # Extract only #trials elements in case the run exited early - assert runs_count.value <= runs, "Composition ran more times than allowed!" - return _convert_ctype_to_python(outputs)[0:runs_count.value] + self._bin_run_func(self._state_struct[1], + self._param_struct[1], + self._data_struct[1], + inputs, + outputs, + runs_count, + input_count) + + # Extract only #trials elements in case the run exited early + assert runs_count.value <= runs, "Composition ran more times than allowed!" + return _convert_ctype_to_python(outputs)[0:runs_count.value] def cuda_run(self, inputs, runs, num_input_sets): # Create input buffer @@ -752,13 +623,11 @@ def cuda_run(self, inputs, runs, num_input_sets): # Create output buffer output_type = (self._bin_run_func.byref_arg_types[4] * runs) - if len(self._execution_contexts) > 1: - output_type = output_type * len(self._execution_contexts) ct_out = output_type() # number of trials argument - np_runs = np.full(len(self._execution_contexts), runs, dtype=np.int32) + np_runs = np.asarray(runs, dtype=np.int32).copy() self._bin_run_func.cuda_call(self._cuda_state_struct, self._cuda_param_struct, @@ -766,20 +635,15 @@ def cuda_run(self, inputs, runs, num_input_sets): jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # input jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_out)), # output jit_engine.pycuda.driver.InOut(np_runs), # runs - jit_engine.pycuda.driver.In(np.int32(num_input_sets)), # number of inputs - threads=len(self._execution_contexts)) + jit_engine.pycuda.driver.In(np.int32(num_input_sets))) # number of inputs - assert all(np_runs <= runs), "Composition ran more times than allowed: {}".format(runs) + assert np_runs <= runs, "Composition ran more times than allowed: {}".format(runs) - if len(self._execution_contexts) > 1: - return _convert_ctype_to_python(ct_out) - else: - # Extract only #trials elements in case the run exited early - return _convert_ctype_to_python(ct_out)[0:np_runs[0]] + # Extract only #trials elements in case the run exited early + return _convert_ctype_to_python(ct_out)[0:np_runs] def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool): ocm = self._composition.controller - assert len(self._execution_contexts) == 1 eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective" tags = {"evaluate", "alloc_range", eval_type} @@ -803,7 +667,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results # Output ctype out_el_ty = bin_func.byref_arg_types[4] if all_results: - num_trials = ocm.parameters.num_trials_per_estimate.get(self._execution_contexts[0]) + num_trials = ocm.parameters.num_trials_per_estimate.get(self._execution_context) if num_trials is None: num_trials = num_input_sets out_el_ty *= num_trials diff --git a/tests/llvm/test_multiple_executions.py b/tests/llvm/test_multiple_executions.py deleted file mode 100644 index bda26a1db83..00000000000 --- a/tests/llvm/test_multiple_executions.py +++ /dev/null @@ -1,235 +0,0 @@ -import pytest -import psyneulink.core.llvm as pnlvm - -import numpy as np -import psyneulink.core.components.functions.function as Function -import psyneulink.core.components.functions.nonstateful.objectivefunctions as Functions -from psyneulink.core.components.functions.stateful.integratorfunctions import AdaptiveIntegrator -from psyneulink.core.components.functions.nonstateful.transferfunctions import Logistic -from psyneulink.core.components.mechanisms.processing.processingmechanism import ProcessingMechanism -from psyneulink.core.components.mechanisms.processing.transfermechanism import TransferMechanism -from psyneulink.core.compositions.composition import Composition -import psyneulink.core.globals.keywords as kw - -SIZE=10 -# Some metrics (CROSS_ENTROPY) don't like 0s -test_var = [np.random.rand(SIZE) + Function.EPSILON, np.random.rand(SIZE) + Function.EPSILON] -v1 = test_var[0] -v2 = test_var[1] -expected = np.linalg.norm(v1 - v2) - -@pytest.mark.multirun -@pytest.mark.function -@pytest.mark.distance_function -@pytest.mark.benchmark -@pytest.mark.parametrize("executions", [1, 10, 100]) -def test_function(benchmark, executions, func_mode): - f = Functions.Distance(default_variable=test_var, metric=kw.EUCLIDEAN) - benchmark.group = "DistanceFunction multirun {}".format(executions) - var = [test_var for _ in range(executions)] if executions > 1 else test_var - if func_mode == 'Python': - e = f.function if executions == 1 else lambda x: [f.function(xi) for xi in x] - elif func_mode == 'LLVM': - e = pnlvm.execution.FuncExecution(f, [None for _ in range(executions)]).execute - elif func_mode == 'PTX': - e = pnlvm.execution.FuncExecution(f, [None for _ in range(executions)]).cuda_execute - - res = benchmark(e, var) - np.testing.assert_allclose(res, [expected for _ in range(executions)]) - -@pytest.mark.multirun -@pytest.mark.mechanism -@pytest.mark.transfer_mechanism -@pytest.mark.benchmark -@pytest.mark.parametrize("executions", [1, 10, 100]) -def test_mechanism(benchmark, executions, mech_mode): - benchmark.group = "TransferMechanism multirun {}".format(executions) - variable = [0 for _ in range(SIZE)] - T = TransferMechanism( - name='T', - default_variable=variable, - integration_rate=1.0, - noise=-2.0, - integrator_mode=True - ) - var = [[10.0 for _ in range(SIZE)] for _ in range(executions)] - expected = [[8.0 for i in range(SIZE)]] - if mech_mode == 'Python': - e = T.execute if executions == 1 else lambda x : [T.execute(xi) for xi in x] - elif mech_mode == 'LLVM': - e = pnlvm.execution.MechExecution(T, [None for _ in range(executions)]).execute - elif mech_mode == 'PTX': - e = pnlvm.execution.MechExecution(T, [None for _ in range(executions)]).cuda_execute - - if executions > 1: - expected = [expected for _ in range(executions)] - - res = benchmark(e, var) - np.testing.assert_allclose(res, expected) - - -@pytest.mark.multirun -@pytest.mark.nested -@pytest.mark.composition -@pytest.mark.benchmark -@pytest.mark.parametrize("executions", [1, 10, 100]) -@pytest.mark.parametrize("mode", ['Python', - pytest.param('LLVM', marks=pytest.mark.llvm), - pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda])]) -def test_nested_composition_execution(benchmark, executions, mode): - benchmark.group = "Nested Composition execution multirun {}".format(executions) - - # mechanisms - A = ProcessingMechanism(name="A", - function=AdaptiveIntegrator(rate=0.1)) - B = ProcessingMechanism(name="B", - function=Logistic) - - inner_comp = Composition(name="inner_comp") - inner_comp.add_linear_processing_pathway([A, B]) - inner_comp._analyze_graph() - - outer_comp = Composition(name="outer_comp") - outer_comp.add_node(inner_comp) - - outer_comp._analyze_graph() - - # The input dict should assign inputs origin nodes (inner_comp in this case) - var = {inner_comp: [[1.0]]} - expected = [[0.52497918747894]] - if executions > 1: - var = [var for _ in range(executions)] - - if mode == 'Python': - e = outer_comp.execute if executions == 1 else lambda x : [outer_comp.execute(x[i], context=i) for i in range(executions)] - res = e(var) - benchmark(e, var) - elif mode == 'LLVM': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - e.execute(var) - res = e.extract_node_output(outer_comp.output_CIM) - benchmark(e.execute, var) - elif mode == 'PTX': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - e.cuda_execute(var) - res = e.extract_node_output(outer_comp.output_CIM) - benchmark(e.cuda_execute, var) - else: - assert False, "Unknown mode: {}".format(mode) - - expected = [expected for _ in range(executions)] if executions > 1 else expected - np.testing.assert_allclose(res, expected) - - -@pytest.mark.multirun -@pytest.mark.nested -@pytest.mark.composition -@pytest.mark.benchmark -@pytest.mark.parametrize("executions", [1, 10, 100]) -@pytest.mark.parametrize("mode", ['Python', - pytest.param('LLVM', marks=pytest.mark.llvm), - pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda])]) -def test_nested_composition_run(benchmark, executions, mode): - benchmark.group = "Nested Composition multirun {}".format(executions) - - # mechanisms - A = ProcessingMechanism(name="A", - function=AdaptiveIntegrator(rate=0.1)) - B = ProcessingMechanism(name="B", - function=Logistic) - - inner_comp = Composition(name="inner_comp") - inner_comp.add_linear_processing_pathway([A, B]) - inner_comp._analyze_graph() - - outer_comp = Composition(name="outer_comp") - outer_comp.add_node(inner_comp) - - outer_comp._analyze_graph() - - # The input dict should assign inputs origin nodes (inner_comp in this case) - var = {inner_comp: [[[2.0]]]} - expected = [[[0.549833997312478]]] - if executions > 1: - var = [var for _ in range(executions)] - if mode == 'Python': - e = outer_comp.run if executions == 1 else lambda x: [outer_comp.run(x[i], context=i) for i in range(executions)] - res = e(var) - - # Composition.run returns only the result of the last trail, - # unlike results for all trials reported by CompExecution.run below - expected = expected[0] - - benchmark(e, var) - elif mode == 'LLVM': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - res = e.run(var, 1, 1) - benchmark(e.run, var, 1, 1) - elif mode == 'PTX': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - res = e.cuda_run(var, 1, 1) - benchmark(e.cuda_run, var, 1, 1) - else: - assert False, "Unknown mode: {}".format(mode) - - expected = [expected for _ in range(executions)] if executions > 1 else expected - np.testing.assert_allclose(res, expected) - - -@pytest.mark.multirun -@pytest.mark.nested -@pytest.mark.composition -@pytest.mark.benchmark -@pytest.mark.parametrize("executions", [1, 10, 100]) -@pytest.mark.parametrize("mode", [ - 'Python', - pytest.param('LLVM', marks=pytest.mark.llvm), - pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda]) -]) -def test_nested_composition_run_trials_inputs(benchmark, executions, mode): - benchmark.group = "Nested Composition mutliple trials/inputs multirun {}".format(executions) - - # mechanisms - A = ProcessingMechanism(name="A", - function=AdaptiveIntegrator(rate=0.1)) - B = ProcessingMechanism(name="B", - function=Logistic) - - inner_comp = Composition(name="inner_comp") - inner_comp.add_linear_processing_pathway([A, B]) - inner_comp._analyze_graph() - - outer_comp = Composition(name="outer_comp") - outer_comp.add_node(inner_comp) - - outer_comp._analyze_graph() - - # The input dict should assign inputs origin nodes (inner_comp in this case) - var = {inner_comp: [[[2.0]], [[3.0]]]} - expected = [[[0.549833997312478]], [[0.617747874769249]], [[0.6529428177055896]], [[0.7044959416252289]]] - if executions > 1: - var = [var for _ in range(executions)] - if mode == 'Python': - def f(v, num_trials, copy_results=False): - results = [] - for i in range(executions): - outer_comp.run(v[i], context=i, num_trials=num_trials) - if copy_results: # copy the results immediately, otherwise it's empty - results.append(outer_comp.results.copy()) - return results[0] if len(results) == 1 else results - - res = f(var, 4, True) if executions > 1 else f([var], 4, True) - benchmark(f if executions > 1 else outer_comp.run, var, num_trials=4) - elif mode == 'LLVM': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - res = e.run(var, 4, 2) - benchmark(e.run, var, 4, 2) - elif mode == 'PTX': - e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)]) - res = e.cuda_run(var, 4, 2) - benchmark(e.cuda_run, var, 4, 2) - else: - assert False, "Unknown mode: {}".format(mode) - - expected = [expected for _ in range(executions)] if executions > 1 else expected - np.testing.assert_allclose(res, expected) From 309730b4ceccfad3fd8aacb557a1923948c6a788 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 18:37:46 -0400 Subject: [PATCH 02/10] llvm/execution: Simplify and remove dead code Simplify run input construction. Do not store references to constructed ctype structures. Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 68 ++++++++++++------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 60a7967e10f..51c9dba203d 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -82,7 +82,7 @@ def _get_compilation_param(self, name, init_method, arg): assert numpy_struct.nbytes == ctypes.sizeof(struct), \ "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct)) - saved = (struct, numpy_struct) + saved = numpy_struct setattr(self, name, saved) if "time_stat" in self._debug_env: @@ -224,7 +224,7 @@ def __init__(self, buffers=['param_struct', 'state_struct']): def __get_cuda_arg(self, struct_name, arg_handler): gpu_buffer = self._gpu_buffers[struct_name] - np_struct = getattr(self, struct_name)[1] + np_struct = getattr(self, struct_name) # .array is a public member of pycuda's In/Out ArgumentHandler classes if gpu_buffer is None or gpu_buffer.array is not np_struct: @@ -294,7 +294,7 @@ def execute(self, variable): data_in = new_variable.reshape(self._bin_func.np_params[2].shape) data_out = self._bin_func.np_buffer_for_arg(3) - self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out) + self._bin_func(self._param_struct, self._state_struct, data_in, data_out) return self._get_indexable(data_out) @@ -374,11 +374,11 @@ def _conditions(self): np_conditions.shape = () - self.__conditions = (ct_conditions, np_conditions) + self.__conditions = np_conditions if "stat" in self._debug_env: print("Instantiated condition struct ( size:" , - _pretty_size(ctypes.sizeof(conditions_ctype)), ")", + _pretty_size(np_conditions.nbytes), ")", "for", self._composition.name) return self.__conditions @@ -401,7 +401,7 @@ def _data_struct(self): def _data_struct(self, data_struct): self._data = data_struct - def _extract_node_struct_from_numpy(self, node, data): + def extract_node_struct(self, node, data): # state structure consists of a list of node states, # followed by a list of projection contexts; get the first one # parameter structure consists of a list of node parameters, @@ -418,9 +418,6 @@ def _extract_node_struct_from_numpy(self, node, data): # returned results in next execution return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy() - def extract_node_struct(self, node, struct): - return self._extract_node_struct_from_numpy(node, struct[1]) - def extract_frozen_node_output(self, node): return self.extract_node_struct(node, self.__frozen_values) @@ -436,7 +433,7 @@ def extract_node_params(self, node): def insert_node_output(self, node, data): # output structure consists of a list of node outputs, # followed by a list of nested data structures; get the first one - all_nodes = self._data_struct[1][self._data_struct[1].dtype.names[0]] + all_nodes = self._data_struct[self._data_struct.dtype.names[0]] # Get the index into the array of all nodes index = self._composition._get_node_index(node) @@ -447,7 +444,6 @@ def _get_input_struct(self, inputs): # Either node or composition execute. # Read provided input data and parse into an array (generator) - ct_input = None data = self._composition._build_variable_for_input_CIM(inputs) np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base) @@ -456,12 +452,10 @@ def _get_input_struct(self, inputs): if "stat" in self._debug_env: print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name) - return ct_input, np_input + return np_input def freeze_values(self): - np_copy = self._data_struct[1].copy() - - self.__frozen_values = (None, np_copy) + self.__frozen_values = self._data_struct.copy() def execute_node(self, node, inputs=None): # We need to reconstruct the input dictionary here if it was not provided. @@ -485,7 +479,7 @@ def execute_node(self, node, inputs=None): # Numpy doesn't allow to pass NULL to the called function. # Create and pass a dummy buffer filled with NaN instead. if inputs is not None: - inputs = self._get_input_struct(inputs)[1] + inputs = self._get_input_struct(inputs) else: inputs = self._bin_func.np_buffer_for_arg(2) @@ -493,17 +487,13 @@ def execute_node(self, node, inputs=None): # and need frozen values available if node is not self._composition.input_CIM and node is not self._composition.parameter_CIM: assert self.__frozen_values is not None - data_in = self.__frozen_values[1] + data_in = self.__frozen_values else: # The ndarray argument check doesn't allow None for null so just provide # the same structure as outputs. - data_in = self._data_struct[1] + data_in = self._data_struct - self._bin_func(self._state_struct[1], - self._param_struct[1], - inputs, - data_in, - self._data_struct[1]) + self._bin_func(self._state_struct, self._param_struct, inputs, data_in, self._data_struct) if "comp_node_debug" in self._debug_env: print("RAN: {}. State: {}".format(node, self.extract_node_state(node))) @@ -523,18 +513,18 @@ def _bin_exec_func(self): def execute(self, inputs): # NOTE: Make sure that input struct generation is inlined. # We need the binary function to be setup for it to work correctly. - self._bin_exec_func(self._state_struct[1], - self._param_struct[1], - self._get_input_struct(inputs)[1], - self._data_struct[1], - self._conditions[1]) + self._bin_exec_func(self._state_struct, + self._param_struct, + self._get_input_struct(inputs), + self._data_struct, + self._conditions) def cuda_execute(self, inputs): # NOTE: Make sure that input struct generation is inlined. # We need the binary function to be setup for it to work correctly. self._bin_exec_func.cuda_call(self._cuda_state_struct, self._cuda_param_struct, - jit_engine.pycuda.driver.In(self._get_input_struct(inputs)[1]), + jit_engine.pycuda.driver.In(self._get_input_struct(inputs)), self._cuda_data_struct, self._cuda_conditions) @@ -544,11 +534,10 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3): bin_f = self._bin_run_func if arg == 3 else self._bin_func input_type = bin_f.byref_arg_types[arg] - c_input_type = (input_type * num_input_sets) * 1 - inputs = [inputs] + c_input_type = (input_type * num_input_sets) # Extract input for each trial and execution id - run_inputs = ((([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inp.items()})) for i in range(num_input_sets)) for inp in inputs) + run_inputs = (([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inputs.items()})) for i in range(num_input_sets)) c_inputs = c_input_type(*_tupleize(run_inputs)) if "stat" in self._debug_env: print("Instantiated struct: input ( size:" , @@ -597,12 +586,9 @@ def run(self, inputs, runs=0, num_input_sets=0): runs_count = ctypes.c_uint(runs) input_count = ctypes.c_uint(num_input_sets) - # The cast is only needed for non-generator inputs that are wrapped in an extra context dimension - inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3]) - - self._bin_run_func(self._state_struct[1], - self._param_struct[1], - self._data_struct[1], + self._bin_run_func(self._state_struct, + self._param_struct, + self._data_struct, inputs, outputs, runs_count, @@ -657,9 +643,9 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results # Directly initialized structures assert ocm.agent_rep is self._composition - comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)[1] - comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)[1] - comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)[1] + comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0) + comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1) + comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6) # Construct input variable, the 5th parameter of the evaluate function ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5) From 7ab159935cb8b2a3ebbe989d408b029d42328d14 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 21:08:51 -0400 Subject: [PATCH 03/10] llvm/execution/run: Use numpy structures for input and execution counts Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 51c9dba203d..f834ac540d3 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -561,7 +561,7 @@ def _get_generator_run_input_struct(self, inputs, runs): def _bin_run_func(self): if self.__bin_run_func is None: self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj( - self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2)) + self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2, 5, 6)) return self.__bin_run_func @@ -583,8 +583,8 @@ def run(self, inputs, runs=0, num_input_sets=0): print("Output struct size:", _pretty_size(ctypes.sizeof(outputs)), "for", self._composition.name) - runs_count = ctypes.c_uint(runs) - input_count = ctypes.c_uint(num_input_sets) + runs_count = np.asarray(runs, dtype=np.uint32).copy() + input_count = np.asarray(num_input_sets, dtype=np.uint32) self._bin_run_func(self._state_struct, self._param_struct, @@ -595,8 +595,8 @@ def run(self, inputs, runs=0, num_input_sets=0): input_count) # Extract only #trials elements in case the run exited early - assert runs_count.value <= runs, "Composition ran more times than allowed!" - return _convert_ctype_to_python(outputs)[0:runs_count.value] + assert runs_count <= runs, "Composition ran more times than allowed!" + return _convert_ctype_to_python(outputs)[0:runs_count] def cuda_run(self, inputs, runs, num_input_sets): # Create input buffer From aa166a6d68ec015453d52fae7c29dd1af87caab2 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 21:22:55 -0400 Subject: [PATCH 04/10] llvm/execution: Consolidate shared code between CPU and GPU 'run' Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 51 ++++++++++++------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index f834ac540d3..cddea113df7 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -565,7 +565,9 @@ def _bin_run_func(self): return self.__bin_run_func - def run(self, inputs, runs=0, num_input_sets=0): + def _prepare_run(self, inputs, runs, num_input_sets): + + # Create input buffer if isgenerator(inputs): inputs, runs = self._get_generator_run_input_struct(inputs, runs) assert num_input_sets == 0 or num_input_sets == sys.maxsize @@ -573,60 +575,47 @@ def run(self, inputs, runs=0, num_input_sets=0): else: inputs = self._get_run_input_struct(inputs, num_input_sets) - ct_vo = self._bin_run_func.byref_arg_types[4] * runs - - outputs = ct_vo() + # Create output buffer + outputs = (self._bin_run_func.byref_arg_types[4] * runs)() if "stat" in self._debug_env: - print("Input struct size:", _pretty_size(ctypes.sizeof(inputs)), - "for", self._composition.name) print("Output struct size:", _pretty_size(ctypes.sizeof(outputs)), "for", self._composition.name) runs_count = np.asarray(runs, dtype=np.uint32).copy() input_count = np.asarray(num_input_sets, dtype=np.uint32) + return inputs, outputs, runs_count, input_count + + def run(self, inputs, runs, num_input_sets): + ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets) + self._bin_run_func(self._state_struct, self._param_struct, self._data_struct, - inputs, - outputs, + ct_inputs, + ct_outputs, runs_count, input_count) # Extract only #trials elements in case the run exited early assert runs_count <= runs, "Composition ran more times than allowed!" - return _convert_ctype_to_python(outputs)[0:runs_count] + return _convert_ctype_to_python(ct_outputs)[0:runs_count] def cuda_run(self, inputs, runs, num_input_sets): - # Create input buffer - if isgenerator(inputs): - ct_inputs, runs = self._get_generator_run_input_struct(inputs, runs) - assert num_input_sets == 0 or num_input_sets == sys.maxsize - num_input_sets = len(ct_inputs) - else: - ct_inputs = self._get_run_input_struct(inputs, num_input_sets) - - # Create output buffer - output_type = (self._bin_run_func.byref_arg_types[4] * runs) - - ct_out = output_type() - - # number of trials argument - np_runs = np.asarray(runs, dtype=np.int32).copy() + ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets) self._bin_run_func.cuda_call(self._cuda_state_struct, self._cuda_param_struct, self._cuda_data_struct, - jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # input - jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_out)), # output - jit_engine.pycuda.driver.InOut(np_runs), # runs - jit_engine.pycuda.driver.In(np.int32(num_input_sets))) # number of inputs - - assert np_runs <= runs, "Composition ran more times than allowed: {}".format(runs) + jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), + jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_outputs)), + jit_engine.pycuda.driver.InOut(runs_count), + jit_engine.pycuda.driver.In(input_count)) # Extract only #trials elements in case the run exited early - return _convert_ctype_to_python(ct_out)[0:np_runs] + assert runs_count <= runs, "Composition ran more times than allowed: {}".format(runs) + return _convert_ctype_to_python(ct_outputs)[0:runs_count] def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool): ocm = self._composition.controller From 286dfcec4738556e970f51c87dcb5e977a0895f1 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 17:44:01 -0400 Subject: [PATCH 05/10] llvm/execution: Consolidate shared code between CPU and GPU 'evaluate' Do not sync back composition state or data. A call to evaluate creates a copy of these structures for each evaluation so the structure content is unchanged. Moreover, the structures are deallocated after the evaluate call anyway. Use Numpy structure for number of inputs Instantiate output buffer in the shared _prepare_evaluate function. Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index cddea113df7..8b92e50af1e 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -622,7 +622,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective" tags = {"evaluate", "alloc_range", eval_type} - bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6)) + bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6, 7)) self.__bin_func = bin_func # There are 8 arguments to evaluate_alloc_range: @@ -648,27 +648,25 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results out_el_ty *= num_trials out_ty = out_el_ty * num_evaluations - ct_num_inputs = bin_func.byref_arg_types[7](num_input_sets) + num_inputs = np.asarray(num_input_sets, dtype=np.uint32) if "stat" in self._debug_env: print("Evaluate result struct type size:", _pretty_size(ctypes.sizeof(out_ty)), "( evaluations:", num_evaluations, "element size:", ctypes.sizeof(out_el_ty), ")", "for", self._obj.name) - return comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs + return comp_params, comp_state, comp_data, ct_inputs, out_ty(), num_inputs def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False): - comp_params, comp_state, comp_data, ct_inputs, out_ty, _ = \ + comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \ self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results) - ct_results = out_ty() - cuda_args = (jit_engine.pycuda.driver.In(comp_params), - jit_engine.pycuda.driver.InOut(comp_state), + jit_engine.pycuda.driver.In(comp_state), jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_results)), # results jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # inputs - jit_engine.pycuda.driver.InOut(comp_data), # composition data - jit_engine.pycuda.driver.In(np.int32(num_input_sets)), # number of inputs + jit_engine.pycuda.driver.In(comp_data), # composition data + jit_engine.pycuda.driver.In(num_inputs), # number of inputs ) self.__bin_func.cuda_call(*cuda_args, threads=int(num_evaluations)) @@ -676,10 +674,9 @@ def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:boo return ct_results def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False): - comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs = \ + comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \ self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results) - ct_results = out_ty() jobs = min(os.cpu_count(), num_evaluations) evals_per_job = (num_evaluations + jobs - 1) // jobs @@ -688,11 +685,11 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b # Create input and result typed casts once, they are the same # for every submitted job. - input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5]) results_arg = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4]) + input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5]) - # There are 7 arguments to evaluate_alloc_range: - # comp_param, comp_state, from, to, results, input, comp_data + # There are 8 arguments to evaluate_alloc_range: + # comp_param, comp_state, from, to, results, input, comp_data, input length results = [ex.submit(self.__bin_func, comp_params, comp_state, @@ -701,7 +698,7 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b results_arg, input_arg, comp_data, - ct_num_inputs) + num_inputs) for i in range(jobs)] parallel_stop = time.time() From 4533f621b2ca601f4d541bb6a20eba26a47824c1 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 21:40:45 -0400 Subject: [PATCH 06/10] llvm/execute: Move cuda_execute for Function and Mechanism to FuncExecution There's an overriding implementation in CompExecution. Signed-off-by: Jan Vesely --- psyneulink/core/llvm/execution.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 8b92e50af1e..67100583b5a 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -252,20 +252,6 @@ def _cuda_data_struct(self): def _cuda_conditions(self): return self.__get_cuda_arg("_conditions", jit_engine.pycuda.driver.InOut) - def cuda_execute(self, variable): - # Create input argument, PyCUDA doesn't care about shape - new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) - data_in = jit_engine.pycuda.driver.In(new_var) - - data_out = self._bin_func.np_buffer_for_arg(3) - - self._bin_func.cuda_call(self._cuda_param_struct, - self._cuda_state_struct, - data_in, - jit_engine.pycuda.driver.Out(data_out)) - - return self._get_indexable(data_out) - class FuncExecution(CUDAExecution): @@ -290,14 +276,26 @@ def _state_struct(self): def execute(self, variable): new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) - data_in = new_variable.reshape(self._bin_func.np_params[2].shape) + data_out = self._bin_func.np_buffer_for_arg(3) self._bin_func(self._param_struct, self._state_struct, data_in, data_out) return self._get_indexable(data_out) + def cuda_execute(self, variable): + # Create input argument, PyCUDA doesn't care about shape + data_in = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) + data_out = self._bin_func.np_buffer_for_arg(3) + + self._bin_func.cuda_call(self._cuda_param_struct, + self._cuda_state_struct, + jit_engine.pycuda.driver.In(data_in), + jit_engine.pycuda.driver.Out(data_out)) + + return self._get_indexable(data_out) + class MechExecution(FuncExecution): pass From 55cb895c3f98cc7db82f59dfbd2969bdd48ade50 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 22:35:30 -0400 Subject: [PATCH 07/10] llvm/execution: Use Context instance in the CompExecution constructor Signed-off-by: Jan Vesely --- .../functions/nonstateful/optimizationfunctions.py | 2 +- psyneulink/core/llvm/execution.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py index eee98a83d2d..dfdce982a52 100644 --- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py +++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py @@ -831,7 +831,7 @@ def _is_static(it:SampleIterator): num_evals = np.prod([d._num for d in self.search_space]) # Map allocations to values - comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context.execution_id) + comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context) execution_mode = ocm.parameters.comp_execution_mode._get(context) if execution_mode == "PTX": outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals, get_results) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 67100583b5a..5ead80a2731 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -305,10 +305,10 @@ class CompExecution(CUDAExecution): active_executions = weakref.WeakSet() - def __init__(self, composition, execution_id, *, additional_tags=frozenset()): + def __init__(self, composition, context:Context, *, additional_tags=frozenset()): super().__init__(buffers=['state_struct', 'param_struct', 'data_struct', 'conditions']) self._composition = composition - self._execution_context = Context(execution_id=execution_id) + self._execution_context = context self.__bin_exec_func = None self.__bin_func = None self.__bin_run_func = None @@ -324,7 +324,7 @@ def __del__(self): self.active_executions.discard(self) @staticmethod - def get(composition, context, additional_tags=frozenset()): + def get(composition, context:Context, additional_tags=frozenset()): executions = composition._compilation_data.execution._get(context) if executions is None: executions = dict() @@ -332,7 +332,7 @@ def get(composition, context, additional_tags=frozenset()): execution = executions.get(additional_tags, None) if execution is None: - execution = pnlvm.CompExecution(composition, context.execution_id, additional_tags=additional_tags) + execution = pnlvm.CompExecution(composition, context, additional_tags=additional_tags) executions[additional_tags] = execution return execution From dd1170c0fadf1bc0cc183fdd2f46f19033b243a2 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Mon, 5 Aug 2024 22:50:32 -0400 Subject: [PATCH 08/10] llvm: Remove 'wrap_call' No longer used. Signed-off-by: Jan Vesely --- psyneulink/core/llvm/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py index 1a921470690..93b287cd748 100644 --- a/psyneulink/core/llvm/__init__.py +++ b/psyneulink/core/llvm/__init__.py @@ -176,11 +176,6 @@ def c_func(self): def __call__(self, *args, **kwargs): return self.c_func(*args, **kwargs) - def wrap_call(self, *pargs): - cpargs = (ctypes.byref(p) if p is not None else None for p in pargs) - args = zip(cpargs, self.c_func.argtypes) - self(*(ctypes.cast(p, t) for p, t in args)) - @property def _cuda_kernel(self): if self.__cuda_kernel is None: From cc8c381432a43185fa69c73ba8155e2e5ea6020c Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Tue, 6 Aug 2024 01:00:19 -0400 Subject: [PATCH 09/10] llvm: Rename np_params -> np_arg_dtypes Signed-off-by: Jan Vesely --- psyneulink/core/llvm/__init__.py | 8 ++++---- psyneulink/core/llvm/execution.py | 14 +++++++------- tests/llvm/test_builtins_intrinsics.py | 2 +- tests/llvm/test_builtins_matrix.py | 6 +++--- tests/llvm/test_builtins_mt_random.py | 8 ++++---- tests/llvm/test_builtins_philox_random.py | 8 ++++---- tests/llvm/test_builtins_vector.py | 8 ++++---- tests/llvm/test_compile.py | 6 +++--- tests/llvm/test_helpers.py | 14 +++++++------- 9 files changed, 37 insertions(+), 37 deletions(-) diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py index 93b287cd748..7976f1505ed 100644 --- a/psyneulink/core/llvm/__init__.py +++ b/psyneulink/core/llvm/__init__.py @@ -148,11 +148,11 @@ def __init__(self, name: str, *, numpy_args=()): # '_type_' special attribute stores pointee type for pointers # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_ self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args] - self.np_params = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args] + self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args] for a in numpy_args: assert self.byref_arg_types[a] is not None - args[a] = np.ctypeslib.ndpointer(dtype=self.np_params[a].base, shape=self.np_params[a].shape) + args[a] = np.ctypeslib.ndpointer(dtype=self.np_arg_dtypes[a].base, shape=self.np_arg_dtypes[a].shape) middle = time.perf_counter() self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args) @@ -223,8 +223,8 @@ def cuda_wrap_call(self, *args, **kwargs): def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan): - out_base = self.np_params[arg_num].base - out_shape = extra_dimensions + self.np_params[arg_num].shape + out_base = self.np_arg_dtypes[arg_num].base + out_shape = extra_dimensions + self.np_arg_dtypes[arg_num].shape # fill the buffer with NaN poison return np.full(out_shape, fill_value, dtype=out_base) diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 5ead80a2731..786f2feb6bc 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -77,7 +77,7 @@ def _get_compilation_param(self, name, init_method, arg): struct_end = time.time() # numpy "frombuffer" creates a shared memory view of the provided buffer - numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=1) + numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_arg_dtypes[arg], count=1) assert numpy_struct.nbytes == ctypes.sizeof(struct), \ "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct)) @@ -275,8 +275,8 @@ def _state_struct(self): return self._get_compilation_param('_state', '_get_state_initializer', 1) def execute(self, variable): - new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) - data_in = new_variable.reshape(self._bin_func.np_params[2].shape) + new_variable = np.asfarray(variable, dtype=self._bin_func.np_arg_dtypes[2].base) + data_in = new_variable.reshape(self._bin_func.np_arg_dtypes[2].shape) data_out = self._bin_func.np_buffer_for_arg(3) @@ -286,7 +286,7 @@ def execute(self, variable): def cuda_execute(self, variable): # Create input argument, PyCUDA doesn't care about shape - data_in = np.asfarray(variable, dtype=self._bin_func.np_params[2].base) + data_in = np.asfarray(variable, dtype=self._bin_func.np_arg_dtypes[2].base) data_out = self._bin_func.np_buffer_for_arg(3) self._bin_func.cuda_call(self._cuda_param_struct, @@ -368,7 +368,7 @@ def _conditions(self): conditions_initializer = gen.get_condition_initializer() ct_conditions = conditions_ctype(*conditions_initializer) - np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=1) + np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_arg_dtypes[4], count=1) np_conditions.shape = () @@ -444,8 +444,8 @@ def _get_input_struct(self, inputs): # Read provided input data and parse into an array (generator) data = self._composition._build_variable_for_input_CIM(inputs) - np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base) - np_input = np_input.reshape(self._bin_func.np_params[2].shape) + np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_arg_dtypes[2].base) + np_input = np_input.reshape(self._bin_func.np_arg_dtypes[2].shape) if "stat" in self._debug_env: print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name) diff --git a/tests/llvm/test_builtins_intrinsics.py b/tests/llvm/test_builtins_intrinsics.py index 22cc3d2df8d..5195fcee73b 100644 --- a/tests/llvm/test_builtins_intrinsics.py +++ b/tests/llvm/test_builtins_intrinsics.py @@ -52,7 +52,7 @@ def test_builtin_op(benchmark, op, args, builtin, result, func_mode): # The result argument is a pointer, use it to derive # the right argument type - dty = bin_f.np_params[1].base + dty = bin_f.np_arg_dtypes[1].base ptx_res = np.empty_like(result, dtype=dty) ptx_res_arg = pnlvm.jit_engine.pycuda.driver.Out(ptx_res) diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py index 1cad00e1565..f2c50bf576f 100644 --- a/tests/llvm/test_builtins_matrix.py +++ b/tests/llvm/test_builtins_matrix.py @@ -64,9 +64,9 @@ def _get_const_dim_func(builtin, *dims): def test_matrix_op(benchmark, op, x, y, builtin, result, func_mode, dims): def _numpy_args(bin_f): - np_x = x.astype(bin_f.np_params[0]) - np_y = bin_f.np_params[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_params[1]) - np_res = np.empty_like(result, dtype=bin_f.np_params[-1]) + np_x = x.astype(bin_f.np_arg_dtypes[0]) + np_y = bin_f.np_arg_dtypes[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_arg_dtypes[1]) + np_res = np.empty_like(result, dtype=bin_f.np_arg_dtypes[-1]) return np_x, np_y, np_res diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py index 2ff7cff0ea2..28082e2d7e8 100644 --- a/tests/llvm/test_builtins_mt_random.py +++ b/tests/llvm/test_builtins_mt_random.py @@ -196,8 +196,8 @@ def f(): init_fun(state, SEED) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3)) - n = np.asarray(n, dtype=gen_fun.np_params[1]) - p = np.asarray(p, dtype=gen_fun.np_params[2]) + n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]) + p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]) def f(): out = gen_fun.np_buffer_for_arg(1) @@ -214,8 +214,8 @@ def f(): gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial') - gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1])) - gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2])) + gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])) + gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])) out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py index 0c6e289a700..2466ea4f6d2 100644 --- a/tests/llvm/test_builtins_philox_random.py +++ b/tests/llvm/test_builtins_philox_random.py @@ -327,8 +327,8 @@ def f(): init_fun(state, SEED) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3)) - n = np.asarray(n, dtype=gen_fun.np_params[1]) - p = np.asarray(p, dtype=gen_fun.np_params[2]) + n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]) + p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]) def f(): out = gen_fun.np_buffer_for_arg(1) @@ -342,8 +342,8 @@ def f(): init_fun.cuda_call(gpu_state, np.int64(SEED)) gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial') - gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1])) - gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2])) + gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])) + gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])) out = gen_fun.np_buffer_for_arg(1) gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out) diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py index 999a7e42696..9a806bde911 100644 --- a/tests/llvm/test_builtins_vector.py +++ b/tests/llvm/test_builtins_vector.py @@ -29,8 +29,8 @@ def test_vector_op(benchmark, op, v, builtin, result, func_mode): def _numpy_args(bin_f): - np_u = u.astype(bin_f.np_params[0]) - np_v = bin_f.np_params[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_params[1]) + np_u = u.astype(bin_f.np_arg_dtypes[0]) + np_v = bin_f.np_arg_dtypes[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_arg_dtypes[1]) np_res = np.empty_like(np_u) return np_u, np_v, np_res @@ -77,7 +77,7 @@ def ex(): elif func_mode == 'LLVM': bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) - np_u = u.astype(bin_f.np_params[0]) + np_u = u.astype(bin_f.np_arg_dtypes[0]) np_res = bin_f.np_buffer_for_arg(2) ct_u = np_u.ctypes.data_as(bin_f.c_func.argtypes[0]) @@ -89,7 +89,7 @@ def ex(): elif func_mode == 'PTX': bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) - np_u = u.astype(bin_f.np_params[0]) + np_u = u.astype(bin_f.np_arg_dtypes[0]) np_res = bin_f.np_buffer_for_arg(2) cuda_u = pnlvm.jit_engine.pycuda.driver.In(np_u) diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py index c396cba594f..71c8526e2bc 100644 --- a/tests/llvm/test_compile.py +++ b/tests/llvm/test_compile.py @@ -12,9 +12,9 @@ def test_recompile(): # The original builtin mxv function bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') - vector = np.random.rand(DIM_X).astype(bin_f.np_params[0].base) - matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_params[1].base) - llvm_res = np.empty(DIM_Y, dtype=bin_f.np_params[4].base) + vector = np.random.rand(DIM_X).astype(bin_f.np_arg_dtypes[0].base) + matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_arg_dtypes[1].base) + llvm_res = np.empty(DIM_Y, dtype=bin_f.np_arg_dtypes[4].base) x, y = matrix.shape diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py index e692bd62f37..9f1c9bad29a 100644 --- a/tests/llvm/test_helpers.py +++ b/tests/llvm/test_helpers.py @@ -144,8 +144,8 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) - vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_params[0].base)) - vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_params[1].base)) + vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base)) + vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base)) assert len(vec1) == len(vec2) res = np.empty_like(vec2) @@ -442,7 +442,7 @@ def test_helper_numerical(mode, op, var, expected, fp_type): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,)) - res = np.asfarray(var, dtype=bin_f.np_params[0]) + res = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0]) if mode == 'CPU': bin_f(res) @@ -475,7 +475,7 @@ def test_helper_elementwise_op(mode, var, expected): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) - vec = np.asfarray(var, dtype=bin_f.np_params[0].base) + vec = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0].base) res = bin_f.np_buffer_for_arg(1) if mode == 'CPU': @@ -521,8 +521,8 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2)) - vec1 = np.asfarray(var1, dtype=bin_f.np_params[0].base) - vec2 = np.asfarray(var2, dtype=bin_f.np_params[0].base) + vec1 = np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base) + vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[0].base) res = bin_f.np_buffer_for_arg(1) if mode == 'CPU': @@ -558,7 +558,7 @@ def test_helper_convert_fp_type(t1, t2, mode, val): bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) # Get the argument numpy dtype - np_dt1, np_dt2 = (np.dtype(bin_f.np_params[i]) for i in (0, 1)) + np_dt1, np_dt2 = (np.dtype(bin_f.np_arg_dtypes[i]) for i in (0, 1)) # instantiate value, result and reference x = np.asfarray(val, dtype=np_dt1) From 45d8ccd2469358e489f5e9bd4e426250e0f2ebc1 Mon Sep 17 00:00:00 2001 From: Jan Vesely Date: Tue, 6 Aug 2024 11:34:38 -0400 Subject: [PATCH 10/10] llvm: Use Numpy ndpointer by default ctype_ptr_arg can be used to force use ctype pointers for dynamically sized argument Signed-off-by: Jan Vesely --- .../nonstateful/optimizationfunctions.py | 2 +- psyneulink/core/llvm/__init__.py | 20 ++++++------ psyneulink/core/llvm/execution.py | 16 +++++----- tests/llvm/test_builtins_matrix.py | 2 +- tests/llvm/test_builtins_mt_random.py | 16 +++++----- tests/llvm/test_builtins_philox_random.py | 24 +++++++------- tests/llvm/test_builtins_vector.py | 6 ++-- tests/llvm/test_compile.py | 10 +++--- tests/llvm/test_custom_func.py | 10 +++--- tests/llvm/test_helpers.py | 31 +++++++++---------- 10 files changed, 66 insertions(+), 71 deletions(-) diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py index dfdce982a52..bc4d323c606 100644 --- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py +++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py @@ -2103,7 +2103,7 @@ def _function(self, # select_min params are: # params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, count min_tags = frozenset({"select_min", "evaluate_type_objective"}) - bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, numpy_args=(2, 4, 6)) + bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, ctype_ptr_args=(0, 1, 3, 5)) ct_param = bin_func.byref_arg_types[0](*self._get_param_initializer(context)) ct_state = bin_func.byref_arg_types[1](*self._get_state_initializer(context)) diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py index 7976f1505ed..5a9788102f4 100644 --- a/psyneulink/core/llvm/__init__.py +++ b/psyneulink/core/llvm/__init__.py @@ -123,7 +123,7 @@ def _llvm_build(target_generation=_binary_generation + 1): class LLVMBinaryFunction: - def __init__(self, name: str, *, numpy_args=()): + def __init__(self, name: str, *, ctype_ptr_args=()): self.name = name self.__c_func = None @@ -143,16 +143,18 @@ def __init__(self, name: str, *, numpy_args=()): # Create ctype function instance start = time.perf_counter() return_type = _convert_llvm_ir_to_ctype(f.return_value.type) + + self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args] + args = [_convert_llvm_ir_to_ctype(a.type) for a in f.args] # '_type_' special attribute stores pointee type for pointers # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_ self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args] - self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args] - for a in numpy_args: - assert self.byref_arg_types[a] is not None - args[a] = np.ctypeslib.ndpointer(dtype=self.np_arg_dtypes[a].base, shape=self.np_arg_dtypes[a].shape) + for i, arg in enumerate(self.np_arg_dtypes): + if i not in ctype_ptr_args and self.byref_arg_types[i] is not None: + args[i] = np.ctypeslib.ndpointer(dtype=arg.base, shape=arg.shape) middle = time.perf_counter() self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args) @@ -231,14 +233,14 @@ def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan): @staticmethod @functools.lru_cache(maxsize=32) - def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()): + def from_obj(obj, *, tags:frozenset=frozenset(), ctype_ptr_args:tuple=()): name = LLVMBuilderContext.get_current().gen_llvm_function(obj, tags=tags).name - return LLVMBinaryFunction.get(name, numpy_args=numpy_args) + return LLVMBinaryFunction.get(name, ctype_ptr_args=ctype_ptr_args) @staticmethod @functools.lru_cache(maxsize=32) - def get(name: str, *, numpy_args:tuple=()): - return LLVMBinaryFunction(name, numpy_args=numpy_args) + def get(name: str, *, ctype_ptr_args:tuple=()): + return LLVMBinaryFunction(name, ctype_ptr_args=ctype_ptr_args) _cpu_engine = None diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py index 786f2feb6bc..c49c801f0b0 100644 --- a/psyneulink/core/llvm/execution.py +++ b/psyneulink/core/llvm/execution.py @@ -258,7 +258,7 @@ class FuncExecution(CUDAExecution): def __init__(self, component, execution_id=None, *, tags=frozenset()): super().__init__() - self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3)) + self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags) self._execution_context = Context(execution_id=execution_id) self._component = component @@ -355,9 +355,7 @@ def _bin_func(self): def _set_bin_node(self, node): assert node in self._composition._all_nodes node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node) - self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly, - tags=self.__tags.union({"node_assembly"}), - numpy_args=(0, 1, 2, 3, 4)) + self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly, tags=self.__tags.union({"node_assembly"})) @property def _conditions(self): @@ -503,8 +501,7 @@ def execute_node(self, node, inputs=None): @property def _bin_exec_func(self): if self.__bin_exec_func is None: - self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj( - self._composition, tags=self.__tags, numpy_args=(0, 1, 2, 3, 4)) + self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj(self._composition, tags=self.__tags) return self.__bin_exec_func @@ -558,8 +555,9 @@ def _get_generator_run_input_struct(self, inputs, runs): @property def _bin_run_func(self): if self.__bin_run_func is None: - self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj( - self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2, 5, 6)) + self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(self._composition, + tags=self.__tags.union({"run"}), + ctype_ptr_args=(3, 4)) return self.__bin_run_func @@ -620,7 +618,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective" tags = {"evaluate", "alloc_range", eval_type} - bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6, 7)) + bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), ctype_ptr_args=(4, 5)) self.__bin_func = bin_func # There are 8 arguments to evaluate_alloc_range: diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py index f2c50bf576f..9280eb0db98 100644 --- a/tests/llvm/test_builtins_matrix.py +++ b/tests/llvm/test_builtins_matrix.py @@ -80,7 +80,7 @@ def ex(): else: func_name = builtin - bin_f = pnlvm.LLVMBinaryFunction.get(func_name) + bin_f = pnlvm.LLVMBinaryFunction.get(func_name, ctype_ptr_args=(0, 1, 2, 3, 4)) lx, ly, lres = _numpy_args(bin_f) ct_x = lx.ctypes.data_as(bin_f.c_func.argtypes[0]) diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py index 28082e2d7e8..d8c0f51d1ce 100644 --- a/tests/llvm/test_builtins_mt_random.py +++ b/tests/llvm/test_builtins_mt_random.py @@ -27,12 +27,12 @@ def f(): return state.randint(0xffffffff, dtype=np.int64) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -84,11 +84,11 @@ def f(): return state.random_sample() elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -133,11 +133,11 @@ def f(): return state.normal() elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -191,11 +191,11 @@ def f(): return state.binomial(n, p) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial') n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]) p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]) diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py index 2466ea4f6d2..af9f4228d71 100644 --- a/tests/llvm/test_builtins_philox_random.py +++ b/tests/llvm/test_builtins_philox_random.py @@ -26,11 +26,11 @@ def f(): return prng.integers(0xffffffffffffffff, dtype=np.uint64, endpoint=True) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, seed) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -76,11 +76,11 @@ def f(): return prng.integers(0xffffffff, dtype=np.uint32, endpoint=True) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -124,11 +124,11 @@ def f(): return prng.random(dtype=np.float64) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -171,11 +171,11 @@ def f(): return prng.random(dtype=np.float32) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -224,11 +224,11 @@ def f(): return prng.standard_normal(dtype=dtype) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal', numpy_args=(0, 1)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal') def f(): out = gen_fun.np_buffer_for_arg(1) @@ -322,11 +322,11 @@ def f(): return prng.binomial(n, p) elif mode == 'LLVM': - init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,)) + init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init') state = init_fun.np_buffer_for_arg(0) init_fun(state, SEED) - gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3)) + gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial') n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]) p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]) diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py index 9a806bde911..70ced0e8864 100644 --- a/tests/llvm/test_builtins_vector.py +++ b/tests/llvm/test_builtins_vector.py @@ -40,7 +40,7 @@ def ex(): return op(u, v) elif func_mode == 'LLVM': - bin_f = pnlvm.LLVMBinaryFunction.get(builtin) + bin_f = pnlvm.LLVMBinaryFunction.get(builtin, ctype_ptr_args=(0, 1, 3)) lu, lv, lres = _numpy_args(bin_f) ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0]) @@ -75,7 +75,7 @@ def ex(): return np.sum(u) elif func_mode == 'LLVM': - bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) + bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", ctype_ptr_args=(0,)) np_u = u.astype(bin_f.np_arg_dtypes[0]) np_res = bin_f.np_buffer_for_arg(2) @@ -87,7 +87,7 @@ def ex(): return np_res elif func_mode == 'PTX': - bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,)) + bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum") np_u = u.astype(bin_f.np_arg_dtypes[0]) np_res = bin_f.np_buffer_for_arg(2) diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py index 71c8526e2bc..4a1cff96317 100644 --- a/tests/llvm/test_compile.py +++ b/tests/llvm/test_compile.py @@ -10,7 +10,7 @@ @pytest.mark.llvm def test_recompile(): # The original builtin mxv function - bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') + bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm', ctype_ptr_args=(0, 1, 4)) vector = np.random.rand(DIM_X).astype(bin_f.np_arg_dtypes[0].base) matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_arg_dtypes[1].base) @@ -24,7 +24,7 @@ def test_recompile(): orig_res = np.empty_like(llvm_res) ct_res = orig_res.ctypes.data_as(bin_f.c_func.argtypes[4]) - bin_f.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f(ct_vec, ct_mat, x, y, ct_res) # Rebuild and try again # This is not a public API @@ -33,15 +33,15 @@ def test_recompile(): rebuild_res = np.empty_like(llvm_res) ct_res = rebuild_res.ctypes.data_as(bin_f.c_func.argtypes[4]) - bin_f.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f(ct_vec, ct_mat, x, y, ct_res) assert np.array_equal(orig_res, rebuild_res) # Get a new pointer - bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm') + bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm', ctype_ptr_args=(0, 1, 4)) new_res = np.empty_like(llvm_res) ct_res = new_res.ctypes.data_as(bin_f2.c_func.argtypes[4]) - bin_f2.c_func(ct_vec, ct_mat, x, y, ct_res) + bin_f2(ct_vec, ct_mat, x, y, ct_res) assert np.array_equal(rebuild_res, new_res) callable_res = np.empty_like(llvm_res) diff --git a/tests/llvm/test_custom_func.py b/tests/llvm/test_custom_func.py index d15e65146ce..87936eb54e0 100644 --- a/tests/llvm/test_custom_func.py +++ b/tests/llvm/test_custom_func.py @@ -35,14 +35,12 @@ def test_integer_broadcast(mode, val): builder.ret_void() binf = pnlvm.LLVMBinaryFunction.get(custom_name) - res = np.zeros(8, dtype=val.dtype) + val = np.asarray(val) + res = binf.np_buffer_for_arg(1) if mode == 'CPU': - ct_res = np.ctypeslib.as_ctypes(res) - ct_in = np.ctypeslib.as_ctypes(val) - - binf(ctypes.byref(ct_in), ctypes.byref(ct_res)) + binf(val, res) else: - binf.cuda_wrap_call(np.asarray(val), res) + binf.cuda_wrap_call(val, res) assert all(res == np.broadcast_to(val + 1, 8)) diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py index 9f1c9bad29a..00696744eb1 100644 --- a/tests/llvm/test_helpers.py +++ b/tests/llvm/test_helpers.py @@ -45,7 +45,7 @@ def test_helper_fclamp(mode): ref = np.clip(VECTOR, TST_MIN, TST_MAX) bounds = np.asfarray([TST_MIN, TST_MAX]) - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0, 2)) local_vec = VECTOR.copy() if mode == 'CPU': @@ -86,7 +86,7 @@ def test_helper_fclamp_const(mode): local_vec = VECTOR.copy() ref = np.clip(VECTOR, TST_MIN, TST_MAX) - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0,)) if mode == 'CPU': ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0]) @@ -118,8 +118,7 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type): with pnlvm.LLVMBuilderContext.get_current() as ctx: float_ptr_ty = ctx.float_ty.as_pointer() - func_ty = ir.FunctionType(ir.VoidType(), [float_ptr_ty, float_ptr_ty, - float_ptr_ty, ctx.int32_ty]) + func_ty = ir.FunctionType(ir.VoidType(), [float_ptr_ty, float_ptr_ty, float_ptr_ty, ctx.int32_ty]) custom_name = ctx.get_unique_name("is_close") function = ir.Function(ctx.module, func_ty, name=custom_name) @@ -135,14 +134,12 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type): val2 = b1.load(val2_ptr) close = pnlvm.helpers.is_close(ctx, b1, val1, val2, **tolerance) out_ptr = b1.gep(out, [index]) - out_val = b1.select(close, val1.type(1), val1.type(0)) - res = b1.select(close, out_ptr.type.pointee(1), - out_ptr.type.pointee(0)) + out_val = b1.select(close, out_ptr.type.pointee(1), out_ptr.type.pointee(0)) b1.store(out_val, out_ptr) builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0, 1, 2)) vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base)) vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base)) @@ -200,7 +197,7 @@ def test_helper_all_close(mode, var1, var2, atol, rtol): ref = np.allclose(vec1, vec2, **tolerance) res = np.array(5, dtype=np.uint32) - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2)) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) if mode == 'CPU': bin_f(vec1, vec2, res) @@ -440,7 +437,7 @@ def test_helper_numerical(mode, op, var, expected, fp_type): builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,)) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) res = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0]) @@ -473,7 +470,7 @@ def test_helper_elementwise_op(mode, var, expected): lambda ctx, builder, x: builder.fadd(x.type(1.0), x), out) builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) vec = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0].base) res = bin_f.np_buffer_for_arg(1) @@ -519,11 +516,11 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected): builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2)) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) vec1 = np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base) - vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[0].base) - res = bin_f.np_buffer_for_arg(1) + vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base) + res = bin_f.np_buffer_for_arg(2) if mode == 'CPU': bin_f(vec1, vec2, res) @@ -555,14 +552,14 @@ def test_helper_convert_fp_type(t1, t2, mode, val): builder.store(conv_x, y) builder.ret_void() - bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1)) + bin_f = pnlvm.LLVMBinaryFunction.get(custom_name) # Get the argument numpy dtype np_dt1, np_dt2 = (np.dtype(bin_f.np_arg_dtypes[i]) for i in (0, 1)) # instantiate value, result and reference - x = np.asfarray(val, dtype=np_dt1) - y = np.asfarray(0, dtype=np_dt2) + x = np.asfarray(val, dtype=bin_f.np_arg_dtypes[0]) + y = bin_f.np_buffer_for_arg(1) ref = x.astype(np_dt2) if mode == 'CPU':