From b90adc04c8e2cceeb6d2d03354d38563197c9100 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 17:36:05 -0400
Subject: [PATCH 01/10] llvm: Remove support for running multiple contexts

Execute the code per-context in multiple Python threads instead.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../nonstateful/optimizationfunctions.py      |   2 +-
 psyneulink/core/llvm/__init__.py              |  10 -
 psyneulink/core/llvm/codegen.py               |  50 ----
 psyneulink/core/llvm/execution.py             | 258 +++++-------------
 tests/llvm/test_multiple_executions.py        | 235 ----------------
 5 files changed, 62 insertions(+), 493 deletions(-)
 delete mode 100644 tests/llvm/test_multiple_executions.py

diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index 5ff6359225e..eee98a83d2d 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -831,7 +831,7 @@ def _is_static(it:SampleIterator):
         num_evals = np.prod([d._num for d in self.search_space])
 
         # Map allocations to values
-        comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, [context.execution_id])
+        comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context.execution_id)
         execution_mode = ocm.parameters.comp_execution_mode._get(context)
         if execution_mode == "PTX":
             outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals, get_results)
diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
index 568ef7ec910..1a921470690 100644
--- a/psyneulink/core/llvm/__init__.py
+++ b/psyneulink/core/llvm/__init__.py
@@ -245,16 +245,6 @@ def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()):
     def get(name: str, *, numpy_args:tuple=()):
         return LLVMBinaryFunction(name, numpy_args=numpy_args)
 
-    def get_multi_run(self, *, numpy_args=()):
-        try:
-            multirun_llvm = _find_llvm_function(self.name + "_multirun")
-        except ValueError:
-            function = _find_llvm_function(self.name)
-            with LLVMBuilderContext.get_current() as ctx:
-                multirun_llvm = codegen.gen_multirun_wrapper(ctx, function)
-
-        return LLVMBinaryFunction.get(multirun_llvm.name, numpy_args=numpy_args)
-
 
 _cpu_engine = None
 _ptx_engine = None
diff --git a/psyneulink/core/llvm/codegen.py b/psyneulink/core/llvm/codegen.py
index df792ce5fe9..cd14fadc52e 100644
--- a/psyneulink/core/llvm/codegen.py
+++ b/psyneulink/core/llvm/codegen.py
@@ -1119,56 +1119,6 @@ def gen_composition_run(ctx, composition, *, tags:frozenset):
     return llvm_func
 
 
-def gen_multirun_wrapper(ctx, function: ir.Function) -> ir.Function:
-    if function.module is not ctx.module:
-        function = ir.Function(ctx.module, function.type.pointee, function.name)
-        assert function.is_declaration
-
-    args = [a.type for a in function.args]
-    args.append(ctx.int32_ty.as_pointer())
-    multirun_ty = ir.FunctionType(function.type.pointee.return_type, args)
-    multirun_f = ir.Function(ctx.module, multirun_ty, function.name + "_multirun")
-    block = multirun_f.append_basic_block(name="entry")
-    builder = ir.IRBuilder(block)
-
-    multi_runs = builder.load(multirun_f.args[-1])
-    # Runs need special handling. data_in and data_out are one dimensional,
-    # but hold entries for all parallel invocations.
-    is_comp_run = len(function.args) == 7
-    if is_comp_run:
-        trials_count = builder.load(multirun_f.args[5])
-        input_count = builder.load(multirun_f.args[6])
-
-    with helpers.for_loop_zero_inc(builder, multi_runs, "multi_run_loop") as (b, index):
-        # Index all pointer arguments
-        indexed_args = []
-        for i, arg in enumerate(multirun_f.args[:-1]):
-            # Don't adjust #inputs and #trials
-            if isinstance(arg.type, ir.PointerType):
-                offset = index
-                # #runs and #trials needs to be the same for every invocation
-                if is_comp_run and i >= 5:
-                    offset = ctx.int32_ty(0)
-                    # Reset trial count for every invocation.
-                    # Previous runs might have finished earlier
-                    if i == 5:
-                        builder.store(trials_count, arg)
-                # data arrays need special handling
-                elif is_comp_run and i == 4:  # data_out
-                    offset = b.mul(index, trials_count)
-                elif is_comp_run and i == 3:  # data_in
-                    offset = b.mul(index, input_count)
-
-                arg = b.gep(arg, [offset])
-
-            indexed_args.append(arg)
-
-        b.call(function, indexed_args)
-
-    builder.ret_void()
-    return multirun_f
-
-
 def gen_autodiffcomp_exec(ctx, composition, *, tags:frozenset):
     """Creates llvm bin execute for autodiffcomp"""
     assert composition.controller is None
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index f90919b97bc..60a7967e10f 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -69,20 +69,15 @@ def _get_compilation_param(self, name, init_method, arg):
         if saved is None:
             struct_ty = self._bin_func.byref_arg_types[arg]
             init_f = getattr(self._obj, init_method)
-            if len(self._execution_contexts) > 1:
-                struct_ty = struct_ty * len(self._execution_contexts)
-                init_start = time.time()
-                initializer = (init_f(ex) for ex in self._execution_contexts)
-            else:
-                init_start = time.time()
-                initializer = init_f(self._execution_contexts[0])
+            init_start = time.time()
+            initializer = init_f(self._execution_context)
 
             init_end = time.time()
             struct = struct_ty(*initializer)
             struct_end = time.time()
 
             # numpy "frombuffer" creates a shared memory view of the provided buffer
-            numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=len(self._execution_contexts))
+            numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=1)
 
             assert numpy_struct.nbytes == ctypes.sizeof(struct), \
                 "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct))
@@ -101,21 +96,19 @@ def _get_compilation_param(self, name, init_method, arg):
                       _pretty_size(ctypes.sizeof(struct_ty)), ")",
                       "for", self._obj.name)
 
-            if len(self._execution_contexts) == 1:
-
-                numpy_struct.shape = ()
+            numpy_struct.shape = ()
 
-                if name == '_state':
-                    self._copy_params_to_pnl(self._execution_contexts[0],
-                                             self._obj,
-                                             numpy_struct,
-                                             "llvm_state_ids")
+            if name == '_state':
+                self._copy_params_to_pnl(self._execution_context,
+                                         self._obj,
+                                         numpy_struct,
+                                         "llvm_state_ids")
 
-                elif name == '_param':
-                    self._copy_params_to_pnl(self._execution_contexts[0],
-                                             self._obj,
-                                             numpy_struct,
-                                             "llvm_param_ids")
+            elif name == '_param':
+                self._copy_params_to_pnl(self._execution_context,
+                                         self._obj,
+                                         numpy_struct,
+                                         "llvm_param_ids")
 
         return saved
 
@@ -228,11 +221,6 @@ def __init__(self, buffers=['param_struct', 'state_struct']):
         # Initialize GPU buffer map
         self._gpu_buffers = {"_" + b: None for b in buffers}
 
-    @property
-    def _bin_func_multirun(self):
-        # CUDA uses the same function for single and multi run
-        return self._bin_func
-
     def __get_cuda_arg(self, struct_name, arg_handler):
         gpu_buffer = self._gpu_buffers[struct_name]
 
@@ -269,37 +257,25 @@ def cuda_execute(self, variable):
         new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
         data_in = jit_engine.pycuda.driver.In(new_var)
 
-        extra_dims = (len(self._execution_contexts),) if len(self._execution_contexts) > 1 else ()
-        data_out = self._bin_func.np_buffer_for_arg(3, extra_dimensions=extra_dims)
+        data_out = self._bin_func.np_buffer_for_arg(3)
 
         self._bin_func.cuda_call(self._cuda_param_struct,
                                  self._cuda_state_struct,
                                  data_in,
-                                 jit_engine.pycuda.driver.Out(data_out),
-                                 threads=len(self._execution_contexts))
+                                 jit_engine.pycuda.driver.Out(data_out))
 
         return self._get_indexable(data_out)
 
 
 class FuncExecution(CUDAExecution):
 
-    def __init__(self, component, execution_ids=[None], *, tags=frozenset()):
+    def __init__(self, component, execution_id=None, *, tags=frozenset()):
         super().__init__()
 
         self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3))
-        self._execution_contexts = [
-            Context(execution_id=eid) for eid in execution_ids
-        ]
+        self._execution_context = Context(execution_id=execution_id)
         self._component = component
 
-
-        if len(execution_ids) > 1:
-            self._bin_multirun = self._bin_func.get_multi_run()
-            self._ct_len = ctypes.c_int(len(execution_ids))
-
-            vo_ty = self._bin_func.byref_arg_types[3] * len(execution_ids)
-            self._ct_vo = vo_ty()
-
     @property
     def _obj(self):
         return self._component
@@ -315,21 +291,10 @@ def _state_struct(self):
     def execute(self, variable):
         new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
 
-        if len(self._execution_contexts) > 1:
-            # wrap_call casts the arguments so we only need contiguous data layout
-            ct_vi = np.ctypeslib.as_ctypes(new_variable)
+        data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
+        data_out = self._bin_func.np_buffer_for_arg(3)
 
-            self._bin_multirun.wrap_call(self._param_struct[0],
-                                         self._state_struct[0],
-                                         ct_vi,
-                                         self._ct_vo,
-                                         self._ct_len)
-            return _convert_ctype_to_python(self._ct_vo)
-        else:
-            data_out = self._bin_func.np_buffer_for_arg(3)
-            data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
-
-            self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out)
+        self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out)
 
         return self._get_indexable(data_out)
 
@@ -342,26 +307,19 @@ class CompExecution(CUDAExecution):
 
     active_executions = weakref.WeakSet()
 
-    def __init__(self, composition, execution_ids=[None], *, additional_tags=frozenset()):
+    def __init__(self, composition, execution_id, *, additional_tags=frozenset()):
         super().__init__(buffers=['state_struct', 'param_struct', 'data_struct', 'conditions'])
         self._composition = composition
-        self._execution_contexts = [
-            Context(execution_id=eid) for eid in execution_ids
-        ]
+        self._execution_context = Context(execution_id=execution_id)
         self.__bin_exec_func = None
-        self.__bin_exec_multi_func = None
         self.__bin_func = None
         self.__bin_run_func = None
-        self.__bin_run_multi_func = None
         self.__frozen_values = None
         self.__tags = frozenset(additional_tags)
 
         # Scheduling conditions, only used by "execute"
         self.__conditions = None
 
-        if len(execution_ids) > 1:
-            self._ct_len = ctypes.c_int(len(execution_ids))
-
         self.active_executions.add(self)
 
     def __del__(self):
@@ -376,8 +334,7 @@ def get(composition, context, additional_tags=frozenset()):
 
         execution = executions.get(additional_tags, None)
         if execution is None:
-            execution = pnlvm.CompExecution(composition, [context.execution_id],
-                                            additional_tags=additional_tags)
+            execution = pnlvm.CompExecution(composition, context.execution_id, additional_tags=additional_tags)
             executions[additional_tags] = execution
 
         return execution
@@ -389,7 +346,6 @@ def _obj(self):
     @property
     def _bin_func(self):
         if self.__bin_func is not None:
-            assert len(self._execution_contexts) == 1
             return self.__bin_func
         if self.__bin_exec_func is not None:
             return self.__bin_exec_func
@@ -398,15 +354,6 @@ def _bin_func(self):
 
         assert False, "Binary function not set for execution!"
 
-    @property
-    def _bin_func_multirun(self):
-        if self.__bin_exec_multi_func is not None:
-            return self.__bin_exec_multi_func
-        if self.__bin_run_multi_func is not None:
-            return self.__bin_run_multi_func
-
-        return super()._bin_func_multirun
-
     def _set_bin_node(self, node):
         assert node in self._composition._all_nodes
         node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node)
@@ -419,18 +366,13 @@ def _conditions(self):
         if self.__conditions is None:
             gen = helpers.ConditionGenerator(None, self._composition)
 
-            if len(self._execution_contexts) > 1:
-                conditions_ctype = self._bin_func_multirun.byref_arg_types[4] * len(self._execution_contexts)
-                conditions_initializer = (gen.get_condition_initializer() for _ in self._execution_contexts)
-            else:
-                conditions_ctype = self._bin_func.byref_arg_types[4]
-                conditions_initializer = gen.get_condition_initializer()
+            conditions_ctype = self._bin_func.byref_arg_types[4]
+            conditions_initializer = gen.get_condition_initializer()
 
             ct_conditions = conditions_ctype(*conditions_initializer)
-            np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=len(self._execution_contexts))
+            np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=1)
 
-            if len(self._execution_contexts) == 1:
-                np_conditions.shape = ()
+            np_conditions.shape = ()
 
             self.__conditions = (ct_conditions, np_conditions)
 
@@ -459,23 +401,6 @@ def _data_struct(self):
     def _data_struct(self, data_struct):
         self._data = data_struct
 
-    def _extract_node_struct_from_ctype(self, node, data):
-        # state structure consists of a list of node states,
-        #   followed by a list of projection contexts; get the first one
-        # parameter structure consists of a list of node parameters,
-        #   followed by a list of projection parameters; get the first one
-        # output structure consists of a list of node outputs,
-        #   followed by a list of nested data structures; get the first one
-        field_name = data._fields_[0][0]
-        res_struct = getattr(data, field_name)
-
-        # Get the index into the array of all nodes
-        index = self._composition._get_node_index(node)
-        field_name = res_struct._fields_[index][0]
-        res_struct = getattr(res_struct, field_name)
-
-        return _convert_ctype_to_python(res_struct)
-
     def _extract_node_struct_from_numpy(self, node, data):
         # state structure consists of a list of node states,
         #   followed by a list of projection contexts; get the first one
@@ -494,10 +419,7 @@ def _extract_node_struct_from_numpy(self, node, data):
         return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy()
 
     def extract_node_struct(self, node, struct):
-        if len(self._execution_contexts) > 1:
-            return [self._extract_node_struct_from_ctype(node, struct[0][i]) for i, _ in enumerate(self._execution_contexts)]
-        else:
-            return self._extract_node_struct_from_numpy(node, struct[1])
+        return self._extract_node_struct_from_numpy(node, struct[1])
 
     def extract_frozen_node_output(self, node):
         return self.extract_node_struct(node, self.__frozen_values)
@@ -525,22 +447,11 @@ def _get_input_struct(self, inputs):
         # Either node or composition execute.
 
         # Read provided input data and parse into an array (generator)
-        if len(self._execution_contexts) > 1:
-            assert len(self._execution_contexts) == len(inputs)
-
-            # All execute functions expect inputs to be 3rd param.
-            ct_input_type = self._bin_func.byref_arg_types[2] * len(self._execution_contexts)
-
-            input_data = (([x] for x in self._composition._build_variable_for_input_CIM(inp)) for inp in inputs)
-
-            ct_input = ct_input_type(*_tupleize(input_data))
-            np_input = np.ctypeslib.as_array(ct_input)
-        else:
-            ct_input = None
-            data = self._composition._build_variable_for_input_CIM(inputs)
+        ct_input = None
+        data = self._composition._build_variable_for_input_CIM(inputs)
 
-            np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base)
-            np_input = np_input.reshape(self._bin_func.np_params[2].shape)
+        np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base)
+        np_input = np_input.reshape(self._bin_func.np_params[2].shape)
 
         if "stat" in self._debug_env:
             print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name)
@@ -555,8 +466,7 @@ def freeze_values(self):
     def execute_node(self, node, inputs=None):
         # We need to reconstruct the input dictionary here if it was not provided.
         # This happens during node execution of nested compositions.
-        assert len(self._execution_contexts) == 1
-        context = self._execution_contexts[0]
+        context = self._execution_context
 
         if inputs is None and node is self._composition.input_CIM:
 
@@ -610,29 +520,14 @@ def _bin_exec_func(self):
 
         return self.__bin_exec_func
 
-    @property
-    def _bin_exec_multi_func(self):
-        if self.__bin_exec_multi_func is None:
-            self.__bin_exec_multi_func = self._bin_exec_func.get_multi_run()
-
-        return self.__bin_exec_multi_func
-
     def execute(self, inputs):
         # NOTE: Make sure that input struct generation is inlined.
         # We need the binary function to be setup for it to work correctly.
-        if len(self._execution_contexts) > 1:
-            self._bin_exec_multi_func.wrap_call(self._state_struct[0],
-                                                self._param_struct[0],
-                                                self._get_input_struct(inputs)[0],
-                                                self._data_struct[0],
-                                                self._conditions[0],
-                                                self._ct_len)
-        else:
-            self._bin_exec_func(self._state_struct[1],
-                                self._param_struct[1],
-                                self._get_input_struct(inputs)[1],
-                                self._data_struct[1],
-                                self._conditions[1])
+        self._bin_exec_func(self._state_struct[1],
+                            self._param_struct[1],
+                            self._get_input_struct(inputs)[1],
+                            self._data_struct[1],
+                            self._conditions[1])
 
     def cuda_execute(self, inputs):
         # NOTE: Make sure that input struct generation is inlined.
@@ -641,8 +536,7 @@ def cuda_execute(self, inputs):
                                       self._cuda_param_struct,
                                       jit_engine.pycuda.driver.In(self._get_input_struct(inputs)[1]),
                                       self._cuda_data_struct,
-                                      self._cuda_conditions,
-                                      threads=len(self._execution_contexts))
+                                      self._cuda_conditions)
 
     # Methods used to accelerate "Run"
     def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
@@ -650,11 +544,9 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
         bin_f = self._bin_run_func if arg == 3 else self._bin_func
 
         input_type = bin_f.byref_arg_types[arg]
-        c_input_type = (input_type * num_input_sets) * len(self._execution_contexts)
-        if len(self._execution_contexts) == 1:
-            inputs = [inputs]
+        c_input_type = (input_type * num_input_sets) * 1
+        inputs = [inputs]
 
-        assert len(inputs) == len(self._execution_contexts)
         # Extract input for each trial and execution id
         run_inputs = ((([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inp.items()})) for i in range(num_input_sets)) for inp in inputs)
         c_inputs = c_input_type(*_tupleize(run_inputs))
@@ -668,7 +560,6 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
         return c_inputs
 
     def _get_generator_run_input_struct(self, inputs, runs):
-        assert len(self._execution_contexts) == 1
         # Extract input for each trial
         run_inputs = ((np.atleast_2d(x) for x in self._composition._build_variable_for_input_CIM({k:np.atleast_1d(v) for k,v in inp.items()})) for inp in inputs)
         run_inputs = _tupleize(run_inputs)
@@ -685,13 +576,6 @@ def _bin_run_func(self):
 
         return self.__bin_run_func
 
-    @property
-    def _bin_run_multi_func(self):
-        if self.__bin_run_multi_func is None:
-            self.__bin_run_multi_func = self._bin_run_func.get_multi_run()
-
-        return self.__bin_run_multi_func
-
     def run(self, inputs, runs=0, num_input_sets=0):
         if isgenerator(inputs):
             inputs, runs = self._get_generator_run_input_struct(inputs, runs)
@@ -701,8 +585,6 @@ def run(self, inputs, runs=0, num_input_sets=0):
             inputs = self._get_run_input_struct(inputs, num_input_sets)
 
         ct_vo = self._bin_run_func.byref_arg_types[4] * runs
-        if len(self._execution_contexts) > 1:
-            ct_vo = ct_vo * len(self._execution_contexts)
 
         outputs = ct_vo()
 
@@ -714,32 +596,21 @@ def run(self, inputs, runs=0, num_input_sets=0):
 
         runs_count = ctypes.c_uint(runs)
         input_count = ctypes.c_uint(num_input_sets)
-        if len(self._execution_contexts) > 1:
-            self._bin_run_multi_func.wrap_call(self._state_struct[0],
-                                               self._param_struct[0],
-                                               self._data_struct[0],
-                                               inputs,
-                                               outputs,
-                                               runs_count,
-                                               input_count,
-                                               self._ct_len)
-
-            return _convert_ctype_to_python(outputs)
-        else:
-            # This is only needed for non-generator inputs that are wrapped in an extra context dimension
-            inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3])
 
-            self._bin_run_func(self._state_struct[1],
-                               self._param_struct[1],
-                               self._data_struct[1],
-                               inputs,
-                               outputs,
-                               runs_count,
-                               input_count)
+        # The cast is only needed for non-generator inputs that are wrapped in an extra context dimension
+        inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3])
 
-            # Extract only #trials elements in case the run exited early
-            assert runs_count.value <= runs, "Composition ran more times than allowed!"
-            return _convert_ctype_to_python(outputs)[0:runs_count.value]
+        self._bin_run_func(self._state_struct[1],
+                           self._param_struct[1],
+                           self._data_struct[1],
+                           inputs,
+                           outputs,
+                           runs_count,
+                           input_count)
+
+        # Extract only #trials elements in case the run exited early
+        assert runs_count.value <= runs, "Composition ran more times than allowed!"
+        return _convert_ctype_to_python(outputs)[0:runs_count.value]
 
     def cuda_run(self, inputs, runs, num_input_sets):
         # Create input buffer
@@ -752,13 +623,11 @@ def cuda_run(self, inputs, runs, num_input_sets):
 
         # Create output buffer
         output_type = (self._bin_run_func.byref_arg_types[4] * runs)
-        if len(self._execution_contexts) > 1:
-            output_type = output_type * len(self._execution_contexts)
 
         ct_out = output_type()
 
         # number of trials argument
-        np_runs = np.full(len(self._execution_contexts), runs, dtype=np.int32)
+        np_runs = np.asarray(runs, dtype=np.int32).copy()
 
         self._bin_run_func.cuda_call(self._cuda_state_struct,
                                      self._cuda_param_struct,
@@ -766,20 +635,15 @@ def cuda_run(self, inputs, runs, num_input_sets):
                                      jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # input
                                      jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_out)),   # output
                                      jit_engine.pycuda.driver.InOut(np_runs),                       # runs
-                                     jit_engine.pycuda.driver.In(np.int32(num_input_sets)),         # number of inputs
-                                     threads=len(self._execution_contexts))
+                                     jit_engine.pycuda.driver.In(np.int32(num_input_sets)))         # number of inputs
 
-        assert all(np_runs <= runs), "Composition ran more times than allowed: {}".format(runs)
+        assert np_runs <= runs, "Composition ran more times than allowed: {}".format(runs)
 
-        if len(self._execution_contexts) > 1:
-            return _convert_ctype_to_python(ct_out)
-        else:
-            # Extract only #trials elements in case the run exited early
-            return _convert_ctype_to_python(ct_out)[0:np_runs[0]]
+        # Extract only #trials elements in case the run exited early
+        return _convert_ctype_to_python(ct_out)[0:np_runs]
 
     def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool):
         ocm = self._composition.controller
-        assert len(self._execution_contexts) == 1
 
         eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
         tags = {"evaluate", "alloc_range", eval_type}
@@ -803,7 +667,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
         # Output ctype
         out_el_ty = bin_func.byref_arg_types[4]
         if all_results:
-            num_trials = ocm.parameters.num_trials_per_estimate.get(self._execution_contexts[0])
+            num_trials = ocm.parameters.num_trials_per_estimate.get(self._execution_context)
             if num_trials is None:
                 num_trials = num_input_sets
             out_el_ty *= num_trials
diff --git a/tests/llvm/test_multiple_executions.py b/tests/llvm/test_multiple_executions.py
deleted file mode 100644
index bda26a1db83..00000000000
--- a/tests/llvm/test_multiple_executions.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import pytest
-import psyneulink.core.llvm as pnlvm
-
-import numpy as np
-import psyneulink.core.components.functions.function as Function
-import psyneulink.core.components.functions.nonstateful.objectivefunctions as Functions
-from psyneulink.core.components.functions.stateful.integratorfunctions import AdaptiveIntegrator
-from psyneulink.core.components.functions.nonstateful.transferfunctions import Logistic
-from psyneulink.core.components.mechanisms.processing.processingmechanism import ProcessingMechanism
-from psyneulink.core.components.mechanisms.processing.transfermechanism import TransferMechanism
-from psyneulink.core.compositions.composition import Composition
-import psyneulink.core.globals.keywords as kw
-
-SIZE=10
-# Some metrics (CROSS_ENTROPY) don't like 0s
-test_var = [np.random.rand(SIZE) + Function.EPSILON, np.random.rand(SIZE) + Function.EPSILON]
-v1 = test_var[0]
-v2 = test_var[1]
-expected = np.linalg.norm(v1 - v2)
-
-@pytest.mark.multirun
-@pytest.mark.function
-@pytest.mark.distance_function
-@pytest.mark.benchmark
-@pytest.mark.parametrize("executions", [1, 10, 100])
-def test_function(benchmark, executions, func_mode):
-    f = Functions.Distance(default_variable=test_var, metric=kw.EUCLIDEAN)
-    benchmark.group = "DistanceFunction multirun {}".format(executions)
-    var = [test_var for _ in range(executions)] if executions > 1 else test_var
-    if func_mode == 'Python':
-        e = f.function if executions == 1 else lambda x: [f.function(xi) for xi in x]
-    elif func_mode == 'LLVM':
-        e = pnlvm.execution.FuncExecution(f, [None for _ in range(executions)]).execute
-    elif func_mode == 'PTX':
-        e = pnlvm.execution.FuncExecution(f, [None for _ in range(executions)]).cuda_execute
-
-    res = benchmark(e, var)
-    np.testing.assert_allclose(res, [expected for _ in range(executions)])
-
-@pytest.mark.multirun
-@pytest.mark.mechanism
-@pytest.mark.transfer_mechanism
-@pytest.mark.benchmark
-@pytest.mark.parametrize("executions", [1, 10, 100])
-def test_mechanism(benchmark, executions, mech_mode):
-    benchmark.group = "TransferMechanism multirun {}".format(executions)
-    variable = [0 for _ in range(SIZE)]
-    T = TransferMechanism(
-        name='T',
-        default_variable=variable,
-        integration_rate=1.0,
-        noise=-2.0,
-        integrator_mode=True
-    )
-    var = [[10.0 for _ in range(SIZE)] for _ in range(executions)]
-    expected = [[8.0 for i in range(SIZE)]]
-    if mech_mode == 'Python':
-        e = T.execute if executions == 1 else lambda x : [T.execute(xi) for xi in x]
-    elif mech_mode == 'LLVM':
-        e = pnlvm.execution.MechExecution(T, [None for _ in range(executions)]).execute
-    elif mech_mode == 'PTX':
-        e = pnlvm.execution.MechExecution(T, [None for _ in range(executions)]).cuda_execute
-
-    if executions > 1:
-        expected = [expected for _ in range(executions)]
-
-    res = benchmark(e, var)
-    np.testing.assert_allclose(res, expected)
-
-
-@pytest.mark.multirun
-@pytest.mark.nested
-@pytest.mark.composition
-@pytest.mark.benchmark
-@pytest.mark.parametrize("executions", [1, 10, 100])
-@pytest.mark.parametrize("mode", ['Python',
-                                  pytest.param('LLVM', marks=pytest.mark.llvm),
-                                  pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda])])
-def test_nested_composition_execution(benchmark, executions, mode):
-    benchmark.group = "Nested Composition execution multirun {}".format(executions)
-
-    # mechanisms
-    A = ProcessingMechanism(name="A",
-                            function=AdaptiveIntegrator(rate=0.1))
-    B = ProcessingMechanism(name="B",
-                            function=Logistic)
-
-    inner_comp = Composition(name="inner_comp")
-    inner_comp.add_linear_processing_pathway([A, B])
-    inner_comp._analyze_graph()
-
-    outer_comp = Composition(name="outer_comp")
-    outer_comp.add_node(inner_comp)
-
-    outer_comp._analyze_graph()
-
-    # The input dict should assign inputs origin nodes (inner_comp in this case)
-    var = {inner_comp: [[1.0]]}
-    expected = [[0.52497918747894]]
-    if executions > 1:
-        var = [var for _ in range(executions)]
-
-    if mode == 'Python':
-        e = outer_comp.execute if executions == 1 else lambda x : [outer_comp.execute(x[i], context=i) for i in range(executions)]
-        res = e(var)
-        benchmark(e, var)
-    elif mode == 'LLVM':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        e.execute(var)
-        res = e.extract_node_output(outer_comp.output_CIM)
-        benchmark(e.execute, var)
-    elif mode == 'PTX':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        e.cuda_execute(var)
-        res = e.extract_node_output(outer_comp.output_CIM)
-        benchmark(e.cuda_execute, var)
-    else:
-        assert False, "Unknown mode: {}".format(mode)
-
-    expected = [expected for _ in range(executions)] if executions > 1 else expected
-    np.testing.assert_allclose(res, expected)
-
-
-@pytest.mark.multirun
-@pytest.mark.nested
-@pytest.mark.composition
-@pytest.mark.benchmark
-@pytest.mark.parametrize("executions", [1, 10, 100])
-@pytest.mark.parametrize("mode", ['Python',
-                                  pytest.param('LLVM', marks=pytest.mark.llvm),
-                                  pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda])])
-def test_nested_composition_run(benchmark, executions, mode):
-    benchmark.group = "Nested Composition multirun {}".format(executions)
-
-    # mechanisms
-    A = ProcessingMechanism(name="A",
-                            function=AdaptiveIntegrator(rate=0.1))
-    B = ProcessingMechanism(name="B",
-                            function=Logistic)
-
-    inner_comp = Composition(name="inner_comp")
-    inner_comp.add_linear_processing_pathway([A, B])
-    inner_comp._analyze_graph()
-
-    outer_comp = Composition(name="outer_comp")
-    outer_comp.add_node(inner_comp)
-
-    outer_comp._analyze_graph()
-
-    # The input dict should assign inputs origin nodes (inner_comp in this case)
-    var = {inner_comp: [[[2.0]]]}
-    expected = [[[0.549833997312478]]]
-    if executions > 1:
-        var = [var for _ in range(executions)]
-    if mode == 'Python':
-        e = outer_comp.run if executions == 1 else lambda x: [outer_comp.run(x[i], context=i) for i in range(executions)]
-        res = e(var)
-
-        # Composition.run returns only the result of the last trail,
-        # unlike results for all trials reported by CompExecution.run below
-        expected = expected[0]
-
-        benchmark(e, var)
-    elif mode == 'LLVM':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        res = e.run(var, 1, 1)
-        benchmark(e.run, var, 1, 1)
-    elif mode == 'PTX':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        res = e.cuda_run(var, 1, 1)
-        benchmark(e.cuda_run, var, 1, 1)
-    else:
-        assert False, "Unknown mode: {}".format(mode)
-
-    expected = [expected for _ in range(executions)] if executions > 1 else expected
-    np.testing.assert_allclose(res, expected)
-
-
-@pytest.mark.multirun
-@pytest.mark.nested
-@pytest.mark.composition
-@pytest.mark.benchmark
-@pytest.mark.parametrize("executions", [1, 10, 100])
-@pytest.mark.parametrize("mode", [
-    'Python',
-    pytest.param('LLVM', marks=pytest.mark.llvm),
-    pytest.param('PTX', marks=[pytest.mark.llvm, pytest.mark.cuda])
-])
-def test_nested_composition_run_trials_inputs(benchmark, executions, mode):
-    benchmark.group = "Nested Composition mutliple trials/inputs multirun {}".format(executions)
-
-    # mechanisms
-    A = ProcessingMechanism(name="A",
-                            function=AdaptiveIntegrator(rate=0.1))
-    B = ProcessingMechanism(name="B",
-                            function=Logistic)
-
-    inner_comp = Composition(name="inner_comp")
-    inner_comp.add_linear_processing_pathway([A, B])
-    inner_comp._analyze_graph()
-
-    outer_comp = Composition(name="outer_comp")
-    outer_comp.add_node(inner_comp)
-
-    outer_comp._analyze_graph()
-
-    # The input dict should assign inputs origin nodes (inner_comp in this case)
-    var = {inner_comp: [[[2.0]], [[3.0]]]}
-    expected = [[[0.549833997312478]], [[0.617747874769249]], [[0.6529428177055896]], [[0.7044959416252289]]]
-    if executions > 1:
-        var = [var for _ in range(executions)]
-    if mode == 'Python':
-        def f(v, num_trials, copy_results=False):
-            results = []
-            for i in range(executions):
-                outer_comp.run(v[i], context=i, num_trials=num_trials)
-                if copy_results: # copy the results immediately, otherwise it's empty
-                    results.append(outer_comp.results.copy())
-            return results[0] if len(results) == 1 else results
-
-        res = f(var, 4, True) if executions > 1 else f([var], 4, True)
-        benchmark(f if executions > 1 else outer_comp.run, var, num_trials=4)
-    elif mode == 'LLVM':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        res = e.run(var, 4, 2)
-        benchmark(e.run, var, 4, 2)
-    elif mode == 'PTX':
-        e = pnlvm.execution.CompExecution(outer_comp, [None for _ in range(executions)])
-        res = e.cuda_run(var, 4, 2)
-        benchmark(e.cuda_run, var, 4, 2)
-    else:
-        assert False, "Unknown mode: {}".format(mode)
-
-    expected = [expected for _ in range(executions)] if executions > 1 else expected
-    np.testing.assert_allclose(res, expected)

From 309730b4ceccfad3fd8aacb557a1923948c6a788 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 18:37:46 -0400
Subject: [PATCH 02/10] llvm/execution: Simplify and remove dead code

Simplify run input construction.
Do not store references to constructed ctype structures.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 68 ++++++++++++-------------------
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 60a7967e10f..51c9dba203d 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -82,7 +82,7 @@ def _get_compilation_param(self, name, init_method, arg):
             assert numpy_struct.nbytes == ctypes.sizeof(struct), \
                 "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct))
 
-            saved = (struct, numpy_struct)
+            saved = numpy_struct
             setattr(self, name, saved)
 
             if "time_stat" in self._debug_env:
@@ -224,7 +224,7 @@ def __init__(self, buffers=['param_struct', 'state_struct']):
     def __get_cuda_arg(self, struct_name, arg_handler):
         gpu_buffer = self._gpu_buffers[struct_name]
 
-        np_struct = getattr(self, struct_name)[1]
+        np_struct = getattr(self, struct_name)
 
         # .array is a public member of pycuda's In/Out ArgumentHandler classes
         if gpu_buffer is None or gpu_buffer.array is not np_struct:
@@ -294,7 +294,7 @@ def execute(self, variable):
         data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
         data_out = self._bin_func.np_buffer_for_arg(3)
 
-        self._bin_func(self._param_struct[1], self._state_struct[1], data_in, data_out)
+        self._bin_func(self._param_struct, self._state_struct, data_in, data_out)
 
         return self._get_indexable(data_out)
 
@@ -374,11 +374,11 @@ def _conditions(self):
 
             np_conditions.shape = ()
 
-            self.__conditions = (ct_conditions, np_conditions)
+            self.__conditions = np_conditions
 
             if "stat" in self._debug_env:
                 print("Instantiated condition struct ( size:" ,
-                      _pretty_size(ctypes.sizeof(conditions_ctype)), ")",
+                      _pretty_size(np_conditions.nbytes), ")",
                       "for", self._composition.name)
 
         return self.__conditions
@@ -401,7 +401,7 @@ def _data_struct(self):
     def _data_struct(self, data_struct):
         self._data = data_struct
 
-    def _extract_node_struct_from_numpy(self, node, data):
+    def extract_node_struct(self, node, data):
         # state structure consists of a list of node states,
         #   followed by a list of projection contexts; get the first one
         # parameter structure consists of a list of node parameters,
@@ -418,9 +418,6 @@ def _extract_node_struct_from_numpy(self, node, data):
         # returned results in next execution
         return node_struct.copy().tolist() if node_struct.shape == () else node_struct.copy()
 
-    def extract_node_struct(self, node, struct):
-        return self._extract_node_struct_from_numpy(node, struct[1])
-
     def extract_frozen_node_output(self, node):
         return self.extract_node_struct(node, self.__frozen_values)
 
@@ -436,7 +433,7 @@ def extract_node_params(self, node):
     def insert_node_output(self, node, data):
         # output structure consists of a list of node outputs,
         #   followed by a list of nested data structures; get the first one
-        all_nodes = self._data_struct[1][self._data_struct[1].dtype.names[0]]
+        all_nodes = self._data_struct[self._data_struct.dtype.names[0]]
 
         # Get the index into the array of all nodes
         index = self._composition._get_node_index(node)
@@ -447,7 +444,6 @@ def _get_input_struct(self, inputs):
         # Either node or composition execute.
 
         # Read provided input data and parse into an array (generator)
-        ct_input = None
         data = self._composition._build_variable_for_input_CIM(inputs)
 
         np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base)
@@ -456,12 +452,10 @@ def _get_input_struct(self, inputs):
         if "stat" in self._debug_env:
             print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name)
 
-        return ct_input, np_input
+        return np_input
 
     def freeze_values(self):
-        np_copy = self._data_struct[1].copy()
-
-        self.__frozen_values = (None, np_copy)
+        self.__frozen_values = self._data_struct.copy()
 
     def execute_node(self, node, inputs=None):
         # We need to reconstruct the input dictionary here if it was not provided.
@@ -485,7 +479,7 @@ def execute_node(self, node, inputs=None):
         # Numpy doesn't allow to pass NULL to the called function.
         # Create and pass a dummy buffer filled with NaN instead.
         if inputs is not None:
-            inputs = self._get_input_struct(inputs)[1]
+            inputs = self._get_input_struct(inputs)
         else:
             inputs = self._bin_func.np_buffer_for_arg(2)
 
@@ -493,17 +487,13 @@ def execute_node(self, node, inputs=None):
         # and need frozen values available
         if node is not self._composition.input_CIM and node is not self._composition.parameter_CIM:
             assert self.__frozen_values is not None
-            data_in = self.__frozen_values[1]
+            data_in = self.__frozen_values
         else:
             # The ndarray argument check doesn't allow None for null so just provide
             # the same structure as outputs.
-            data_in = self._data_struct[1]
+            data_in = self._data_struct
 
-        self._bin_func(self._state_struct[1],
-                       self._param_struct[1],
-                       inputs,
-                       data_in,
-                       self._data_struct[1])
+        self._bin_func(self._state_struct, self._param_struct, inputs, data_in, self._data_struct)
 
         if "comp_node_debug" in self._debug_env:
             print("RAN: {}. State: {}".format(node, self.extract_node_state(node)))
@@ -523,18 +513,18 @@ def _bin_exec_func(self):
     def execute(self, inputs):
         # NOTE: Make sure that input struct generation is inlined.
         # We need the binary function to be setup for it to work correctly.
-        self._bin_exec_func(self._state_struct[1],
-                            self._param_struct[1],
-                            self._get_input_struct(inputs)[1],
-                            self._data_struct[1],
-                            self._conditions[1])
+        self._bin_exec_func(self._state_struct,
+                            self._param_struct,
+                            self._get_input_struct(inputs),
+                            self._data_struct,
+                            self._conditions)
 
     def cuda_execute(self, inputs):
         # NOTE: Make sure that input struct generation is inlined.
         # We need the binary function to be setup for it to work correctly.
         self._bin_exec_func.cuda_call(self._cuda_state_struct,
                                       self._cuda_param_struct,
-                                      jit_engine.pycuda.driver.In(self._get_input_struct(inputs)[1]),
+                                      jit_engine.pycuda.driver.In(self._get_input_struct(inputs)),
                                       self._cuda_data_struct,
                                       self._cuda_conditions)
 
@@ -544,11 +534,10 @@ def _get_run_input_struct(self, inputs, num_input_sets, arg=3):
         bin_f = self._bin_run_func if arg == 3 else self._bin_func
 
         input_type = bin_f.byref_arg_types[arg]
-        c_input_type = (input_type * num_input_sets) * 1
-        inputs = [inputs]
+        c_input_type = (input_type * num_input_sets)
 
         # Extract input for each trial and execution id
-        run_inputs = ((([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inp.items()})) for i in range(num_input_sets)) for inp in inputs)
+        run_inputs = (([x] for x in self._composition._build_variable_for_input_CIM({k:v[i] for k,v in inputs.items()})) for i in range(num_input_sets))
         c_inputs = c_input_type(*_tupleize(run_inputs))
         if "stat" in self._debug_env:
             print("Instantiated struct: input ( size:" ,
@@ -597,12 +586,9 @@ def run(self, inputs, runs=0, num_input_sets=0):
         runs_count = ctypes.c_uint(runs)
         input_count = ctypes.c_uint(num_input_sets)
 
-        # The cast is only needed for non-generator inputs that are wrapped in an extra context dimension
-        inputs = ctypes.cast(inputs, self._bin_run_func.c_func.argtypes[3])
-
-        self._bin_run_func(self._state_struct[1],
-                           self._param_struct[1],
-                           self._data_struct[1],
+        self._bin_run_func(self._state_struct,
+                           self._param_struct,
+                           self._data_struct,
                            inputs,
                            outputs,
                            runs_count,
@@ -657,9 +643,9 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
 
         # Directly initialized structures
         assert ocm.agent_rep is self._composition
-        comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)[1]
-        comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)[1]
-        comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)[1]
+        comp_params = self._get_compilation_param('_eval_param', '_get_param_initializer', 0)
+        comp_state = self._get_compilation_param('_eval_state', '_get_state_initializer', 1)
+        comp_data = self._get_compilation_param('_eval_data', '_get_data_initializer', 6)
 
         # Construct input variable, the 5th parameter of the evaluate function
         ct_inputs = self._get_run_input_struct(inputs, num_input_sets, 5)

From 7ab159935cb8b2a3ebbe989d408b029d42328d14 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 21:08:51 -0400
Subject: [PATCH 03/10] llvm/execution/run: Use numpy structures for input and
 execution counts

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 51c9dba203d..f834ac540d3 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -561,7 +561,7 @@ def _get_generator_run_input_struct(self, inputs, runs):
     def _bin_run_func(self):
         if self.__bin_run_func is None:
             self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(
-                self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2))
+                self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2, 5, 6))
 
         return self.__bin_run_func
 
@@ -583,8 +583,8 @@ def run(self, inputs, runs=0, num_input_sets=0):
             print("Output struct size:", _pretty_size(ctypes.sizeof(outputs)),
                   "for", self._composition.name)
 
-        runs_count = ctypes.c_uint(runs)
-        input_count = ctypes.c_uint(num_input_sets)
+        runs_count = np.asarray(runs, dtype=np.uint32).copy()
+        input_count = np.asarray(num_input_sets, dtype=np.uint32)
 
         self._bin_run_func(self._state_struct,
                            self._param_struct,
@@ -595,8 +595,8 @@ def run(self, inputs, runs=0, num_input_sets=0):
                            input_count)
 
         # Extract only #trials elements in case the run exited early
-        assert runs_count.value <= runs, "Composition ran more times than allowed!"
-        return _convert_ctype_to_python(outputs)[0:runs_count.value]
+        assert runs_count <= runs, "Composition ran more times than allowed!"
+        return _convert_ctype_to_python(outputs)[0:runs_count]
 
     def cuda_run(self, inputs, runs, num_input_sets):
         # Create input buffer

From aa166a6d68ec015453d52fae7c29dd1af87caab2 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 21:22:55 -0400
Subject: [PATCH 04/10] llvm/execution: Consolidate shared code between CPU and
 GPU 'run'

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 51 ++++++++++++-------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index f834ac540d3..cddea113df7 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -565,7 +565,9 @@ def _bin_run_func(self):
 
         return self.__bin_run_func
 
-    def run(self, inputs, runs=0, num_input_sets=0):
+    def _prepare_run(self, inputs, runs, num_input_sets):
+
+        # Create input buffer
         if isgenerator(inputs):
             inputs, runs = self._get_generator_run_input_struct(inputs, runs)
             assert num_input_sets == 0 or num_input_sets == sys.maxsize
@@ -573,60 +575,47 @@ def run(self, inputs, runs=0, num_input_sets=0):
         else:
             inputs = self._get_run_input_struct(inputs, num_input_sets)
 
-        ct_vo = self._bin_run_func.byref_arg_types[4] * runs
-
-        outputs = ct_vo()
+        # Create output buffer
+        outputs = (self._bin_run_func.byref_arg_types[4] * runs)()
 
         if "stat" in self._debug_env:
-            print("Input struct size:", _pretty_size(ctypes.sizeof(inputs)),
-                  "for", self._composition.name)
             print("Output struct size:", _pretty_size(ctypes.sizeof(outputs)),
                   "for", self._composition.name)
 
         runs_count = np.asarray(runs, dtype=np.uint32).copy()
         input_count = np.asarray(num_input_sets, dtype=np.uint32)
 
+        return inputs, outputs, runs_count, input_count
+
+    def run(self, inputs, runs, num_input_sets):
+        ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
+
         self._bin_run_func(self._state_struct,
                            self._param_struct,
                            self._data_struct,
-                           inputs,
-                           outputs,
+                           ct_inputs,
+                           ct_outputs,
                            runs_count,
                            input_count)
 
         # Extract only #trials elements in case the run exited early
         assert runs_count <= runs, "Composition ran more times than allowed!"
-        return _convert_ctype_to_python(outputs)[0:runs_count]
+        return _convert_ctype_to_python(ct_outputs)[0:runs_count]
 
     def cuda_run(self, inputs, runs, num_input_sets):
-        # Create input buffer
-        if isgenerator(inputs):
-            ct_inputs, runs = self._get_generator_run_input_struct(inputs, runs)
-            assert num_input_sets == 0 or num_input_sets == sys.maxsize
-            num_input_sets = len(ct_inputs)
-        else:
-            ct_inputs = self._get_run_input_struct(inputs, num_input_sets)
-
-        # Create output buffer
-        output_type = (self._bin_run_func.byref_arg_types[4] * runs)
-
-        ct_out = output_type()
-
-        # number of trials argument
-        np_runs = np.asarray(runs, dtype=np.int32).copy()
+        ct_inputs, ct_outputs, runs_count, input_count = self._prepare_run(inputs, runs, num_input_sets)
 
         self._bin_run_func.cuda_call(self._cuda_state_struct,
                                      self._cuda_param_struct,
                                      self._cuda_data_struct,
-                                     jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)), # input
-                                     jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_out)),   # output
-                                     jit_engine.pycuda.driver.InOut(np_runs),                       # runs
-                                     jit_engine.pycuda.driver.In(np.int32(num_input_sets)))         # number of inputs
-
-        assert np_runs <= runs, "Composition ran more times than allowed: {}".format(runs)
+                                     jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)),
+                                     jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_outputs)),
+                                     jit_engine.pycuda.driver.InOut(runs_count),
+                                     jit_engine.pycuda.driver.In(input_count))
 
         # Extract only #trials elements in case the run exited early
-        return _convert_ctype_to_python(ct_out)[0:np_runs]
+        assert runs_count <= runs, "Composition ran more times than allowed: {}".format(runs)
+        return _convert_ctype_to_python(ct_outputs)[0:runs_count]
 
     def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool):
         ocm = self._composition.controller

From 286dfcec4738556e970f51c87dcb5e977a0895f1 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 17:44:01 -0400
Subject: [PATCH 05/10] llvm/execution: Consolidate shared code between CPU and
 GPU 'evaluate'

Do not sync back composition state or data.

	A call to evaluate creates a copy of these structures for each
	evaluation so the structure content is unchanged.
	Moreover, the structures are deallocated after the evaluate call anyway.

Use Numpy structure for number of inputs
Instantiate output buffer in the shared _prepare_evaluate function.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index cddea113df7..8b92e50af1e 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -622,7 +622,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
 
         eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
         tags = {"evaluate", "alloc_range", eval_type}
-        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6))
+        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6, 7))
         self.__bin_func = bin_func
 
         # There are 8 arguments to evaluate_alloc_range:
@@ -648,27 +648,25 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
             out_el_ty *= num_trials
         out_ty = out_el_ty * num_evaluations
 
-        ct_num_inputs = bin_func.byref_arg_types[7](num_input_sets)
+        num_inputs = np.asarray(num_input_sets, dtype=np.uint32)
         if "stat" in self._debug_env:
             print("Evaluate result struct type size:",
                   _pretty_size(ctypes.sizeof(out_ty)),
                   "( evaluations:", num_evaluations, "element size:", ctypes.sizeof(out_el_ty), ")",
                   "for", self._obj.name)
 
-        return comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs
+        return comp_params, comp_state, comp_data, ct_inputs, out_ty(), num_inputs
 
     def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False):
-        comp_params, comp_state, comp_data, ct_inputs, out_ty, _ = \
+        comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \
             self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results)
 
-        ct_results = out_ty()
-
         cuda_args = (jit_engine.pycuda.driver.In(comp_params),
-                     jit_engine.pycuda.driver.InOut(comp_state),
+                     jit_engine.pycuda.driver.In(comp_state),
                      jit_engine.pycuda.driver.Out(np.ctypeslib.as_array(ct_results)),   # results
                      jit_engine.pycuda.driver.In(np.ctypeslib.as_array(ct_inputs)),     # inputs
-                     jit_engine.pycuda.driver.InOut(comp_data),                         # composition data
-                     jit_engine.pycuda.driver.In(np.int32(num_input_sets)),             # number of inputs
+                     jit_engine.pycuda.driver.In(comp_data),                            # composition data
+                     jit_engine.pycuda.driver.In(num_inputs),                           # number of inputs
                     )
 
         self.__bin_func.cuda_call(*cuda_args, threads=int(num_evaluations))
@@ -676,10 +674,9 @@ def cuda_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:boo
         return ct_results
 
     def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:bool=False):
-        comp_params, comp_state, comp_data, ct_inputs, out_ty, ct_num_inputs = \
+        comp_params, comp_state, comp_data, ct_inputs, ct_results, num_inputs = \
             self._prepare_evaluate(inputs, num_input_sets, num_evaluations, all_results)
 
-        ct_results = out_ty()
         jobs = min(os.cpu_count(), num_evaluations)
         evals_per_job = (num_evaluations + jobs - 1) // jobs
 
@@ -688,11 +685,11 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
 
             # Create input and result typed casts once, they are the same
             # for every submitted job.
-            input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5])
             results_arg = ctypes.cast(ct_results, self.__bin_func.c_func.argtypes[4])
+            input_arg = ctypes.cast(ct_inputs, self.__bin_func.c_func.argtypes[5])
 
-            # There are 7 arguments to evaluate_alloc_range:
-            # comp_param, comp_state, from, to, results, input, comp_data
+            # There are 8 arguments to evaluate_alloc_range:
+            # comp_param, comp_state, from, to, results, input, comp_data, input length
             results = [ex.submit(self.__bin_func,
                                  comp_params,
                                  comp_state,
@@ -701,7 +698,7 @@ def thread_evaluate(self, inputs, num_input_sets, num_evaluations, all_results:b
                                  results_arg,
                                  input_arg,
                                  comp_data,
-                                 ct_num_inputs)
+                                 num_inputs)
                        for i in range(jobs)]
 
         parallel_stop = time.time()

From 4533f621b2ca601f4d541bb6a20eba26a47824c1 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 21:40:45 -0400
Subject: [PATCH 06/10] llvm/execute: Move cuda_execute for Function and
 Mechanism to FuncExecution

There's an overriding implementation in CompExecution.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/execution.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 8b92e50af1e..67100583b5a 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -252,20 +252,6 @@ def _cuda_data_struct(self):
     def _cuda_conditions(self):
         return self.__get_cuda_arg("_conditions", jit_engine.pycuda.driver.InOut)
 
-    def cuda_execute(self, variable):
-        # Create input argument, PyCUDA doesn't care about shape
-        new_var = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
-        data_in = jit_engine.pycuda.driver.In(new_var)
-
-        data_out = self._bin_func.np_buffer_for_arg(3)
-
-        self._bin_func.cuda_call(self._cuda_param_struct,
-                                 self._cuda_state_struct,
-                                 data_in,
-                                 jit_engine.pycuda.driver.Out(data_out))
-
-        return self._get_indexable(data_out)
-
 
 class FuncExecution(CUDAExecution):
 
@@ -290,14 +276,26 @@ def _state_struct(self):
 
     def execute(self, variable):
         new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
-
         data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
+
         data_out = self._bin_func.np_buffer_for_arg(3)
 
         self._bin_func(self._param_struct, self._state_struct, data_in, data_out)
 
         return self._get_indexable(data_out)
 
+    def cuda_execute(self, variable):
+        # Create input argument, PyCUDA doesn't care about shape
+        data_in = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
+        data_out = self._bin_func.np_buffer_for_arg(3)
+
+        self._bin_func.cuda_call(self._cuda_param_struct,
+                                 self._cuda_state_struct,
+                                 jit_engine.pycuda.driver.In(data_in),
+                                 jit_engine.pycuda.driver.Out(data_out))
+
+        return self._get_indexable(data_out)
+
 
 class MechExecution(FuncExecution):
     pass

From 55cb895c3f98cc7db82f59dfbd2969bdd48ade50 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 22:35:30 -0400
Subject: [PATCH 07/10] llvm/execution: Use Context instance in the
 CompExecution constructor

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../functions/nonstateful/optimizationfunctions.py        | 2 +-
 psyneulink/core/llvm/execution.py                         | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index eee98a83d2d..dfdce982a52 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -831,7 +831,7 @@ def _is_static(it:SampleIterator):
         num_evals = np.prod([d._num for d in self.search_space])
 
         # Map allocations to values
-        comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context.execution_id)
+        comp_exec = pnlvm.execution.CompExecution(ocm.agent_rep, context)
         execution_mode = ocm.parameters.comp_execution_mode._get(context)
         if execution_mode == "PTX":
             outcomes = comp_exec.cuda_evaluate(inputs, num_inputs_sets, num_evals, get_results)
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 67100583b5a..5ead80a2731 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -305,10 +305,10 @@ class CompExecution(CUDAExecution):
 
     active_executions = weakref.WeakSet()
 
-    def __init__(self, composition, execution_id, *, additional_tags=frozenset()):
+    def __init__(self, composition, context:Context, *, additional_tags=frozenset()):
         super().__init__(buffers=['state_struct', 'param_struct', 'data_struct', 'conditions'])
         self._composition = composition
-        self._execution_context = Context(execution_id=execution_id)
+        self._execution_context = context
         self.__bin_exec_func = None
         self.__bin_func = None
         self.__bin_run_func = None
@@ -324,7 +324,7 @@ def __del__(self):
         self.active_executions.discard(self)
 
     @staticmethod
-    def get(composition, context, additional_tags=frozenset()):
+    def get(composition, context:Context, additional_tags=frozenset()):
         executions = composition._compilation_data.execution._get(context)
         if executions is None:
             executions = dict()
@@ -332,7 +332,7 @@ def get(composition, context, additional_tags=frozenset()):
 
         execution = executions.get(additional_tags, None)
         if execution is None:
-            execution = pnlvm.CompExecution(composition, context.execution_id, additional_tags=additional_tags)
+            execution = pnlvm.CompExecution(composition, context, additional_tags=additional_tags)
             executions[additional_tags] = execution
 
         return execution

From dd1170c0fadf1bc0cc183fdd2f46f19033b243a2 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Mon, 5 Aug 2024 22:50:32 -0400
Subject: [PATCH 08/10] llvm: Remove 'wrap_call'

No longer used.

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/__init__.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
index 1a921470690..93b287cd748 100644
--- a/psyneulink/core/llvm/__init__.py
+++ b/psyneulink/core/llvm/__init__.py
@@ -176,11 +176,6 @@ def c_func(self):
     def __call__(self, *args, **kwargs):
         return self.c_func(*args, **kwargs)
 
-    def wrap_call(self, *pargs):
-        cpargs = (ctypes.byref(p) if p is not None else None for p in pargs)
-        args = zip(cpargs, self.c_func.argtypes)
-        self(*(ctypes.cast(p, t) for p, t in args))
-
     @property
     def _cuda_kernel(self):
         if self.__cuda_kernel is None:

From cc8c381432a43185fa69c73ba8155e2e5ea6020c Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Tue, 6 Aug 2024 01:00:19 -0400
Subject: [PATCH 09/10] llvm: Rename np_params -> np_arg_dtypes

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 psyneulink/core/llvm/__init__.py          |  8 ++++----
 psyneulink/core/llvm/execution.py         | 14 +++++++-------
 tests/llvm/test_builtins_intrinsics.py    |  2 +-
 tests/llvm/test_builtins_matrix.py        |  6 +++---
 tests/llvm/test_builtins_mt_random.py     |  8 ++++----
 tests/llvm/test_builtins_philox_random.py |  8 ++++----
 tests/llvm/test_builtins_vector.py        |  8 ++++----
 tests/llvm/test_compile.py                |  6 +++---
 tests/llvm/test_helpers.py                | 14 +++++++-------
 9 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
index 93b287cd748..7976f1505ed 100644
--- a/psyneulink/core/llvm/__init__.py
+++ b/psyneulink/core/llvm/__init__.py
@@ -148,11 +148,11 @@ def __init__(self, name: str, *, numpy_args=()):
         # '_type_' special attribute stores pointee type for pointers
         # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_
         self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args]
-        self.np_params = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args]
+        self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args]
 
         for a in numpy_args:
             assert self.byref_arg_types[a] is not None
-            args[a] = np.ctypeslib.ndpointer(dtype=self.np_params[a].base, shape=self.np_params[a].shape)
+            args[a] = np.ctypeslib.ndpointer(dtype=self.np_arg_dtypes[a].base, shape=self.np_arg_dtypes[a].shape)
 
         middle = time.perf_counter()
         self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args)
@@ -223,8 +223,8 @@ def cuda_wrap_call(self, *args, **kwargs):
 
     def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan):
 
-        out_base = self.np_params[arg_num].base
-        out_shape = extra_dimensions + self.np_params[arg_num].shape
+        out_base = self.np_arg_dtypes[arg_num].base
+        out_shape = extra_dimensions + self.np_arg_dtypes[arg_num].shape
 
         # fill the buffer with NaN poison
         return np.full(out_shape, fill_value, dtype=out_base)
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 5ead80a2731..786f2feb6bc 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -77,7 +77,7 @@ def _get_compilation_param(self, name, init_method, arg):
             struct_end = time.time()
 
             # numpy "frombuffer" creates a shared memory view of the provided buffer
-            numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_params[arg], count=1)
+            numpy_struct = np.frombuffer(struct, dtype=self._bin_func.np_arg_dtypes[arg], count=1)
 
             assert numpy_struct.nbytes == ctypes.sizeof(struct), \
                 "Size mismatch ({}), numpy: {} vs. ctypes:{}".format(name, numpy_struct.nbytes, ctypes.sizeof(struct))
@@ -275,8 +275,8 @@ def _state_struct(self):
         return self._get_compilation_param('_state', '_get_state_initializer', 1)
 
     def execute(self, variable):
-        new_variable = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
-        data_in = new_variable.reshape(self._bin_func.np_params[2].shape)
+        new_variable = np.asfarray(variable, dtype=self._bin_func.np_arg_dtypes[2].base)
+        data_in = new_variable.reshape(self._bin_func.np_arg_dtypes[2].shape)
 
         data_out = self._bin_func.np_buffer_for_arg(3)
 
@@ -286,7 +286,7 @@ def execute(self, variable):
 
     def cuda_execute(self, variable):
         # Create input argument, PyCUDA doesn't care about shape
-        data_in = np.asfarray(variable, dtype=self._bin_func.np_params[2].base)
+        data_in = np.asfarray(variable, dtype=self._bin_func.np_arg_dtypes[2].base)
         data_out = self._bin_func.np_buffer_for_arg(3)
 
         self._bin_func.cuda_call(self._cuda_param_struct,
@@ -368,7 +368,7 @@ def _conditions(self):
             conditions_initializer = gen.get_condition_initializer()
 
             ct_conditions = conditions_ctype(*conditions_initializer)
-            np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_params[4], count=1)
+            np_conditions = np.frombuffer(ct_conditions, dtype=self._bin_func.np_arg_dtypes[4], count=1)
 
             np_conditions.shape = ()
 
@@ -444,8 +444,8 @@ def _get_input_struct(self, inputs):
         # Read provided input data and parse into an array (generator)
         data = self._composition._build_variable_for_input_CIM(inputs)
 
-        np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_params[2].base)
-        np_input = np_input.reshape(self._bin_func.np_params[2].shape)
+        np_input = np.asarray(_tupleize(data), dtype=self._bin_func.np_arg_dtypes[2].base)
+        np_input = np_input.reshape(self._bin_func.np_arg_dtypes[2].shape)
 
         if "stat" in self._debug_env:
             print("Input struct size:", _pretty_size(np_input.nbytes), "for", self._composition.name)
diff --git a/tests/llvm/test_builtins_intrinsics.py b/tests/llvm/test_builtins_intrinsics.py
index 22cc3d2df8d..5195fcee73b 100644
--- a/tests/llvm/test_builtins_intrinsics.py
+++ b/tests/llvm/test_builtins_intrinsics.py
@@ -52,7 +52,7 @@ def test_builtin_op(benchmark, op, args, builtin, result, func_mode):
 
         # The result argument is a pointer, use it to derive
         # the right argument type
-        dty = bin_f.np_params[1].base
+        dty = bin_f.np_arg_dtypes[1].base
 
         ptx_res = np.empty_like(result, dtype=dty)
         ptx_res_arg = pnlvm.jit_engine.pycuda.driver.Out(ptx_res)
diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py
index 1cad00e1565..f2c50bf576f 100644
--- a/tests/llvm/test_builtins_matrix.py
+++ b/tests/llvm/test_builtins_matrix.py
@@ -64,9 +64,9 @@ def _get_const_dim_func(builtin, *dims):
 def test_matrix_op(benchmark, op, x, y, builtin, result, func_mode, dims):
 
     def _numpy_args(bin_f):
-        np_x = x.astype(bin_f.np_params[0])
-        np_y = bin_f.np_params[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_params[1])
-        np_res = np.empty_like(result, dtype=bin_f.np_params[-1])
+        np_x = x.astype(bin_f.np_arg_dtypes[0])
+        np_y = bin_f.np_arg_dtypes[1].type(y) if np.isscalar(y) else y.astype(bin_f.np_arg_dtypes[1])
+        np_res = np.empty_like(result, dtype=bin_f.np_arg_dtypes[-1])
 
         return np_x, np_y, np_res
 
diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py
index 2ff7cff0ea2..28082e2d7e8 100644
--- a/tests/llvm/test_builtins_mt_random.py
+++ b/tests/llvm/test_builtins_mt_random.py
@@ -196,8 +196,8 @@ def f():
         init_fun(state, SEED)
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3))
-        n = np.asarray(n, dtype=gen_fun.np_params[1])
-        p = np.asarray(p, dtype=gen_fun.np_params[2])
+        n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])
+        p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -214,8 +214,8 @@ def f():
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial')
 
-        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1]))
-        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2]))
+        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]))
+        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]))
 
         out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py
index 0c6e289a700..2466ea4f6d2 100644
--- a/tests/llvm/test_builtins_philox_random.py
+++ b/tests/llvm/test_builtins_philox_random.py
@@ -327,8 +327,8 @@ def f():
         init_fun(state, SEED)
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3))
-        n = np.asarray(n, dtype=gen_fun.np_params[1])
-        p = np.asarray(p, dtype=gen_fun.np_params[2])
+        n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])
+        p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -342,8 +342,8 @@ def f():
         init_fun.cuda_call(gpu_state, np.int64(SEED))
 
         gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial')
-        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_params[1]))
-        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_params[2]))
+        gpu_n = pnlvm.jit_engine.pycuda.driver.In(np.asarray(n, dtype=gen_fun.np_arg_dtypes[1]))
+        gpu_p = pnlvm.jit_engine.pycuda.driver.In(np.asarray(p, dtype=gen_fun.np_arg_dtypes[2]))
         out = gen_fun.np_buffer_for_arg(1)
         gpu_out = pnlvm.jit_engine.pycuda.driver.Out(out)
 
diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py
index 999a7e42696..9a806bde911 100644
--- a/tests/llvm/test_builtins_vector.py
+++ b/tests/llvm/test_builtins_vector.py
@@ -29,8 +29,8 @@
 def test_vector_op(benchmark, op, v, builtin, result, func_mode):
 
     def _numpy_args(bin_f):
-        np_u = u.astype(bin_f.np_params[0])
-        np_v = bin_f.np_params[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_params[1])
+        np_u = u.astype(bin_f.np_arg_dtypes[0])
+        np_v = bin_f.np_arg_dtypes[1].type(v) if np.isscalar(v) else v.astype(bin_f.np_arg_dtypes[1])
         np_res = np.empty_like(np_u)
 
         return np_u, np_v, np_res
@@ -77,7 +77,7 @@ def ex():
     elif func_mode == 'LLVM':
         bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
 
-        np_u = u.astype(bin_f.np_params[0])
+        np_u = u.astype(bin_f.np_arg_dtypes[0])
         np_res = bin_f.np_buffer_for_arg(2)
 
         ct_u = np_u.ctypes.data_as(bin_f.c_func.argtypes[0])
@@ -89,7 +89,7 @@ def ex():
     elif func_mode == 'PTX':
         bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
 
-        np_u = u.astype(bin_f.np_params[0])
+        np_u = u.astype(bin_f.np_arg_dtypes[0])
         np_res = bin_f.np_buffer_for_arg(2)
 
         cuda_u = pnlvm.jit_engine.pycuda.driver.In(np_u)
diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py
index c396cba594f..71c8526e2bc 100644
--- a/tests/llvm/test_compile.py
+++ b/tests/llvm/test_compile.py
@@ -12,9 +12,9 @@ def test_recompile():
     # The original builtin mxv function
     bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
 
-    vector = np.random.rand(DIM_X).astype(bin_f.np_params[0].base)
-    matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_params[1].base)
-    llvm_res = np.empty(DIM_Y, dtype=bin_f.np_params[4].base)
+    vector = np.random.rand(DIM_X).astype(bin_f.np_arg_dtypes[0].base)
+    matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_arg_dtypes[1].base)
+    llvm_res = np.empty(DIM_Y, dtype=bin_f.np_arg_dtypes[4].base)
 
     x, y = matrix.shape
 
diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py
index e692bd62f37..9f1c9bad29a 100644
--- a/tests/llvm/test_helpers.py
+++ b/tests/llvm/test_helpers.py
@@ -144,8 +144,8 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type):
 
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
-    vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_params[0].base))
-    vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_params[1].base))
+    vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base))
+    vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base))
     assert len(vec1) == len(vec2)
     res = np.empty_like(vec2)
 
@@ -442,7 +442,7 @@ def test_helper_numerical(mode, op, var, expected, fp_type):
 
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,))
 
-    res = np.asfarray(var, dtype=bin_f.np_params[0])
+    res = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0])
 
     if mode == 'CPU':
         bin_f(res)
@@ -475,7 +475,7 @@ def test_helper_elementwise_op(mode, var, expected):
 
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
 
-    vec = np.asfarray(var, dtype=bin_f.np_params[0].base)
+    vec = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0].base)
     res = bin_f.np_buffer_for_arg(1)
 
     if mode == 'CPU':
@@ -521,8 +521,8 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected):
 
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2))
 
-    vec1 = np.asfarray(var1, dtype=bin_f.np_params[0].base)
-    vec2 = np.asfarray(var2, dtype=bin_f.np_params[0].base)
+    vec1 = np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base)
+    vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[0].base)
     res = bin_f.np_buffer_for_arg(1)
 
     if mode == 'CPU':
@@ -558,7 +558,7 @@ def test_helper_convert_fp_type(t1, t2, mode, val):
     bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
 
     # Get the argument numpy dtype
-    np_dt1, np_dt2 = (np.dtype(bin_f.np_params[i]) for i in (0, 1))
+    np_dt1, np_dt2 = (np.dtype(bin_f.np_arg_dtypes[i]) for i in (0, 1))
 
     # instantiate value, result and reference
     x = np.asfarray(val, dtype=np_dt1)

From 45d8ccd2469358e489f5e9bd4e426250e0f2ebc1 Mon Sep 17 00:00:00 2001
From: Jan Vesely <jan.vesely@rutgers.edu>
Date: Tue, 6 Aug 2024 11:34:38 -0400
Subject: [PATCH 10/10] llvm: Use Numpy ndpointer by default

ctype_ptr_arg can be used to force use ctype pointers for dynamically
sized argument

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../nonstateful/optimizationfunctions.py      |  2 +-
 psyneulink/core/llvm/__init__.py              | 20 ++++++------
 psyneulink/core/llvm/execution.py             | 16 +++++-----
 tests/llvm/test_builtins_matrix.py            |  2 +-
 tests/llvm/test_builtins_mt_random.py         | 16 +++++-----
 tests/llvm/test_builtins_philox_random.py     | 24 +++++++-------
 tests/llvm/test_builtins_vector.py            |  6 ++--
 tests/llvm/test_compile.py                    | 10 +++---
 tests/llvm/test_custom_func.py                | 10 +++---
 tests/llvm/test_helpers.py                    | 31 +++++++++----------
 10 files changed, 66 insertions(+), 71 deletions(-)

diff --git a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
index dfdce982a52..bc4d323c606 100644
--- a/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
+++ b/psyneulink/core/components/functions/nonstateful/optimizationfunctions.py
@@ -2103,7 +2103,7 @@ def _function(self,
                 # select_min params are:
                 # params, state, min_sample_ptr, sample_ptr, min_value_ptr, value_ptr, opt_count_ptr, count
                 min_tags = frozenset({"select_min", "evaluate_type_objective"})
-                bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, numpy_args=(2, 4, 6))
+                bin_func = pnlvm.LLVMBinaryFunction.from_obj(self, tags=min_tags, ctype_ptr_args=(0, 1, 3, 5))
 
                 ct_param = bin_func.byref_arg_types[0](*self._get_param_initializer(context))
                 ct_state = bin_func.byref_arg_types[1](*self._get_state_initializer(context))
diff --git a/psyneulink/core/llvm/__init__.py b/psyneulink/core/llvm/__init__.py
index 7976f1505ed..5a9788102f4 100644
--- a/psyneulink/core/llvm/__init__.py
+++ b/psyneulink/core/llvm/__init__.py
@@ -123,7 +123,7 @@ def _llvm_build(target_generation=_binary_generation + 1):
 
 
 class LLVMBinaryFunction:
-    def __init__(self, name: str, *, numpy_args=()):
+    def __init__(self, name: str, *, ctype_ptr_args=()):
         self.name = name
 
         self.__c_func = None
@@ -143,16 +143,18 @@ def __init__(self, name: str, *, numpy_args=()):
         # Create ctype function instance
         start = time.perf_counter()
         return_type = _convert_llvm_ir_to_ctype(f.return_value.type)
+
+        self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args]
+
         args = [_convert_llvm_ir_to_ctype(a.type) for a in f.args]
 
         # '_type_' special attribute stores pointee type for pointers
         # https://docs.python.org/3/library/ctypes.html#ctypes._Pointer._type_
         self.byref_arg_types = [a._type_ if hasattr(a, "contents") else None for a in args]
-        self.np_arg_dtypes = [_convert_llvm_ir_to_dtype(getattr(a.type, "pointee", a.type)) for a in f.args]
 
-        for a in numpy_args:
-            assert self.byref_arg_types[a] is not None
-            args[a] = np.ctypeslib.ndpointer(dtype=self.np_arg_dtypes[a].base, shape=self.np_arg_dtypes[a].shape)
+        for i, arg in enumerate(self.np_arg_dtypes):
+            if i not in ctype_ptr_args and self.byref_arg_types[i] is not None:
+                args[i] = np.ctypeslib.ndpointer(dtype=arg.base, shape=arg.shape)
 
         middle = time.perf_counter()
         self.__c_func_type = ctypes.CFUNCTYPE(return_type, *args)
@@ -231,14 +233,14 @@ def np_buffer_for_arg(self, arg_num, *, extra_dimensions=(), fill_value=np.nan):
 
     @staticmethod
     @functools.lru_cache(maxsize=32)
-    def from_obj(obj, *, tags:frozenset=frozenset(), numpy_args:tuple=()):
+    def from_obj(obj, *, tags:frozenset=frozenset(), ctype_ptr_args:tuple=()):
         name = LLVMBuilderContext.get_current().gen_llvm_function(obj, tags=tags).name
-        return LLVMBinaryFunction.get(name, numpy_args=numpy_args)
+        return LLVMBinaryFunction.get(name, ctype_ptr_args=ctype_ptr_args)
 
     @staticmethod
     @functools.lru_cache(maxsize=32)
-    def get(name: str, *, numpy_args:tuple=()):
-        return LLVMBinaryFunction(name, numpy_args=numpy_args)
+    def get(name: str, *, ctype_ptr_args:tuple=()):
+        return LLVMBinaryFunction(name, ctype_ptr_args=ctype_ptr_args)
 
 
 _cpu_engine = None
diff --git a/psyneulink/core/llvm/execution.py b/psyneulink/core/llvm/execution.py
index 786f2feb6bc..c49c801f0b0 100644
--- a/psyneulink/core/llvm/execution.py
+++ b/psyneulink/core/llvm/execution.py
@@ -258,7 +258,7 @@ class FuncExecution(CUDAExecution):
     def __init__(self, component, execution_id=None, *, tags=frozenset()):
         super().__init__()
 
-        self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags, numpy_args=(0, 1, 2, 3))
+        self._bin_func = pnlvm.LLVMBinaryFunction.from_obj(component, tags=tags)
         self._execution_context = Context(execution_id=execution_id)
         self._component = component
 
@@ -355,9 +355,7 @@ def _bin_func(self):
     def _set_bin_node(self, node):
         assert node in self._composition._all_nodes
         node_assembly = builder_context.LLVMBuilderContext.get_current().get_node_assembly(self._composition, node)
-        self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly,
-                                                            tags=self.__tags.union({"node_assembly"}),
-                                                            numpy_args=(0, 1, 2, 3, 4))
+        self.__bin_func = pnlvm.LLVMBinaryFunction.from_obj(node_assembly, tags=self.__tags.union({"node_assembly"}))
 
     @property
     def _conditions(self):
@@ -503,8 +501,7 @@ def execute_node(self, node, inputs=None):
     @property
     def _bin_exec_func(self):
         if self.__bin_exec_func is None:
-            self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj(
-                self._composition, tags=self.__tags, numpy_args=(0, 1, 2, 3, 4))
+            self.__bin_exec_func = pnlvm.LLVMBinaryFunction.from_obj(self._composition, tags=self.__tags)
 
         return self.__bin_exec_func
 
@@ -558,8 +555,9 @@ def _get_generator_run_input_struct(self, inputs, runs):
     @property
     def _bin_run_func(self):
         if self.__bin_run_func is None:
-            self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(
-                self._composition, tags=self.__tags.union({"run"}), numpy_args=(0, 1, 2, 5, 6))
+            self.__bin_run_func = pnlvm.LLVMBinaryFunction.from_obj(self._composition,
+                                                                    tags=self.__tags.union({"run"}),
+                                                                    ctype_ptr_args=(3, 4))
 
         return self.__bin_run_func
 
@@ -620,7 +618,7 @@ def _prepare_evaluate(self, inputs, num_input_sets, num_evaluations, all_results
 
         eval_type = "evaluate_type_all_results" if all_results else "evaluate_type_objective"
         tags = {"evaluate", "alloc_range", eval_type}
-        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), numpy_args=(0, 1, 6, 7))
+        bin_func = pnlvm.LLVMBinaryFunction.from_obj(ocm, tags=frozenset(tags), ctype_ptr_args=(4, 5))
         self.__bin_func = bin_func
 
         # There are 8 arguments to evaluate_alloc_range:
diff --git a/tests/llvm/test_builtins_matrix.py b/tests/llvm/test_builtins_matrix.py
index f2c50bf576f..9280eb0db98 100644
--- a/tests/llvm/test_builtins_matrix.py
+++ b/tests/llvm/test_builtins_matrix.py
@@ -80,7 +80,7 @@ def ex():
         else:
             func_name = builtin
 
-        bin_f = pnlvm.LLVMBinaryFunction.get(func_name)
+        bin_f = pnlvm.LLVMBinaryFunction.get(func_name, ctype_ptr_args=(0, 1, 2, 3, 4))
         lx, ly, lres = _numpy_args(bin_f)
 
         ct_x = lx.ctypes.data_as(bin_f.c_func.argtypes[0])
diff --git a/tests/llvm/test_builtins_mt_random.py b/tests/llvm/test_builtins_mt_random.py
index 28082e2d7e8..d8c0f51d1ce 100644
--- a/tests/llvm/test_builtins_mt_random.py
+++ b/tests/llvm/test_builtins_mt_random.py
@@ -27,12 +27,12 @@ def f():
             return state.randint(0xffffffff, dtype=np.int64)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
         state = init_fun.np_buffer_for_arg(0)
 
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_int32')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -84,11 +84,11 @@ def f():
             return state.random_sample()
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_double')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -133,11 +133,11 @@ def f():
             return state.normal()
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_normal')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -191,11 +191,11 @@ def f():
             return state.binomial(n, p)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial', numpy_args=(0, 1, 2, 3))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_mt_rand_binomial')
         n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])
         p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])
 
diff --git a/tests/llvm/test_builtins_philox_random.py b/tests/llvm/test_builtins_philox_random.py
index 2466ea4f6d2..af9f4228d71 100644
--- a/tests/llvm/test_builtins_philox_random.py
+++ b/tests/llvm/test_builtins_philox_random.py
@@ -26,11 +26,11 @@ def f():
             return prng.integers(0xffffffffffffffff, dtype=np.uint64, endpoint=True)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, seed)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int64')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -76,11 +76,11 @@ def f():
             return prng.integers(0xffffffff, dtype=np.uint32, endpoint=True)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_int32')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -124,11 +124,11 @@ def f():
             return prng.random(dtype=np.float64)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_double')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -171,11 +171,11 @@ def f():
             return prng.random(dtype=np.float32)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_float')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -224,11 +224,11 @@ def f():
             return prng.standard_normal(dtype=dtype)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal', numpy_args=(0, 1))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_normal')
 
         def f():
             out = gen_fun.np_buffer_for_arg(1)
@@ -322,11 +322,11 @@ def f():
             return prng.binomial(n, p)
 
     elif mode == 'LLVM':
-        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init', numpy_args=(0,))
+        init_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_init')
         state = init_fun.np_buffer_for_arg(0)
         init_fun(state, SEED)
 
-        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial', numpy_args=(0, 1, 2, 3))
+        gen_fun = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_philox_rand_binomial')
         n = np.asarray(n, dtype=gen_fun.np_arg_dtypes[1])
         p = np.asarray(p, dtype=gen_fun.np_arg_dtypes[2])
 
diff --git a/tests/llvm/test_builtins_vector.py b/tests/llvm/test_builtins_vector.py
index 9a806bde911..70ced0e8864 100644
--- a/tests/llvm/test_builtins_vector.py
+++ b/tests/llvm/test_builtins_vector.py
@@ -40,7 +40,7 @@ def ex():
             return op(u, v)
 
     elif func_mode == 'LLVM':
-        bin_f = pnlvm.LLVMBinaryFunction.get(builtin)
+        bin_f = pnlvm.LLVMBinaryFunction.get(builtin, ctype_ptr_args=(0, 1, 3))
         lu, lv, lres = _numpy_args(bin_f)
 
         ct_u = lu.ctypes.data_as(bin_f.c_func.argtypes[0])
@@ -75,7 +75,7 @@ def ex():
             return np.sum(u)
 
     elif func_mode == 'LLVM':
-        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
+        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", ctype_ptr_args=(0,))
 
         np_u = u.astype(bin_f.np_arg_dtypes[0])
         np_res = bin_f.np_buffer_for_arg(2)
@@ -87,7 +87,7 @@ def ex():
             return np_res
 
     elif func_mode == 'PTX':
-        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum", numpy_args=(2,))
+        bin_f = pnlvm.LLVMBinaryFunction.get("__pnl_builtin_vec_sum")
 
         np_u = u.astype(bin_f.np_arg_dtypes[0])
         np_res = bin_f.np_buffer_for_arg(2)
diff --git a/tests/llvm/test_compile.py b/tests/llvm/test_compile.py
index 71c8526e2bc..4a1cff96317 100644
--- a/tests/llvm/test_compile.py
+++ b/tests/llvm/test_compile.py
@@ -10,7 +10,7 @@
 @pytest.mark.llvm
 def test_recompile():
     # The original builtin mxv function
-    bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
+    bin_f = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm', ctype_ptr_args=(0, 1, 4))
 
     vector = np.random.rand(DIM_X).astype(bin_f.np_arg_dtypes[0].base)
     matrix = np.random.rand(DIM_X, DIM_Y).astype(bin_f.np_arg_dtypes[1].base)
@@ -24,7 +24,7 @@ def test_recompile():
     orig_res = np.empty_like(llvm_res)
     ct_res = orig_res.ctypes.data_as(bin_f.c_func.argtypes[4])
 
-    bin_f.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f(ct_vec, ct_mat, x, y, ct_res)
 
     # Rebuild and try again
     # This is not a public API
@@ -33,15 +33,15 @@ def test_recompile():
     rebuild_res = np.empty_like(llvm_res)
     ct_res = rebuild_res.ctypes.data_as(bin_f.c_func.argtypes[4])
 
-    bin_f.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f(ct_vec, ct_mat, x, y, ct_res)
     assert np.array_equal(orig_res, rebuild_res)
 
     # Get a new pointer
-    bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm')
+    bin_f2 = pnlvm.LLVMBinaryFunction.get('__pnl_builtin_vxm', ctype_ptr_args=(0, 1, 4))
     new_res = np.empty_like(llvm_res)
     ct_res = new_res.ctypes.data_as(bin_f2.c_func.argtypes[4])
 
-    bin_f2.c_func(ct_vec, ct_mat, x, y, ct_res)
+    bin_f2(ct_vec, ct_mat, x, y, ct_res)
     assert np.array_equal(rebuild_res, new_res)
 
     callable_res = np.empty_like(llvm_res)
diff --git a/tests/llvm/test_custom_func.py b/tests/llvm/test_custom_func.py
index d15e65146ce..87936eb54e0 100644
--- a/tests/llvm/test_custom_func.py
+++ b/tests/llvm/test_custom_func.py
@@ -35,14 +35,12 @@ def test_integer_broadcast(mode, val):
         builder.ret_void()
 
     binf = pnlvm.LLVMBinaryFunction.get(custom_name)
-    res = np.zeros(8, dtype=val.dtype)
+    val = np.asarray(val)
+    res = binf.np_buffer_for_arg(1)
 
     if mode == 'CPU':
-        ct_res = np.ctypeslib.as_ctypes(res)
-        ct_in = np.ctypeslib.as_ctypes(val)
-
-        binf(ctypes.byref(ct_in), ctypes.byref(ct_res))
+        binf(val, res)
     else:
-        binf.cuda_wrap_call(np.asarray(val), res)
+        binf.cuda_wrap_call(val, res)
 
     assert all(res == np.broadcast_to(val + 1, 8))
diff --git a/tests/llvm/test_helpers.py b/tests/llvm/test_helpers.py
index 9f1c9bad29a..00696744eb1 100644
--- a/tests/llvm/test_helpers.py
+++ b/tests/llvm/test_helpers.py
@@ -45,7 +45,7 @@ def test_helper_fclamp(mode):
     ref = np.clip(VECTOR, TST_MIN, TST_MAX)
     bounds = np.asfarray([TST_MIN, TST_MAX])
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0, 2))
     local_vec = VECTOR.copy()
 
     if mode == 'CPU':
@@ -86,7 +86,7 @@ def test_helper_fclamp_const(mode):
     local_vec = VECTOR.copy()
     ref = np.clip(VECTOR, TST_MIN, TST_MAX)
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0,))
     if mode == 'CPU':
         ct_vec = local_vec.ctypes.data_as(bin_f.c_func.argtypes[0])
 
@@ -118,8 +118,7 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type):
 
     with pnlvm.LLVMBuilderContext.get_current() as ctx:
         float_ptr_ty = ctx.float_ty.as_pointer()
-        func_ty = ir.FunctionType(ir.VoidType(), [float_ptr_ty, float_ptr_ty,
-                                                  float_ptr_ty, ctx.int32_ty])
+        func_ty = ir.FunctionType(ir.VoidType(), [float_ptr_ty, float_ptr_ty, float_ptr_ty, ctx.int32_ty])
 
         custom_name = ctx.get_unique_name("is_close")
         function = ir.Function(ctx.module, func_ty, name=custom_name)
@@ -135,14 +134,12 @@ def test_helper_is_close(mode, var1, var2, rtol, atol, fp_type):
             val2 = b1.load(val2_ptr)
             close = pnlvm.helpers.is_close(ctx, b1, val1, val2, **tolerance)
             out_ptr = b1.gep(out, [index])
-            out_val = b1.select(close, val1.type(1), val1.type(0))
-            res = b1.select(close, out_ptr.type.pointee(1),
-                                   out_ptr.type.pointee(0))
+            out_val = b1.select(close, out_ptr.type.pointee(1), out_ptr.type.pointee(0))
             b1.store(out_val, out_ptr)
 
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, ctype_ptr_args=(0, 1, 2))
 
     vec1 = np.atleast_1d(np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base))
     vec2 = np.atleast_1d(np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base))
@@ -200,7 +197,7 @@ def test_helper_all_close(mode, var1, var2, atol, rtol):
     ref = np.allclose(vec1, vec2, **tolerance)
     res = np.array(5, dtype=np.uint32)
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2))
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
     if mode == 'CPU':
         bin_f(vec1, vec2, res)
@@ -440,7 +437,7 @@ def test_helper_numerical(mode, op, var, expected, fp_type):
 
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0,))
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
     res = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0])
 
@@ -473,7 +470,7 @@ def test_helper_elementwise_op(mode, var, expected):
             lambda ctx, builder, x: builder.fadd(x.type(1.0), x), out)
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
     vec = np.asfarray(var, dtype=bin_f.np_arg_dtypes[0].base)
     res = bin_f.np_buffer_for_arg(1)
@@ -519,11 +516,11 @@ def test_helper_recursive_iterate_arrays(mode, var1, var2, expected):
 
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1, 2))
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
     vec1 = np.asfarray(var1, dtype=bin_f.np_arg_dtypes[0].base)
-    vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[0].base)
-    res = bin_f.np_buffer_for_arg(1)
+    vec2 = np.asfarray(var2, dtype=bin_f.np_arg_dtypes[1].base)
+    res = bin_f.np_buffer_for_arg(2)
 
     if mode == 'CPU':
         bin_f(vec1, vec2, res)
@@ -555,14 +552,14 @@ def test_helper_convert_fp_type(t1, t2, mode, val):
         builder.store(conv_x, y)
         builder.ret_void()
 
-    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name, numpy_args=(0, 1))
+    bin_f = pnlvm.LLVMBinaryFunction.get(custom_name)
 
     # Get the argument numpy dtype
     np_dt1, np_dt2 = (np.dtype(bin_f.np_arg_dtypes[i]) for i in (0, 1))
 
     # instantiate value, result and reference
-    x = np.asfarray(val, dtype=np_dt1)
-    y = np.asfarray(0, dtype=np_dt2)
+    x = np.asfarray(val, dtype=bin_f.np_arg_dtypes[0])
+    y = bin_f.np_buffer_for_arg(1)
     ref = x.astype(np_dt2)
 
     if mode == 'CPU':