diff --git a/GPy_ABCD/Kernels/linearOffsetKernel.py b/GPy_ABCD/Kernels/linearOffsetKernel.py
index 752daa0..7eece4e 100644
--- a/GPy_ABCD/Kernels/linearOffsetKernel.py
+++ b/GPy_ABCD/Kernels/linearOffsetKernel.py
@@ -4,9 +4,11 @@
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this
 
+from GPy_ABCD.Util.numba_types import *
+
 
 class LinearWithOffset(Kern):
-    """
+    '''
     Linear kernel with horizontal offset
 
     .. math::
@@ -25,7 +27,7 @@ class LinearWithOffset(Kern):
     :param name: Name of the kernel for output
     :type String
     :rtype: Kernel object
-    """
+    '''
 
     def __init__(self, input_dim: int, variance: float = 1., offset: float = 0., active_dims: int = None, name: str = 'linear_with_offset') -> None:
         super(LinearWithOffset, self).__init__(input_dim, active_dims, name)
@@ -40,30 +42,27 @@ def __init__(self, input_dim: int, variance: float = 1., offset: float = 0., act
 
         self.link_parameters(self.variance, self.offset)
 
-
     def to_dict(self):
         input_dict = super(LinearWithOffset, self)._save_to_input_dict()
-        input_dict["class"] = "LinearWithOffset"
-        input_dict["variance"] = self.variance.values.tolist()
-        input_dict["offset"] = self.offset
+        input_dict['class'] = 'LinearWithOffset'
+        input_dict['variance'] = self.variance.values.tolist()
+        input_dict['offset'] = self.offset
         return input_dict
 
-
     @staticmethod
     def _build_from_input_dict(kernel_class, input_dict):
         useGPU = input_dict.pop('useGPU', None)
         return LinearWithOffset(**input_dict)
 
-
-    @Cache_this(limit=3)
-    def K(self, X, X2=None):
+    @Cache_this(limit = 3)
+    def K(self, X, X2 = None):
         if X2 is None: X2 = X
         return self.variance * (X - self.offset) * (X2 - self.offset).T
-
+        # return _K(self.variance, self.offset, X, X if X2 is None else X2)
 
     def Kdiag(self, X):
-        return np.sum(self.variance * np.square(X - self.offset), -1)
-
+        # return np.sum(self.variance * np.square(X - self.offset), -1)
+        return _Kdiag(self.variance, self.offset, X)
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         if X2 is None: X2 = X
@@ -71,4 +70,27 @@ def update_gradients_full(self, dL_dK, X, X2=None):
         dK_do = self.variance * (2 * self.offset - X - X2)
 
         self.variance.gradient = np.sum(dL_dK * dK_dV)
-        self.offset.gradient = np.sum(dL_dK * dK_do)
+        self.offset.gradient   = np.sum(dL_dK * dK_do)
+        # self.variance.gradient, self.offset.gradient = _update_gradients_full(self.offset, self.variance, dL_dK, X, X if X2 is None else X2)
+
+
+
+## numba versions of some methods
+
+# from GPy_ABCD.Util.benchmarking import numba_comparisons
+
+# # @njit(f8A2(f8A, f8A, f8A2, f8A2)) # The numpy version is faster
+# def _K(variance, offset, X, X2):
+#     return variance * (X - offset) * (X2 - offset).T
+
+@njit(f8A(f8A, f8A, f8A2)) # 2x faster than numpy version
+def _Kdiag(variance, offset, X):
+    return np.sum(variance * np.square(X - offset), -1)
+
+# # @njit(nTup(f8, f8)(f8A, f8A, f8A2, f8A2, f8A2)) # The numpy version is faster
+# def _update_gradients_full(offset, variance, dL_dK, X, X2):
+#     dK_dV = (X - offset) * (X2 - offset).T
+#     dK_do = variance * (2 * offset - X - X2)
+#     return np.sum(dL_dK * dK_dV), np.sum(dL_dK * dK_do) # variance.gradient and offset.gradient
+
+
diff --git a/GPy_ABCD/Kernels/periodicKernel.py b/GPy_ABCD/Kernels/periodicKernel.py
index c81be21..8a4e80f 100644
--- a/GPy_ABCD/Kernels/periodicKernel.py
+++ b/GPy_ABCD/Kernels/periodicKernel.py
@@ -5,10 +5,30 @@
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this
 
+from GPy_ABCD.Util.numba_types import *
+from GPy_ABCD.Util.benchmarking import compare_implementations, numba_comparisons
+
+
+@njit(f8A(f8A))
+def _embi0(x): # == exp(-x) * besseli(0, x) => 9.8.2 Abramowitz & Stegun (http://people.math.sfu.ca/~cbm/aands/page_378.htm)
+    y = 3.75 / x # The below is an efficient polynomial computation
+    f = 0.39894228 + (0.01328592 + (0.00225319 + (-0.00157565 + (0.00916281 + (-0.02057706 + (0.02635537 + (-0.01647633 + (0.00392377)*y)*y)*y)*y)*y)*y)*y)*y
+    return f / np.sqrt(x)
+@njit(f8A(f8A))
+def _embi1(x): # == exp(-x) * besseli(1, x) => 9.8.4 Abramowitz & Stegun (http://people.math.sfu.ca/~cbm/aands/page_378.htm)
+    y = 3.75 / x # The below is an efficient polynomial computation
+    f = 0.39894228 + (-0.03988024 + (-0.00362018 + (0.00163801 + (-0.01031555 + (0.02282967 + (-0.02895312 + (0.01787654 + (-0.00420059)*y)*y)*y)*y)*y)*y)*y)*y
+    return f / np.sqrt(x)
+@njit(f8A(f8A))
+def _embi0min1(x): # == embi0(x) - embi1(x)
+    y = 3.75 / x # The below is an efficient polynomial computation (with the 0-th power term equal to 0)
+    f = (0.05316616 + (0.00587337 + (-0.00321366 + (0.01947836 + (-0.04340673 + (0.05530849 + (-0.03435287 + (0.00812436)*y)*y)*y)*y)*y)*y)*y)*y
+    return f / np.sqrt(x)
+
 
 # Based on GPy's StdPeriodic kernel and gaussianprocess.org/gpml/code/matlab/cov/covPeriodicNoDC.m
 class PureStdPeriodicKernel(Kern):
-    """
+    '''
     The standard periodic kernel due to MacKay (1998) can be decomposed into a sum of a periodic and a constant component;
     this kernel is the purely periodic component, as mentioned in:
     Lloyd, James Robert; Duvenaud, David Kristjanson; Grosse, Roger Baker; Tenenbaum, Joshua B.; Ghahramani, Zoubin (2014):
@@ -37,7 +57,7 @@ class PureStdPeriodicKernel(Kern):
     :param name: Name of the kernel for output
     :type String
     :rtype: Kernel object
-    """
+    '''
 
     def __init__(self, input_dim: int, variance: float = 1., period: float = 2.*np.pi, lengthscale: float = 2.*np.pi,
                  active_dims: int = None, name: str = 'pure_std_periodic') -> None:
@@ -47,17 +67,17 @@ def __init__(self, input_dim: int, variance: float = 1., period: float = 2.*np.p
 
         if period is not None:
             period = np.asarray(period)
-            assert period.size == input_dim, "bad number of periods"
+            assert period.size == input_dim, 'bad number of periods'
         else:
             period = 2.*np.pi * np.ones(input_dim)
         if lengthscale is not None:
             lengthscale = np.asarray(lengthscale)
-            assert lengthscale.size == input_dim, "bad number of lengthscales"
+            assert lengthscale.size == input_dim, 'bad number of lengthscales'
         else:
             lengthscale = 2.*np.pi * np.ones(input_dim)
 
         self.variance = Param('variance', variance, Logexp())
-        assert self.variance.size == 1, "Variance size must be one"
+        assert self.variance.size == 1, 'Variance size must be one'
         self.period = Param('period', period, Logexp())
         self.lengthscale = Param('lengthscale', lengthscale, Logexp())
 
@@ -65,30 +85,14 @@ def __init__(self, input_dim: int, variance: float = 1., period: float = 2.*np.p
 
     def to_dict(self):
         input_dict = super(PureStdPeriodicKernel, self)._save_to_input_dict()
-        input_dict["class"] = "BesselShiftedPeriodic"
-        input_dict["variance"] = self.variance.values.tolist()
-        input_dict["period"] = self.period.values.tolist()
-        input_dict["lengthscale"] = self.lengthscale.values.tolist()
+        input_dict['class'] = 'BesselShiftedPeriodic'
+        input_dict['variance'] = self.variance.values.tolist()
+        input_dict['period'] = self.period.values.tolist()
+        input_dict['lengthscale'] = self.lengthscale.values.tolist()
         return input_dict
 
-    @staticmethod
-    def embi0(x): # == exp(-x) * besseli(0, x) => 9.8.2 Abramowitz & Stegun (http://people.math.sfu.ca/~cbm/aands/page_378.htm)
-        y = 3.75 / x # The below is an efficient polynomial computation
-        f = 0.39894228 + (0.01328592 + (0.00225319 + (-0.00157565 + (0.00916281 + (-0.02057706 + (0.02635537 + (-0.01647633 + (0.00392377)*y)*y)*y)*y)*y)*y)*y)*y
-        return f / np.sqrt(x)
-    @staticmethod
-    def embi1(x): # == exp(-x) * besseli(1, x) => 9.8.4 Abramowitz & Stegun (http://people.math.sfu.ca/~cbm/aands/page_378.htm)
-        y = 3.75 / x # The below is an efficient polynomial computation
-        f = 0.39894228 + (-0.03988024 + (-0.00362018 + (0.00163801 + (-0.01031555 + (0.02282967 + (-0.02895312 + (0.01787654 + (-0.00420059)*y)*y)*y)*y)*y)*y)*y)*y
-        return f / np.sqrt(x)
-    @staticmethod
-    def embi0min1(x): # == embi0(x) - embi1(x)
-        y = 3.75 / x # The below is an efficient polynomial computation (with the 0-th power term equal to 0)
-        f = (0.05316616 + (0.00587337 + (-0.00321366 + (0.01947836 + (-0.04340673 + (0.05530849 + (-0.03435287 + (0.00812436)*y)*y)*y)*y)*y)*y)*y)*y
-        return f / np.sqrt(x)
-
     @Cache_this(limit = 3)
-    def K(self, X, X2=None):
+    def K(self, X, X2 = None):
         if X2 is None: X2 = X
         cos_term = np.cos((2 * np.pi / self.period) * (X - X2.T))
         invL2 = 1 / self.lengthscale ** 2
@@ -101,21 +105,11 @@ def K(self, X, X2=None):
             return self.variance * ((exp_term - bessel0) / (np.exp(invL2) - bessel0)) # The brackets prevent an overflow; want division first
         else:
             exp_term = np.exp((cos_term - 1) * invL2)
-            embi0 = self.embi0(invL2)
-            return self.variance * ((exp_term - embi0) / (1 - embi0))  # The brackets prevent an overflow; want division first
-    # @Cache_this(limit = 3)
-    # def K(self, X, X2 = None):
-    #     if X2 is None: X2 = X
-    #     cos_term = np.cos(2 * np.pi * (X - X2.T) / self.period)
-    #
-    #     if np.any(self.lengthscale > 1e4): # Limit for l -> infinity
-    #         return self.variance * cos_term
-    #     else:
-    #         # Overflow on exp and i0 by going backwards from sys.float_info.max (1.7976931348623157e+308): 1/l^2 < 709.782712893384
-    #         invL2 = np.clip(1 / np.where(self.lengthscale < 1e-100, 1e-200, self.lengthscale ** 2), 0, 705)
-    #         exp_term = np.exp(cos_term * invL2)
-    #         bessel0 = i0(invL2)
-    #         return self.variance * ((exp_term - bessel0) / (np.exp(invL2) - bessel0)) # The brackets prevent an overflow; want division first
+            embi0 = _embi0(invL2)
+            return self.variance * ((exp_term - embi0) / (1 - embi0)) # The brackets prevent an overflow; want division first
+        # invL2 = 1 / self.lengthscale ** 2
+        # bessel0 = i0(invL2)
+        # return _K(self.variance, self.period, self.lengthscale, invL2, bessel0, X, X if X2 is None else X2)
 
     def Kdiag(self, X): # Correct; nice and fast
         ret = np.empty(X.shape[0])
@@ -135,7 +129,8 @@ def update_gradients_full(self, dL_dK, X, X2 = None):
             dK_dp = (self.variance / self.period) * trig_arg * sin_term
 
             # This is 0 in the limit, but best to set it to a small non-0 value
-            dK_dl = 1e-4 / self.lengthscale
+            dK_dl = np.empty_like(dL_dK)
+            dK_dl[:, :] = 1e-4 / self.lengthscale[0]
         elif np.any(invL2 < 3.75):
             bessel0 = i0(invL2)
             bessel1 = i1(invL2)
@@ -153,10 +148,10 @@ def update_gradients_full(self, dL_dK, X, X2 = None):
 
             dK_dl = dInvL2_dl * self.variance * ( (cos_term * exp_term - bessel1) - K_no_Var * (eInvL2 - bessel1) ) / denom
         else:
-            embi0 = self.embi0(invL2)
-            # embi1 = self.embi1(invL2)
+            embi0 = _embi0(invL2)
+            # embi1 = _embi1(invL2)
             # embi0min1 = embi0 - embi1
-            embi0min1 = self.embi0min1(invL2)
+            embi0min1 = _embi0min1(invL2)
             dInvL2_dl = -2 * invL2 / self.lengthscale # == -2 / l^3
 
             denom = 1 - embi0
@@ -173,40 +168,10 @@ def update_gradients_full(self, dL_dK, X, X2 = None):
         self.variance.gradient = np.sum(dL_dK * dK_dV)
         self.period.gradient = np.sum(dL_dK * dK_dp)
         self.lengthscale.gradient = np.sum(dL_dK * dK_dl)
-    # def update_gradients_full(self, dL_dK, X, X2 = None):
-    #     if X2 is None: X2 = X
-    #     trig_arg = 2 * np.pi * (X - X2.T) / self.period
-    #     cos_term = np.cos(trig_arg)
-    #     sin_term = np.sin(trig_arg)
-    #
-    #     if np.any(self.lengthscale > 1e4):  # Limit for l -> infinity
-    #         dK_dV = cos_term # K / V
-    #
-    #         dK_dp = self.variance * trig_arg * sin_term / self.period
-    #
-    #         # This is 0 in the limit, but best to set it to a small non-0 value
-    #         dK_dl = 1e-4 / self.lengthscale
-    #     else:
-    #         # Overflow on exp and i0 by going backwards from sys.float_info.max (1.7976931348623157e+308): 1/l^2 < 709.782712893384
-    #         invL2 = np.clip(1 / np.where(self.lengthscale < 1e-100, 1e-200, self.lengthscale ** 2), 0, 705)
-    #         bessel0 = i0(invL2)
-    #         bessel1 = i1(invL2)
-    #         eInvL2 = np.exp(invL2)
-    #         denom = eInvL2 - bessel0
-    #         exp_term = np.exp(cos_term * invL2)
-    #
-    #         dK_dV = (exp_term - bessel0) / denom # = K / V
-    #
-    #         dK_dp = (self.variance * invL2 * trig_arg * sin_term / self.period) * (exp_term / denom) # exp terms division separate because of overflow risk
-    #
-    #         K = self.variance * dK_dV
-    #         dInvL2_dl = -2 / np.where(self.lengthscale < 1e-100, 1e-300, self.lengthscale ** 3)
-    #         dK_dl = dInvL2_dl * ( self.variance * ((cos_term * exp_term - bessel1) / denom) - K * ((eInvL2 - bessel1) / denom) ) # The brackets prevent some overflows; want those divisions first
-    #
-    #     self.variance.gradient = np.sum(dL_dK * dK_dV)
-    #     self.period.gradient = np.sum(dL_dK * dK_dp)
-    #     self.lengthscale.gradient = np.sum(dL_dK * dK_dl)
-
+        # invL2 = 1 / self.lengthscale ** 2
+        # bessel0 = i0(invL2)
+        # bessel1 = i1(invL2)
+        # self.variance.gradient, self.period.gradient, self.lengthscale.gradient = _update_gradients_full(self.variance, self.period, self.lengthscale, invL2, bessel0, bessel1, dL_dK, X, X if X2 is None else X2)
 
 
     # Unused functions for a possible version of this class as a subclass of Stationary
@@ -236,3 +201,81 @@ def update_gradients_full(self, dL_dK, X, X2 = None):
     #         return - self.variance * invL2 * (trig_arg / r) * sin_term * exp_term / (np.exp(invL2) - i0(invL2))
 
 
+
+## numba versions of some methods
+
+# from GPy_ABCD.Util.benchmarking import numba_comparisons
+# numba_comparisons(_embi0, f8A(f8A), n = 200, args = [x])
+
+
+# # if X2 is None: X2 = X
+# # invL2 = 1 / lengthscale ** 2
+# # bessel0 = i0(invL2)
+# # @njit(f8A2(f8A, f8A, f8A, f8A, f8A, f8A2, f8A2)) # The numpy version is faster
+# def _K(variance, period, lengthscale, invL2, bessel0, X, X2):
+#     cos_term = np.cos((2 * np.pi / period) * (X - X2.T))
+#
+#     if np.any(lengthscale > 1e4): # Limit for l -> infinity
+#         return variance * cos_term
+#     elif np.any(invL2 < 3.75):
+#         exp_term = np.exp(cos_term * invL2)
+#         return variance * ((exp_term - bessel0) / (np.exp(invL2) - bessel0)) # The brackets prevent an overflow; want division first
+#     else:
+#         exp_term = np.exp((cos_term - 1) * invL2)
+#         embi0 = _embi0(invL2)
+#         return variance * ((exp_term - embi0) / (1 - embi0)) # The brackets prevent an overflow; want division first
+
+
+
+# # invL2 = 1 / lengthscale ** 2
+# # bessel0 = i0(invL2)
+# # bessel1 = i1(invL2)
+# # @njit(nTup(f8, f8, f8)(f8A, f8A, f8A, f8A, f8A, f8A, f8A2, f8A2, f8A2)) # The numpy version is faster
+# def _update_gradients_full(variance, period, lengthscale, invL2, bessel0, bessel1, dL_dK, X, X2):
+#     trig_arg = (2 * np.pi / period) * (X - X2.T)
+#     cos_term = np.cos(trig_arg)
+#     sin_term = np.sin(trig_arg)
+#
+#     if np.any(lengthscale > 1e4):  # Limit for l -> infinity
+#         dK_dV = cos_term # K / V
+#
+#         dK_dp = (variance / period) * trig_arg * sin_term
+#
+#         # This is 0 in the limit, but best to set it to a small non-0 value
+#         dK_dl = np.empty_like(dL_dK)
+#         dK_dl[:,:] = 1e-4 / lengthscale[0]
+#     elif np.any(invL2 < 3.75):
+#         eInvL2 = np.exp(invL2)
+#         dInvL2_dl = -2 * invL2 / lengthscale # == -2 / l^3
+#
+#         denom = eInvL2 - bessel0
+#         exp_term = np.exp(cos_term * invL2)
+#         K_no_Var = (exp_term - bessel0) / denom # == K / V; here just for clarity of further expressions
+#
+#
+#         dK_dV = K_no_Var
+#
+#         dK_dp = (variance / period) * invL2 * trig_arg * sin_term * exp_term / denom
+#
+#         dK_dl = dInvL2_dl * variance * ( (cos_term * exp_term - bessel1) - K_no_Var * (eInvL2 - bessel1) ) / denom
+#     else:
+#         embi0 = _embi0(invL2)
+#         # embi1 = _embi1(invL2)
+#         # embi0min1 = embi0 - embi1
+#         embi0min1 = _embi0min1(invL2)
+#         dInvL2_dl = -2 * invL2 / lengthscale # == -2 / l^3
+#
+#         denom = 1 - embi0
+#         exp_term = np.exp((cos_term - 1) * invL2)
+#         K_no_Var = (exp_term - embi0) / denom # == K / V; here just for clarity of further expressions
+#
+#
+#         dK_dV = K_no_Var
+#
+#         dK_dp = (variance / period) * invL2 * trig_arg * sin_term * exp_term / denom # I.e. SAME as the above case at this abstraction level
+#
+#         dK_dl = dInvL2_dl * variance * ( (cos_term - 1) * exp_term + embi0min1 - K_no_Var * embi0min1 ) / denom
+#
+#     return np.sum(dL_dK * dK_dV), np.sum(dL_dK * dK_dp), np.sum(dL_dK * dK_dl) # variance.gradient, period.gradient, lengthscale.gradient
+
+
diff --git a/GPy_ABCD/Kernels/sigmoidalKernels.py b/GPy_ABCD/Kernels/sigmoidalKernels.py
index 803be49..feb5ae0 100644
--- a/GPy_ABCD/Kernels/sigmoidalKernels.py
+++ b/GPy_ABCD/Kernels/sigmoidalKernels.py
@@ -5,12 +5,41 @@
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this
 
+from GPy_ABCD.Util.numba_types import *
+# from GPy_ABCD.Util.benchmarking import numba_comparisons
+
+
+@njit(f8A2(f8A2))
+def _sigmoid_function(arg): return (1 + arg / (1 + np.abs(arg))) / 2
+@njit(f8A2(f8A2))
+def _sigmoid_function_d(arg): return 0.5 / (1 + np.abs(arg)) ** 2
+# Since arg is usually (x - l) / s, derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
+#       x: * 1/s
+#       l: * -1/s
+#       s: * -(x-l)/s^2, i.e. * -arg/s
+# When arg is (x - l - w) / (-s) [NOTE THE -s], derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
+#       x: * -1/s
+#       l: * 1/s
+#       w: * 1/s
+#       s: * (x-l-w)/(-s)^2, i.e. * -argW/s
+
+@njit(f8(f8))
+def _core_sigmoid_function(arg): return arg / (1 + np.abs(arg))
+# NOTE: This function can be used to compute the maximum height of the sum of opposite sigmoidals - 1 by feeding it arg = w / (2 * s)
+# Let arg = (x - l) / s and argW (x - l - w) / (-s) [NOTE THE -s]
+# This is the height of sig(arg) + sig(argW) - 1 for midpoint x: l + w/2 in both args (which become w/2s)
+# so 2*sig(w/2s) - 1 = the core sigmoid ([-1,1]) on +ve values because of +ve w and s: core_sig(w/2s)
+@njit(f8(f8))
+def _inv_core_sigmoid_d(arg): return - 1 / arg ** 2
+# NOTE: This function comes up in computing the derivatives of the scaled (to 1) version of the above sum of sigmoidals
+# Since arg  is always  w / (2 * s), derivatives are just _inv_core_sigmoid_d(arg) * ...
+#       w: * 1/2s
+#       s: * -w/(2s^2), i.e. * -arg/s
+
 
 
 class SigmoidalKernelBase(BasisFuncKernel):
-    """
-    Abstract superclass for sigmoidal kernels
-    """
+    '''Abstract superclass for sigmoidal kernels'''
 
     def __init__(self, input_dim: int, reverse: bool = False, variance: float = 1., location: float = 0., slope: float = 0.2,
                  active_dims: int = None, name: str = 'sigmoidal_kernel_base', fixed_slope = False) -> None:
@@ -30,33 +59,6 @@ def __init__(self, input_dim: int, reverse: bool = False, variance: float = 1.,
             self.slope = Param('slope', slope, Logexp()) # This +ve constraint makes non-reverse sigmoids only fit (+ve or -ve) curves going away from 0; similarly for other kernels
             self.link_parameter(self.slope)
 
-    @staticmethod
-    def _sigmoid_function(arg): return (1 + arg / (1 + abs(arg))) / 2
-    @staticmethod
-    def _sigmoid_function_d(arg): return 0.5 / (1 + abs(arg)) ** 2
-    # Since arg is usually (x - l) / s, derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
-    #       x: * 1/s
-    #       l: * -1/s
-    #       s: * -(x-l)/s^2, i.e. * -arg/s
-    # When arg is (x - l - w) / (-s) [NOTE THE -s], derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
-    #       x: * -1/s
-    #       l: * 1/s
-    #       w: * 1/s
-    #       s: * (x-l-w)/(-s)^2, i.e. * -argW/s
-
-    @staticmethod
-    def _core_sigmoid_function(arg): return arg / (1 + abs(arg))
-    # NOTE: This function can be used to compute the maximum height of the sum of opposite sigmoidals - 1 by feeding it arg = w / (2 * s)
-    # Let arg = (x - l) / s and argW (x - l - w) / (-s) [NOTE THE -s]
-    # This is the height of sig(arg) + sig(argW) - 1 for midpoint x: l + w/2 in both args (which become w/2s)
-    # so 2*sig(w/2s) - 1 = the core sigmoid ([-1,1]) on +ve values because of +ve w and s: core_sig(w/2s)
-    @staticmethod
-    def _inv_core_sigmoid_d(arg): return - 1 / arg ** 2
-    # NOTE: This function comes up in computing the derivatives of the scaled (to 1) version of the above sum of sigmoidals
-    # Since arg  is always  w / (2 * s), derivatives are just _inv_core_sigmoid_d(arg) * ...
-    #       w: * 1/2s
-    #       s: * -w/(2s^2), i.e. * -arg/s
-
     def update_gradients_diag(self, dL_dKdiag, X):
         raise NotImplementedError
 
@@ -64,68 +66,84 @@ def update_gradients_diag(self, dL_dKdiag, X):
 
 # Based on LogisticBasisFuncKernel
 class SigmoidalKernel(SigmoidalKernelBase):
-    """
-    Sigmoidal kernel
-    (ascending by default; descending by reverse=True)
-    """
+    '''Sigmoidal kernel
+    (ascending by default; descending by reverse = True)'''
 
     def __init__(self, input_dim: int, reverse: bool = False, variance: float = 1., location: float = 0., slope: float = 0.2,
                  active_dims: int = None, name: str = 'sigmoidal', fixed_slope = False) -> None:
         super(SigmoidalKernel, self).__init__(input_dim, reverse, variance, location, slope, active_dims, name, fixed_slope)
 
-    @Cache_this(limit=3, ignore_args=())
+    @Cache_this(limit = 3)
     def _phi(self, X):
-        asc = self._sigmoid_function((X - self.location) / self.slope)
-        return 1 - asc if self.reverse else asc
+        return _phi_S(self.location[0], self.slope if self._fixed_slope else self.slope[0], self.reverse, X)
 
-    def update_gradients_full(self, dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2 = None):
         super(SigmoidalKernel, self).update_gradients_full(dL_dK, X, X2)
-        if X2 is None or X is X2:
-            phi1 = self.phi(X)
-            if phi1.ndim != 2:
-                phi1 = phi1[:, None]
-
-            arg = (X - self.location) / self.slope
-            sig_d = self._sigmoid_function_d(arg)
-
-            dphi1_dl = sig_d / (-self.slope)
-            self.location.gradient = np.sum(self.variance * 2 * (dL_dK * phi1.dot(dphi1_dl.T)).sum())
+        phi1 = self.phi(X)
+        if phi1.ndim != 2: phi1 = phi1[:, None]
 
-            if not self._fixed_slope:
-                dphi1_ds = sig_d * (-arg / self.slope)
-                self.slope.gradient = np.sum(self.variance * 2 * (dL_dK * phi1.dot(dphi1_ds.T)).sum())
+        if X2 is None or X is X2:
+            self.location.gradient, slope_gradient = _update_gradients_full_S_1(self.variance[0], self.location[0],
+                self.slope if self._fixed_slope else self.slope[0], self._fixed_slope, phi1, dL_dK, X)
+            if not self._fixed_slope: self.slope.gradient = slope_gradient
         else:
-            phi1 = self.phi(X)
             phi2 = self.phi(X2)
-            if phi1.ndim != 2:
-                phi1 = phi1[:, None]
-                phi2 = phi2[:, None]
-
-            arg, arg2 = (X - self.location) / self.slope, (X2 - self.location) / self.slope
-            sig_d, sig_d2 = self._sigmoid_function_d(arg), self._sigmoid_function_d(arg2)
+            if phi1.ndim != 2: phi2 = phi2[:, None]
 
-            dphi1_dl, dphi2_dl = sig_d / (-self.slope), sig_d2 / (-self.slope)
-            self.location.gradient = np.sum(self.variance * (dL_dK * (phi1.dot(dphi2_dl.T) + phi2.dot(dphi1_dl.T))).sum())
-            self.location.gradient = np.where(np.isnan(self.location.gradient), 0, self.location.gradient)
-
-            if not self._fixed_slope:
-                dphi1_ds = sig_d * (-arg / self.slope)
-                dphi2_ds = sig_d2 * (-arg2 / self.slope)
-                self.slope.gradient = np.sum(self.variance * (dL_dK * (phi1.dot(dphi2_ds.T) + phi2.dot(dphi1_ds.T))).sum())
-                self.slope.gradient = np.where(np.isnan(self.slope.gradient), 0, self.slope.gradient)
+            self.location.gradient, slope_gradient = _update_gradients_full_S_2(self.variance[0], self.location[0],
+                self.slope if self._fixed_slope else self.slope[0], self._fixed_slope, phi1, phi2, dL_dK, X, X2)
+            if not self._fixed_slope: self.slope.gradient = slope_gradient
 
         if self.reverse: # Works this simply (a single sign flip per product above)
             self.location.gradient = - self.location.gradient
             if not self._fixed_slope: self.slope.gradient = - self.slope.gradient
 
+@njit(f8A2(f8, f8, b1, f8A2)) # Almost 2x faster than numpy version
+def _phi_S(location, slope, reverse, X):
+    asc = _sigmoid_function((X - location) / slope)
+    return 1 - asc if reverse else asc
+
+@njit(nTup(f8, f8)(f8, f8, f8, b1, f8A2, f8A2, f8A2)) # 1.5x faster than numpy version
+def _update_gradients_full_S_1(variance, location, slope, _fixed_slope, phi1, dL_dK, X):
+    arg = (X - location) / slope
+    sig_d = _sigmoid_function_d(arg)
+
+    dphi1_dl = sig_d / (-slope)
+    location_gradient = variance * 2 * (dL_dK * phi1.dot(dphi1_dl.T)).sum()
+    if np.isnan(location_gradient): location_gradient = 0.0
+
+    if not _fixed_slope:
+        dphi1_ds = sig_d * (-arg / slope)
+        slope_gradient = variance * 2 * (dL_dK * phi1.dot(dphi1_ds.T)).sum()
+        if np.isnan(slope_gradient): slope_gradient = 0.0
+    else: slope_gradient = 1.0 # Going to discard it anyway
+
+    return location_gradient, slope_gradient
+
+@njit(nTup(f8, f8)(f8, f8, f8, b1, f8A2, f8A2, f8A2, f8A2, f8A2))
+def _update_gradients_full_S_2(variance, location, slope, _fixed_slope, phi1, phi2, dL_dK, X, X2):
+    arg, arg2 = (X - location) / slope, (X2 - location) / slope
+    sig_d, sig_d2 = _sigmoid_function_d(arg), _sigmoid_function_d(arg2)
+
+    dphi1_dl, dphi2_dl = sig_d / (-slope), sig_d2 / (-slope)
+    location_gradient = variance * (dL_dK * (phi1.dot(dphi2_dl.T) + phi2.dot(dphi1_dl.T))).sum()
+    if np.isnan(location_gradient): location_gradient = 0.0
+
+    if not _fixed_slope:
+        dphi1_ds = sig_d * (-arg / slope)
+        dphi2_ds = sig_d2 * (-arg2 / slope)
+        slope_gradient = variance * (dL_dK * (phi1.dot(dphi2_ds.T) + phi2.dot(dphi1_ds.T))).sum()
+        if np.isnan(slope_gradient): slope_gradient = 0.0
+    else: slope_gradient = 1.0 # Going to discard it anyway
+
+    return location_gradient, slope_gradient
+
 
 
 class SigmoidalIndicatorKernel(SigmoidalKernelBase):
-    """
-    Sigmoidal indicator function kernel with parametrised start location and width:
+    '''Sigmoidal indicator function kernel with parametrised start location and width:
     ascendingSigmoid(location) + (1 - ascendingSigmoid(stop_location + width)) - 1, i.e. ascendingSigmoid(location) - ascendingSigmoid(location + width)
-    (hat if width > 0, and positive-well otherwise; can flip from one to the other by reverse=True)
-    """
+    (hat if width > 0, and positive-well otherwise; can flip from one to the other by reverse=True)'''
 
     def __init__(self, input_dim: int, reverse: bool = False, variance: float = 1., location: float = 0., slope: float = 0.2, width: float = 1.,
                  active_dims: int = None, name: str = 'sigmoidal_indicator', fixed_slope = False) -> None:
@@ -135,13 +153,9 @@ def __init__(self, input_dim: int, reverse: bool = False, variance: float = 1.,
 
     @Cache_this(limit = 3)
     def _phi(self, X):
-        asc = self._sigmoid_function((X - self.location) / self.slope)
-        desc = self._sigmoid_function((X - self.location - self.width) / (-self.slope))
-        height = self._core_sigmoid_function(self.width / (2 * self.slope))
-        hat = (asc + desc - 1) / height
-        return 1 - hat if self.reverse else hat
+        return _phi_SI(self.location[0], self.slope if self._fixed_slope else self.slope[0], self.width[0], self.reverse, X)
 
-    def update_gradients_full(self, dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2 = None):
         super(SigmoidalIndicatorKernel, self).update_gradients_full(dL_dK, X, X2)
         if X2 is None: X2 = X
 
@@ -151,51 +165,68 @@ def update_gradients_full(self, dL_dK, X, X2=None):
             phi1 = phi1[:, None]
             phi2 = phi2[:, None] if X2 is not X else phi1
 
-        arg, arg2 = (X - self.location) / self.slope, (X2 - self.location) / self.slope
-        argW, argW2 = (X - self.location - self.width) / (-self.slope), (X2 - self.location - self.width) / (-self.slope)
-        sig_d, sig_d2 = self._sigmoid_function_d(arg), self._sigmoid_function_d(arg2)
-        sig_dW, sig_dW2 = self._sigmoid_function_d(argW), self._sigmoid_function_d(argW2)
-
-        numerator1 = self._sigmoid_function(arg) + self._sigmoid_function(argW) - 1 # asc1 + desc1 - 1
-        numerator2 = self._sigmoid_function(arg2) + self._sigmoid_function(argW2) - 1 # asc2 + desc2 - 1
-
-        height_arg = self.width / (2 * self.slope)
-        inv_height = 1 / self._core_sigmoid_function(height_arg)
-        inv_height_d = self._inv_core_sigmoid_d(height_arg)
-        dinvheight_dw = inv_height_d / (2 * self.slope)
-        dinvheight_ds = inv_height_d * (-height_arg / self.slope)
-
-        # Repeated _sigmoid_function_d reference:
-        # Since arg is usually (x - l) / s, derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
-        #       x: * 1/s
-        #       l: * -1/s
-        #       s: * -(x-l)/s^2, i.e. * -arg/s
-        # When arg is (x - l - w) / (-s) [NOTE THE -s], derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
-        #       x: * -1/s
-        #       l: * 1/s
-        #       w: * 1/s
-        #       s: * (x-l-w)/(-s)^2, i.e. * -argW/s
-
-        dphi1_dl = ((- sig_d + sig_dW) / self.slope) * inv_height #+ (asc + desc) * dinvheight_dl, which is 0
-        dphi2_dl = ((- sig_d2 + sig_dW2) / self.slope) * inv_height #+ (asc + desc) * dinvheight_dl, which is 0
-        self.location.gradient = np.sum(self.variance * (dL_dK * (phi1.dot(dphi2_dl.T) + phi2.dot(dphi1_dl.T))).sum())
-        self.location.gradient = np.where(np.isnan(self.location.gradient), 0, self.location.gradient)
-
-        if not self._fixed_slope:
-            dphi1_ds = ((sig_d * arg + sig_dW * argW) / (-self.slope)) * inv_height + numerator1 * dinvheight_ds
-            dphi2_ds = ((sig_d2 * arg2 + sig_dW2 * argW2) / (-self.slope)) * inv_height + numerator2 * dinvheight_ds
-            self.slope.gradient = np.sum(self.variance * (dL_dK * (phi1.dot(dphi2_ds.T) + phi2.dot(dphi1_ds.T))).sum())
-            self.slope.gradient = np.where(np.isnan(self.slope.gradient), 0, self.slope.gradient)
-
-        dphi1_dw = (sig_dW / self.slope) * inv_height + numerator1 * dinvheight_dw # d(asc)_dw = 0
-        dphi2_dw = (sig_dW2 / self.slope) * inv_height + numerator2 * dinvheight_dw # d(asc)_dw = 0
-        self.width.gradient = np.sum(self.variance * (dL_dK * (phi1.dot(dphi2_dw.T) + phi2.dot(dphi1_dw.T))).sum())
-        self.width.gradient = np.where(np.isnan(self.width.gradient), 0, self.width.gradient)
-
+        self.location.gradient, slope_gradient, self.width.gradient = _update_gradients_full_SI(
+            self.variance[0], self.location[0], self.slope if self._fixed_slope else self.slope[0], self.width[0],
+            self.reverse, self._fixed_slope, phi1, phi2, dL_dK, X, X2)
+        if not self._fixed_slope: self.slope.gradient = slope_gradient
+
+@njit(f8A2(f8, f8, f8, b1, f8A2)) # 4.6x faster than numpy version
+def _phi_SI(location, slope, width, reverse, X):
+    asc = _sigmoid_function((X - location) / slope)
+    desc = _sigmoid_function((X - location - width) / (-slope))
+    height = _core_sigmoid_function(width / (2 * slope))
+    hat = (asc + desc - 1) / height
+    return 1 - hat if reverse else hat
+
+@njit(nTup(f8, f8, f8)(f8, f8, f8, f8, b1, b1, f8A2, f8A2, f8A2, f8A2, f8A2)) # 1.45x faster than numpy version
+def _update_gradients_full_SI(variance, location, slope, width, reverse, _fixed_slope, phi1, phi2, dL_dK, X, X2):
+    arg, arg2 = (X - location) / slope, (X2 - location) / slope
+    argW, argW2 = (X - location - width) / (-slope), (X2 - location - width) / (-slope)
+    sig_d, sig_d2 = _sigmoid_function_d(arg), _sigmoid_function_d(arg2)
+    sig_dW, sig_dW2 = _sigmoid_function_d(argW), _sigmoid_function_d(argW2)
+
+    numerator1 = _sigmoid_function(arg) + _sigmoid_function(argW) - 1 # asc1 + desc1 - 1
+    numerator2 = _sigmoid_function(arg2) + _sigmoid_function(argW2) - 1 # asc2 + desc2 - 1
+
+    height_arg = width / (2 * slope)
+    inv_height = 1 / _core_sigmoid_function(height_arg)
+    inv_height_d = _inv_core_sigmoid_d(height_arg)
+    dinvheight_dw = inv_height_d / (2 * slope)
+    dinvheight_ds = inv_height_d * (-height_arg / slope)
+
+    # Repeated _sigmoid_function_d reference:
+    # Since arg is usually (x - l) / s, derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
+    #       x: * 1/s
+    #       l: * -1/s
+    #       s: * -(x-l)/s^2, i.e. * -arg/s
+    # When arg is (x - l - w) / (-s) [NOTE THE -s], derivatives by the constituent terms are just _sigmoid_function_d(arg) * ...
+    #       x: * -1/s
+    #       l: * 1/s
+    #       w: * 1/s
+    #       s: * (x-l-w)/(-s)^2, i.e. * -argW/s
 
-        if self.reverse: # Works this simply (a single sign flip per product above)
-            self.location.gradient = - self.location.gradient
-            if not self._fixed_slope: self.slope.gradient = - self.slope.gradient
-            self.width.gradient = - self.width.gradient
+    dphi1_dl = ((- sig_d + sig_dW) / slope) * inv_height #+ (asc + desc) * dinvheight_dl, which is 0
+    dphi2_dl = ((- sig_d2 + sig_dW2) / slope) * inv_height #+ (asc + desc) * dinvheight_dl, which is 0
+    location_gradient = variance * (dL_dK * (phi1.dot(dphi2_dl.T) + phi2.dot(dphi1_dl.T))).sum()
+    if np.isnan(location_gradient): location_gradient = 0.0
+
+    if not _fixed_slope:
+        dphi1_ds = ((sig_d * arg + sig_dW * argW) / (-slope)) * inv_height + numerator1 * dinvheight_ds
+        dphi2_ds = ((sig_d2 * arg2 + sig_dW2 * argW2) / (-slope)) * inv_height + numerator2 * dinvheight_ds
+        slope_gradient = variance * (dL_dK * (phi1.dot(dphi2_ds.T) + phi2.dot(dphi1_ds.T))).sum()
+        if np.isnan(slope_gradient): slope_gradient = 0.0
+    else: slope_gradient = 1.0 # Going to discard it anyway
+
+    dphi1_dw = (sig_dW / slope) * inv_height + numerator1 * dinvheight_dw # d(asc)_dw = 0
+    dphi2_dw = (sig_dW2 / slope) * inv_height + numerator2 * dinvheight_dw # d(asc)_dw = 0
+    width_gradient = variance * (dL_dK * (phi1.dot(dphi2_dw.T) + phi2.dot(dphi1_dw.T))).sum()
+    if np.isnan(width_gradient): width_gradient = 0.0
+
+    if reverse: # Works this simply (a single sign flip per product above)
+        location_gradient = - location_gradient
+        if not _fixed_slope: slope_gradient = - slope_gradient
+        width_gradient = - width_gradient
+
+    return location_gradient, slope_gradient, width_gradient
 
 
diff --git a/GPy_ABCD/Models/modelSearch.py b/GPy_ABCD/Models/modelSearch.py
index effb5c2..047edf8 100644
--- a/GPy_ABCD/Models/modelSearch.py
+++ b/GPy_ABCD/Models/modelSearch.py
@@ -39,8 +39,8 @@ def fit_mods_parallel_processes(X, Y, k_exprs, restarts = 5, optimiser = GPy_opt
 
 
 def explore_model_space(X, Y,
-                        start_kernels: List[Union[str, KernelExpression]] = start_kernels['Default'], p_rules: List[Callable] = production_rules['Default'], utility_function: Callable = BIC,
-                        rounds: int = 2, beam: Union[int, List[int]] = [3, 2, 1], restarts: int = 5,
+                        start_kernels: List[Union[str, KernelExpression]] = start_kernels['Default'], p_rules: List[Callable] = production_rules['Default'],
+                        utility_function: Callable = BIC, rounds: int = 2, beam: Union[int, List[int]] = [3, 2, 1], restarts: int = 5,
                         model_list_fitter: Callable = fit_mods_parallel_processes, optimiser: str = GPy_optimisers[0],
                         verbose: bool = True) -> Tuple[List[GPModel], List[List[GPModel]], List[KernelExpression], List[GPModel], List[GPModel]]:
     '''Perform `rounds` rounds of kernel expansion followed by model fit starting from the given `start_kernels` with and expanding the best `buffer` of them with `p_rules` production rules
diff --git a/GPy_ABCD/Util/benchmarking.py b/GPy_ABCD/Util/benchmarking.py
new file mode 100644
index 0000000..5bf37ad
--- /dev/null
+++ b/GPy_ABCD/Util/benchmarking.py
@@ -0,0 +1,100 @@
+'''This file is an old, reduced, Python-3.8-compatible version of the one in https://github.com/T-Flet/Python-Generic-Util'''
+import time
+from pandas import DataFrame, concat
+from numba import njit, vectorize
+
+from typing import Callable, Dict, List, Tuple
+
+
+def timethis(f: Callable, n = 2, *args, **kwargs):
+    '''Return n execution times for the given function with given arguments'''
+    ts = []
+    for i in range(n):
+        ts.append(time.perf_counter())
+        f(*args, **kwargs)
+        ts[-1] = time.perf_counter() - ts[-1]
+    return ts
+
+
+def compare_implementations(fs_with_shared_args: Dict[str, Callable], n = 200, wait = 1, verbose = True,
+                            fs_with_own_args: Dict[str, Tuple[Callable, List, Dict]] = None, args: List = None, kwargs: Dict = None):
+    '''Benchmark multiple implementations of the same function called n times each with the same args and kwargs, with a break between functions.
+        fs_with_own_args is meant for additional functions taking different *args and **kwargs.
+        Recommended output view if not verbose: print(table.to_markdown(index = False)).'''
+    assert n >= 3
+    table = []
+    for name, f in fs_with_shared_args.items():
+        time.sleep(wait)
+        if args:
+            if kwargs: times = timethis(f, n, *args, **kwargs)
+            else: times = timethis(f, n, *args)
+        elif kwargs: times = timethis(f, n, **kwargs)
+        else: times = timethis(f, n)
+        table.append([name, sum(times) / len(times), sum(times[1:]) / (len(times)-1), times[0], times[1], times[2]])
+        if verbose: print(f'Benchmarked {name} - mean {table[-1][1]} and mean excluding 1st run {table[-1][2]}')
+
+    if fs_with_own_args:
+        for name, f_a_k in fs_with_own_args.items():
+            f, args, kwargs = f_a_k
+            time.sleep(wait)
+            if args:
+                if kwargs: times = timethis(f, n, *args, **kwargs)
+                else: times = timethis(f, n, *args)
+            elif kwargs: times = timethis(f, n, **kwargs)
+            else: times = timethis(f, n)
+            table.append([name, sum(times) / len(times), sum(times[1:]) / (len(times)-1), times[0], times[1], times[2]])
+            if verbose: print(f'Benchmarked {name} - mean {table[-1][1]} and mean excluding 1st run {table[-1][2]}')
+
+    table = sorted(table, key = lambda row: row[1])
+
+    last, last1 = table[0][1], table[0][2]
+    table = [(name, m0, m1, m0 / table[0][1], m1 / table[0][2], next_mean_ratio, next_mean_ratio1, t0, t1, t2)
+             for name, m0, m1, t0, t1, t2 in table
+             if (next_mean_ratio := m0 / last) if (next_mean_ratio1 := m1 / last1)
+             if (last := m0) if (last1 := m1)]
+
+    df = DataFrame(table, columns = ['f', 'mean', 'mean excl. 1st', 'best mean ratio', 'best mean1 ratio', 'next mean ratio', 'next mean1 ratio', 't0', 't1', 't2'])
+    if verbose: print('\n', df.to_markdown(index = False), sep = '')
+    return df
+
+
+def numba_comparisons(f: Callable = None, numba_signature = None, separate_numba_signatures = None,
+                      f_scalar: Callable = None, numba_signatures_scalar = None, separate_numba_signaturess_scalar = None, parallel = False,
+                      fs_with_own_args: Dict[str, Tuple[Callable, List, Dict]] = None, n = 200, wait = 1, prefix = None, verbose = True,
+                      args: List = None, kwargs: dict = None):
+    '''Combine available ingredients to benchmark possible numba versions of the given function.
+        numba_signature and parallel are only used if f is given.
+        numba_signatures_scalar is only used if f_scalar is given.
+        separate_numba_signatures and separate_numba_signaturess_scalar are, respectively, a list and a list of lists of
+        additional signatures to be tried in addition to, respectively, numba_signature and numba_signatures_scalar.'''
+    f_dict = dict()
+    if f:
+        f_dict['Base'], f_dict['Lazy'] = f, njit(f)
+        if parallel: f_dict['Parallel'] = njit(parallel = True)(f)
+        if numba_signature:
+            f_dict['Eager'] = njit(numba_signature)(f)
+            if parallel: f_dict['Parallel Eager'] = njit(numba_signature, parallel = True)(f)
+        if separate_numba_signatures:
+            for i, sig in enumerate(separate_numba_signatures):
+                f_dict[f'Eager{i+2}'] = njit(sig)(f)
+                if parallel: f_dict[f'Parallel Eager{i+2}'] = njit(sig, parallel = True)(f)
+    if f_scalar:
+        f_dict['Vec Lazy'] = vectorize(f_scalar)
+        if numba_signatures_scalar: f_dict['Vec Eager'] = vectorize(numba_signatures_scalar)(f_scalar)
+        if separate_numba_signaturess_scalar:
+            for i, sigs in enumerate(separate_numba_signaturess_scalar): f_dict[f'Vec Eager{i+2}'] = vectorize(sigs)(f_scalar)
+    if prefix: f_dict = {(prefix + k): v for k, v in f_dict.items()}
+    return compare_implementations(f_dict, n, wait, verbose, fs_with_own_args, args, kwargs)
+
+
+def merge_bench_tables(*tables, verbose = True):
+    table = concat(tables).sort_values(by = 'mean').reset_index(drop = True)
+    table['best mean ratio'] = table['mean'] / table['mean'].values[0]
+    table['best mean1 ratio'] = table['mean excl. 1st'] / table['mean excl. 1st'].values[0]
+    table['next mean ratio'] = table['mean'] / table['mean'].shift()
+    table['next mean1 ratio'] = table['mean excl. 1st'] / table['mean excl. 1st'].shift()
+    table.loc[0, 'next mean ratio'], table.loc[0, 'next mean1 ratio'] = 1, 1
+    if verbose: print('\n', table.to_markdown(index = False), sep = '')
+    return table
+
+
diff --git a/GPy_ABCD/Util/modelUtil.py b/GPy_ABCD/Util/modelUtil.py
index 8d5ad03..1df490e 100644
--- a/GPy_ABCD/Util/modelUtil.py
+++ b/GPy_ABCD/Util/modelUtil.py
@@ -56,7 +56,7 @@ def fit_GPy_kern(X, Y, kernel, restarts, score = BIC, **kwargs):
     m.plot()
     print(m.kern)
     print(f'Log-Likelihood: {m.log_likelihood()}')
-    print(f'{score.__name__}: {score(m.log_likelihood(), len(X), m._size_transformed())}')
+    print(f'{score.__name__}: {score(m, m.log_likelihood(), len(X), m._size_transformed())}')
 
     plt.show()
 
diff --git a/GPy_ABCD/Util/numba_types.py b/GPy_ABCD/Util/numba_types.py
new file mode 100644
index 0000000..84ab29d
--- /dev/null
+++ b/GPy_ABCD/Util/numba_types.py
@@ -0,0 +1,27 @@
+'''This file is an old, reduced, Python-3.8-compatible version of the one in https://github.com/T-Flet/Python-Generic-Util'''
+from numba import njit, vectorize, guvectorize, stencil, typeof, b1, f8, i8
+from numba.core.types import Array
+from numba.core.types.containers import Tuple
+
+from typing import Union
+
+
+## This script serves to export common numba functions and type signatures easily and without namespace pollution
+__all__ = ['njit', 'typeof',
+           'b1', 'f8', 'i8', # The basic types used in the project
+           'b1A', 'f8A', 'i8A', # Shorthand for their C-consecutive 1d arrays
+           'b1A2', 'f8A2', 'i8A2', # Shorthand for their C-consecutive 2d arrays
+           'nTup', 'Union'] # numba's Tuple (renamed to nTup to avoid clashes) and typing's Union
+
+
+b1A = b1[::1]
+f8A = f8[::1]
+i8A = i8[::1]
+
+b1A2 = b1[:, ::1]
+f8A2 = f8[:, ::1]
+i8A2 = i8[:, ::1]
+
+def nTup(*args): return Tuple(args)
+
+
diff --git a/GPy_ABCD/__init__.py b/GPy_ABCD/__init__.py
index 1ee717f..b7c3feb 100644
--- a/GPy_ABCD/__init__.py
+++ b/GPy_ABCD/__init__.py
@@ -1,6 +1,6 @@
 """GPy-ABCD - Basic implementation with GPy of an Automatic Bayesian Covariance Discovery (ABCD) system"""
 
-__version__ = '1.1' # Change it in setup.py too
+__version__ = '1.2' # Change it in setup.py too
 __author__ = 'Thomas Fletcher <T-Fletcher@outlook.com>'
 # __all__ = []
 
diff --git a/README.rst b/README.rst
index 7e2407b..57d334e 100644
--- a/README.rst
+++ b/README.rst
@@ -31,7 +31,7 @@ fits based on the identified functional shapes.
 See the picture in `Usage` below for an example input/output and read the paper for further details:
 
 `Fletcher, T., Bundy, A., & Nuamah, K. . GPy-ABCD: A Configurable Automatic Bayesian Covariance Discovery Implementation.
-8th ICML Workshop on Automated Machine Learning (2021) <http://icml2021.automl.org/>`_
+8th ICML Workshop on Automated Machine Learning (2021) <https://sites.google.com/view/automl2021/accepted-papers>`_
 
 
 
diff --git a/Tests/regressLinearKernels.py b/Tests/regressLinearKernels.py
index b353937..e776ec2 100644
--- a/Tests/regressLinearKernels.py
+++ b/Tests/regressLinearKernels.py
@@ -37,7 +37,11 @@
 
 plt.show()
 
-    # VERDICT: LinearWithOffset is just better: better fits (even accounting for extra params) which is interpretable (the offsets are just roots)
-    #           And having a single variance per product is even better
+    # VERDICT: LinearWithOffset is just better: better fits (even accounting for extra params), is interpretable (the offsets are just roots),
+    #           and having a single variance per product is even better
+
+# from GPy_ABCD.Util.benchmarking import timethis
+# times = timethis(fit_GPy_kern, 5, X, Y, kernel, 20, optimizer = GPy_optimisers[0])
+# print(sum(times) / len(times))
 
 
diff --git a/Tests/regressPeriodicKernel.py b/Tests/regressPeriodicKernel.py
index 6d56c61..aecff22 100644
--- a/Tests/regressPeriodicKernel.py
+++ b/Tests/regressPeriodicKernel.py
@@ -37,3 +37,7 @@
 plt.show()
 
 
+# from GPy_ABCD.Util.benchmarking import timethis
+# times = timethis(fit_kex, 5, X, Y, correct_k, 10)
+# print(sum(times) / len(times))
+
diff --git a/Tests/regressSigmoidalKernels.py b/Tests/regressSigmoidalKernels.py
index db509a9..a33474f 100644
--- a/Tests/regressSigmoidalKernels.py
+++ b/Tests/regressSigmoidalKernels.py
@@ -20,19 +20,19 @@ def sigTwoLocIndicatorHeight(location, stop_location, slope):
 # YS = sig((X - 5) / 10) * 5 + np.random.randn(101, 1) * 0.5 #- 100
 # fit_GPy_kern(X, YS, S(), 5)
 # fit_GPy_kern(X, YS, Sr(), 5)
-# fit_GPy_kern(X, YS, S + C, 5)
+# fit_GPy_kern(X, YS, S() + C(), 5)
 # YS = -YS
 # fit_GPy_kern(X, YS, S(), 5)
 # fit_GPy_kern(X, YS, Sr(), 5)
-# fit_GPy_kern(X, YS, S + C, 5)
+# fit_GPy_kern(X, YS, S() + C(), 5)
 # YS = sig((X - 5) / (-10)) * 5 + np.random.randn(101, 1) * 0.5 #- 100
 # fit_GPy_kern(X, YS, S(), 5)
 # fit_GPy_kern(X, YS, Sr(), 5)
-# fit_GPy_kern(X, YS, S + C, 5)
+# fit_GPy_kern(X, YS, S() + C(), 5)
 # YS = -YS
 # fit_GPy_kern(X, YS, S(), 5)
 # fit_GPy_kern(X, YS, Sr(), 5)
-# fit_GPy_kern(X, YS, S + C, 5)
+# fit_GPy_kern(X, YS, S() + C(), 5)
 
 # I.e: S only fits functions moving from 0 to +ve or -ve values; Sr ones going TO 0
 #   Also, adding a constant makes either version fit any vaguely sigmoidal shape
diff --git a/Tests/testData.py b/Tests/testData.py
index 4e2466c..c5fe708 100644
--- a/Tests/testData.py
+++ b/Tests/testData.py
@@ -15,6 +15,7 @@
     # Only 1, 2, 10 and 11 have published analyses, and their identified formulae are (deciphered from component descriptions):
     #   1: LIN + PER * LIN + SE + WN * LIN
     #       Default Rules: (PER + C + LIN * (PER + C)) * (PER + PER + WN) * (C + LIN)
+    #           or LIN + PER * LIN * (C + WN) * (PER + PER + C), LIN + PER * (C + WN) * (PER + C) * (PER + LIN)
     #   2: C + CW_1643_1716(PER + SE + RQ + WN * LIN + WN * LIN, C + WN)
     #       Default Rules: C + (PER + C) * (PER + PER + PER + WN)
     #  10: PER + SE + CP_64(PER + WN, CW_69_77(SE, SE) + CP_90(C + WN, WN))
diff --git a/requirements.txt b/requirements.txt
index 57f21ab..c80a5d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
-# paramz>=0.9.5
-plotnine>=0.7.1
-numpy>=1.19.2
-matplotlib>=3.3.2
+numpy==1.23.5 # Because GPy 1.12 uses np.int, which was removed in numpy 1.24
+numba>=0.55.2
+GPy>=1.12.0
 scipy>=1.6.2
 pandas>=1.2.4
-GPy>=1.10.0
+matplotlib>=3.3.2
+plotnine>=0.7.1
 # ipython>=7.25.0
+# paramz>=0.9.5
diff --git a/setup.py b/setup.py
index c0e0e0b..3a658c9 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ def read_requirements():
 
 setup(
     name = 'GPy-ABCD',
-    version = '1.1', # Change it in __init__.py too
+    version = '1.2', # Change it in __init__.py too
     url = 'https://github.com/T-Flet/GPy-ABCD',
     license = 'BSD 3-Clause',
 
@@ -37,6 +37,9 @@ def read_requirements():
         'Development Status :: 4 - Beta',
         # 'Development Status :: 5 - Production/Stable',
         'Programming Language :: Python',
-        'Programming Language :: Python :: 3.8'
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11'
     ],
 )