CArray now preloads the GPU context from OpenCL during PHPMINIT.

C1312543 Henrique Saviatto Borba · C1312543 Henrique Saviatto Borba · commit d1cbeddbbb42 · 2020-02-07T17:18:29.000-03:00
diff --git a/config.m4 b/config.m4
@@ -157,6 +157,7 @@ PHP_NEW_EXTENSION(carray,
       kernel/convert_datatype.c \
       kernel/dtype_transfer.c \
       kernel/assign_scalar.c \
+      kernel/gpu.c \
       kernel/common/exceptions.c \
       kernel/item_selection.c \
       kernel/clip.c \
diff --git a/kernel/common/clblas_funcs.c b/kernel/common/clblas_funcs.c
@@ -10,6 +10,7 @@
 
 #include "clblas_funcs.h"
 #include "clBLAS.h"
+#include "../gpu.h"
 
 static MatrixShape
 _select_matrix_shape(CArray *array)
@@ -58,29 +59,22 @@ _bad_strides(CArray * ap)
     return 0;
 }
 
-/*
- * Helper: dispatch to appropriate cblas_?gemm for typenum.
- */
 static void
-clgemm(int typenum, clblasOrder order,
-     clblasTranspose transA, clblasTranspose transB,
-     int m, int n, int k,
-     CArray *A, int lda, CArray *B, int ldb, CArray *R)
-{
-    int i ;
-    const void *Adata = CArray_DATA(A), *Bdata = CArray_DATA(B);
-    void *Rdata = CArray_DATA(R);
-    int ldc = CArray_DIM(R, 1) > 1 ? CArray_DIM(R, 1) : 1;
+cldaxpy(int n_elements, int alpha, double *a, int incX, double *b, int incY) {
+    cl_double alphad = alpha;
+    size_t offsetX = 0, offsetY = 0;
 
     cl_int err;
     cl_platform_id platform = 0;
     cl_device_id device = 0;
     cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
     cl_context ctx = 0;
     cl_command_queue queue = 0;
-    cl_mem bufA, bufB, bufC;
+    cl_mem bufA, bufB;
     cl_event event = NULL;
-    int ret = 0;
+
+    ctx = getCLContext();
+    queue = getCLQueue();
 
     /* Setup OpenCL environment. */
     err = clGetPlatformIDs( 1, &platform, NULL );
@@ -93,6 +87,50 @@ clgemm(int typenum, clblasOrder order,
     /* Setup clBLAS */
     err = clblasSetup( );
 
+    /* Prepare OpenCL memory objects and place matrices inside them. */
+    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, n_elements * sizeof(double),
+                          NULL, &err );
+
+    php_printf("%d", err);
+
+    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, sizeof(double),
+                          NULL, &err );
+
+
+
+    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+        n_elements * sizeof(double), a, 0, NULL, NULL );
+    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+        sizeof(double), b, 0, NULL, NULL );
+
+    err = clblasDaxpy((size_t)n_elements, alphad, bufA, 0, incX, bufB, 0, incY, 1, &queue, 0, NULL, &event);
+
+    return NULL;
+}
+
+/*
+ * Helper: dispatch to appropriate cblas_?gemm for typenum.
+ */
+static void
+clgemm(int typenum, clblasOrder order,
+     clblasTranspose transA, clblasTranspose transB,
+     int m, int n, int k,
+     CArray *A, int lda, CArray *B, int ldb, CArray *R)
+{
+    int i ;
+    const void *Adata = CArray_DATA(A), *Bdata = CArray_DATA(B);
+    void *Rdata = CArray_DATA(R);
+    int ldc = CArray_DIM(R, 1) > 1 ? CArray_DIM(R, 1) : 1;
+
+    cl_int err;
+    cl_context ctx = 0;
+    cl_command_queue queue = 0;
+    cl_mem bufA, bufB, bufC;
+    cl_event event = NULL;
+    int ret = 0;
+
+    ctx = getCLContext();
+    queue = getCLQueue();
 
     /* Prepare OpenCL memory objects and place matrices inside them. */
     bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, m * k * CArray_DESCR(A)->elsize,
@@ -113,6 +151,7 @@ clgemm(int typenum, clblasOrder order,
     cl_float alpha = 1;
     cl_float beta  = 0;
 
+
     switch (typenum) {
         case TYPE_DOUBLE_INT:
             /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
@@ -135,6 +174,7 @@ clgemm(int typenum, clblasOrder order,
                                 m * n * CArray_DESCR(R)->elsize,
                                 Rdata, 0, NULL, NULL );
 
+
     /* Release OpenCL memory objects. */
     clReleaseMemObject( bufC );
     clReleaseMemObject( bufB );
@@ -147,7 +187,6 @@ clgemm(int typenum, clblasOrder order,
     clReleaseCommandQueue( queue );
     clReleaseContext( ctx );
 
-
 }
 
 CArray *
@@ -272,7 +311,7 @@ clblas_matrixproduct(int typenum, CArray * ap1, CArray *ap2, CArray *out, Memory
     }
     else {
         /*
-         * (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2)
+         * (CArray_NDIM(ap1) <= 2 && CArray_NDIM(ap2) <= 2)
          * Both ap1 and ap2 are vectors or matrices
          */
         l = CArray_DIM(ap1, CArray_NDIM(ap1) - 1);
@@ -321,13 +360,9 @@ clblas_matrixproduct(int typenum, CArray * ap1, CArray *ap2, CArray *out, Memory
                                                      *((double *)CArray_DATA(ap1));
             }
             else if (ap1shape != _matrix) {
-                throw_notimplemented_exception();
+                throw_not_implemented_exception();
                 return NULL;
-                /**cblas_daxpy(l,
-                            *((double *)PyArray_DATA(ap2)),
-                            (double *)PyArray_DATA(ap1),
-                            ap1stride/sizeof(double),
-                            (double *)PyArray_DATA(out_buf), 1);*/
+                //cldaxpy(ap1, ap2, out_buf);
             }
             else {
                 int maxind, oind, i, a1s, outs;
@@ -343,12 +378,9 @@ clblas_matrixproduct(int typenum, CArray * ap1, CArray *ap2, CArray *out, Memory
                 a1s = CArray_STRIDE(ap1, maxind) / sizeof(double);
                 outs = CArray_STRIDE(out_buf, maxind) / sizeof(double);
                 for (i = 0; i < CArray_DIM(ap1, oind); i++) {
-                    //cblas_daxpy(l, val, (double *)ptr, a1s,
-                                //(double *)optr, outs);
+                    cldaxpy(l, val, (double *)ptr, a1s, (double *)optr, outs);
                     ptr += CArray_STRIDE(ap1, oind);
                     optr += CArray_STRIDE(out_buf, oind);
-                    throw_notimplemented_exception();
-                    return NULL;
                 }
             }
         }
diff --git a/kernel/gpu.c b/kernel/gpu.c
@@ -0,0 +1,42 @@
+#include "config.h"
+
+#ifdef HAVE_CLBLAS
+#include "gpu.h"
+#include "clBLAS.h"
+
+cl_context ctx;
+cl_command_queue queue;
+
+void
+start_clblas_context() {
+    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+    cl_platform_id platform = 0;
+    cl_device_id device = 0;
+    cl_int err;
+
+
+    /* Setup OpenCL environment. */
+    err = clGetPlatformIDs( 1, &platform, NULL );
+        err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+    props[1] = (cl_context_properties)platform;
+
+    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+    queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+    /* Setup clBLAS */
+    err = clblasSetup( );
+}
+
+
+cl_context
+getCLContext() {
+    return ctx;
+}
+
+cl_command_queue
+getCLQueue() {
+    return queue;
+}
+
+#endif
diff --git a/kernel/gpu.h b/kernel/gpu.h
@@ -0,0 +1,15 @@
+#ifndef PHPSCI_EXT_GPU_H
+#define PHPSCI_EXT_GPU_H
+
+#include "config.h"
+
+#ifdef HAVE_CLBLAS
+#include "clBLAS.h"
+
+void start_clblas_context();
+cl_command_queue getCLQueue();
+cl_context getCLContext();
+#endif
+
+
+#endif //PHPSCI_EXT_GPU_H
diff --git a/kernel/linalg.c b/kernel/linalg.c
@@ -98,10 +98,9 @@ FLOAT_dot(char *ip1, int is1, char *ip2, int is2, char *op, int n)
 void
 DOUBLE_dot(char *ip1, int is1, char *ip2, int is2, char *op, int n)
 {
+#ifdef HAVE_CBLAS
     int is1b = blas_stride(is1, sizeof(double));
     int is2b = blas_stride(is2, sizeof(double));
-
-#ifdef HAVE_CBLAS
     if (is1b && is2b)
     {
         double sum = 0.;
@@ -160,27 +159,27 @@ CArray_Matmul(CArray * ap1, CArray * ap2, CArray * out, MemoryPointer * ptr)
     CArrayIterator * it1, * it2;
     char * op;
 
-    if (CArray_NDIM(ap1) == 0 || CArray_NDIM(ap2) == 0) {
+    /**if (CArray_NDIM(ap1) == 0 || CArray_NDIM(ap2) == 0) {
         throw_valueerror_exception("Scalar operands are not allowed, use '*' instead");
         return NULL;
-    }
+    }**/
     typenum = CArray_ObjectType(ap1, 0);
     typenum = CArray_ObjectType(ap2, typenum);
 
     nd1 = CArray_NDIM(ap1);
     nd2 = CArray_NDIM(ap2);
 
 #ifdef HAVE_BLAS
+#ifndef HAVE_CLBLAS
     if (nd1 <= 2 && nd2 <= 2 && (TYPE_DOUBLE_INT == typenum || TYPE_FLOAT_INT == typenum)) {
         return cblas_matrixproduct(typenum, ap1, ap2, out, ptr);
     }
 #endif
+#endif
 
 #ifdef HAVE_CLBLAS
     if (nd1 <= 2 && nd2 <= 2 && (TYPE_DOUBLE_INT == typenum || TYPE_FLOAT_INT == typenum)) {
         return clblas_matrixproduct(typenum, ap1, ap2, out, ptr);
-        php_printf("FOI");
-        return NULL;
     }
 #endif
 
diff --git a/phpsci.c b/phpsci.c
@@ -59,6 +59,10 @@
 #include "kernel/storage.h"
 #include "kernel/round.h"
 
+#ifdef HAVE_CLBLAS
+#include "kernel/gpu.h"
+#endif
+
 typedef struct _zend_carray_cdata {
     zend_object std;
 } end_carray_cdata;
@@ -2948,6 +2952,9 @@ static PHP_MINIT_FUNCTION(carray)
     carray_object_handlers.compare_objects = carray_compare;
     carray_object_handlers.count_elements = carray_count;
 
+#ifdef HAVE_CLBLAS
+    start_clblas_context();
+#endif
 
     zend_class_implements(carray_sc_entry, 1, zend_ce_arrayaccess);