WIP memory error

lantiga · lantiga · commit 0f86380b6dd0 · 2021-03-11T01:51:41.000+01:00
diff --git a/src/backends/tensorflow.c b/src/backends/tensorflow.c
@@ -28,20 +28,17 @@ int RAI_InitBackendTF(int (*get_api_fn)(const char *, void *)) {
     return REDISMODULE_OK;
 }
 
-// Managing context for the DLManagedTensor, will manage the lifetime of
-// DLManagedTensor. When calling DLManagedTensor::deleter, it will notify the
-// original framework of destruction, and this context will be deleted also.
-struct TfDlManagedTensorCtx {
+struct TFDLManagedTensorCtx {
     TFE_TensorHandle *reference;
     int64_t ndim;
     int64_t *shape;
     int64_t *strides;
     DLManagedTensor tensor;
 };
-typedef struct TfDlManagedTensorCtx TfDlManagedTensorCtx;
+typedef struct TFDLManagedTensorCtx TFDLManagedTensorCtx;
 
-TfDlManagedTensorCtx *TfDlManagedTensorCtx_Create(TFE_TensorHandle *h, TF_Status *status) {
-    TfDlManagedTensorCtx *ctx = RedisModule_Alloc(sizeof(TfDlManagedTensorCtx));
+TFDLManagedTensorCtx *TFDLManagedTensorCtx_Create(TFE_TensorHandle *h, TF_Status *status) {
+    TFDLManagedTensorCtx *ctx = RedisModule_Alloc(sizeof(TFDLManagedTensorCtx));
     ctx->ndim = TFE_TensorHandleNumDims(h, status);
     ctx->shape = RedisModule_Calloc(ctx->ndim, sizeof(int64_t));
     ctx->strides = RedisModule_Calloc(ctx->ndim, sizeof(int64_t));
@@ -55,23 +52,19 @@ TfDlManagedTensorCtx *TfDlManagedTensorCtx_Create(TFE_TensorHandle *h, TF_Status
     return ctx;
 }
 
-void TfDlManagedTensorCtx_Free(TfDlManagedTensorCtx *ctx) {
+void TFDLManagedTensorCtx_Free(TFDLManagedTensorCtx *ctx) {
     RedisModule_Free(ctx->shape);
     RedisModule_Free(ctx->strides);
     RedisModule_Free(ctx);
 }
 
-// Deleter for DLManagedTensor
 void DLManagedTensorDeleter(DLManagedTensor *arg) {
-    TfDlManagedTensorCtx *owner = (TfDlManagedTensorCtx *)(arg->manager_ctx);
-
-    // TODO: check if we need to deleted the actual tensor as well
+    TFDLManagedTensorCtx *owner = (TFDLManagedTensorCtx *)(arg->manager_ctx);
     TFE_DeleteTensorHandle(owner->reference);
-    TfDlManagedTensorCtx_Free(owner);
+    TFDLManagedTensorCtx_Free(owner);
 }
 
-// Converts TF_DATAType to DLPack data type.
-DLDataType GetDlDataType(TF_DataType data_type, TF_Status *status) {
+DLDataType GetDLDataType(TF_DataType data_type, TF_Status *status) {
     DLDataType dtype;
     dtype.lanes = 1;
     dtype.bits = TF_DataTypeSize(data_type) * 8;
@@ -104,8 +97,7 @@ DLDataType GetDlDataType(TF_DataType data_type, TF_Status *status) {
     return dtype;
 }
 
-// Gets DLPack's DLDevice from eager tensor handle.
-DLDevice GetDlDevice(TFE_TensorHandle *h, TF_Status *status) {
+DLDevice GetDLDevice(TFE_TensorHandle *h, TF_Status *status) {
     DLDevice device;
     const char *device_name = TFE_TensorHandleBackingDeviceName(h, status);
 
@@ -135,8 +127,7 @@ DLDevice GetDlDevice(TFE_TensorHandle *h, TF_Status *status) {
     return device;
 }
 
-// Converts DLContext to TF device name.
-int DeviceNameFromDlContext(const DLDevice *device, char device_name[64]) {
+int DeviceNameFromDLContext(const DLDevice *device, char device_name[64]) {
     switch (device->device_type) {
     case kDLCPU:
         strcpy(device_name, "CPU:0");
@@ -148,8 +139,7 @@ int DeviceNameFromDlContext(const DLDevice *device, char device_name[64]) {
     return 1;
 }
 
-// Converts DLPack data type to TF_DATATYPE.
-int TfDataTypeFromDlDataType(const DLDataType *dtype, TF_DataType *tf_dtype) {
+int TFDataTypeFromDLDataType(const DLDataType *dtype, TF_DataType *tf_dtype) {
     switch (dtype->code) {
     case kDLUInt:
         switch (dtype->bits) {
@@ -216,14 +206,10 @@ int TfDataTypeFromDlDataType(const DLDataType *dtype, TF_DataType *tf_dtype) {
     }
 }
 
-// Wraps the deleter function of DLManagedTensor to match the function signature
-// TFE_NewTensorHandleFromDeviceMemory.
 void DeallocatorWrapperFunc(void *data, size_t len, void *dlmt_vptr) {
     TFE_CallDLManagedTensorDeleter(dlmt_vptr);
 }
 
-// Checks whether the stride array matches the layout of compact, row-majored
-// data.
 bool IsValidStrideCompactRowMajorData(int64_t *shape_arr, int64_t *stride_arr, int ndim) {
     if (ndim >= 1 && stride_arr[ndim - 1] != 1) {
         return false;
@@ -244,7 +230,7 @@ void TFE_CallDLManagedTensorDeleter(void *dlm_ptr) {
 }
 
 void *TFE_HandleToDLPack(TFE_TensorHandle *h, TF_Status *status) {
-    DLDevice tf_dlm_device = GetDlDevice(h, status);
+    DLDevice tf_dlm_device = GetDLDevice(h, status);
     if (TF_GetCode(status) != TF_OK) {
         return NULL;
     }
@@ -256,12 +242,12 @@ void *TFE_HandleToDLPack(TFE_TensorHandle *h, TF_Status *status) {
 
     TF_DataType data_type = TFE_TensorHandleDataType(h);
 
-    DLDataType tf_dlm_type = GetDlDataType(data_type, status);
+    DLDataType tf_dlm_type = GetDLDataType(data_type, status);
     if (TF_GetCode(status) != TF_OK) {
         return NULL;
     }
 
-    TfDlManagedTensorCtx *tf_dlm_tensor_ctx = TfDlManagedTensorCtx_Create(h, status);
+    TFDLManagedTensorCtx *tf_dlm_tensor_ctx = TFDLManagedTensorCtx_Create(h, status);
 
     DLManagedTensor *dlm_tensor = &tf_dlm_tensor_ctx->tensor;
     dlm_tensor->manager_ctx = tf_dlm_tensor_ctx;
@@ -287,15 +273,15 @@ TFE_TensorHandle *TFE_HandleFromDLPack(void *dlm, TF_Status *status, TFE_Context
     DLManagedTensor *dlmt = (DLManagedTensor *)dlm;
     DLTensor *dl_tensor = &dlmt->dl_tensor;
     char device_name[64];
-    int ret = DeviceNameFromDlContext(&dl_tensor->device, device_name);
+    int ret = DeviceNameFromDLContext(&dl_tensor->device, device_name);
     if (ret != 0) {
-        // tensorflow::errors::InvalidArgument("Unsupported Device Type");
+        // TODO Unsupported device type
         return NULL;
     }
     TF_DataType dtype;
-    ret = TfDataTypeFromDlDataType(&dl_tensor->dtype, &dtype);
+    ret = TFDataTypeFromDLDataType(&dl_tensor->dtype, &dtype);
     if (ret != 0) {
-        // status->status = std::move(s);
+        // TODO Unsupported data type
         return NULL;
     }
     int num_dims = dl_tensor->ndim;
@@ -421,8 +407,7 @@ RAI_Model *RAI_ModelCreateTF(RAI_Backend backend, const char *devicestr, RAI_Mod
     uint8_t config[4] = {0x32, 0x02, 0x20, 0x01};
     TFE_ContextOptionsSetConfig(context_opts, (void *)config, 4, status);
 
-    // TFE_ContextOptionsSetConfig(context_opts, proto, proto_len, status);
-    // TFE_ContextOptionsSetAsync(context_opts, 0);
+    TFE_ContextOptionsSetAsync(context_opts, 0);
     TFE_ContextOptionsSetDevicePlacementPolicy(context_opts, TFE_DEVICE_PLACEMENT_EXPLICIT);
 
     TFE_Context *context = TFE_NewContext(context_opts, status);
@@ -605,6 +590,8 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
     const size_t noutputs = array_len(mctxs[0]->outputs);
     TFE_TensorHandle *inputTensorsHandles[ninputs];
     TFE_TensorHandle *outputTensorsHandles[noutputs];
+    TFE_TensorHandle *deviceInputTensorsHandles[ninputs];
+    TFE_TensorHandle *deviceOutputTensorsHandles[noutputs];
 
     size_t batch_sizes[nbatches];
     size_t batch_offsets[nbatches];
@@ -655,7 +642,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
             return 1;
         }
 
-        inputTensorsHandles[i] = TFE_TensorHandleCopyToDevice(
+        deviceInputTensorsHandles[i] = TFE_TensorHandleCopyToDevice(
             inputTensorsHandles[i], mctxs[0]->model->session, tf_devicestr, status);
 
         if (TF_GetCode(status) != TF_OK) {
@@ -676,7 +663,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
         return 1;
     }
 
-    TFE_OpAddInputList(fn_op, inputTensorsHandles, ninputs, status);
+    TFE_OpAddInputList(fn_op, deviceInputTensorsHandles, ninputs, status);
     if (TF_GetCode(status) != TF_OK) {
         char *errorMessage = RedisModule_Strdup(TF_Message(status));
         RAI_SetError(error, RAI_EMODELRUN, errorMessage);
@@ -686,7 +673,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
     }
 
     int noutputs_ = noutputs;
-    TFE_Execute(fn_op, outputTensorsHandles, &noutputs_, status);
+    TFE_Execute(fn_op, deviceOutputTensorsHandles, &noutputs_, status);
     if (TF_GetCode(status) != TF_OK) {
         char *errorMessage = RedisModule_Strdup(TF_Message(status));
         RAI_SetError(error, RAI_EMODELRUN, errorMessage);
@@ -697,6 +684,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
 
     for (size_t i = 0; i < ninputs; ++i) {
         TFE_DeleteTensorHandle(inputTensorsHandles[i]);
+        TFE_DeleteTensorHandle(deviceInputTensorsHandles[i]);
     }
 
     if (TF_GetCode(status) != TF_OK) {
@@ -709,9 +697,8 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
 
     for (size_t i = 0; i < noutputs; ++i) {
         outputTensorsHandles[i] = TFE_TensorHandleCopyToDevice(
-            outputTensorsHandles[i], mctxs[0]->model->session, "/device:CPU:0", status);
+            deviceOutputTensorsHandles[i], mctxs[0]->model->session, "/device:CPU:0", status);
 
-        // TF_Tensor* outputTensor = TFE_TensorHandleResolve(outputTensorsHandles[i], status);
         RAI_Tensor *outputTensor =
             RAI_TensorCreateFromDLTensor(TFE_HandleToDLPack(outputTensorsHandles[i], status));
 
@@ -728,7 +715,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
                 continue;
             }
             if (RAI_TensorDim(outputTensor, 0) != total_batch_size) {
-                // TF_DeleteTensor(outputTensor);
+                RAI_TensorFree(outputTensor);
                 TF_DeleteStatus(status);
                 RAI_SetError(error, RAI_EMODELRUN,
                              "ERR Model did not generate the expected batch size");
@@ -743,7 +730,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx **mctxs, RAI_Error *error) {
             mctxs[0]->outputs[i].tensor = RAI_TensorGetShallowCopy(outputTensor);
         }
         RAI_TensorFree(outputTensor);
-        TFE_DeleteTensorHandle(outputTensorsHandles[i]);
+        TFE_DeleteTensorHandle(deviceOutputTensorsHandles[i]);
     }
 
     TF_DeleteStatus(status);
diff --git a/tests/flow/tests_dag.py b/tests/flow/tests_dag.py
@@ -814,12 +814,15 @@ def test_dag_modelrun_financialNet_no_writes(env):
         env)
     model_name = 'financialNet{{hhh}}'
 
-    ret = con.execute_command('AI.MODELSET', model_name, 'TF', "CPU",
+    ret = con.execute_command('AI.MODELSET', model_name, 'TF', "GPU",
                               'INPUTS', 'transaction', 'reference', 'OUTPUTS', 'output', 'BLOB', model_pb)
     env.assertEqual(ret, b'OK')
 
-    for tensor_number in range(1,MAX_TRANSACTIONS):
-        for repetition in range(1,10):
+    MAX_TRANSACTIONS = 2
+
+    for tensor_number in range(1, MAX_TRANSACTIONS):
+        # for repetition in range(1, 10):
+        for repetition in range(1, 2):
             reference_tensor = creditcard_referencedata[tensor_number]
             transaction_tensor = creditcard_transactions[tensor_number]
             result_tensor_keyname = 'resultTensor{{hhh}}{}'.format(tensor_number)
@@ -833,20 +836,24 @@ def test_dag_modelrun_financialNet_no_writes(env):
             ret = con.execute_command("EXISTS {}".format(reference_tensor_keyname))
             env.assertEqual(ret, 1)
 
+            # print(reference_tensor)
+            print(transaction_tensor)
+
             ret = con.execute_command(
                 'AI.DAGRUN', 'LOAD', '1', reference_tensor_keyname, '|>',
                 'AI.TENSORSET', transaction_tensor_keyname, 'FLOAT', 1, 30,'BLOB', transaction_tensor.tobytes(), '|>',
                 'AI.MODELRUN', model_name, 
                             'INPUTS', transaction_tensor_keyname, reference_tensor_keyname,
                             'OUTPUTS', result_tensor_keyname, '|>',
-                'AI.TENSORGET',result_tensor_keyname, 'META',  '|>',
+                'AI.TENSORGET', result_tensor_keyname, 'META',  '|>',
                 'AI.TENSORGET', result_tensor_keyname, 'VALUES'
             )
             env.assertEqual(4, len(ret))
             env.assertEqual([b'OK', b'OK'], ret[:2])
             env.assertEqual([b'dtype', b'FLOAT', b'shape', [1, 2]], ret[2])
             values = ret[3]
             # Assert that resulting classification is within [0,1]
+            print(values)
             env.assertEqual(True, 0 <= float(values[0]) <= 1)
             env.assertEqual(True, 0 <= float(values[1]) <= 1)