Skip to content

Commit

Permalink
Non linear blend Tensor impl - HIP (#131)
Browse files Browse the repository at this point in the history
* Add gpu tensor support for non linear blend

* Add relevant unit tests

* Add relevant perf tests

* Add check on div

* Minor change
  • Loading branch information
r-abishek authored Dec 16, 2022
1 parent 707dffe commit 05b67b5
Show file tree
Hide file tree
Showing 10 changed files with 716 additions and 6 deletions.
3 changes: 3 additions & 0 deletions include/rppt_tensor_effects_augmentations.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr, RpptDescPtr srcDescPtr, RppPt
// *retval RPP_ERROR : Error

RppStatus rppt_non_linear_blend_host(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *stdDevTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1, RppPtr_t srcPtr2, RpptDescPtr srcDescPtr, RppPtr_t dstPtr, RpptDescPtr dstDescPtr, Rpp32f *stdDevTensor, RpptROIPtr roiTensorPtrSrc, RpptRoiType roiType, rppHandle_t rppHandle);
#endif // GPU_SUPPORT

#ifdef __cplusplus
}
Expand Down
1 change: 1 addition & 0 deletions src/modules/hip/hip_tensor_effects_augmentations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ THE SOFTWARE.
#include "kernel/noise_salt_and_pepper.hpp"
#include "kernel/noise_shot.hpp"
#include "kernel/noise_gaussian.hpp"
#include "kernel/non_linear_blend.hpp"

#endif // HIP_TENSOR_EFFECTS_AUGMENTATIONS_HPP
286 changes: 286 additions & 0 deletions src/modules/hip/kernel/non_linear_blend.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
#include <hip/hip_runtime.h>
#include "rpp_hip_common.hpp"

__device__ void non_linear_blend_gaussian_hip_compute(float &multiplier, int2 &halfDimsWH_i2, int2 &idXY_i2, d_float8 *gaussianValue_f8)
{
float rowLocComponent;
rowLocComponent = idXY_i2.y - halfDimsWH_i2.y;
rowLocComponent *= (rowLocComponent * multiplier);

float4 rowLocComponent_f4 = (float4)rowLocComponent;
float4 multiplier_f4 = (float4)multiplier;

d_float8 colLocComponent_f8;
colLocComponent_f8.f4[0] = make_float4(idXY_i2.x, idXY_i2.x + 1, idXY_i2.x + 2, idXY_i2.x + 3);
colLocComponent_f8.f4[1] = colLocComponent_f8.f4[0] + (float4)4;
colLocComponent_f8.f4[0] -= (float4)halfDimsWH_i2.x;
colLocComponent_f8.f4[1] -= (float4)halfDimsWH_i2.x;
colLocComponent_f8.f4[0] = (colLocComponent_f8.f4[0] * colLocComponent_f8.f4[0] * multiplier_f4) + rowLocComponent_f4;
colLocComponent_f8.f4[1] = (colLocComponent_f8.f4[1] * colLocComponent_f8.f4[1] * multiplier_f4) + rowLocComponent_f4;

gaussianValue_f8->f4[0] = make_float4(expf(colLocComponent_f8.f4[0].x), expf(colLocComponent_f8.f4[0].y), expf(colLocComponent_f8.f4[0].z), expf(colLocComponent_f8.f4[0].w));
gaussianValue_f8->f4[1] = make_float4(expf(colLocComponent_f8.f4[1].x), expf(colLocComponent_f8.f4[1].y), expf(colLocComponent_f8.f4[1].z), expf(colLocComponent_f8.f4[1].w));
}

__device__ void non_linear_blend_8_hip_compute(d_float8 *src1_f8, d_float8 *src2_f8, d_float8 *dst_f8, d_float8 *gaussianValue_f8)
{
dst_f8->f4[0] = (src1_f8->f4[0] - src2_f8->f4[0]) * gaussianValue_f8->f4[0] + src2_f8->f4[0];
dst_f8->f4[1] = (src1_f8->f4[1] - src2_f8->f4[1]) * gaussianValue_f8->f4[1] + src2_f8->f4[1];
}

__device__ void non_linear_blend_24_hip_compute(d_float24 *src1_f24, d_float24 *src2_f24, d_float24 *dst_f24, d_float8 *gaussianValue_f8)
{
non_linear_blend_8_hip_compute(&(src1_f24->f8[0]), &(src2_f24->f8[0]), &(dst_f24->f8[0]), gaussianValue_f8);
non_linear_blend_8_hip_compute(&(src1_f24->f8[1]), &(src2_f24->f8[1]), &(dst_f24->f8[1]), gaussianValue_f8);
non_linear_blend_8_hip_compute(&(src1_f24->f8[2]), &(src2_f24->f8[2]), &(dst_f24->f8[2]), gaussianValue_f8);
}

template <typename T>
__global__ void non_linear_blend_pkd_tensor(T *srcPtr1,
T *srcPtr2,
uint2 srcStridesNH,
T *dstPtr,
uint2 dstStridesNH,
float *stdDev,
RpptROIPtr roiTensorPtrSrc)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
{
return;
}

uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;

float multiplier = -0.5f / (stdDev[id_z] * stdDev[id_z]);
int2 halfDimsWH_i2 = make_int2(roiTensorPtrSrc[id_z].xywhROI.roiWidth >> 1, roiTensorPtrSrc[id_z].xywhROI.roiHeight >> 1);
int2 idXY_i2 = make_int2(id_x, id_y);

d_float24 src1_f24, src2_f24, dst_f24;
d_float8 gaussianValue_f8;
non_linear_blend_gaussian_hip_compute(multiplier, halfDimsWH_i2, idXY_i2, &gaussianValue_f8);

rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
non_linear_blend_24_hip_compute(&src1_f24, &src2_f24, &dst_f24, &gaussianValue_f8);
rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
}

template <typename T>
__global__ void non_linear_blend_pln_tensor(T *srcPtr1,
T *srcPtr2,
uint3 srcStridesNCH,
T *dstPtr,
uint3 dstStridesNCH,
int channelsDst,
float *stdDev,
RpptROIPtr roiTensorPtrSrc)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
{
return;
}

uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;

float multiplier = -0.5f / (stdDev[id_z] * stdDev[id_z]);
int2 halfDimsWH_i2 = make_int2(roiTensorPtrSrc[id_z].xywhROI.roiWidth >> 1, roiTensorPtrSrc[id_z].xywhROI.roiHeight >> 1);
int2 idXY_i2 = make_int2(id_x, id_y);

d_float8 src1_f8, src2_f8, dst_f8, gaussianValue_f8;
non_linear_blend_gaussian_hip_compute(multiplier, halfDimsWH_i2, idXY_i2, &gaussianValue_f8);

rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
non_linear_blend_8_hip_compute(&src1_f8, &src2_f8, &dst_f8, &gaussianValue_f8);
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);

if (channelsDst == 3)
{
srcIdx += srcStridesNCH.y;
dstIdx += dstStridesNCH.y;

rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
non_linear_blend_8_hip_compute(&src1_f8, &src2_f8, &dst_f8, &gaussianValue_f8);
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);

srcIdx += srcStridesNCH.y;
dstIdx += dstStridesNCH.y;

rpp_hip_load8_and_unpack_to_float8(srcPtr1 + srcIdx, &src1_f8);
rpp_hip_load8_and_unpack_to_float8(srcPtr2 + srcIdx, &src2_f8);
non_linear_blend_8_hip_compute(&src1_f8, &src2_f8, &dst_f8, &gaussianValue_f8);
rpp_hip_pack_float8_and_store8(dstPtr + dstIdx, &dst_f8);
}
}

template <typename T>
__global__ void non_linear_blend_pkd3_pln3_tensor(T *srcPtr1,
T *srcPtr2,
uint2 srcStridesNH,
T *dstPtr,
uint3 dstStridesNCH,
float *stdDev,
RpptROIPtr roiTensorPtrSrc)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
{
return;
}

uint srcIdx = (id_z * srcStridesNH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNH.y) + ((id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x) * 3);
uint dstIdx = (id_z * dstStridesNCH.x) + (id_y * dstStridesNCH.z) + id_x;

float multiplier = -0.5f / (stdDev[id_z] * stdDev[id_z]);
int2 halfDimsWH_i2 = make_int2(roiTensorPtrSrc[id_z].xywhROI.roiWidth >> 1, roiTensorPtrSrc[id_z].xywhROI.roiHeight >> 1);
int2 idXY_i2 = make_int2(id_x, id_y);

d_float24 src1_f24, src2_f24, dst_f24;
d_float8 gaussianValue_f8;
non_linear_blend_gaussian_hip_compute(multiplier, halfDimsWH_i2, idXY_i2, &gaussianValue_f8);

rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, &src1_f24);
rpp_hip_load24_pkd3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, &src2_f24);
non_linear_blend_24_hip_compute(&src1_f24, &src2_f24, &dst_f24, &gaussianValue_f8);
rpp_hip_pack_float24_pln3_and_store24_pln3(dstPtr + dstIdx, dstStridesNCH.y, &dst_f24);
}

template <typename T>
__global__ void non_linear_blend_pln3_pkd3_tensor(T *srcPtr1,
T *srcPtr2,
uint3 srcStridesNCH,
T *dstPtr,
uint2 dstStridesNH,
float *stdDev,
RpptROIPtr roiTensorPtrSrc)
{
int id_x = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x) * 8;
int id_y = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int id_z = hipBlockIdx_z * hipBlockDim_z + hipThreadIdx_z;

if ((id_y >= roiTensorPtrSrc[id_z].xywhROI.roiHeight) || (id_x >= roiTensorPtrSrc[id_z].xywhROI.roiWidth))
{
return;
}

uint srcIdx = (id_z * srcStridesNCH.x) + ((id_y + roiTensorPtrSrc[id_z].xywhROI.xy.y) * srcStridesNCH.z) + (id_x + roiTensorPtrSrc[id_z].xywhROI.xy.x);
uint dstIdx = (id_z * dstStridesNH.x) + (id_y * dstStridesNH.y) + id_x * 3;

float multiplier = -0.5f / (stdDev[id_z] * stdDev[id_z]);
int2 halfDimsWH_i2 = make_int2(roiTensorPtrSrc[id_z].xywhROI.roiWidth >> 1, roiTensorPtrSrc[id_z].xywhROI.roiHeight >> 1);
int2 idXY_i2 = make_int2(id_x, id_y);

d_float24 src1_f24, src2_f24, dst_f24;
d_float8 gaussianValue_f8;
non_linear_blend_gaussian_hip_compute(multiplier, halfDimsWH_i2, idXY_i2, &gaussianValue_f8);

rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr1 + srcIdx, srcStridesNCH.y, &src1_f24);
rpp_hip_load24_pln3_and_unpack_to_float24_pln3(srcPtr2 + srcIdx, srcStridesNCH.y, &src2_f24);
non_linear_blend_24_hip_compute(&src1_f24, &src2_f24, &dst_f24, &gaussianValue_f8);
rpp_hip_pack_float24_pln3_and_store24_pkd3(dstPtr + dstIdx, &dst_f24);
}

template <typename T>
RppStatus hip_exec_non_linear_blend_tensor(T *srcPtr1,
T *srcPtr2,
RpptDescPtr srcDescPtr,
T *dstPtr,
RpptDescPtr dstDescPtr,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
rpp::Handle& handle)
{
if (roiType == RpptRoiType::LTRB)
hip_exec_roi_converison_ltrb_to_xywh(roiTensorPtrSrc, handle);

int localThreads_x = LOCAL_THREADS_X;
int localThreads_y = LOCAL_THREADS_Y;
int localThreads_z = LOCAL_THREADS_Z;
int globalThreads_x = (dstDescPtr->strides.hStride + 7) >> 3;
int globalThreads_y = dstDescPtr->h;
int globalThreads_z = handle.GetBatchSize();

if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NHWC))
{
globalThreads_x = (dstDescPtr->strides.hStride / 3 + 7) >> 3;

hipLaunchKernelGGL(non_linear_blend_pkd_tensor,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
dstPtr,
make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
roiTensorPtrSrc);
}
else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(non_linear_blend_pln_tensor,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
dstPtr,
make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
dstDescPtr->c,
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
roiTensorPtrSrc);
}
else if ((srcDescPtr->c == 3) && (dstDescPtr->c == 3))
{
if ((srcDescPtr->layout == RpptLayout::NHWC) && (dstDescPtr->layout == RpptLayout::NCHW))
{
hipLaunchKernelGGL(non_linear_blend_pkd3_pln3_tensor,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
make_uint2(srcDescPtr->strides.nStride, srcDescPtr->strides.hStride),
dstPtr,
make_uint3(dstDescPtr->strides.nStride, dstDescPtr->strides.cStride, dstDescPtr->strides.hStride),
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
roiTensorPtrSrc);
}
else if ((srcDescPtr->layout == RpptLayout::NCHW) && (dstDescPtr->layout == RpptLayout::NHWC))
{
globalThreads_x = (srcDescPtr->strides.hStride + 7) >> 3;
hipLaunchKernelGGL(non_linear_blend_pln3_pkd3_tensor,
dim3(ceil((float)globalThreads_x/localThreads_x), ceil((float)globalThreads_y/localThreads_y), ceil((float)globalThreads_z/localThreads_z)),
dim3(localThreads_x, localThreads_y, localThreads_z),
0,
handle.GetStream(),
srcPtr1,
srcPtr2,
make_uint3(srcDescPtr->strides.nStride, srcDescPtr->strides.cStride, srcDescPtr->strides.hStride),
dstPtr,
make_uint2(dstDescPtr->strides.nStride, dstDescPtr->strides.hStride),
handle.GetInitHandle()->mem.mgpu.floatArr[0].floatmem,
roiTensorPtrSrc);
}
}

return RPP_SUCCESS;
}
70 changes: 70 additions & 0 deletions src/modules/rppt_tensor_effects_augmentations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -919,4 +919,74 @@ RppStatus rppt_gaussian_noise_gpu(RppPtr_t srcPtr,
#endif // backend
}

/******************** non_linear_blend ********************/

RppStatus rppt_non_linear_blend_gpu(RppPtr_t srcPtr1,
RppPtr_t srcPtr2,
RpptDescPtr srcDescPtr,
RppPtr_t dstPtr,
RpptDescPtr dstDescPtr,
Rpp32f *stdDevTensor,
RpptROIPtr roiTensorPtrSrc,
RpptRoiType roiType,
rppHandle_t rppHandle)
{
#ifdef HIP_COMPILE
for(int i = 0; i < srcDescPtr->n; i++)
if (stdDevTensor[i] == 0)
return RPP_ERROR_ZERO_DIVISION;
Rpp32u paramIndex = 0;
copy_param_float(stdDevTensor, rpp::deref(rppHandle), paramIndex++);

if ((srcDescPtr->dataType == RpptDataType::U8) && (dstDescPtr->dataType == RpptDataType::U8))
{
hip_exec_non_linear_blend_tensor(static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes,
static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F16) && (dstDescPtr->dataType == RpptDataType::F16))
{
hip_exec_non_linear_blend_tensor((half*) (static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
(half*) (static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
srcDescPtr,
(half*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::F32) && (dstDescPtr->dataType == RpptDataType::F32))
{
hip_exec_non_linear_blend_tensor((Rpp32f*) (static_cast<Rpp8u*>(srcPtr1) + srcDescPtr->offsetInBytes),
(Rpp32f*) (static_cast<Rpp8u*>(srcPtr2) + srcDescPtr->offsetInBytes),
srcDescPtr,
(Rpp32f*) (static_cast<Rpp8u*>(dstPtr) + dstDescPtr->offsetInBytes),
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}
else if ((srcDescPtr->dataType == RpptDataType::I8) && (dstDescPtr->dataType == RpptDataType::I8))
{
hip_exec_non_linear_blend_tensor(static_cast<Rpp8s*>(srcPtr1) + srcDescPtr->offsetInBytes,
static_cast<Rpp8s*>(srcPtr2) + srcDescPtr->offsetInBytes,
srcDescPtr,
static_cast<Rpp8s*>(dstPtr) + dstDescPtr->offsetInBytes,
dstDescPtr,
roiTensorPtrSrc,
roiType,
rpp::deref(rppHandle));
}

return RPP_SUCCESS;
#elif defined(OCL_COMPILE)
return RPP_ERROR_NOT_IMPLEMENTED;
#endif // backend
}

#endif // GPU_SUPPORT
Loading

0 comments on commit 05b67b5

Please sign in to comment.