r-abishek · ManasaDattaT · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 23, 2024
diff --git a/include/rppdefs.h b/include/rppdefs.h
@@ -389,7 +389,8 @@ typedef enum
     U8,
     F32,
     F16,
-    I8
+    I8,
+    I16
 } RpptDataType;
 
 /*! \brief RPPT Tensor layout type enum

diff --git a/include/rppt_tensor_arithmetic_operations.h b/include/rppt_tensor_arithmetic_operations.h
@@ -291,6 +291,43 @@ RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, R
  */
 RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
 #endif // GPU_SUPPORT
+#ifdef GPU_SUPPORT
+/*! \brief Logarithm operation on HIP backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ *          Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log1p_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
+
+
+RppStatus rppt_log1p_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#ifdef GPU_SUPPORT
+/*! \brief Logarithm operation on HIP backend
+ * \details Computes Log to base e(natural log) of the input for a given ND Tensor.
+ *          Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
+ *          Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
+ * \param [in] srcPtr source tensor in HIP memory
+ * \param [in] srcGenericDescPtr source tensor descriptor
+ * \param [out] dstPtr destination tensor in HIP memory
+ * \param [in] dstGenericDescPtr destination tensor descriptor
+ * \param [in] roiTensor values to represent dimensions of input tensor
+ * \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
+ * \return A <tt> \ref RppStatus</tt> enumeration.
+ * \retval RPP_SUCCESS Successful completion.
+ * \retval RPP_ERROR* Unsuccessful completion.
+ */
+RppStatus rppt_log1p_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
+#endif // GPU_SUPPORT
 
 /*! @}
  */

diff --git a/src/include/cpu/rpp_cpu_simd.hpp b/src/include/cpu/rpp_cpu_simd.hpp
@@ -105,6 +105,7 @@ const __m128i xmm_px3 = _mm_set1_epi32(3);
 const __m128i xmm_px4 = _mm_set1_epi32(4);
 const __m128i xmm_px5 = _mm_set1_epi32(5);
 const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128);
+const __m128i xmm_pxConvertI16 = _mm_set1_epi16((short)32768);
 const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3);
 
 const __m256 avx_p0 = _mm256_set1_ps(0.0f);
@@ -148,6 +149,8 @@ const __m128i xmm_pxMask04To07 = _mm_setr_epi8(4, 0x80, 0x80, 0x80, 5, 0x80, 0x8
 const __m128i xmm_pxMask08To11 = _mm_setr_epi8(8, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80, 11, 0x80, 0x80, 0x80);
 const __m128i xmm_pxMask12To15 = _mm_setr_epi8(12, 0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80);
 
+
+
 const __m128i xmm_pxMask00To02 = _mm_setr_epi8(0, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
 const __m128i xmm_pxMask03To05 = _mm_setr_epi8(3, 0x80, 0x80, 0x80, 4, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
 const __m128i xmm_pxMask06To08 = _mm_setr_epi8(6, 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
@@ -1954,6 +1957,23 @@ inline void rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
     p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15)));    /* Contains pixels 09-16 */
 }
 
+inline void rpp_load16_i16_to_f32_avx(Rpp16s *srcPtr, __m256 *p)
+{
+    __m256i px =  _mm256_loadu_si256((__m256i *)srcPtr);  
+
+    //Extracting 16 bits from the px and converting from 16 bit int to 32 bit int
+    __m256i px0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(px, 0)); 
+    __m256i px1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(px, 1)); 
+
+    //Taking absolute values
+    __m256i abs_px0 = _mm256_abs_epi32(px0);  
+    __m256i abs_px1 = _mm256_abs_epi32(px1); 
+
+    // Convert 32 bit int to 32 bit floats
+    p[0] = _mm256_cvtepi32_ps(abs_px0); 
+    p[1] = _mm256_cvtepi32_ps(abs_px1); 
+}
+
 inline void rpp_load24_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
 {
     __m128i px1, px2;
@@ -2693,6 +2713,8 @@ static inline __m256 log_ps(__m256 x)
     __m256 one = *(__m256 *)&avx_p1;
     __m256 invalid_mask = _mm256_cmp_ps(x, avx_p0, _CMP_LE_OQ);
 
+    // x = _mm256_add_ps(x, one);
+
     // cut off denormalized stuff
     x = _mm256_max_ps(x, *(__m256 *)&_ps_min_norm_pos_avx);
 

diff --git a/src/include/hip/rpp_hip_common.hpp b/src/include/hip/rpp_hip_common.hpp
@@ -464,6 +464,18 @@ __device__ __forceinline__ float rpp_hip_unpack3(int src)
 {
     return (float)(schar)((src >> 24) & 0xFF);
 }
+// Un-Packing from I16s
+
+__device__ __forceinline__ float rpp_hip_unpack0_(int src)
+{
+    return (float)((short)(src & 0xFFFF)); 
+}
+
+__device__ __forceinline__ float rpp_hip_unpack2_(int src)
+{
+    return (float)((short)((src >> 16) & 0xFFFF)); 
+}
+
 
 __device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src)
 {
@@ -560,6 +572,25 @@ __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(schar *srcPtr
     srcPtr_f8->f4[1] = rpp_hip_unpack_from_i8(src_i2.y);    // write 04-07
 }
 
+// I16 loads without layout toggle (8 I16 pixels)
+__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(short *srcPtr, d_float8 *srcPtr_f8)
+{
+    int4 src_i4 = *(int4 *)srcPtr; 
+    srcPtr_f8->f4[0] = make_float4( rpp_hip_unpack0_(src_i4.x),
+                                    rpp_hip_unpack2_(src_i4.x),
+                                    rpp_hip_unpack0_(src_i4.y),
+                                    rpp_hip_unpack2_(src_i4.y) 
+                                 ); 
+
+    srcPtr_f8->f4[1] = make_float4( rpp_hip_unpack0_(src_i4.z),
+                                    rpp_hip_unpack2_(src_i4.z),
+                                    rpp_hip_unpack0_(src_i4.w),
+                                    rpp_hip_unpack2_(src_i4.w)  
+                                 ); 
+}
+
+
+
 __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8_mirror(schar *srcPtr, d_float8 *srcPtr_f8)
 {
     int2 src_i2 = *(int2 *)srcPtr;
@@ -1902,6 +1933,17 @@ __device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst
     dst_f8->f1[7] = __logf(src_f8->f1[7]);
 }
 
+__device__ __forceinline__ void rpp_hip_math_log1p(d_float8 *src_f8, d_float8 *dst_f8)
+{
+    dst_f8->f1[0] = __logf((src_f8->f1[0]));
+    dst_f8->f1[1] = __logf((src_f8->f1[1]));
+    dst_f8->f1[2] = __logf((src_f8->f1[2]));
+    dst_f8->f1[3] = __logf((src_f8->f1[3]));
+    dst_f8->f1[4] = __logf((src_f8->f1[4]));
+    dst_f8->f1[5] = __logf((src_f8->f1[5]));
+    dst_f8->f1[6] = __logf((src_f8->f1[6]));
+    dst_f8->f1[7] = __logf((src_f8->f1[7]));
+}
 // /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/
 
 template<typename T>

diff --git a/src/modules/cpu/host_tensor_arithmetic_operations.hpp b/src/modules/cpu/host_tensor_arithmetic_operations.hpp
@@ -31,5 +31,6 @@ SOFTWARE.
 #include "kernel/multiply_scalar.hpp"
 #include "kernel/magnitude.hpp"
 #include "kernel/log.hpp"
+#include "kernel/log1p.hpp"
 
 #endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
diff --git a/src/modules/cpu/kernel/log1p.hpp b/src/modules/cpu/kernel/log1p.hpp
@@ -0,0 +1,225 @@
+/*
+MIT License
+
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "rppdefs.h"
+#include "rpp_cpu_common.hpp"
+
+// 1 pixel log helper functions
+// Also negative values are converted to positive by taking absolute of inputs
+inline void compute_log1p(Rpp16s *src, Rpp32f *dst) { 
+    *dst =  std::log1p(std::abs(*src));
+    }
+
+
+// Computes ND log recursively
+template<typename T1, typename T2>
+void log1p_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim)
+{
+    if (!nDim)
+        compute_log1p(src, dst);
+    else
+    {
+        for (int i = 0; i < *dstShape; i++)
+        {
+            log1p_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1);
+            dst += *dstStrides;
+            src += *srcStrides;
+        }
+    }
+}
+
+
+//log(1+x) or log1p(x) for input I16 and output F32
+
+RppStatus log1p_generic_host_tensor(Rpp16s *srcPtr,
+                                  RpptGenericDescPtr srcGenericDescPtr,
+                                  Rpp32f *dstPtr,
+                                  RpptGenericDescPtr dstGenericDescPtr,
+                                  Rpp32u *roiTensor,
+                                  rpp::Handle& handle)
+{
+    Rpp32u numThreads = handle.GetNumThreads();
+    Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
+    Rpp32u batchSize = dstGenericDescPtr->dims[0];
+    const __m256 one_vec =  _mm256_set1_ps(1.0f);
+
+    omp_set_dynamic(0);
+#pragma omp parallel for num_threads(numThreads)
+    for(int batchCount = 0; batchCount < batchSize; batchCount++)
+    {
+        Rpp32u *roi = roiTensor + batchCount * nDim * 2;
+        Rpp32u *begin = roi;
+        Rpp32u *length = &roi[nDim];
+
+        Rpp16s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
+        Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];
+
+        for(int i = 0; i < nDim; i++)
+            srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
+        Rpp32u alignedLength;
+        Rpp32u vectorIncrement = 16;
+        if (nDim == 1)
+        {
+            alignedLength = length[0] & ~15;
+            int vectorLoopCount = 0;
+#if __AVX2__
+            for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+            {
+                __m256 p[2];
+
+                rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtr1, p);    // simd loads
+                p[0] = _mm256_add_ps(p[0], one_vec);
+                p[1] = _mm256_add_ps(p[1], one_vec);
+                compute_log_16_host(p);  // log compute
+                rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p);    // simd stores
+                srcPtr1 += vectorIncrement;
+                dstPtr1 += vectorIncrement;
+            }
+#endif
+            for (; vectorLoopCount < length[0]; vectorLoopCount++)
+            {
+                compute_log1p(srcPtr1, dstPtr1);
+                srcPtr1++;
+                dstPtr1++;
+            }
+        }
+        else if(nDim == 2)
+        {
+            alignedLength = length[1] & ~15;
+            for(int i = 0; i < length[0]; i++)
+            {
+                Rpp16s *srcPtrTemp = srcPtr1;
+                Rpp32f *dstPtrTemp = dstPtr1;
+
+                int vectorLoopCount = 0;
+#if __AVX2__
+                for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                {
+                    __m256 p[2];
+
+                    rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtr1, p);    // simd loads
+                    p[0] = _mm256_add_ps(p[0], one_vec);
+                    p[1] = _mm256_add_ps(p[1], one_vec);
+                    compute_log_16_host(p);  // log compute
+                    rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                    srcPtrTemp += vectorIncrement;
+                    dstPtrTemp += vectorIncrement;
+                }
+#endif
+                for (; vectorLoopCount < length[1]; vectorLoopCount++)
+                {
+                    compute_log1p(srcPtrTemp, dstPtrTemp);
+                    srcPtrTemp++;
+                    dstPtrTemp++;
+                }
+                srcPtr1 += srcGenericDescPtr->strides[1];
+                dstPtr1 += dstGenericDescPtr->strides[1];
+            }
+        }
+        else if(nDim == 3)
+        {
+            int combinedLength = length[0] * length[1];
+            alignedLength = combinedLength & ~15;
+            for(int i = 0; i < length[2]; i++)
+            {
+                Rpp16s *srcPtrTemp = srcPtr1;
+                Rpp32f *dstPtrTemp = dstPtr1;
+
+                    int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+
+                        rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        p[0] = _mm256_add_ps(p[0], one_vec);
+                        p[1] = _mm256_add_ps(p[1], one_vec);
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                    for (; vectorLoopCount < combinedLength; vectorLoopCount++)
+                    {
+                        compute_log1p(srcPtrTemp, dstPtrTemp);
+                        srcPtrTemp++;
+                        dstPtrTemp++;
+                    }
+
+                srcPtr1 += combinedLength;
+                dstPtr1 += combinedLength;
+            }
+        }
+
+
+        else if(nDim == 4)
+        {
+            int combinedLength = length[0] * length[1];
+            int combinedLength2 = length[0] * length[1] * length[2];
+            alignedLength = combinedLength & ~15;
+            for(int i = 0; i < length[3]; i++)
+            {
+                Rpp16s *srcPtrCol = srcPtr1;
+                Rpp32f *dstPtrCol = dstPtr1;
+                for(int j = 0; j < length[2]; j++)
+                {
+                    Rpp16s *srcPtrTemp = srcPtrCol;
+                    Rpp32f *dstPtrTemp = dstPtrCol;
+                        int vectorLoopCount = 0;
+#if __AVX2__
+                    for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
+                    {
+                        __m256 p[2];
+
+                        rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtrTemp, p);    // simd loads
+                        p[0] = _mm256_add_ps(p[0], one_vec);
+                        p[1] = _mm256_add_ps(p[1], one_vec);
+                        compute_log_16_host(p);  // log compute
+                        rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p);    // simd stores
+                        srcPtrTemp += vectorIncrement;
+                        dstPtrTemp += vectorIncrement;
+                    }
+#endif
+                        for (; vectorLoopCount < combinedLength; vectorLoopCount++)
+                        {
+                            compute_log1p(srcPtrTemp, dstPtrTemp);
+                            srcPtrTemp++;
+                            dstPtrTemp++;
+                        }
+
+                    srcPtrCol += combinedLength;
+                    dstPtrCol += combinedLength;
+                }
+                srcPtr1 += combinedLength2;
+                dstPtr1 += combinedLength2;
+            }
+
+        }
+         else
+             log1p_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
+    }
+
+    return RPP_SUCCESS;
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -389,7 +389,8 @@ typedef enum @@
         U8,
         F32,
         F16,
-        I8
+        I8,
+        I16
     } RpptDataType;
     /*! \brief RPPT Tensor layout type enum
@@ Expand Down @@