Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log1p #377

Open
wants to merge 25 commits into
base: develop
Choose a base branch
from
Open

Log1p #377

Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
d834306
Initial log1p implementation in C++
ManasaDattaT Dec 18, 2024
0cfe759
Added for nDim = 4 separately instead of recursive loop in log1p
ManasaDattaT Dec 18, 2024
80184d2
Test by converting existing input F32 to I16
ManasaDattaT Dec 18, 2024
ddd7cb4
log1p_HIP_Implementation
Dec 23, 2024
1f6a0ce
added abs in AVX2
ManasaDattaT Dec 24, 2024
3f58b8c
HIP calls
Dec 26, 2024
6edb178
log1p HOST
ManasaDattaT Dec 27, 2024
2967bb7
log1p HOST
ManasaDattaT Dec 27, 2024
5513f2d
calls in HIP backend
ManasaDattaT Dec 27, 2024
eef3ae2
#
ManasaDattaT Dec 27, 2024
be135aa
Add files via upload
ManasaDattaT Dec 27, 2024
8f31d18
reference output files for log1p
ManasaDattaT Dec 30, 2024
aafae05
Merge branch 'log1p_HOST' of https://github.com/ManasaDattaT/rpp into…
snehaa8 Dec 30, 2024
0117c34
log1p HOST implementation
ManasaDattaT Dec 30, 2024
2eb2039
log1p HOST implementation
ManasaDattaT Dec 30, 2024
c65051e
Merge branch 'log1p_HOST' into log1p_HOST_HIP
ManasaDattaT Dec 30, 2024
ae0fd19
merge conflicts resolved
ManasaDattaT Dec 30, 2024
b82517f
removed print statements
snehaa8 Dec 31, 2024
d6beaa4
Worked on the review comment
ManasaDattaT Jan 2, 2025
d78c287
Worked on the review comment
ManasaDattaT Jan 2, 2025
3038153
Update rpp_hip_common.hpp
ManasaDattaT Jan 3, 2025
835a8ad
Minor changes after review
ManasaDattaT Jan 3, 2025
8872db6
Resolved merge conflicts
ManasaDattaT Feb 17, 2025
578a967
Reverted the testsuite changes, which were added in support for I16
ManasaDattaT Feb 17, 2025
2173693
removed the testsuite support
ManasaDattaT Feb 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/rppdefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,8 @@ typedef enum
U8,
F32,
F16,
I8
I8,
I16
} RpptDataType;

/*! \brief RPPT Tensor layout type enum
Expand Down
37 changes: 37 additions & 0 deletions include/rppt_tensor_arithmetic_operations.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,43 @@ RppStatus rppt_log_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, R
*/
RppStatus rppt_log_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
#endif // GPU_SUPPORT
#ifdef GPU_SUPPORT
/*! \brief Logarithm operation on HIP backend
* \details Computes Log to base e(natural log) of the input for a given ND Tensor.
* Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
* Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcGenericDescPtr source tensor descriptor
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstGenericDescPtr destination tensor descriptor
* \param [in] roiTensor values to represent dimensions of input tensor
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_log1p_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
#endif // GPU_SUPPORT


RppStatus rppt_log1p_host(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);
#ifdef GPU_SUPPORT
/*! \brief Logarithm operation on HIP backend
* \details Computes Log to base e(natural log) of the input for a given ND Tensor.
* Supports u8->f32, i8->f32, f16->f16 and f32->f32 datatypes.
* Uses Absolute of input for log computation and uses nextafter() if input is 0 to avoid undefined result.
* \param [in] srcPtr source tensor in HIP memory
* \param [in] srcGenericDescPtr source tensor descriptor
* \param [out] dstPtr destination tensor in HIP memory
* \param [in] dstGenericDescPtr destination tensor descriptor
* \param [in] roiTensor values to represent dimensions of input tensor
* \param [in] rppHandle RPP HIP handle created with <tt>\ref rppCreateWithStreamAndBatchSize()</tt>
* \return A <tt> \ref RppStatus</tt> enumeration.
* \retval RPP_SUCCESS Successful completion.
* \retval RPP_ERROR* Unsuccessful completion.
*/
RppStatus rppt_log1p_gpu(RppPtr_t srcPtr, RpptGenericDescPtr srcGenericDescPtr, RppPtr_t dstPtr, RpptGenericDescPtr dstGenericDescPtr, Rpp32u *roiTensor, rppHandle_t rppHandle);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please check the gpu declaration, it is repeated twice

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

#endif // GPU_SUPPORT

/*! @}
*/
Expand Down
22 changes: 22 additions & 0 deletions src/include/cpu/rpp_cpu_simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ const __m128i xmm_px3 = _mm_set1_epi32(3);
const __m128i xmm_px4 = _mm_set1_epi32(4);
const __m128i xmm_px5 = _mm_set1_epi32(5);
const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128);
const __m128i xmm_pxConvertI16 = _mm_set1_epi16((short)32768);
const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3);

const __m256 avx_p0 = _mm256_set1_ps(0.0f);
Expand Down Expand Up @@ -148,6 +149,8 @@ const __m128i xmm_pxMask04To07 = _mm_setr_epi8(4, 0x80, 0x80, 0x80, 5, 0x80, 0x8
const __m128i xmm_pxMask08To11 = _mm_setr_epi8(8, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80, 11, 0x80, 0x80, 0x80);
const __m128i xmm_pxMask12To15 = _mm_setr_epi8(12, 0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80);


Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove these extra spaces

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


const __m128i xmm_pxMask00To02 = _mm_setr_epi8(0, 0x80, 0x80, 0x80, 1, 0x80, 0x80, 0x80, 2, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
const __m128i xmm_pxMask03To05 = _mm_setr_epi8(3, 0x80, 0x80, 0x80, 4, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
const __m128i xmm_pxMask06To08 = _mm_setr_epi8(6, 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
Expand Down Expand Up @@ -1954,6 +1957,23 @@ inline void rpp_load16_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
p[1] = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_shuffle_epi8(px, xmm_pxMask08To11), _mm_shuffle_epi8(px, xmm_pxMask12To15))); /* Contains pixels 09-16 */
}

inline void rpp_load16_i16_to_f32_avx(Rpp16s *srcPtr, __m256 *p)
{
__m256i px = _mm256_loadu_si256((__m256i *)srcPtr);

//Extracting 16 bits from the px and converting from 16 bit int to 32 bit int
__m256i px0 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(px, 0));
__m256i px1 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(px, 1));

//Taking absolute values
__m256i abs_px0 = _mm256_abs_epi32(px0);
__m256i abs_px1 = _mm256_abs_epi32(px1);

// Convert 32 bit int to 32 bit floats
p[0] = _mm256_cvtepi32_ps(abs_px0);
p[1] = _mm256_cvtepi32_ps(abs_px1);
}

inline void rpp_load24_i8_to_f32_avx(Rpp8s *srcPtr, __m256 *p)
{
__m128i px1, px2;
Expand Down Expand Up @@ -2693,6 +2713,8 @@ static inline __m256 log_ps(__m256 x)
__m256 one = *(__m256 *)&avx_p1;
__m256 invalid_mask = _mm256_cmp_ps(x, avx_p0, _CMP_LE_OQ);

// x = _mm256_add_ps(x, one);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove the commented-out lines, if these are not necessary


// cut off denormalized stuff
x = _mm256_max_ps(x, *(__m256 *)&_ps_min_norm_pos_avx);

Expand Down
42 changes: 42 additions & 0 deletions src/include/hip/rpp_hip_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,18 @@ __device__ __forceinline__ float rpp_hip_unpack3(int src)
{
return (float)(schar)((src >> 24) & 0xFF);
}
// Un-Packing from I16s

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove blank line at L468

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

__device__ __forceinline__ float rpp_hip_unpack0_(int src)
{
return (float)((short)(src & 0xFFFF));
}

__device__ __forceinline__ float rpp_hip_unpack2_(int src)
{
return (float)((short)((src >> 16) & 0xFFFF));
}


__device__ __forceinline__ float4 rpp_hip_unpack_from_i8(int src)
{
Expand Down Expand Up @@ -560,6 +572,25 @@ __device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(schar *srcPtr
srcPtr_f8->f4[1] = rpp_hip_unpack_from_i8(src_i2.y); // write 04-07
}

// I16 loads without layout toggle (8 I16 pixels)
__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8(short *srcPtr, d_float8 *srcPtr_f8)
{
int4 src_i4 = *(int4 *)srcPtr;
srcPtr_f8->f4[0] = make_float4( rpp_hip_unpack0_(src_i4.x),
rpp_hip_unpack2_(src_i4.x),
rpp_hip_unpack0_(src_i4.y),
rpp_hip_unpack2_(src_i4.y)
);

srcPtr_f8->f4[1] = make_float4( rpp_hip_unpack0_(src_i4.z),
rpp_hip_unpack2_(src_i4.z),
rpp_hip_unpack0_(src_i4.w),
rpp_hip_unpack2_(src_i4.w)
);
}



Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just keep one blank line between functions, remove the extra lines

__device__ __forceinline__ void rpp_hip_load8_and_unpack_to_float8_mirror(schar *srcPtr, d_float8 *srcPtr_f8)
{
int2 src_i2 = *(int2 *)srcPtr;
Expand Down Expand Up @@ -1902,6 +1933,17 @@ __device__ __forceinline__ void rpp_hip_math_log(d_float8 *src_f8, d_float8 *dst
dst_f8->f1[7] = __logf(src_f8->f1[7]);
}

__device__ __forceinline__ void rpp_hip_math_log1p(d_float8 *src_f8, d_float8 *dst_f8)
{
dst_f8->f1[0] = __logf((src_f8->f1[0]));
dst_f8->f1[1] = __logf((src_f8->f1[1]));
dst_f8->f1[2] = __logf((src_f8->f1[2]));
dst_f8->f1[3] = __logf((src_f8->f1[3]));
dst_f8->f1[4] = __logf((src_f8->f1[4]));
dst_f8->f1[5] = __logf((src_f8->f1[5]));
dst_f8->f1[6] = __logf((src_f8->f1[6]));
dst_f8->f1[7] = __logf((src_f8->f1[7]));
}
// /******************** DEVICE RANDOMIZATION HELPER FUNCTIONS ********************/

template<typename T>
Expand Down
1 change: 1 addition & 0 deletions src/modules/cpu/host_tensor_arithmetic_operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@ SOFTWARE.
#include "kernel/multiply_scalar.hpp"
#include "kernel/magnitude.hpp"
#include "kernel/log.hpp"
#include "kernel/log1p.hpp"

#endif // HOST_TENSOR_ARITHMETIC_OPERATIONS_HPP
225 changes: 225 additions & 0 deletions src/modules/cpu/kernel/log1p.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
/*
MIT License

Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

#include "rppdefs.h"
#include "rpp_cpu_common.hpp"

// 1 pixel log helper functions
// Also negative values are converted to positive by taking absolute of inputs
inline void compute_log1p(Rpp16s *src, Rpp32f *dst) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move { to next line

*dst = std::log1p(std::abs(*src));
}


// Computes ND log recursively
template<typename T1, typename T2>
void log1p_recursive(T1 *src, Rpp32u *srcStrides, T2 *dst, Rpp32u *dstStrides, Rpp32u *dstShape, Rpp32u nDim)
{
if (!nDim)
compute_log1p(src, dst);
else
{
for (int i = 0; i < *dstShape; i++)
{
log1p_recursive(src, srcStrides + 1, dst, dstStrides + 1, dstShape + 1, nDim - 1);
dst += *dstStrides;
src += *srcStrides;
}
}
}


//log(1+x) or log1p(x) for input I16 and output F32

RppStatus log1p_generic_host_tensor(Rpp16s *srcPtr,
RpptGenericDescPtr srcGenericDescPtr,
Rpp32f *dstPtr,
RpptGenericDescPtr dstGenericDescPtr,
Rpp32u *roiTensor,
rpp::Handle& handle)
{
Rpp32u numThreads = handle.GetNumThreads();
Rpp32u nDim = srcGenericDescPtr->numDims - 1; // Omitting batchSize here to get tensor dimension.
Rpp32u batchSize = dstGenericDescPtr->dims[0];
const __m256 one_vec = _mm256_set1_ps(1.0f);

omp_set_dynamic(0);
#pragma omp parallel for num_threads(numThreads)
for(int batchCount = 0; batchCount < batchSize; batchCount++)
{
Rpp32u *roi = roiTensor + batchCount * nDim * 2;
Rpp32u *begin = roi;
Rpp32u *length = &roi[nDim];

Rpp16s *srcPtr1 = srcPtr + batchCount * srcGenericDescPtr->strides[0];
Rpp32f *dstPtr1 = dstPtr + batchCount * dstGenericDescPtr->strides[0];

for(int i = 0; i < nDim; i++)
srcPtr1 += begin[i] * srcGenericDescPtr->strides[i + 1];
Rpp32u alignedLength;
Rpp32u vectorIncrement = 16;
if (nDim == 1)
{
alignedLength = length[0] & ~15;
int vectorLoopCount = 0;
#if __AVX2__
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];

rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtr1, p); // simd loads
p[0] = _mm256_add_ps(p[0], one_vec);
p[1] = _mm256_add_ps(p[1], one_vec);
compute_log_16_host(p); // log compute
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtr1, p); // simd stores
srcPtr1 += vectorIncrement;
dstPtr1 += vectorIncrement;
}
#endif
for (; vectorLoopCount < length[0]; vectorLoopCount++)
{
compute_log1p(srcPtr1, dstPtr1);
srcPtr1++;
dstPtr1++;
}
}
else if(nDim == 2)
{
alignedLength = length[1] & ~15;
for(int i = 0; i < length[0]; i++)
{
Rpp16s *srcPtrTemp = srcPtr1;
Rpp32f *dstPtrTemp = dstPtr1;

int vectorLoopCount = 0;
#if __AVX2__
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];

rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtr1, p); // simd loads
p[0] = _mm256_add_ps(p[0], one_vec);
p[1] = _mm256_add_ps(p[1], one_vec);
compute_log_16_host(p); // log compute
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
srcPtrTemp += vectorIncrement;
dstPtrTemp += vectorIncrement;
}
#endif
for (; vectorLoopCount < length[1]; vectorLoopCount++)
{
compute_log1p(srcPtrTemp, dstPtrTemp);
srcPtrTemp++;
dstPtrTemp++;
}
srcPtr1 += srcGenericDescPtr->strides[1];
dstPtr1 += dstGenericDescPtr->strides[1];
}
}
else if(nDim == 3)
{
int combinedLength = length[0] * length[1];
alignedLength = combinedLength & ~15;
for(int i = 0; i < length[2]; i++)
{
Rpp16s *srcPtrTemp = srcPtr1;
Rpp32f *dstPtrTemp = dstPtr1;

int vectorLoopCount = 0;
#if __AVX2__
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];

rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtrTemp, p); // simd loads
p[0] = _mm256_add_ps(p[0], one_vec);
p[1] = _mm256_add_ps(p[1], one_vec);
compute_log_16_host(p); // log compute
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
srcPtrTemp += vectorIncrement;
dstPtrTemp += vectorIncrement;
}
#endif
for (; vectorLoopCount < combinedLength; vectorLoopCount++)
{
compute_log1p(srcPtrTemp, dstPtrTemp);
srcPtrTemp++;
dstPtrTemp++;
}

srcPtr1 += combinedLength;
dstPtr1 += combinedLength;
}
}


else if(nDim == 4)
{
int combinedLength = length[0] * length[1];
int combinedLength2 = length[0] * length[1] * length[2];
alignedLength = combinedLength & ~15;
for(int i = 0; i < length[3]; i++)
{
Rpp16s *srcPtrCol = srcPtr1;
Rpp32f *dstPtrCol = dstPtr1;
for(int j = 0; j < length[2]; j++)
{
Rpp16s *srcPtrTemp = srcPtrCol;
Rpp32f *dstPtrTemp = dstPtrCol;
int vectorLoopCount = 0;
#if __AVX2__
for (; vectorLoopCount < alignedLength; vectorLoopCount += vectorIncrement)
{
__m256 p[2];

rpp_simd_load(rpp_load16_i16_to_f32_avx, srcPtrTemp, p); // simd loads
p[0] = _mm256_add_ps(p[0], one_vec);
p[1] = _mm256_add_ps(p[1], one_vec);
compute_log_16_host(p); // log compute
rpp_simd_store(rpp_store16_f32_to_f32_avx, dstPtrTemp, p); // simd stores
srcPtrTemp += vectorIncrement;
dstPtrTemp += vectorIncrement;
}
#endif
for (; vectorLoopCount < combinedLength; vectorLoopCount++)
{
compute_log1p(srcPtrTemp, dstPtrTemp);
srcPtrTemp++;
dstPtrTemp++;
}

srcPtrCol += combinedLength;
dstPtrCol += combinedLength;
}
srcPtr1 += combinedLength2;
dstPtr1 += combinedLength2;
}

}
else
log1p_recursive(srcPtr1, srcGenericDescPtr->strides, dstPtr1, dstGenericDescPtr->strides, length, nDim);
}

return RPP_SUCCESS;
}
Loading