-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP - Brightness AVX512 Trial #131
base: master
Are you sure you want to change the base?
WIP - Brightness AVX512 Trial #131
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jenetscaria-mcw
I have done the first round of review
Please check and address them
Once the PR Comments are addressed please reply to PR comment as done, so that we can know what has been done and what is pending
# -fPIC -- Generate position-independent code if possible. | ||
# -mavx2 -- Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation. | ||
# -mfma -- Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation. | ||
# -std=gnu++14 -- Conform to the ISO 2014 C++ standard with GNU extensions. | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mavx2 -mf16c -mfma -std=gnu++14") | ||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512bw -mavx512f -fPIC -mavx2 -mf16c -mfma -std=gnu++14") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add description for avx512bw flag as well
@@ -2529,6 +2539,14 @@ inline void compute_brightness_24_host(__m128 *p, __m128 *pBrightnessParams) | |||
p[5] = _mm_fmadd_ps(p[5], pBrightnessParams[0], pBrightnessParams[1]); // brightness adjustment | |||
} | |||
|
|||
inline void compute_brightness_64_host(__m512 *p, __m512 *pBrightnessParams) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please move the compute_brightness_64_host before compute_brightness_48_host
@@ -110,6 +111,8 @@ const __m128i xmm_px4 = _mm_set1_epi32(4); | |||
const __m128i xmm_px5 = _mm_set1_epi32(5); | |||
const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128); | |||
const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3); | |||
const __m128i xmm_px0I8 = _mm_set1_epi8((char)0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can use xmm_px0, no need to redefine another variable with _mm_set1_epi8
Please check once
@@ -110,6 +111,8 @@ const __m128i xmm_px4 = _mm_set1_epi32(4); | |||
const __m128i xmm_px5 = _mm_set1_epi32(5); | |||
const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128); | |||
const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3); | |||
const __m128i xmm_px0I8 = _mm_set1_epi8((char)0); | |||
const __m512i xmm_pxConvertI8_avx512 = _mm512_set1_epi8((char)128); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please rename the variable name to
avx512_pxConvertI8
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove the additional blank line
compute_brightness_96_host( | ||
p, pBrightnessParams); // brightness adjustment | ||
rpp_simd_store(rpp_store96_f32pln3_to_u8pln3_avx512, dstPtrTempR, | ||
dstPtrTempG, dstPtrTempB, p); // simd stores |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please align the function params
It should be in single line
compute_brightness_96_host(p, pBrightnessParams); // brightness adjustment
Applicable to all such instances in the file
#else | ||
int max_length = 16; | ||
#endif | ||
Rpp32u alignedLength = bufferLength & ~(max_length-1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add space before and after - operator
@@ -187,7 +212,12 @@ RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr, | |||
// Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW) | |||
else | |||
{ | |||
Rpp32u alignedLength = bufferLength & ~15; | |||
#if __AVX512__ | |||
int max_length = 64; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove the usage of this max_length variable and use vectorIncrementPerChannel or another suitable variable used
@@ -216,8 +246,8 @@ RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr, | |||
compute_brightness_16_host(p, pBrightnessParams); // brightness adjustment | |||
rpp_simd_store(rpp_store16_f32_to_u8, dstPtrTemp, p); // simd stores | |||
#endif | |||
srcPtrTemp +=16; | |||
dstPtrTemp +=16; | |||
srcPtrTemp +=max_length; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add space after += operator
Applicable to all such instances in the file
src/include/cpu/rpp_cpu_simd.hpp
Outdated
px[1] = _mm512_loadu_si512((__m512i *)(srcPtr + 48)); | ||
__m512i pxCvt[6]; | ||
|
||
__mmask64 maskR = 0x9249249249240000; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add comment stating what this value represent, same for next 2 masks also
If it is constant, please move define this masks outside as const
__mmask64 maskR = 0x9249249249240000; | ||
__mmask64 maskG = 0x4924924924920000; | ||
__mmask64 maskB = 0x2492492492490000; | ||
pxCvt[0] = _mm512_bslli_epi128 (_mm512_maskz_mov_epi8 (maskR, px[0]),1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a comment mention what does this instruction do. Please refer this line of codes and add something similar here
https://github.com/sampath1117/rpp/blob/js/opt_brightness_avx512/src/include/cpu/rpp_cpu_simd.hpp#L335
https://github.com/sampath1117/rpp/blob/js/opt_brightness_avx512/src/include/cpu/rpp_cpu_simd.hpp#L339
src/include/cpu/rpp_cpu_simd.hpp
Outdated
px[4] = _mm512_loadu_si512((__m512i *)(srcPtrG + 48)); | ||
px[5] = _mm512_loadu_si512((__m512i *)(srcPtrB + 48)); | ||
|
||
__m128i input[4]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please note that all __m128i should start with px in variable name
src/include/cpu/rpp_cpu_simd.hpp
Outdated
inline void rpp_load96_u8pln3_to_f32pln3_avx512(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m512 *p) | ||
{ | ||
__m512i px[6]; | ||
px[0] = _mm512_loadu_si512((__m512i *)srcPtrR); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are too many load and extract functions used here
Please check if you can directly use sse load instruction
_mm_loadu_si128 and finally fit into 512 vector
src/include/cpu/rpp_cpu_simd.hpp
Outdated
out[3] = _mm512_cvtepu8_epi32 (input[3]); | ||
|
||
__m512i output[3]; | ||
output[0] = _mm512_mask_blend_epi32 (k, out[0], out[1]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add comments for each instruction with the register level storage, so that it is easy to understand
src/include/cpu/rpp_cpu_simd.hpp
Outdated
|
||
p[4] = _mm512_cvtepu32_ps(output[0]); | ||
|
||
input[0] = _mm512_extracti32x4_epi32(px[5], 0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please change the variable names as per the RPP style
variable names should start with px for interger and p for float
Please give better variable names which are meaningful for registers instead of input, in, output, out
pxCvti[1] = _mm512_cvtusepi32_epi8(pxCvt[1]); | ||
pxCvti[2] = _mm512_cvtusepi32_epi8(pxCvt[2]); | ||
|
||
_mm_storeu_si128((__m128i *)dstPtrR, pxCvti[0]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jenetscaria-mcw
Please change this to 96 pixel store instead of 48 pixel store
There are 96 pixels processed but only 48 are stored here
p[1] = _mm512_cvtepu32_ps(output[1]); | ||
p[2] = _mm512_cvtepu32_ps(output[2]); | ||
p[3] = _mm512_cvtepu32_ps(output[3]); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please remove the additional space
inline void rpp_load48_f32pkd3_to_f32pln3_avx512(Rpp32f *srcPtr, __m512 *p) | ||
{ | ||
__m512i px[2]; | ||
px[0] = _mm512_loadu_si512((__m512i *)srcPtr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you getting correct outputs for F32 cases?
Because the input is float, we should use _mm512_loadu_ps and entire functions need to change w.r.t that
__m512 px[5]; | ||
__m512i maski = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); | ||
__m512i avx512_pxPermPkd = _mm512_set_epi32(15, 11, 7, 3, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); | ||
p[0] = _mm512_permutexvar_ps(maski, p[0]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are using lot of premutes in this function
P;ease check if this can be avoided
* Add gpu tensor support for non linear blend * Add relevant unit tests * Add relevant perf tests * Add check on div * Minor change
Added AVX512 code for brightness