WIP - Brightness AVX512 Trial #131

jenetscaria-mcw · 2023-06-07T08:18:03Z

Added AVX512 code for brightness

sampath1117

@jenetscaria-mcw
I have done the first round of review
Please check and address them

Once the PR Comments are addressed please reply to PR comment as done, so that we can know what has been done and what is pending

sampath1117 · 2023-06-09T05:48:20Z

CMakeLists.txt

 # -fPIC -- Generate position-independent code if possible.
 # -mavx2 -- Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and AVX2 built-in functions and code generation.
 # -mfma -- Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation.
 # -std=gnu++14 -- Conform to the ISO 2014 C++ standard with GNU extensions.
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mavx2 -mf16c -mfma -std=gnu++14")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512bw -mavx512f -fPIC -mavx2 -mf16c -mfma -std=gnu++14")


Please add description for avx512bw flag as well

sampath1117 · 2023-06-09T05:50:06Z

src/include/cpu/rpp_cpu_common.hpp

@@ -2529,6 +2539,14 @@ inline void compute_brightness_24_host(__m128 *p, __m128 *pBrightnessParams)
    p[5] = _mm_fmadd_ps(p[5], pBrightnessParams[0], pBrightnessParams[1]);    // brightness adjustment
 }

+inline void compute_brightness_64_host(__m512 *p, __m512 *pBrightnessParams)


Please move the compute_brightness_64_host before compute_brightness_48_host

sampath1117 · 2023-06-09T05:53:25Z

src/include/cpu/rpp_cpu_simd.hpp

@@ -110,6 +111,8 @@ const __m128i xmm_px4 = _mm_set1_epi32(4);
 const __m128i xmm_px5 = _mm_set1_epi32(5);
 const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128);
 const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3);
+const __m128i xmm_px0I8 = _mm_set1_epi8((char)0);


I think we can use xmm_px0, no need to redefine another variable with _mm_set1_epi8
Please check once

sampath1117 · 2023-06-09T05:54:22Z

src/include/cpu/rpp_cpu_simd.hpp

@@ -110,6 +111,8 @@ const __m128i xmm_px4 = _mm_set1_epi32(4);
 const __m128i xmm_px5 = _mm_set1_epi32(5);
 const __m128i xmm_pxConvertI8 = _mm_set1_epi8((char)128);
 const __m128 xmm_pDstLocInit = _mm_setr_ps(0, 1, 2, 3);
+const __m128i xmm_px0I8 = _mm_set1_epi8((char)0);
+const __m512i xmm_pxConvertI8_avx512 = _mm512_set1_epi8((char)128);


Please rename the variable name to
avx512_pxConvertI8

sampath1117 · 2023-06-09T05:55:14Z

src/include/cpu/rpp_cpu_simd.hpp

+    }
+}
+
+


Please remove the additional blank line

sampath1117 · 2023-06-09T06:00:01Z

src/modules/cpu/kernel/brightness.hpp

+                    compute_brightness_96_host(
+                        p, pBrightnessParams);  // brightness adjustment
+                    rpp_simd_store(rpp_store96_f32pln3_to_u8pln3_avx512, dstPtrTempR,
+                                    dstPtrTempG, dstPtrTempB, p);  // simd stores


Please align the function params
It should be in single line
compute_brightness_96_host(p, pBrightnessParams); // brightness adjustment

Applicable to all such instances in the file

sampath1117 · 2023-06-09T06:02:15Z

src/modules/cpu/kernel/brightness.hpp

+#else
+            int max_length = 16;
+#endif
+            Rpp32u alignedLength = bufferLength & ~(max_length-1);


Please add space before and after - operator

sampath1117 · 2023-06-09T06:04:37Z

src/modules/cpu/kernel/brightness.hpp

@@ -187,7 +212,12 @@ RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr,
        // Brightness without fused output-layout toggle (NHWC -> NHWC or NCHW -> NCHW)
        else
        {
-            Rpp32u alignedLength = bufferLength & ~15;
+#if __AVX512__
+            int max_length = 64;


Please remove the usage of this max_length variable and use vectorIncrementPerChannel or another suitable variable used

sampath1117 · 2023-06-09T06:05:00Z

src/modules/cpu/kernel/brightness.hpp

@@ -216,8 +246,8 @@ RppStatus brightness_u8_u8_host_tensor(Rpp8u *srcPtr,
                        compute_brightness_16_host(p, pBrightnessParams);  // brightness adjustment
                        rpp_simd_store(rpp_store16_f32_to_u8, dstPtrTemp, p);    // simd stores
 #endif
-                        srcPtrTemp +=16;
-                        dstPtrTemp +=16;
+                        srcPtrTemp +=max_length;


Please add space after += operator
Applicable to all such instances in the file

src/modules/cpu/kernel/brightness.hpp

sampath1117 · 2023-06-12T16:08:16Z

src/include/cpu/rpp_cpu_simd.hpp

+    px[1] = _mm512_loadu_si512((__m512i *)(srcPtr + 48));
+    __m512i pxCvt[6];
+
+    __mmask64 maskR = 0x9249249249240000;


Please add comment stating what this value represent, same for next 2 masks also
If it is constant, please move define this masks outside as const

sampath1117 · 2023-06-12T16:11:24Z

src/include/cpu/rpp_cpu_simd.hpp

+    __mmask64 maskR = 0x9249249249240000;
+    __mmask64 maskG = 0x4924924924920000;
+    __mmask64 maskB = 0x2492492492490000;
+    pxCvt[0] = _mm512_bslli_epi128 (_mm512_maskz_mov_epi8 (maskR, px[0]),1);


Please add a comment mention what does this instruction do. Please refer this line of codes and add something similar here
https://github.com/sampath1117/rpp/blob/js/opt_brightness_avx512/src/include/cpu/rpp_cpu_simd.hpp#L335
https://github.com/sampath1117/rpp/blob/js/opt_brightness_avx512/src/include/cpu/rpp_cpu_simd.hpp#L339

sampath1117 · 2023-06-12T16:24:40Z

src/include/cpu/rpp_cpu_simd.hpp

+    px[4] = _mm512_loadu_si512((__m512i *)(srcPtrG + 48));
+    px[5] = _mm512_loadu_si512((__m512i *)(srcPtrB + 48));
+
+    __m128i input[4];


Please note that all __m128i should start with px in variable name

sampath1117 · 2023-06-12T16:52:14Z

src/include/cpu/rpp_cpu_simd.hpp

+inline void rpp_load96_u8pln3_to_f32pln3_avx512(Rpp8u *srcPtrR, Rpp8u *srcPtrG, Rpp8u *srcPtrB, __m512 *p)
+{
+    __m512i px[6];
+    px[0] = _mm512_loadu_si512((__m512i *)srcPtrR);


There are too many load and extract functions used here
Please check if you can directly use sse load instruction
_mm_loadu_si128 and finally fit into 512 vector

sampath1117 · 2023-06-12T16:53:51Z

src/include/cpu/rpp_cpu_simd.hpp

+    out[3] = _mm512_cvtepu8_epi32 (input[3]);
+
+    __m512i output[3];
+    output[0] = _mm512_mask_blend_epi32 (k, out[0], out[1]);


Please add comments for each instruction with the register level storage, so that it is easy to understand

src/include/cpu/rpp_cpu_simd.hpp

sampath1117 · 2023-06-12T16:58:23Z

src/include/cpu/rpp_cpu_simd.hpp

+
+    p[4] = _mm512_cvtepu32_ps(output[0]);
+
+    input[0] = _mm512_extracti32x4_epi32(px[5], 0);


Please change the variable names as per the RPP style
variable names should start with px for interger and p for float
Please give better variable names which are meaningful for registers instead of input, in, output, out

…on toggle variant

sampath1117 · 2023-06-14T08:03:00Z

src/include/cpu/rpp_cpu_simd.hpp

+    pxCvti[1] = _mm512_cvtusepi32_epi8(pxCvt[1]);
+    pxCvti[2] = _mm512_cvtusepi32_epi8(pxCvt[2]);
+
+    _mm_storeu_si128((__m128i *)dstPtrR, pxCvti[0]);


@jenetscaria-mcw
Please change this to 96 pixel store instead of 48 pixel store

There are 96 pixels processed but only 48 are stored here

sampath1117 · 2023-06-14T08:04:07Z

src/include/cpu/rpp_cpu_simd.hpp

+    p[1] = _mm512_cvtepu32_ps(output[1]);
+    p[2] = _mm512_cvtepu32_ps(output[2]);
+    p[3] = _mm512_cvtepu32_ps(output[3]);
+


Please remove the additional space

sampath1117 · 2023-06-14T08:15:08Z

src/include/cpu/rpp_cpu_simd.hpp

+inline void rpp_load48_f32pkd3_to_f32pln3_avx512(Rpp32f *srcPtr, __m512 *p)
+{
+    __m512i px[2];
+    px[0] = _mm512_loadu_si512((__m512i *)srcPtr);


Are you getting correct outputs for F32 cases?
Because the input is float, we should use _mm512_loadu_ps and entire functions need to change w.r.t that

sampath1117 · 2023-06-14T08:17:52Z

src/include/cpu/rpp_cpu_simd.hpp

+    __m512 px[5];
+    __m512i maski = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+    __m512i avx512_pxPermPkd = _mm512_set_epi32(15, 11, 7, 3, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+    p[0] = _mm512_permutexvar_ps(maski, p[0]);


We are using lot of premutes in this function
P;ease check if this can be avoided

* Add gpu tensor support for non linear blend * Add relevant unit tests * Add relevant perf tests * Add check on div * Minor change

jenetscaria-mcw and others added 10 commits May 3, 2023 00:11

Initial commit for AVX512 changes for brightness

d02c6d7

Added changes to resolve segmentation fault and nameing for functions

e38e839

Modified logic for pkd3_to_pkd3 with cvt instructions

7040658

Added AVX512 code for u8_pln3_to_pkd3 variant

7108a5a

Added changes for i8 variant and overall formatting

60e5c1b

removed unnecessary declarations

8282689

Fixed formatting and removed unnecessary comments

b6c7529

Changed formatting

0312365

merge with master

85e8379

Fixed formatting for brightness.hpp

6719c0a

sampath1117 reviewed Jun 9, 2023

View reviewed changes

sampath1117 reviewed Jun 12, 2023

View reviewed changes

src/modules/cpu/kernel/brightness.hpp Outdated Show resolved Hide resolved

sampath1117 reviewed Jun 12, 2023

View reviewed changes

src/modules/cpu/kernel/brightness.hpp Show resolved Hide resolved

sampath1117 reviewed Jun 12, 2023

View reviewed changes

jenetscaria-mcw added 2 commits June 13, 2023 03:05

Added avx512 version for f32 and f16 variant

a2135d5

Added f32 toggle variant and removed usage of insert function in u8 n…

134a7a8

…on toggle variant

sampath1117 reviewed Jun 14, 2023

View reviewed changes

Added changes to load 192 values and resolved few review comments

89091fe

r-abishek added a commit that referenced this pull request Jul 24, 2023

Non linear blend Tensor impl - HIP (#131)

05b67b5

* Add gpu tensor support for non linear blend * Add relevant unit tests * Add relevant perf tests * Add check on div * Minor change

r-abishek changed the title ~~Brightness AVX512 support~~ WIP - Brightness AVX512 Trial Aug 16, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WIP - Brightness AVX512 Trial #131

WIP - Brightness AVX512 Trial #131

jenetscaria-mcw commented Jun 7, 2023

sampath1117 left a comment

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 9, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 12, 2023

sampath1117 Jun 14, 2023 •

edited

Loading

sampath1117 Jun 14, 2023

sampath1117 Jun 14, 2023

sampath1117 Jun 14, 2023


		p[4] = _mm512_cvtepu32_ps(output[0]);

		input[0] = _mm512_extracti32x4_epi32(px[5], 0);

WIP - Brightness AVX512 Trial #131

Are you sure you want to change the base?

WIP - Brightness AVX512 Trial #131

Conversation

jenetscaria-mcw commented Jun 7, 2023

sampath1117 left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

sampath1117 Jun 14, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

sampath1117 Jun 14, 2023 •

edited

Loading