Skip to content

Commit 80bc480

Browse files
SDA USRsdausr
authored and
GitHub Enterprise
committedNov 19, 2024
Squashed 'vision' changes from 33a094d53..9c917d9ca (#1131)
3df503485 added resizebicubic first version Co-authored-by: sdausr <[email protected]>
1 parent 3be7e14 commit 80bc480

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+11101136
-61
lines changed
 
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
/*
2+
* Copyright 2021 Xilinx, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#ifndef __XF_RESIZE_BICUBIC_
18+
#define __XF_RESIZE_BICUBIC_
19+
20+
#include <adf.h>
21+
#include <aie_api/aie.hpp>
22+
#include <aie_api/utils.hpp>
23+
#include <common/xf_aie_hw_utils.hpp>
24+
#include <type_traits>
25+
26+
namespace xf {
27+
namespace cv {
28+
namespace aie {
29+
30+
class Resizebicubic {
31+
int mnFBitsIn;
32+
int mnFBitsAlpha;
33+
int mnFBitsBeta;
34+
int mnFBitsOut;
35+
uint32_t (&mwtsY)[LUT_DEPTH];
36+
37+
public:
38+
float coefficient = -0.75f;
39+
Resizebicubic(uint32_t (&wtsy)[LUT_DEPTH]) : mwtsY(wtsy) {}
40+
41+
/* __attribute__((noinline))
42+
void compute_wtsy(int row, int img_height_in, int img_height_out, const uint32_t scale_y, const uint32_t*
43+
weighty, int8_t &Wy1, int8_t &Wy2, int8_t &Wy3, int8_t &Wy4
44+
, int &pos1, int &pos2, int &pos3, int &pos4)
45+
{
46+
47+
int32 position = (row * scale_y) + (scale_y >> 1) - ToFixed<int64_t, 16>(0.5f);
48+
position = position > 0 ? position : 0;
49+
uint16_t wt_16 = position;
50+
uint8_t wt_8 = (uint8_t)(wt_16>>8);
51+
//printf("position=%d wt_16=%d wt_8=%d\n", position, wt_16, wt_8) ;
52+
53+
uint32_t wtsy = weighty[wt_8];
54+
Wy1 = (int8_t)(wtsy >> 24);
55+
Wy2 = (int8_t)(wtsy >> 16);
56+
Wy3 = (int8_t)(wtsy >> 8);
57+
Wy4 = (int8_t)(wtsy);
58+
int p=(position >> 16);
59+
60+
//int p=(position >> 16);
61+
pos1=(p - 1)<0 ? 0: (p - 1);
62+
pos2=(p) <0 ? 0: p;
63+
pos3=(p + 1)<0 ? 0: (p + 1);
64+
pos4=(p + 2)<0 ? 0: (p + 2);
65+
66+
int pos11=(p - 1)<0 ? 0: (p - 1);
67+
int pos22=(p) <0 ? 0: p;
68+
int pos33=(p + 1)<0 ? 0: (p + 1);
69+
int pos44=(p + 2)<0 ? 0: (p + 2);
70+
71+
//pos
72+
// int position= (p-1 > 0) * (p-1) ;
73+
pos1 = (pos11 < (img_height_in - 1)) * pos11 + (pos11 >= (img_height_in - 1)) * (img_height_in - 1);
74+
pos2 = (pos22 < (img_height_in - 1)) * pos22 + (pos22 >= (img_height_in - 1)) * (img_height_in - 1);
75+
pos3 = (pos33 < (img_height_in - 1)) * pos33 + (pos33 >= (img_height_in - 1)) * (img_height_in - 1);
76+
pos4 = (pos44 < (img_height_in - 1)) * pos44 + (pos44 >= (img_height_in - 1)) * (img_height_in - 1);
77+
78+
}
79+
*/
80+
__attribute__((noinline)) void compute_wtsy_f(int row,
81+
int img_height_in,
82+
int img_height_out,
83+
const uint32_t scale_y_fix,
84+
float scale_y,
85+
const uint32_t* weighty,
86+
int8_t& Wy1,
87+
int8_t& Wy2,
88+
int8_t& Wy3,
89+
int8_t& Wy4,
90+
int& pos1,
91+
int& pos2,
92+
int& pos3,
93+
int& pos4) {
94+
::aie::vector<float, 16> _row = ::aie::broadcast<float, 16>((float)row);
95+
::aie::vector<float, 16> add_ = ::aie::add(0.5f, _row);
96+
::aie::accum<accfloat, 16> acc(::aie::broadcast<float, 16>(-0.5f), 0);
97+
acc = ::aie::mac(acc, add_, scale_y);
98+
// pos
99+
int32 position = (row * scale_y_fix) + (scale_y_fix >> 1) - ToFixed<int64_t, 16>(0.5f);
100+
int p = (position >> 16);
101+
102+
::aie::vector<float, 16> dist = ::aie::sub(acc.to_vector<float>(0), ::aie::broadcast<float, 16>((float)p));
103+
::aie::accum<accfloat, 16> mul_Acc = ::aie::mul(dist, ::aie::broadcast<float, 16>(256.0f));
104+
::aie::vector<int32_t, 16> idx = ::aie::to_fixed<int32_t, 16>(mul_Acc.to_vector<bfloat16>(), 0);
105+
// ::aie::vector<int32_t, 16> idx=::aie::to_fixed<int32_t,16>(dist, 8);
106+
uint32_t wtsy = weighty[idx[0]];
107+
108+
Wy1 = (int8_t)(wtsy >> 24);
109+
Wy2 = (int8_t)(wtsy >> 16);
110+
Wy3 = (int8_t)(wtsy >> 8);
111+
Wy4 = (int8_t)(wtsy);
112+
113+
int pos11 = (p - 1) < 0 ? 0 : (p - 1);
114+
int pos22 = (p) < 0 ? 0 : p;
115+
int pos33 = (p + 1) < 0 ? 0 : (p + 1);
116+
int pos44 = (p + 2) < 0 ? 0 : (p + 2);
117+
118+
// pos
119+
// int position= (p-1 > 0) * (p-1) ;
120+
pos1 = (pos11 < (img_height_in - 1)) * pos11 + (pos11 >= (img_height_in - 1)) * (img_height_in - 1);
121+
pos2 = (pos22 < (img_height_in - 1)) * pos22 + (pos22 >= (img_height_in - 1)) * (img_height_in - 1);
122+
pos3 = (pos33 < (img_height_in - 1)) * pos33 + (pos33 >= (img_height_in - 1)) * (img_height_in - 1);
123+
pos4 = (pos44 < (img_height_in - 1)) * pos44 + (pos44 >= (img_height_in - 1)) * (img_height_in - 1);
124+
}
125+
126+
void xf_resize1DV(uint8_t* input,
127+
uint8_t* output,
128+
int channels,
129+
int start_in_row,
130+
int start_out_row,
131+
uint32_t scale_y,
132+
int img_height_in,
133+
int img_height_out,
134+
int tile_height_out,
135+
int tile_width_out,
136+
const uint32_t* weighty,
137+
float scale_y_f);
138+
139+
void runImpl(uint8_t* input,
140+
uint8_t* output,
141+
// uint8_t* output,
142+
int channels,
143+
uint32_t scale_x,
144+
uint32_t scale_y,
145+
int img_height_in,
146+
int img_height_out,
147+
int tile_height_out,
148+
int tile_width_out,
149+
int line_stride_in,
150+
int img_width_out,
151+
float scale_y_f);
152+
};
153+
154+
__attribute__((noinline)) void Resizebicubic::xf_resize1DV(uint8_t* input,
155+
uint8_t* output,
156+
int channels,
157+
int start_in_row,
158+
int start_out_row,
159+
uint32_t scale_y,
160+
int img_height_in,
161+
int img_height_out,
162+
int tile_height_out,
163+
int tile_width_out,
164+
const uint32_t* weighty,
165+
float scale_y_f) {
166+
const uint32_t* wty = weighty;
167+
int8_t Wy1, Wy2, Wy3, Wy4;
168+
int pos1, pos2, pos3, pos4; // y-1
169+
::aie::vector<uint8_t, 64> data_vec1, data_vec2;
170+
::aie::accum<acc32, 32> acc1;
171+
::aie::accum<acc32, 32> acc2;
172+
// printf("start_in_row=%d start_out_row=%d\n", start_in_row, start_out_row);
173+
// printf("tile_height_out=%d tile_width_out=%d\n", tile_height_out, tile_width_out);
174+
uint8_t* restrict img_out_ptr = (uint8_t*)output;
175+
set_rnd(rnd_conv_even);
176+
for (int i = 0; i < tile_height_out; i++) {
177+
// compute_wtsy(start_out_row + i, img_height_in, img_height_out, scale_y, weighty, Wy1, Wy2, Wy3, Wy4,
178+
// pos1, pos2,pos3,pos4);
179+
compute_wtsy_f(start_out_row + i, img_height_in, img_height_out, scale_y, scale_y_f, weighty, Wy1, Wy2, Wy3,
180+
Wy4, pos1, pos2, pos3, pos4);
181+
182+
::aie::vector<int8_t, 64> Wy1_y2 =
183+
::aie::concat(::aie::broadcast<int8_t, 32>(Wy1), ::aie::broadcast<int8_t, 32>(Wy2));
184+
::aie::vector<int8_t, 64> Wy3_y4 =
185+
::aie::concat(::aie::broadcast<int8_t, 32>(Wy3), ::aie::broadcast<int8_t, 32>(Wy4));
186+
int y_idx1 = (pos1 - start_in_row) * (tile_width_out * channels); // y-1
187+
int y_idx2 = (pos2 - start_in_row) * (tile_width_out * channels);
188+
int y_idx3 = (pos3 - start_in_row) * (tile_width_out * channels);
189+
int y_idx4 = (pos4 - start_in_row) * (tile_width_out * channels);
190+
uint8_t* restrict img_in_ptr1 = (uint8_t*)(input + y_idx1); // y-1
191+
uint8_t* restrict img_in_ptr2 = (uint8_t*)(input + y_idx2); // y
192+
uint8_t* restrict img_in_ptr3 = (uint8_t*)(input + y_idx3); // y+1
193+
uint8_t* restrict img_in_ptr4 = (uint8_t*)(input + y_idx4); // y+2
194+
195+
for (int j = 0; j < ((tile_width_out * 4) / 32); j++) chess_prepare_for_pipelining chess_loop_range(32, ) {
196+
data_vec1.insert(0, ::aie::load_v<32>(img_in_ptr1));
197+
data_vec1.insert(1, ::aie::load_v<32>(img_in_ptr2));
198+
data_vec2.insert(0, ::aie::load_v<32>(img_in_ptr3));
199+
data_vec2.insert(1, ::aie::load_v<32>(img_in_ptr4));
200+
img_in_ptr1 += 32;
201+
img_in_ptr2 += 32;
202+
img_in_ptr3 += 32;
203+
img_in_ptr4 += 32;
204+
205+
acc1 = mul_elem_32_2(data_vec1, Wy1_y2);
206+
acc2 = mac_elem_32_2(data_vec2, Wy3_y4, acc1);
207+
set_sat();
208+
::aie::store_v(img_out_ptr, acc2.template to_vector<uint8_t>(7));
209+
img_out_ptr += 32;
210+
}
211+
}
212+
set_rnd(rnd_floor);
213+
}
214+
215+
/*__attribute__((noinline)) void Resizebicubic::xf_resize1DV(uint8_t* input,
216+
uint8_t* output,
217+
int channels,
218+
int start_in_row,
219+
int start_out_row,
220+
uint32_t scale_y,
221+
int img_height_in,
222+
int img_height_out,
223+
int tile_height_out,
224+
int tile_width_out,
225+
const
226+
uint32_t* weighty) {
227+
const uint32_t* wty = weighty;
228+
int8_t Wy1,Wy2,Wy3,Wy4;
229+
int pos1, pos2, pos3, pos4; //y-1
230+
::aie::vector<uint8_t, 64> data_vec1, data_vec2;
231+
::aie::accum<acc32, 32> acc1;
232+
::aie::accum<acc32, 32> acc2;
233+
int k=0;
234+
// printf("start_in_row=%d start_out_row=%d\n", start_in_row, start_out_row);
235+
// printf("tile_height_out=%d tile_width_out=%d\n", tile_height_out, tile_width_out);
236+
uint8_t* restrict img_out_ptr = (uint8_t*)output;
237+
set_rnd(rnd_conv_even);
238+
239+
for (int j = 0; j < tile_width_out; j +=32){
240+
uint8_t* restrict img_out_ptr = (uint8_t*)(output + j);
241+
for(int i=0; i<tile_height_out;i++) chess_prepare_for_pipelining{
242+
243+
compute_wtsy(start_out_row + i, img_height_in, img_height_out, scale_y, weighty, Wy1, Wy2, Wy3, Wy4, pos1,
244+
pos2,pos3,pos4);
245+
// printf("pos1=%d pos2=%d pos3=%d pos4=%d\n", pos1, pos2, pos3, pos4);
246+
::aie::vector<int8_t, 64> Wy1_y2 = ::aie::concat(::aie::broadcast<int8_t, 32>(Wy1), ::aie::broadcast<int8_t,
247+
32>(Wy2));
248+
::aie::vector<int8_t, 64> Wy3_y4 = ::aie::concat(::aie::broadcast<int8_t, 32>(Wy3), ::aie::broadcast<int8_t,
249+
32>(Wy4));
250+
int y_idx1 = (pos1 - start_in_row) * (tile_width_out*channels); //y-1
251+
int y_idx2 = (pos2 - start_in_row) * (tile_width_out*channels);
252+
int y_idx3 = (pos3 - start_in_row) * (tile_width_out*channels);
253+
int y_idx4 = (pos4 - start_in_row) * (tile_width_out*channels);
254+
uint8_t* restrict img_in_ptr1 = (uint8_t*)(input + y_idx1); //y-1
255+
uint8_t* restrict img_in_ptr2 = (uint8_t*)(input + y_idx2); //y
256+
uint8_t* restrict img_in_ptr3 = (uint8_t*)(input + y_idx3); //y+1
257+
uint8_t* restrict img_in_ptr4 = (uint8_t*)(input + y_idx4); //y+2
258+
259+
data_vec1.insert(0, ::aie::load_v<32>(img_in_ptr1));
260+
data_vec1.insert(1, ::aie::load_v<32>(img_in_ptr2));
261+
data_vec2.insert(0, ::aie::load_v<32>(img_in_ptr3));
262+
data_vec2.insert(1, ::aie::load_v<32>(img_in_ptr4));
263+
264+
acc1 = mul_elem_32_2(data_vec1, Wy1_y2);
265+
acc2 = mac_elem_32_2(data_vec2, Wy3_y4, acc1);
266+
set_sat();
267+
// chess_report(y_idx1);
268+
// chess_report(y_idx2);
269+
// chess_report(y_idx3);
270+
// chess_report(y_idx4);
271+
// chess_report(data_vec1);
272+
// chess_report(Wy1_y2);
273+
// chess_report(Wy3_y4);
274+
// chess_report(data_vec2);
275+
// chess_report(acc2);
276+
// chess_report(acc2.template to_vector<uint8_t>(7));
277+
278+
::aie::store_v(img_out_ptr, acc2.template to_vector<uint8_t>(7));
279+
clr_sat();
280+
281+
img_out_ptr+=32;
282+
}
283+
}
284+
set_rnd(rnd_floor);
285+
}
286+
*/
287+
__attribute__((noinline)) void Resizebicubic::runImpl(uint8_t* input,
288+
uint8_t* output,
289+
int channels,
290+
uint32_t scale_x,
291+
uint32_t scale_y,
292+
int img_height_in,
293+
int img_height_out,
294+
int tile_height_out,
295+
int tile_width_out,
296+
int line_stride_in,
297+
int img_width_out,
298+
float scale_y_f) {
299+
int start_out_row = xfGetTileOutPosV(input);
300+
int start_in_row = xfGetTilePosV(input);
301+
302+
// printf("start_out_row=%d start_in_row=%d\n", start_out_row, start_in_row);
303+
304+
xfCopyMetaData(input, output);
305+
uint8_t* ptr_in = (uint8_t*)xfGetImgDataPtr(input);
306+
uint8_t* ptr_out = (uint8_t*)xfGetImgDataPtr(output);
307+
308+
xf_resize1DV((uint8_t*)ptr_in, (uint8_t*)ptr_out, channels, start_in_row, start_out_row, scale_y, img_height_in,
309+
img_height_out, tile_height_out, tile_width_out, mwtsY, scale_y_f);
310+
}
311+
312+
} // aie
313+
} // cv
314+
} // xf
315+
316+
#endif

0 commit comments

Comments
 (0)