|
| 1 | +/* |
| 2 | + * Copyright 2021 Xilinx, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +#ifndef __XF_RESIZE_BICUBIC_ |
| 18 | +#define __XF_RESIZE_BICUBIC_ |
| 19 | + |
| 20 | +#include <adf.h> |
| 21 | +#include <aie_api/aie.hpp> |
| 22 | +#include <aie_api/utils.hpp> |
| 23 | +#include <common/xf_aie_hw_utils.hpp> |
| 24 | +#include <type_traits> |
| 25 | + |
| 26 | +namespace xf { |
| 27 | +namespace cv { |
| 28 | +namespace aie { |
| 29 | + |
| 30 | +class Resizebicubic { |
| 31 | + int mnFBitsIn; |
| 32 | + int mnFBitsAlpha; |
| 33 | + int mnFBitsBeta; |
| 34 | + int mnFBitsOut; |
| 35 | + uint32_t (&mwtsY)[LUT_DEPTH]; |
| 36 | + |
| 37 | + public: |
| 38 | + float coefficient = -0.75f; |
| 39 | + Resizebicubic(uint32_t (&wtsy)[LUT_DEPTH]) : mwtsY(wtsy) {} |
| 40 | + |
| 41 | + /* __attribute__((noinline)) |
| 42 | + void compute_wtsy(int row, int img_height_in, int img_height_out, const uint32_t scale_y, const uint32_t* |
| 43 | + weighty, int8_t &Wy1, int8_t &Wy2, int8_t &Wy3, int8_t &Wy4 |
| 44 | + , int &pos1, int &pos2, int &pos3, int &pos4) |
| 45 | + { |
| 46 | +
|
| 47 | + int32 position = (row * scale_y) + (scale_y >> 1) - ToFixed<int64_t, 16>(0.5f); |
| 48 | + position = position > 0 ? position : 0; |
| 49 | + uint16_t wt_16 = position; |
| 50 | + uint8_t wt_8 = (uint8_t)(wt_16>>8); |
| 51 | + //printf("position=%d wt_16=%d wt_8=%d\n", position, wt_16, wt_8) ; |
| 52 | +
|
| 53 | + uint32_t wtsy = weighty[wt_8]; |
| 54 | + Wy1 = (int8_t)(wtsy >> 24); |
| 55 | + Wy2 = (int8_t)(wtsy >> 16); |
| 56 | + Wy3 = (int8_t)(wtsy >> 8); |
| 57 | + Wy4 = (int8_t)(wtsy); |
| 58 | + int p=(position >> 16); |
| 59 | +
|
| 60 | + //int p=(position >> 16); |
| 61 | + pos1=(p - 1)<0 ? 0: (p - 1); |
| 62 | + pos2=(p) <0 ? 0: p; |
| 63 | + pos3=(p + 1)<0 ? 0: (p + 1); |
| 64 | + pos4=(p + 2)<0 ? 0: (p + 2); |
| 65 | +
|
| 66 | + int pos11=(p - 1)<0 ? 0: (p - 1); |
| 67 | + int pos22=(p) <0 ? 0: p; |
| 68 | + int pos33=(p + 1)<0 ? 0: (p + 1); |
| 69 | + int pos44=(p + 2)<0 ? 0: (p + 2); |
| 70 | +
|
| 71 | + //pos |
| 72 | + // int position= (p-1 > 0) * (p-1) ; |
| 73 | + pos1 = (pos11 < (img_height_in - 1)) * pos11 + (pos11 >= (img_height_in - 1)) * (img_height_in - 1); |
| 74 | + pos2 = (pos22 < (img_height_in - 1)) * pos22 + (pos22 >= (img_height_in - 1)) * (img_height_in - 1); |
| 75 | + pos3 = (pos33 < (img_height_in - 1)) * pos33 + (pos33 >= (img_height_in - 1)) * (img_height_in - 1); |
| 76 | + pos4 = (pos44 < (img_height_in - 1)) * pos44 + (pos44 >= (img_height_in - 1)) * (img_height_in - 1); |
| 77 | +
|
| 78 | + } |
| 79 | + */ |
| 80 | + __attribute__((noinline)) void compute_wtsy_f(int row, |
| 81 | + int img_height_in, |
| 82 | + int img_height_out, |
| 83 | + const uint32_t scale_y_fix, |
| 84 | + float scale_y, |
| 85 | + const uint32_t* weighty, |
| 86 | + int8_t& Wy1, |
| 87 | + int8_t& Wy2, |
| 88 | + int8_t& Wy3, |
| 89 | + int8_t& Wy4, |
| 90 | + int& pos1, |
| 91 | + int& pos2, |
| 92 | + int& pos3, |
| 93 | + int& pos4) { |
| 94 | + ::aie::vector<float, 16> _row = ::aie::broadcast<float, 16>((float)row); |
| 95 | + ::aie::vector<float, 16> add_ = ::aie::add(0.5f, _row); |
| 96 | + ::aie::accum<accfloat, 16> acc(::aie::broadcast<float, 16>(-0.5f), 0); |
| 97 | + acc = ::aie::mac(acc, add_, scale_y); |
| 98 | + // pos |
| 99 | + int32 position = (row * scale_y_fix) + (scale_y_fix >> 1) - ToFixed<int64_t, 16>(0.5f); |
| 100 | + int p = (position >> 16); |
| 101 | + |
| 102 | + ::aie::vector<float, 16> dist = ::aie::sub(acc.to_vector<float>(0), ::aie::broadcast<float, 16>((float)p)); |
| 103 | + ::aie::accum<accfloat, 16> mul_Acc = ::aie::mul(dist, ::aie::broadcast<float, 16>(256.0f)); |
| 104 | + ::aie::vector<int32_t, 16> idx = ::aie::to_fixed<int32_t, 16>(mul_Acc.to_vector<bfloat16>(), 0); |
| 105 | + // ::aie::vector<int32_t, 16> idx=::aie::to_fixed<int32_t,16>(dist, 8); |
| 106 | + uint32_t wtsy = weighty[idx[0]]; |
| 107 | + |
| 108 | + Wy1 = (int8_t)(wtsy >> 24); |
| 109 | + Wy2 = (int8_t)(wtsy >> 16); |
| 110 | + Wy3 = (int8_t)(wtsy >> 8); |
| 111 | + Wy4 = (int8_t)(wtsy); |
| 112 | + |
| 113 | + int pos11 = (p - 1) < 0 ? 0 : (p - 1); |
| 114 | + int pos22 = (p) < 0 ? 0 : p; |
| 115 | + int pos33 = (p + 1) < 0 ? 0 : (p + 1); |
| 116 | + int pos44 = (p + 2) < 0 ? 0 : (p + 2); |
| 117 | + |
| 118 | + // pos |
| 119 | + // int position= (p-1 > 0) * (p-1) ; |
| 120 | + pos1 = (pos11 < (img_height_in - 1)) * pos11 + (pos11 >= (img_height_in - 1)) * (img_height_in - 1); |
| 121 | + pos2 = (pos22 < (img_height_in - 1)) * pos22 + (pos22 >= (img_height_in - 1)) * (img_height_in - 1); |
| 122 | + pos3 = (pos33 < (img_height_in - 1)) * pos33 + (pos33 >= (img_height_in - 1)) * (img_height_in - 1); |
| 123 | + pos4 = (pos44 < (img_height_in - 1)) * pos44 + (pos44 >= (img_height_in - 1)) * (img_height_in - 1); |
| 124 | + } |
| 125 | + |
| 126 | + void xf_resize1DV(uint8_t* input, |
| 127 | + uint8_t* output, |
| 128 | + int channels, |
| 129 | + int start_in_row, |
| 130 | + int start_out_row, |
| 131 | + uint32_t scale_y, |
| 132 | + int img_height_in, |
| 133 | + int img_height_out, |
| 134 | + int tile_height_out, |
| 135 | + int tile_width_out, |
| 136 | + const uint32_t* weighty, |
| 137 | + float scale_y_f); |
| 138 | + |
| 139 | + void runImpl(uint8_t* input, |
| 140 | + uint8_t* output, |
| 141 | + // uint8_t* output, |
| 142 | + int channels, |
| 143 | + uint32_t scale_x, |
| 144 | + uint32_t scale_y, |
| 145 | + int img_height_in, |
| 146 | + int img_height_out, |
| 147 | + int tile_height_out, |
| 148 | + int tile_width_out, |
| 149 | + int line_stride_in, |
| 150 | + int img_width_out, |
| 151 | + float scale_y_f); |
| 152 | +}; |
| 153 | + |
| 154 | +__attribute__((noinline)) void Resizebicubic::xf_resize1DV(uint8_t* input, |
| 155 | + uint8_t* output, |
| 156 | + int channels, |
| 157 | + int start_in_row, |
| 158 | + int start_out_row, |
| 159 | + uint32_t scale_y, |
| 160 | + int img_height_in, |
| 161 | + int img_height_out, |
| 162 | + int tile_height_out, |
| 163 | + int tile_width_out, |
| 164 | + const uint32_t* weighty, |
| 165 | + float scale_y_f) { |
| 166 | + const uint32_t* wty = weighty; |
| 167 | + int8_t Wy1, Wy2, Wy3, Wy4; |
| 168 | + int pos1, pos2, pos3, pos4; // y-1 |
| 169 | + ::aie::vector<uint8_t, 64> data_vec1, data_vec2; |
| 170 | + ::aie::accum<acc32, 32> acc1; |
| 171 | + ::aie::accum<acc32, 32> acc2; |
| 172 | + // printf("start_in_row=%d start_out_row=%d\n", start_in_row, start_out_row); |
| 173 | + // printf("tile_height_out=%d tile_width_out=%d\n", tile_height_out, tile_width_out); |
| 174 | + uint8_t* restrict img_out_ptr = (uint8_t*)output; |
| 175 | + set_rnd(rnd_conv_even); |
| 176 | + for (int i = 0; i < tile_height_out; i++) { |
| 177 | + // compute_wtsy(start_out_row + i, img_height_in, img_height_out, scale_y, weighty, Wy1, Wy2, Wy3, Wy4, |
| 178 | + // pos1, pos2,pos3,pos4); |
| 179 | + compute_wtsy_f(start_out_row + i, img_height_in, img_height_out, scale_y, scale_y_f, weighty, Wy1, Wy2, Wy3, |
| 180 | + Wy4, pos1, pos2, pos3, pos4); |
| 181 | + |
| 182 | + ::aie::vector<int8_t, 64> Wy1_y2 = |
| 183 | + ::aie::concat(::aie::broadcast<int8_t, 32>(Wy1), ::aie::broadcast<int8_t, 32>(Wy2)); |
| 184 | + ::aie::vector<int8_t, 64> Wy3_y4 = |
| 185 | + ::aie::concat(::aie::broadcast<int8_t, 32>(Wy3), ::aie::broadcast<int8_t, 32>(Wy4)); |
| 186 | + int y_idx1 = (pos1 - start_in_row) * (tile_width_out * channels); // y-1 |
| 187 | + int y_idx2 = (pos2 - start_in_row) * (tile_width_out * channels); |
| 188 | + int y_idx3 = (pos3 - start_in_row) * (tile_width_out * channels); |
| 189 | + int y_idx4 = (pos4 - start_in_row) * (tile_width_out * channels); |
| 190 | + uint8_t* restrict img_in_ptr1 = (uint8_t*)(input + y_idx1); // y-1 |
| 191 | + uint8_t* restrict img_in_ptr2 = (uint8_t*)(input + y_idx2); // y |
| 192 | + uint8_t* restrict img_in_ptr3 = (uint8_t*)(input + y_idx3); // y+1 |
| 193 | + uint8_t* restrict img_in_ptr4 = (uint8_t*)(input + y_idx4); // y+2 |
| 194 | + |
| 195 | + for (int j = 0; j < ((tile_width_out * 4) / 32); j++) chess_prepare_for_pipelining chess_loop_range(32, ) { |
| 196 | + data_vec1.insert(0, ::aie::load_v<32>(img_in_ptr1)); |
| 197 | + data_vec1.insert(1, ::aie::load_v<32>(img_in_ptr2)); |
| 198 | + data_vec2.insert(0, ::aie::load_v<32>(img_in_ptr3)); |
| 199 | + data_vec2.insert(1, ::aie::load_v<32>(img_in_ptr4)); |
| 200 | + img_in_ptr1 += 32; |
| 201 | + img_in_ptr2 += 32; |
| 202 | + img_in_ptr3 += 32; |
| 203 | + img_in_ptr4 += 32; |
| 204 | + |
| 205 | + acc1 = mul_elem_32_2(data_vec1, Wy1_y2); |
| 206 | + acc2 = mac_elem_32_2(data_vec2, Wy3_y4, acc1); |
| 207 | + set_sat(); |
| 208 | + ::aie::store_v(img_out_ptr, acc2.template to_vector<uint8_t>(7)); |
| 209 | + img_out_ptr += 32; |
| 210 | + } |
| 211 | + } |
| 212 | + set_rnd(rnd_floor); |
| 213 | +} |
| 214 | + |
| 215 | +/*__attribute__((noinline)) void Resizebicubic::xf_resize1DV(uint8_t* input, |
| 216 | + uint8_t* output, |
| 217 | + int channels, |
| 218 | + int start_in_row, |
| 219 | + int start_out_row, |
| 220 | + uint32_t scale_y, |
| 221 | + int img_height_in, |
| 222 | + int img_height_out, |
| 223 | + int tile_height_out, |
| 224 | + int tile_width_out, |
| 225 | + const |
| 226 | +uint32_t* weighty) { |
| 227 | + const uint32_t* wty = weighty; |
| 228 | + int8_t Wy1,Wy2,Wy3,Wy4; |
| 229 | + int pos1, pos2, pos3, pos4; //y-1 |
| 230 | + ::aie::vector<uint8_t, 64> data_vec1, data_vec2; |
| 231 | + ::aie::accum<acc32, 32> acc1; |
| 232 | + ::aie::accum<acc32, 32> acc2; |
| 233 | + int k=0; |
| 234 | +// printf("start_in_row=%d start_out_row=%d\n", start_in_row, start_out_row); |
| 235 | +// printf("tile_height_out=%d tile_width_out=%d\n", tile_height_out, tile_width_out); |
| 236 | + uint8_t* restrict img_out_ptr = (uint8_t*)output; |
| 237 | + set_rnd(rnd_conv_even); |
| 238 | +
|
| 239 | + for (int j = 0; j < tile_width_out; j +=32){ |
| 240 | + uint8_t* restrict img_out_ptr = (uint8_t*)(output + j); |
| 241 | + for(int i=0; i<tile_height_out;i++) chess_prepare_for_pipelining{ |
| 242 | +
|
| 243 | + compute_wtsy(start_out_row + i, img_height_in, img_height_out, scale_y, weighty, Wy1, Wy2, Wy3, Wy4, pos1, |
| 244 | +pos2,pos3,pos4); |
| 245 | +// printf("pos1=%d pos2=%d pos3=%d pos4=%d\n", pos1, pos2, pos3, pos4); |
| 246 | + ::aie::vector<int8_t, 64> Wy1_y2 = ::aie::concat(::aie::broadcast<int8_t, 32>(Wy1), ::aie::broadcast<int8_t, |
| 247 | +32>(Wy2)); |
| 248 | + ::aie::vector<int8_t, 64> Wy3_y4 = ::aie::concat(::aie::broadcast<int8_t, 32>(Wy3), ::aie::broadcast<int8_t, |
| 249 | +32>(Wy4)); |
| 250 | + int y_idx1 = (pos1 - start_in_row) * (tile_width_out*channels); //y-1 |
| 251 | + int y_idx2 = (pos2 - start_in_row) * (tile_width_out*channels); |
| 252 | + int y_idx3 = (pos3 - start_in_row) * (tile_width_out*channels); |
| 253 | + int y_idx4 = (pos4 - start_in_row) * (tile_width_out*channels); |
| 254 | + uint8_t* restrict img_in_ptr1 = (uint8_t*)(input + y_idx1); //y-1 |
| 255 | + uint8_t* restrict img_in_ptr2 = (uint8_t*)(input + y_idx2); //y |
| 256 | + uint8_t* restrict img_in_ptr3 = (uint8_t*)(input + y_idx3); //y+1 |
| 257 | + uint8_t* restrict img_in_ptr4 = (uint8_t*)(input + y_idx4); //y+2 |
| 258 | +
|
| 259 | + data_vec1.insert(0, ::aie::load_v<32>(img_in_ptr1)); |
| 260 | + data_vec1.insert(1, ::aie::load_v<32>(img_in_ptr2)); |
| 261 | + data_vec2.insert(0, ::aie::load_v<32>(img_in_ptr3)); |
| 262 | + data_vec2.insert(1, ::aie::load_v<32>(img_in_ptr4)); |
| 263 | +
|
| 264 | + acc1 = mul_elem_32_2(data_vec1, Wy1_y2); |
| 265 | + acc2 = mac_elem_32_2(data_vec2, Wy3_y4, acc1); |
| 266 | + set_sat(); |
| 267 | + // chess_report(y_idx1); |
| 268 | + // chess_report(y_idx2); |
| 269 | + // chess_report(y_idx3); |
| 270 | + // chess_report(y_idx4); |
| 271 | + // chess_report(data_vec1); |
| 272 | + // chess_report(Wy1_y2); |
| 273 | + // chess_report(Wy3_y4); |
| 274 | + // chess_report(data_vec2); |
| 275 | + // chess_report(acc2); |
| 276 | + // chess_report(acc2.template to_vector<uint8_t>(7)); |
| 277 | +
|
| 278 | + ::aie::store_v(img_out_ptr, acc2.template to_vector<uint8_t>(7)); |
| 279 | + clr_sat(); |
| 280 | +
|
| 281 | + img_out_ptr+=32; |
| 282 | + } |
| 283 | + } |
| 284 | + set_rnd(rnd_floor); |
| 285 | +} |
| 286 | +*/ |
| 287 | +__attribute__((noinline)) void Resizebicubic::runImpl(uint8_t* input, |
| 288 | + uint8_t* output, |
| 289 | + int channels, |
| 290 | + uint32_t scale_x, |
| 291 | + uint32_t scale_y, |
| 292 | + int img_height_in, |
| 293 | + int img_height_out, |
| 294 | + int tile_height_out, |
| 295 | + int tile_width_out, |
| 296 | + int line_stride_in, |
| 297 | + int img_width_out, |
| 298 | + float scale_y_f) { |
| 299 | + int start_out_row = xfGetTileOutPosV(input); |
| 300 | + int start_in_row = xfGetTilePosV(input); |
| 301 | + |
| 302 | + // printf("start_out_row=%d start_in_row=%d\n", start_out_row, start_in_row); |
| 303 | + |
| 304 | + xfCopyMetaData(input, output); |
| 305 | + uint8_t* ptr_in = (uint8_t*)xfGetImgDataPtr(input); |
| 306 | + uint8_t* ptr_out = (uint8_t*)xfGetImgDataPtr(output); |
| 307 | + |
| 308 | + xf_resize1DV((uint8_t*)ptr_in, (uint8_t*)ptr_out, channels, start_in_row, start_out_row, scale_y, img_height_in, |
| 309 | + img_height_out, tile_height_out, tile_width_out, mwtsY, scale_y_f); |
| 310 | +} |
| 311 | + |
| 312 | +} // aie |
| 313 | +} // cv |
| 314 | +} // xf |
| 315 | + |
| 316 | +#endif |
0 commit comments