Skip to content

Commit

Permalink
Fix linter warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
vilukissa68 committed Dec 10, 2024
1 parent 949fe5b commit 30bbfda
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 48 deletions.
28 changes: 17 additions & 11 deletions examples/hpc/dla-driver-ffi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
#![no_std]
#![no_main]

#[macro_use]
extern crate alloc;
use alloc::vec::Vec;
use core::ffi::{c_char, CStr};
use core::slice;
use dla_driver::layers::{conv2d, conv2d_bias, conv2d_bias_relu, conv2d_relu, grouped_conv2d};
use dla_driver::tensor3::{rescale, Order3, Tensor3};
use dla_driver::tensor3::{Order3, Tensor3};
use dla_driver::tensor4::{Order4, Tensor4};
use dla_driver::utils::optimal_pp_bias_heuristic;
use dla_driver::{Padding, Stride};

/// Converts C-types to DLA Tensors for use with the highlevel layer
#[allow(clippy::too_many_arguments)]
unsafe fn ffi_data_import(
input_data: *const i8,
input_channels: usize,
Expand All @@ -29,7 +29,7 @@ unsafe fn ffi_data_import(
kernel_width: usize,
kernel_order: *const c_char,
) -> (Tensor3<i8>, Tensor4<i8>) {
let mut input_data: Vec<i8> = unsafe {
let input_data: Vec<i8> = unsafe {
slice::from_raw_parts(input_data, input_channels * input_height * input_width).to_vec()
};

Expand Down Expand Up @@ -77,6 +77,7 @@ pub unsafe extern "C" fn dla_init() {

/// Executes Conv2D on DLA with given parameters and writes result to output buffer.
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_conv2d(
input_data: *const i8,
kernel_data: *const i8,
Expand Down Expand Up @@ -141,6 +142,7 @@ pub unsafe extern "C" fn dla_conv2d(

/// Executes Conv2D + ReLU on DLA with given parameters and writes result to output buffer.
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_conv2d_relu(
input_data: *const i8,
kernel_data: *const i8,
Expand Down Expand Up @@ -208,6 +210,7 @@ pub unsafe extern "C" fn dla_conv2d_relu(
///
/// * `bias` - Bias is actually i16 in hardware, here we use 32 for TVM compatibility
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_conv2d_bias(
input_data: *const i8,
kernel_data: *const i8,
Expand Down Expand Up @@ -282,6 +285,7 @@ pub unsafe extern "C" fn dla_conv2d_bias(
///
/// * `bias` - Buffer containing bias data. NOTE: Bias is actually i16 in hardware, here we use 32 for TVM compatibility
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_conv2d_bias_relu(
input_data: *const i8,
kernel_data: *const i8,
Expand Down Expand Up @@ -366,6 +370,7 @@ pub unsafe extern "C" fn dla_conv2d_bias_relu(
///
/// * `bias` - Buffer containing bias data. NOTE: Bias is actually i16 in hardware, here we use 32 for TVM compatibility
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
input_data: *const i8,
kernel_data: *const i8,
Expand All @@ -385,11 +390,11 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
pad_right: u32,
pad_left: u32,
pad_bottom: u32,
pad_value: i32,
_pad_value: i32,
stride_x: u32,
stride_y: u32,
mac_clip: u32,
pp_clip: u32,
_pp_clip: u32,
) {
let (input_tensor, kernels_tensor) = unsafe {
ffi_data_import(
Expand Down Expand Up @@ -419,7 +424,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
//let optimized_pp = optimal_pp_bias_heuristic(&bias);
let optimized_pp = 7;

let mut result: Tensor3<i8> = conv2d_bias(
let result: Tensor3<i8> = conv2d_bias(
input_tensor,
kernels_tensor,
bias,
Expand All @@ -439,7 +444,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
None,
);

let input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };
let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };

// TVM requantization and clip
// NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
Expand All @@ -460,6 +465,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_bias(
///
/// * `bias` - Buffer containing bias data. NOTE: Bias is actually i16 in hardware, here we use 32 for TVM compatibility
#[no_mangle]
#[allow(clippy::too_many_arguments)]
pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
input_data: *const i8,
kernel_data: *const i8,
Expand All @@ -480,11 +486,11 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
pad_right: u32,
pad_left: u32,
pad_bottom: u32,
pad_value: i32,
_pad_value: i32,
stride_x: u32,
stride_y: u32,
mac_clip: u32,
pp_clip: u32,
_pp_clip: u32,
) {
let (input_tensor, kernels_tensor) = unsafe {
ffi_data_import(
Expand Down Expand Up @@ -513,7 +519,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(

let optimized_pp = optimal_pp_bias_heuristic(&bias);

let mut result: Tensor3<i8> = grouped_conv2d(
let result: Tensor3<i8> = grouped_conv2d(
input_tensor,
kernels_tensor,
bias,
Expand All @@ -534,7 +540,7 @@ pub unsafe extern "C" fn dla_tvm_qnn_conv2d_grouped_bias(
groups,
);

let input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };
let _input_order_string = unsafe { CStr::from_ptr(input_order).to_str().unwrap_unchecked() };

// TVM requantization and clip
// NOTE:(20240927 [email protected]) on DLA clipping behaviour with TVM.
Expand Down
12 changes: 4 additions & 8 deletions examples/hpc/dla-driver/src/layers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ use crate::tensor4::{Order4, Tensor4};
use crate::{Dla, InputSize, KernelSize, LayerConfig, Padding, SimdBitMode, Stride};
use alloc::vec::Vec;

use headsail_bsp::sprintln;

use crate::utils::{calculate_conv2d_out_param_dim, get_banks_for_layer};

// Define a trait for output handling
Expand Down Expand Up @@ -190,19 +188,19 @@ pub fn grouped_conv2d<T: DlaOutput + Clone>(
mac_clip: Option<u32>,
pp_clip: Option<u32>,
simd_mode: Option<SimdBitMode>,
groups: usize
groups: usize,
) -> Tensor3<T> {
let total_in_channels = input.channels();
let total_out_channels = kernels.kernels();
let group_in_channels = total_in_channels / groups;
let group_out_channels = kernels.kernels() / groups;

// Placeholder for the output tensor
let mut output_tensors = Vec::new();

for g in 0..groups {
let input_group = input.slice_channels(g * group_in_channels..(g + 1)*group_in_channels);
let kernels_group = kernels.slice_channels(g * group_in_channels..(g + 1)*group_in_channels);
let input_group = input.slice_channels(g * group_in_channels..(g + 1) * group_in_channels);
let kernels_group =
kernels.slice_channels(g * group_in_channels..(g + 1) * group_in_channels);
let bias_group = bias[g * group_out_channels..(g + 1) * group_out_channels].to_vec();

let output_group = run_layers(
Expand All @@ -223,10 +221,8 @@ pub fn grouped_conv2d<T: DlaOutput + Clone>(

// Concatenate the output tensors along the channel dimension
Tensor3::concat_interleaved(output_tensors)

}


fn run_layers<T: DlaOutput + Clone>(
input: Tensor3<i8>,
kernels: Tensor4<i8>,
Expand Down
30 changes: 20 additions & 10 deletions examples/hpc/dla-driver/src/tensor3.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use alloc::vec::*;
use core::ffi::c_char;
use ndarray::{Array, Array3, ArrayView3 , Axis, s, stack, concatenate};
use ndarray::{s, Array, Array3};

#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Order3 {
Expand Down Expand Up @@ -185,9 +185,14 @@ impl<T: Clone> Tensor3<T> {

/// Concatenates a Tensor along the least significant axis (axis=2) by interleaving the tensors
pub fn concat_interleaved(tensors: Vec<Tensor3<T>>) -> Tensor3<T> {
let target_order = tensors[0].order();
let (height, width, channels) = (tensors[0].height(), tensors[0].width(), tensors[0].channels());
let mut intermediary_buffer: Vec<T> = Vec::with_capacity(height * width * channels * tensors.len());
let _target_order = tensors[0].order();
let (height, width, channels) = (
tensors[0].height(),
tensors[0].width(),
tensors[0].channels(),
);
let mut intermediary_buffer: Vec<T> =
Vec::with_capacity(height * width * channels * tensors.len());
for h in 0..height {
for w in 0..width {
for c in 0..channels {
Expand All @@ -197,7 +202,14 @@ impl<T: Clone> Tensor3<T> {
}
}
}
Tensor3::from_data_buffer(channels * tensors.len(), height, width, intermediary_buffer, Order3::HWC).unwrap()
Tensor3::from_data_buffer(
channels * tensors.len(),
height,
width,
intermediary_buffer,
Order3::HWC,
)
.unwrap()
}

/// Slice tensors channel axis with the given range
Expand All @@ -217,7 +229,7 @@ impl<T: Clone> Tensor3<T> {

Tensor3 {
data: sliced_data,
order: self.order.clone(),
order: self.order,
}
}

Expand Down Expand Up @@ -271,7 +283,6 @@ impl<T: Clone> Tensor3<T> {
data.permute(order);
data.to_buffer()
}

}

pub fn rescale(
Expand All @@ -298,10 +309,9 @@ pub fn rescale(
};

channel_slice.map_inplace(|x| {
let value = (input_scale / scale) * (*x as f32 * pre_scale - input_zero as f32)
+ output_zero as f32;
let value = (input_scale / scale) * (*x as f32 * pre_scale - input_zero as f32)
+ output_zero as f32;
*x = value.clamp(i8::MIN as f32, i8::MAX as f32) as i8

});
}
}
44 changes: 29 additions & 15 deletions examples/hpc/dla-driver/src/tensor4.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use alloc::vec::*;
use core::ffi::c_char;
#[macro_use]
use ndarray::{Array, Array4, s, concatenate};
use ndarray::{s, Array, Array4};

#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Order4 {
Expand Down Expand Up @@ -280,18 +279,33 @@ impl<T: Clone> Tensor4<T> {
pub fn slice_channels(&self, c_range: core::ops::Range<usize>) -> Tensor4<T> {
// Determine the index of the channel dimension based on the tensor order
let kernel_axis = match self.order {
Order4::KCHW | Order4::KCWH | Order4::KHWC |
Order4::KHCW | Order4::KWCH | Order4::KWHC => 0,

Order4::CKHW | Order4::CKWH | Order4::HKCW |
Order4::HKWC | Order4::WKCH | Order4::WKHC => 1,

Order4::CHKW | Order4::CWKH | Order4::HCKW |
Order4::HWKC | Order4::WCKH | Order4::WHKC => 2,

Order4::CHWK | Order4::CWHK | Order4::HWCK |
Order4::HCWK | Order4::WCHK | Order4::WHCK => 3,

Order4::KCHW
| Order4::KCWH
| Order4::KHWC
| Order4::KHCW
| Order4::KWCH
| Order4::KWHC => 0,

Order4::CKHW
| Order4::CKWH
| Order4::HKCW
| Order4::HKWC
| Order4::WKCH
| Order4::WKHC => 1,

Order4::CHKW
| Order4::CWKH
| Order4::HCKW
| Order4::HWKC
| Order4::WCKH
| Order4::WHKC => 2,

Order4::CHWK
| Order4::CWHK
| Order4::HWCK
| Order4::HCWK
| Order4::WCHK
| Order4::WHCK => 3,
};

// Create a slice pattern for `s![]` by slicing only on the channels axis
Expand All @@ -307,7 +321,7 @@ impl<T: Clone> Tensor4<T> {
// Return a new Tensor4 with the sliced data and the same order
Tensor4 {
data: sliced_data,
order: self.order.clone(),
order: self.order,
}
}

Expand Down
7 changes: 3 additions & 4 deletions examples/hpc/dla-driver/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub fn generate_output_tensor<I: Clone, K: Clone, O: Clone>(
/// * `bytes` - Number of bytes the data contains
pub fn calculate_number_of_banks_needed(bytes: usize) -> usize {
// Take ceil
(bytes + (MEMORY_BANK_SIZE - 1)) / MEMORY_BANK_SIZE
bytes.div_ceil(MEMORY_BANK_SIZE)
}

/// Assigns data banks for layer data
Expand Down Expand Up @@ -118,13 +118,12 @@ fn calculate_same_padding(input: (u32, u32), kernel: (u32, u32), stride: Stride)
}
}


/// Calculate optimal amount of PP clip based on bias heuristic for minimal loss in granularity
pub fn optimal_pp_bias_heuristic(bias: &Vec<i16>) -> u32 {
pub fn optimal_pp_bias_heuristic(bias: &[i16]) -> u32 {
let abs_max = bias.iter().map(|&x| x.abs() as i32).max().unwrap_or(0) as u32;
let pp = (abs_max.max(127) / 127).ilog2() + 1;
if pp > 8 {
return 8
return 8;
}
pp
}

0 comments on commit 30bbfda

Please sign in to comment.