Skip to content

oneAPI backend update: kernel and layer optimizations #1218

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
3 changes: 3 additions & 0 deletions hls4ml/backends/oneapi/oneapi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
# TODO: add namespace
'WriteTar': write_tar,
}

if 'use_bsp' in _:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably extract this as an actual parameter (and add info about it in the docstring). The _ are generally meant to be ignored, I think. We should also use the same capitalization convention. Also, I am not sure I see how this is used. I saw the define in the C++, but not if this is used to set the define (though I may have missed it.)

config['IS_BSP'] = True

return config

Expand Down
28 changes: 19 additions & 9 deletions hls4ml/backends/oneapi/oneapi_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False):
else:
return f'{self.type.name} {self.name}{name_suffix}'

def declare_cpp(self, pipe_min_size=0, indent=''):
lines = indent + f'class {self.pipe_id};\n'
lines += indent + (
f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+ f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
# Updated pipe min size to be 32 for simulation.
def declare_cpp(self, pipe_min_size=32, indent=''):
# Updated to use streaming beat for restartable streaming kernel.
# Streaming beat is a wrapper type of the actual type with sideband control signals.
# Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
streaming_beat_t = f"{self.pipe_name}BeatT"
lines = (
f"{indent}class {self.pipe_id};\n"
f"{indent}using {streaming_beat_t} = "
f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<"
f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n"
)
return lines

Expand All @@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True):
return f'{self.name}{name_suffix}'

def declare_cpp(self, indent=''):
lines = indent + f'class {self.pipe_id};\n'
lines += indent + (
f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
+ f'{self.type.name}, {self.pragma[-1]}>;\n'
streaming_beat_t = f"{self.pipe_name}BeatT";
lines = (
f"{indent}class {self.pipe_id};\n"
f"{indent}using {streaming_beat_t} = "
f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
f"{indent}using {self.pipe_name} = "
f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n"
)
return lines

Expand Down
5 changes: 3 additions & 2 deletions hls4ml/templates/oneapi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,13 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
# You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
# specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
if(NOT DEFINED FPGA_DEVICE)
set(FPGA_DEVICE "Arria10")
set(FPGA_DEVICE "Agilex7")
endif()

# Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
# compilation.
set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
# -Xsoptimize=latency Turns off the hyper-optimized handshake
set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)

# Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
Expand Down
123 changes: 123 additions & 0 deletions hls4ml/templates/oneapi/firmware/myproject.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,129 @@
// currently this is fixed
using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));

// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
// They are connected to the first and the last layer to stream data into and out from the kernel.
using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
sycl::ext::intel::experimental::ready_latency<0>,
sycl::ext::intel::experimental::bits_per_symbol<8>,
sycl::ext::intel::experimental::uses_valid<true>,
sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready
));


namespace nnet {

#if !defined(IS_BSP)
// Definition for buffer locations for Avalon MM host.
inline constexpr unsigned kInputBufferLocation = 0;
inline constexpr unsigned kOutputBufferLocation = 1;
#endif

// Implementation of a direct memory access kernel. Move data from source, convert,
// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow.
template <class src_T, class dest_pipe>
struct DMA_convert_data {
#if !defined(IS_BSP)
// When targeting a device family, we instantiate an Avalon Memory Mapped Host for
// data transaction between host and the DMA kernel during emulation and simulation.
sycl::ext::oneapi::experimental::annotated_arg<src_T *,
decltype(sycl::ext::oneapi::experimental::properties{
sycl::ext::intel::experimental::latency<0>,
sycl::ext::intel::experimental::dwidth<16>,
sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
sycl::ext::intel::experimental::read_write_mode_read,
sycl::ext::intel::experimental::wait_request_requested})>
#else
// When targeting oneAPI BSP, we can use USM pointer to access host memory.
src_T *const
#endif
src;
size_t num_iteration;

[[intel::kernel_args_restrict]]
void operator()() const {

#if defined(IS_BSP)
// Access data using host pointer.
sycl::ext::intel::host_ptr<src_T> src_ptr(src);
#else
// Host allocation is not supported when targeting an FPGA family or part number.
src_T *src_ptr(src);
#endif
// First, extract the PipeDataT from the pipe
using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
// Then, extract the DataT from StreamingBeat
using DstDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
constexpr auto dstTypeSize = std::tuple_size<DstDataType>{};

[[intel::fpga_register]]
typename nnet::ExtractPipeType<dest_pipe>::value_type packet;

// Keep sending data to the input layer and keep the kernels running.
for (size_t i = 0; i < num_iteration; i++) {
#pragma unroll
for (size_t j = 0; j < dstTypeSize; j++) {
packet.data[j] = src_ptr[i * dstTypeSize + j];
}
packet.sop = (i == 0);
// Assert end-of-packet signal after the last iteration.
// All down-stream kernels will stop seeing eop.
packet.eop = (i == (num_iteration - 1));
dest_pipe::write(packet);
}
}
};

// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and
// writes result to memory.
template <class src_pipe, class dst_T>
struct DMA_convert_data_back {
#if !defined(IS_BSP)
// Without BSP, instantiate an Avalon Memory Mapped Host to write to host.
sycl::ext::oneapi::experimental::annotated_arg<dst_T *,
decltype(sycl::ext::oneapi::experimental::properties{
sycl::ext::intel::experimental::latency<0>,
sycl::ext::intel::experimental::dwidth<16>,
sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
sycl::ext::intel::experimental::read_write_mode_write,
sycl::ext::intel::experimental::wait_request_requested})>
#else
// USM pointer, otherwise.
dst_T *const
#endif
dst;
size_t num_iteration;

[[intel::kernel_args_restrict]]
void operator()() const {
#if defined(IS_BSP)
sycl::ext::intel::host_ptr<dst_T> dst_ptr(dst);
#else
dst_T *dst_ptr(dst);
#endif
// First, extract the PipeDataT from the pipe
using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
// Then, extract the DataT from StreamingBeat
using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};

[[intel::fpga_register]]
typename nnet::ExtractPipeType<src_pipe>::value_type packet;

// Drain the output pipe and write result to memory.
for (size_t i = 0; i < num_iteration; i++) {
packet = src_pipe::read();
#pragma unroll
for (size_t j = 0; j < srcTypeSize; j++) {
dst_ptr[i * srcTypeSize + j] = static_cast<dst_T>(packet.data[j].to_double());
}
}
}
};

} // namespace nnet

// Need to declare the input and output pipes

// hls-fpga-machine-learning insert inputs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,33 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream
// *************************************************
// ReLU Activation
// *************************************************
template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
template <class data_pipe, class res_pipe, typename CONFIG_T>
[[intel::use_stall_enable_clusters]] void relu_stream() {
using namespace nnet;
using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;

bool keep_going = true;
ReLUActLoop:
[[intel::initiation_interval(
1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
auto in_data = data_pipe::read();
typename ExtractPipeType<res_pipe>::value_type out_data;
[[intel::initiation_interval(1)]]
while(keep_going) {
for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
[[intel::fpga_register]] auto in_data = data_pipe::read();
ReLUPackLoop:
#pragma unroll
for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
if (in_data.data[j] > 0)
out_data.data[j] = in_data.data[j];
else
out_data.data[j] = 0;
}

ReLUPackLoop:
#pragma unroll
for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
if (in_data[j] > 0)
out_data[j] = in_data[j];
else
out_data[j] = 0;
}
out_data.sop = in_data.sop;
out_data.eop = in_data.eop;
res_pipe::write(out_data);

res_pipe::write(out_data);
keep_going = !in_data.eop;
}
}
}

Expand Down
33 changes: 26 additions & 7 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,34 @@

namespace nnet {

// Note: DataPack logic removed, at least in the initial version
// Restartable streaming kernel implementation.
// Computation is carried out in a while-1 loop as long as there is valid input.
// The loop breaks when the end-of-packet signal is asserted by upstream task.
template <class data_pipe, class res_pipe, typename CONFIG_T>
void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights, const typename CONFIG_T::bias_t biases) {
using namespace nnet;
using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;

[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type resbeat;

[[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
[[intel::fpga_register]] auto data = data_pipe::read();
dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
CONFIG_T>(data, res, weights, biases);
res_pipe::write(res);
bool keep_going = true;
bool did_read_input;
[[intel::initiation_interval(1)]]
while (keep_going) {
did_read_input = false;
[[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input);

if (did_read_input) {
dense_resource<DataT, ResT, CONFIG_T>(databeat.data, resbeat.data, weights, biases);

resbeat.sop = databeat.sop;
resbeat.eop = databeat.eop;

res_pipe::write(resbeat);
keep_going = !databeat.eop;
}
}
}

} // namespace nnet
Expand Down
11 changes: 11 additions & 0 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <tuple>
#include <utility>

#include <sycl/ext/intel/prototype/pipes_ext.hpp> // Streaming Beat and pipe properties.

namespace nnet {

// Define the pipe type that we use
Expand All @@ -34,6 +36,15 @@ struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProp
typedef PipeDataT value_type;
};

// Helper template for extracting datatype from oneAPI StreamingBeat type.
template <typename T> struct ExtractDataType { typedef T value_type; };

// Specialization on oneAPI StreamingBeat type.
template <typename DataT, bool EnableSOP, bool EnableEmpty>
struct ExtractDataType<sycl::ext::intel::experimental::StreamingBeat<DataT, EnableSOP, EnableEmpty>> {
typedef DataT value_type;
};

/*
* HLS Shift Register Implementation
* To verify a shift register is used in hardware, go to report.html > Area Analysis of System
Expand Down
Loading
Loading