Skip to content

Commit

Permalink
Merge pull request #228 from Dobiasd/im2col-performance
Browse files Browse the repository at this point in the history
Improve Conv2D performance and memory usage by chunking im2col
Dobiasd authored May 30, 2020

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
2 parents 8d45863 + 8e2c2f1 commit cee3d94
Showing 5 changed files with 65 additions and 42 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@ list(APPEND CMAKE_MODULE_PATH "${FDEEP_TOP_DIR}/cmake")

include(cmake/hunter.cmake) # default off

project(frugally-deep VERSION 0.14.0)
project(frugally-deep VERSION 0.14.1)

message(STATUS "===( ${PROJECT_NAME} ${PROJECT_VERSION} )===")

2 changes: 1 addition & 1 deletion INSTALL.md
Original file line number Diff line number Diff line change
@@ -63,7 +63,7 @@ Just add a *conanfile.txt* with frugally-deep as a requirement and chose the gen

```
[requires]
frugally-deep/v0.14.0-p0@dobiasd/stable
frugally-deep/v0.14.1-p0@dobiasd/stable
[generators]
cmake
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -150,23 +150,23 @@ Below you can find the average durations of multiple consecutive forward passes

| Model | Keras + TF | frugally-deep |
| ----------------- | ----------:| -------------:|
| `DenseNet121` | 0.14 s | 0.26 s |
| `DenseNet121` | 0.14 s | 0.25 s |
| `DenseNet169` | 0.15 s | 0.29 s |
| `DenseNet201` | 0.18 s | 0.36 s |
| `InceptionV3` | 0.17 s | 0.27 s |
| `MobileNet` | 0.05 s | 0.13 s |
| `MobileNetV2` | 0.05 s | 0.15 s |
| `NASNetLarge` | 1.08 s | 4.17 s |
| `NASNetMobile` | 0.12 s | 0.35 s |
| `InceptionV3` | 0.17 s | 0.26 s |
| `MobileNet` | 0.05 s | 0.15 s |
| `MobileNetV2` | 0.05 s | 0.17 s |
| `NASNetLarge` | 1.08 s | 3.48 s |
| `NASNetMobile` | 0.12 s | 0.30 s |
| `ResNet101` | 0.23 s | 0.31 s |
| `ResNet101V2` | 0.22 s | 0.28 s |
| `ResNet152` | 0.33 s | 0.45 s |
| `ResNet152V2` | 0.32 s | 0.41 s |
| `ResNet50` | 0.13 s | 0.19 s |
| `ResNet50V2` | 0.12 s | 0.16 s |
| `VGG16` | 0.40 s | 0.45 s |
| `VGG19` | 0.50 s | 0.54 s |
| `Xception` | 0.28 s | 1.05 s |
| `VGG16` | 0.40 s | 0.49 s |
| `VGG19` | 0.50 s | 0.57 s |
| `Xception` | 0.28 s | 1.06 s |

Requirements and Installation
-----------------------------
81 changes: 50 additions & 31 deletions include/fdeep/convolution.hpp
Original file line number Diff line number Diff line change
@@ -79,33 +79,10 @@ inline tensor convolve_im2col(
const auto fy = filter_mat.filter_shape_.height_;
const auto fx = filter_mat.filter_shape_.width_;
const auto fz = filter_mat.filter_shape_.depth_;
ColMajorMatrixXf a(fy * fx * fz + 1, out_height * out_width);
EigenIndex a_x = 0;
for (std::size_t y = 0; y < out_height; ++y)
{
for (std::size_t x = 0; x < out_width; ++x)
{
EigenIndex a_y = 0;
for (std::size_t yf = 0; yf < fy; ++yf)
{
for (std::size_t xf = 0; xf < fx; ++xf)
{
for (std::size_t zf = 0; zf < fz; ++zf)
{
a(a_y++, a_x) = in_padded.get_ignore_rank(tensor_pos(
strides_y * y + yf,
strides_x * x + xf,
zf));
}
}
a(a_y, a_x) = static_cast<float_type>(1);
}
++a_x;
}
}
const EigenIndex a_cols = static_cast<EigenIndex>(out_height * out_width);

const std::size_t val_cnt =
static_cast<std::size_t>(filter_mat.mat_.rows() * a.cols());
static_cast<std::size_t>(filter_mat.mat_.rows() * a_cols);
assertion(val_cnt % (out_height * out_width) == 0,
"Can not calculate out_depth");

@@ -116,13 +93,55 @@ inline tensor convolve_im2col(
shared_float_vec res_vec = fplus::make_shared_ref<float_vec>();
res_vec->resize(static_cast<std::size_t>(out_depth * out_height * out_width));

MappedColMajorMatrixXf out_mat_map(
res_vec->data(),
static_cast<EigenIndex>(filter_mat.mat_.rows()),
static_cast<EigenIndex>(a.cols()));
const EigenIndex a_rows = static_cast<EigenIndex>(fy * fx * fz + 1);
const EigenIndex a_max_size_bytes = 16 * 1024 * 1024;
EigenIndex step_size = a_max_size_bytes / (a_rows * static_cast<EigenIndex>(sizeof(float_type)));
EigenIndex AlignmentStep = 64 / sizeof(float_type);
step_size = (step_size / AlignmentStep) * AlignmentStep;

// https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix
out_mat_map.noalias() = filter_mat.mat_ * a;
ColMajorMatrixXf a(a_rows, step_size);
EigenIndex a_x_virtual = 0;
EigenIndex last_gem_a_x = 0;
for (std::size_t y = 0; y < out_height; ++y)
{
for (std::size_t x = 0; x < out_width; ++x)
{
EigenIndex a_y = 0;
for (std::size_t yf = 0; yf < fy; ++yf)
{
const auto p = &(in_padded.get_ref_ignore_rank(tensor_pos(
strides_y * y + yf,
strides_x * x,
0)));
const auto a_x = a_x_virtual % step_size;
// https://stackoverflow.com/a/9980859/1866775
std::copy(p, p + fx * fz, &a(a_y, a_x));
a_y += static_cast<EigenIndex>(fx * fz);
a(a_y, a_x) = static_cast<float_type>(1);
}
++a_x_virtual;
if (a_x_virtual >= last_gem_a_x + step_size)
{
MappedColMajorMatrixXf out_mat_map(
res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x,
static_cast<EigenIndex>(filter_mat.mat_.rows()),
static_cast<EigenIndex>(a_x_virtual - last_gem_a_x));
out_mat_map.noalias() = filter_mat.mat_ * a;
last_gem_a_x = a_x_virtual;
}
}
}
if (a_x_virtual != last_gem_a_x)
{
EigenIndex fields_left = a_x_virtual - last_gem_a_x;
MappedColMajorMatrixXf a_map(a.data(), a.rows(), fields_left);
MappedColMajorMatrixXf out_mat_map(
res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x,
static_cast<EigenIndex>(filter_mat.mat_.rows()),
static_cast<EigenIndex>(fields_left));
// https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix
out_mat_map.noalias() = filter_mat.mat_ * a_map;
}

return tensor(
tensor_shape_with_changed_rank(
4 changes: 4 additions & 0 deletions include/fdeep/tensor.hpp
Original file line number Diff line number Diff line change
@@ -58,6 +58,10 @@ class tensor
{
return (*values_)[idx_ignore_rank(pos)];
}
const float_type& get_ref_ignore_rank(const tensor_pos& pos) const
{
return (*values_)[idx_ignore_rank(pos)];
}
float_type get_y_x_padded(float_type pad_value,
int y, int x, std::size_t z) const
{

0 comments on commit cee3d94

Please sign in to comment.