diff --git a/CMakeLists.txt b/CMakeLists.txt index c27d248a..30702fa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ list(APPEND CMAKE_MODULE_PATH "${FDEEP_TOP_DIR}/cmake") include(cmake/hunter.cmake) # default off -project(frugally-deep VERSION 0.14.0) +project(frugally-deep VERSION 0.14.1) message(STATUS "===( ${PROJECT_NAME} ${PROJECT_VERSION} )===") diff --git a/INSTALL.md b/INSTALL.md index f70d6bc7..459c17e3 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -63,7 +63,7 @@ Just add a *conanfile.txt* with frugally-deep as a requirement and chose the gen ``` [requires] -frugally-deep/v0.14.0-p0@dobiasd/stable +frugally-deep/v0.14.1-p0@dobiasd/stable [generators] cmake diff --git a/README.md b/README.md index af18e406..cdb61c22 100644 --- a/README.md +++ b/README.md @@ -150,23 +150,23 @@ Below you can find the average durations of multiple consecutive forward passes | Model | Keras + TF | frugally-deep | | ----------------- | ----------:| -------------:| -| `DenseNet121` | 0.14 s | 0.26 s | +| `DenseNet121` | 0.14 s | 0.25 s | | `DenseNet169` | 0.15 s | 0.29 s | | `DenseNet201` | 0.18 s | 0.36 s | -| `InceptionV3` | 0.17 s | 0.27 s | -| `MobileNet` | 0.05 s | 0.13 s | -| `MobileNetV2` | 0.05 s | 0.15 s | -| `NASNetLarge` | 1.08 s | 4.17 s | -| `NASNetMobile` | 0.12 s | 0.35 s | +| `InceptionV3` | 0.17 s | 0.26 s | +| `MobileNet` | 0.05 s | 0.15 s | +| `MobileNetV2` | 0.05 s | 0.17 s | +| `NASNetLarge` | 1.08 s | 3.48 s | +| `NASNetMobile` | 0.12 s | 0.30 s | | `ResNet101` | 0.23 s | 0.31 s | | `ResNet101V2` | 0.22 s | 0.28 s | | `ResNet152` | 0.33 s | 0.45 s | | `ResNet152V2` | 0.32 s | 0.41 s | | `ResNet50` | 0.13 s | 0.19 s | | `ResNet50V2` | 0.12 s | 0.16 s | -| `VGG16` | 0.40 s | 0.45 s | -| `VGG19` | 0.50 s | 0.54 s | -| `Xception` | 0.28 s | 1.05 s | +| `VGG16` | 0.40 s | 0.49 s | +| `VGG19` | 0.50 s | 0.57 s | +| `Xception` | 0.28 s | 1.06 s | Requirements and Installation ----------------------------- diff --git a/include/fdeep/convolution.hpp b/include/fdeep/convolution.hpp index 3a28c0fb..57138d20 100644 --- a/include/fdeep/convolution.hpp +++ b/include/fdeep/convolution.hpp @@ -79,33 +79,10 @@ inline tensor convolve_im2col( const auto fy = filter_mat.filter_shape_.height_; const auto fx = filter_mat.filter_shape_.width_; const auto fz = filter_mat.filter_shape_.depth_; - ColMajorMatrixXf a(fy * fx * fz + 1, out_height * out_width); - EigenIndex a_x = 0; - for (std::size_t y = 0; y < out_height; ++y) - { - for (std::size_t x = 0; x < out_width; ++x) - { - EigenIndex a_y = 0; - for (std::size_t yf = 0; yf < fy; ++yf) - { - for (std::size_t xf = 0; xf < fx; ++xf) - { - for (std::size_t zf = 0; zf < fz; ++zf) - { - a(a_y++, a_x) = in_padded.get_ignore_rank(tensor_pos( - strides_y * y + yf, - strides_x * x + xf, - zf)); - } - } - a(a_y, a_x) = static_cast(1); - } - ++a_x; - } - } + const EigenIndex a_cols = static_cast(out_height * out_width); const std::size_t val_cnt = - static_cast(filter_mat.mat_.rows() * a.cols()); + static_cast(filter_mat.mat_.rows() * a_cols); assertion(val_cnt % (out_height * out_width) == 0, "Can not calculate out_depth"); @@ -116,13 +93,55 @@ inline tensor convolve_im2col( shared_float_vec res_vec = fplus::make_shared_ref(); res_vec->resize(static_cast(out_depth * out_height * out_width)); - MappedColMajorMatrixXf out_mat_map( - res_vec->data(), - static_cast(filter_mat.mat_.rows()), - static_cast(a.cols())); + const EigenIndex a_rows = static_cast(fy * fx * fz + 1); + const EigenIndex a_max_size_bytes = 16 * 1024 * 1024; + EigenIndex step_size = a_max_size_bytes / (a_rows * static_cast(sizeof(float_type))); + EigenIndex AlignmentStep = 64 / sizeof(float_type); + step_size = (step_size / AlignmentStep) * AlignmentStep; - // https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix - out_mat_map.noalias() = filter_mat.mat_ * a; + ColMajorMatrixXf a(a_rows, step_size); + EigenIndex a_x_virtual = 0; + EigenIndex last_gem_a_x = 0; + for (std::size_t y = 0; y < out_height; ++y) + { + for (std::size_t x = 0; x < out_width; ++x) + { + EigenIndex a_y = 0; + for (std::size_t yf = 0; yf < fy; ++yf) + { + const auto p = &(in_padded.get_ref_ignore_rank(tensor_pos( + strides_y * y + yf, + strides_x * x, + 0))); + const auto a_x = a_x_virtual % step_size; + // https://stackoverflow.com/a/9980859/1866775 + std::copy(p, p + fx * fz, &a(a_y, a_x)); + a_y += static_cast(fx * fz); + a(a_y, a_x) = static_cast(1); + } + ++a_x_virtual; + if (a_x_virtual >= last_gem_a_x + step_size) + { + MappedColMajorMatrixXf out_mat_map( + res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x, + static_cast(filter_mat.mat_.rows()), + static_cast(a_x_virtual - last_gem_a_x)); + out_mat_map.noalias() = filter_mat.mat_ * a; + last_gem_a_x = a_x_virtual; + } + } + } + if (a_x_virtual != last_gem_a_x) + { + EigenIndex fields_left = a_x_virtual - last_gem_a_x; + MappedColMajorMatrixXf a_map(a.data(), a.rows(), fields_left); + MappedColMajorMatrixXf out_mat_map( + res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x, + static_cast(filter_mat.mat_.rows()), + static_cast(fields_left)); + // https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix + out_mat_map.noalias() = filter_mat.mat_ * a_map; + } return tensor( tensor_shape_with_changed_rank( diff --git a/include/fdeep/tensor.hpp b/include/fdeep/tensor.hpp index dbc85bfc..b3274b08 100644 --- a/include/fdeep/tensor.hpp +++ b/include/fdeep/tensor.hpp @@ -58,6 +58,10 @@ class tensor { return (*values_)[idx_ignore_rank(pos)]; } + const float_type& get_ref_ignore_rank(const tensor_pos& pos) const + { + return (*values_)[idx_ignore_rank(pos)]; + } float_type get_y_x_padded(float_type pad_value, int y, int x, std::size_t z) const {