diff --git a/CMakeLists.txt b/CMakeLists.txt
index c27d248a..30702fa6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ list(APPEND CMAKE_MODULE_PATH "${FDEEP_TOP_DIR}/cmake")
 
 include(cmake/hunter.cmake) # default off
 
-project(frugally-deep VERSION 0.14.0)
+project(frugally-deep VERSION 0.14.1)
 
 message(STATUS "===( ${PROJECT_NAME} ${PROJECT_VERSION} )===")
 
diff --git a/INSTALL.md b/INSTALL.md
index f70d6bc7..459c17e3 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -63,7 +63,7 @@ Just add a *conanfile.txt* with frugally-deep as a requirement and chose the gen
 
 ```
 [requires]
-frugally-deep/v0.14.0-p0@dobiasd/stable
+frugally-deep/v0.14.1-p0@dobiasd/stable
 
 [generators]
 cmake
diff --git a/README.md b/README.md
index af18e406..cdb61c22 100644
--- a/README.md
+++ b/README.md
@@ -150,23 +150,23 @@ Below you can find the average durations of multiple consecutive forward passes
 
 | Model             | Keras + TF | frugally-deep |
 | ----------------- | ----------:| -------------:|
-| `DenseNet121`     |     0.14 s |        0.26 s |
+| `DenseNet121`     |     0.14 s |        0.25 s |
 | `DenseNet169`     |     0.15 s |        0.29 s |
 | `DenseNet201`     |     0.18 s |        0.36 s |
-| `InceptionV3`     |     0.17 s |        0.27 s |
-| `MobileNet`       |     0.05 s |        0.13 s |
-| `MobileNetV2`     |     0.05 s |        0.15 s |
-| `NASNetLarge`     |     1.08 s |        4.17 s |
-| `NASNetMobile`    |     0.12 s |        0.35 s |
+| `InceptionV3`     |     0.17 s |        0.26 s |
+| `MobileNet`       |     0.05 s |        0.15 s |
+| `MobileNetV2`     |     0.05 s |        0.17 s |
+| `NASNetLarge`     |     1.08 s |        3.48 s |
+| `NASNetMobile`    |     0.12 s |        0.30 s |
 | `ResNet101`       |     0.23 s |        0.31 s |
 | `ResNet101V2`     |     0.22 s |        0.28 s |
 | `ResNet152`       |     0.33 s |        0.45 s |
 | `ResNet152V2`     |     0.32 s |        0.41 s |
 | `ResNet50`        |     0.13 s |        0.19 s |
 | `ResNet50V2`      |     0.12 s |        0.16 s |
-| `VGG16`           |     0.40 s |        0.45 s |
-| `VGG19`           |     0.50 s |        0.54 s |
-| `Xception`        |     0.28 s |        1.05 s |
+| `VGG16`           |     0.40 s |        0.49 s |
+| `VGG19`           |     0.50 s |        0.57 s |
+| `Xception`        |     0.28 s |        1.06 s |
 
 Requirements and Installation
 -----------------------------
diff --git a/include/fdeep/convolution.hpp b/include/fdeep/convolution.hpp
index 3a28c0fb..57138d20 100644
--- a/include/fdeep/convolution.hpp
+++ b/include/fdeep/convolution.hpp
@@ -79,33 +79,10 @@ inline tensor convolve_im2col(
     const auto fy = filter_mat.filter_shape_.height_;
     const auto fx = filter_mat.filter_shape_.width_;
     const auto fz = filter_mat.filter_shape_.depth_;
-    ColMajorMatrixXf a(fy * fx * fz + 1, out_height * out_width);
-    EigenIndex a_x = 0;
-    for (std::size_t y = 0; y < out_height; ++y)
-    {
-        for (std::size_t x = 0; x < out_width; ++x)
-        {
-            EigenIndex a_y = 0;
-            for (std::size_t yf = 0; yf < fy; ++yf)
-            {
-                for (std::size_t xf = 0; xf < fx; ++xf)
-                {
-                    for (std::size_t zf = 0; zf < fz; ++zf)
-                    {
-                        a(a_y++, a_x) = in_padded.get_ignore_rank(tensor_pos(
-                                strides_y * y + yf,
-                                strides_x * x + xf,
-                                zf));
-                    }
-                }
-                a(a_y, a_x) = static_cast<float_type>(1);
-            }
-            ++a_x;
-        }
-    }
+    const EigenIndex a_cols = static_cast<EigenIndex>(out_height * out_width);
 
     const std::size_t val_cnt =
-        static_cast<std::size_t>(filter_mat.mat_.rows() * a.cols());
+        static_cast<std::size_t>(filter_mat.mat_.rows() * a_cols);
     assertion(val_cnt % (out_height * out_width) == 0,
         "Can not calculate out_depth");
 
@@ -116,13 +93,55 @@ inline tensor convolve_im2col(
     shared_float_vec res_vec = fplus::make_shared_ref<float_vec>();
     res_vec->resize(static_cast<std::size_t>(out_depth * out_height * out_width));
 
-    MappedColMajorMatrixXf out_mat_map(
-        res_vec->data(),
-        static_cast<EigenIndex>(filter_mat.mat_.rows()),
-        static_cast<EigenIndex>(a.cols()));
+    const EigenIndex a_rows = static_cast<EigenIndex>(fy * fx * fz + 1);
+    const EigenIndex a_max_size_bytes = 16 * 1024 * 1024;
+    EigenIndex step_size = a_max_size_bytes / (a_rows * static_cast<EigenIndex>(sizeof(float_type)));
+    EigenIndex AlignmentStep = 64 / sizeof(float_type);
+    step_size = (step_size / AlignmentStep) * AlignmentStep;
 
-    // https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix
-    out_mat_map.noalias() = filter_mat.mat_ * a;
+    ColMajorMatrixXf a(a_rows, step_size);
+    EigenIndex a_x_virtual = 0;
+    EigenIndex last_gem_a_x = 0;
+    for (std::size_t y = 0; y < out_height; ++y)
+    {
+        for (std::size_t x = 0; x < out_width; ++x)
+        {
+            EigenIndex a_y = 0;
+            for (std::size_t yf = 0; yf < fy; ++yf)
+            {
+                const auto p = &(in_padded.get_ref_ignore_rank(tensor_pos(
+                        strides_y * y + yf,
+                        strides_x * x,
+                        0)));
+                const auto a_x = a_x_virtual % step_size;
+                // https://stackoverflow.com/a/9980859/1866775
+                std::copy(p, p + fx * fz, &a(a_y, a_x));
+                a_y += static_cast<EigenIndex>(fx * fz);
+                a(a_y, a_x) = static_cast<float_type>(1);
+            }
+            ++a_x_virtual;
+            if (a_x_virtual >= last_gem_a_x + step_size)
+            {
+                MappedColMajorMatrixXf out_mat_map(
+                    res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x,
+                    static_cast<EigenIndex>(filter_mat.mat_.rows()),
+                    static_cast<EigenIndex>(a_x_virtual - last_gem_a_x));
+                out_mat_map.noalias() = filter_mat.mat_ * a;
+                last_gem_a_x = a_x_virtual;
+            }
+        }
+    }
+    if (a_x_virtual != last_gem_a_x)
+    {
+        EigenIndex fields_left = a_x_virtual - last_gem_a_x;
+        MappedColMajorMatrixXf a_map(a.data(), a.rows(), fields_left);
+        MappedColMajorMatrixXf out_mat_map(
+            res_vec->data() + filter_mat.mat_.rows() * last_gem_a_x,
+            static_cast<EigenIndex>(filter_mat.mat_.rows()),
+            static_cast<EigenIndex>(fields_left));
+        // https://stackoverflow.com/questions/48644724/multiply-two-eigen-matrices-directly-into-memory-of-target-matrix
+        out_mat_map.noalias() = filter_mat.mat_ * a_map;
+    }
 
     return tensor(
         tensor_shape_with_changed_rank(
diff --git a/include/fdeep/tensor.hpp b/include/fdeep/tensor.hpp
index dbc85bfc..b3274b08 100644
--- a/include/fdeep/tensor.hpp
+++ b/include/fdeep/tensor.hpp
@@ -58,6 +58,10 @@ class tensor
     {
         return (*values_)[idx_ignore_rank(pos)];
     }
+    const float_type& get_ref_ignore_rank(const tensor_pos& pos) const
+    {
+        return (*values_)[idx_ignore_rank(pos)];
+    }
     float_type get_y_x_padded(float_type pad_value,
         int y, int x, std::size_t z) const
     {