Skip to content

Conv2D: Add CPU version #14320

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
Expand Down Expand Up @@ -1723,6 +1724,17 @@ extern "C" {
struct ggml_tensor * b,
int stride);

GGML_API struct ggml_tensor * ggml_conv_2d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
struct ggml_tensor * b, // input data [W, H, C, N]
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1

enum ggml_op_pool {
GGML_OP_POOL_MAX,
GGML_OP_POOL_AVG,
Expand Down
17 changes: 16 additions & 1 deletion ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,10 @@ static void ggml_init_arm_arch_features(void) {

#endif // __ARM_ARCH

void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
struct ggml_tensor * dst);

struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
GGML_ASSERT(!ggml_get_no_alloc(ctx));

Expand Down Expand Up @@ -1189,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
}
}

static void ggml_compute_forward_mul_mat(
void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {

Expand Down Expand Up @@ -1858,6 +1862,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor);
} break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
Expand Down Expand Up @@ -2203,6 +2211,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D:
Expand Down Expand Up @@ -2721,6 +2730,12 @@ struct ggml_cplan ggml_graph_plan(
GGML_ABORT("fatal error");
}
} break;
case GGML_OP_CONV_2D:
{
cur = GGML_IM2COL_WORK_SIZE;
//Add enough space for kernel transpose
cur += sizeof(ggml_fp16_t)*node->src[1]->ne[0]*node->src[1]->ne[1]*node->src[1]->ne[2]*node->src[1]->ne[3];
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
const int64_t ne00 = node->src[0]->ne[0]; // W
Expand Down
175 changes: 175 additions & 0 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "binary-ops.h"
#include "ggml.h"
#include "unary-ops.h"
#include "vec.h"

Expand Down Expand Up @@ -6058,6 +6059,180 @@ void ggml_compute_forward_im2col_back_f32(
}
}

static void ggml_call_mul_mat(
const ggml_compute_params * params,
int64_t m, int64_t n, int64_t k,
void * a, void * b, void * c) {

struct ggml_tensor src1 = {};
src1.ne[0] = k;
src1.ne[1] = m;
src1.ne[2] = 1;
src1.ne[3] = 1;
src1.nb[0] = sizeof(float);
src1.nb[1] = k * sizeof(float);
src1.nb[2] = src1.nb[1];
src1.nb[3] = src1.nb[2];
src1.data = a;

struct ggml_tensor src0 = {};
src0.ne[0] = k;
src0.ne[1] = n;
src0.ne[2] = 1;
src0.ne[3] = 1;
src0.nb[0] = sizeof(float);
src0.nb[1] = k * sizeof(float);
src0.nb[2] = src0.nb[1];
src0.nb[3] = src0.nb[2];
src0.data = b;

struct ggml_tensor dst = {};
dst.ne[0] = n;
dst.ne[1] = m;
dst.ne[2] = 1;
dst.ne[3] = 1;
dst.nb[0] = sizeof(float);
dst.nb[1] = n * sizeof(float);
dst.nb[2] = dst.nb[1];
dst.nb[3] = dst.nb[2];
dst.data = c;
dst.src[0] = &src0;
dst.src[1] = &src1;

ggml_compute_forward_mul_mat(params, &dst);
}


// ggml_compute_forward_conv_2d

static void ggml_compute_forward_conv_2d_f32(
const ggml_compute_params * params,
const ggml_tensor * kernel, // [KW, KH, IC, OC] - fp32
const ggml_tensor * src, // [W, H, C, N]
ggml_tensor * dst) { // [OW, OH, OC, N]

GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(kernel->type == GGML_TYPE_F32);

const int32_t stride_x = dst->op_params[0];
const int32_t stride_y = dst->op_params[1];
const int32_t pad_x = dst->op_params[2];
const int32_t pad_y = dst->op_params[3];
const int32_t dilation_x = dst->op_params[4];
const int32_t dilation_y = dst->op_params[5];

const int64_t c_in = src->ne[2];
const int64_t c_out = kernel->ne[3];
GGML_ASSERT(c_in == kernel->ne[2]);

const int64_t src_w = src->ne[0];
const int64_t src_h = src->ne[1];
const int64_t knl_w = kernel->ne[0];
const int64_t knl_h = kernel->ne[1];
const int64_t dst_w = dst->ne[0];
const int64_t dst_h = dst->ne[1];

float * src_data = (float*) src->data;
float * knl_data = (float*) kernel->data;
float * dst_data = (float*) dst->data;

const int64_t knl_n = knl_w * knl_h * c_in;
const int64_t patch_total = dst->ne[3] * dst_w * dst_h;

const int64_t space_per_patch = knl_n * sizeof(float) + c_out * sizeof(float);
const int64_t batch_size = params->wsize / space_per_patch;
const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size;
const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch;

GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1);

float * tmp = (float *) params->wdata;

for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) {

const int64_t patch_start_batch = batch_i * patches_per_batch;
const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch,
patch_total);
const int64_t patch_n = patch_end_batch - patch_start_batch;

const int64_t patch_per_thread = (patch_n + params->nth - 1) / params->nth;
const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread;
const int64_t patch_end = std::min(patch_start + patch_per_thread,patch_end_batch);

//im2col for a patch
for (int64_t p = patch_start; p < patch_end; ++p) {
const int64_t batch_n = p / (dst_w * dst_h);
const int64_t src_x = (p / dst_w) % dst_h;
const int64_t src_y = p % dst_w;

float * src_base = (float *)((char *)src_data + batch_n * src->nb[3]);
float * dst_row = tmp + (p % patches_per_batch) * knl_n;

for (int64_t ic = 0; ic < c_in; ++ic) {
for (int64_t ky = 0; ky < knl_h; ++ky) {
for (int64_t kx = 0; kx < knl_w; ++kx) {
const int64_t sy = src_x * stride_y + ky * dilation_y - pad_y;
const int64_t sx = src_y * stride_x + kx * dilation_x - pad_x;

int64_t dst_idx = ic * (knl_h * knl_w) + ky * knl_w + kx;

if (sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) {
dst_row[dst_idx] = 0.0f;
} else {
float * src_ptr = (float *)((char *)src_base + sx * src->nb[0] + sy * src->nb[1] + ic * src->nb[2]);
dst_row[dst_idx] = *src_ptr;
}
}
}
}
} // patches handled by this thread

ggml_barrier(params->threadpool);

float * gemm_output = tmp + patches_per_batch * knl_n;

// GEMM: patches[patch_n, knl_n] × kernel[knl_n, c_out] = output[patch_n, c_out]
ggml_call_mul_mat(params, patch_n, c_out, knl_n,
tmp, knl_data, gemm_output);

ggml_barrier(params->threadpool);


//permute back [OC, N, OH, OW] to [N, OC, OH, OW]
const int64_t permute_per_thread = (patch_n + params->nth - 1) / params->nth;
const int64_t permute_start = params->ith * permute_per_thread;
const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n);

for (int64_t i = permute_start; i < permute_end; ++i) {
const int64_t p = patch_start_batch + i;
const int64_t batch_n = p / (dst_w * dst_h);
const int64_t dst_y = (p / dst_w) % dst_h;
const int64_t dst_x = p % dst_w;

for (int64_t oc = 0; oc < c_out; ++oc) {
const float value = gemm_output[i * c_out + oc];
// Write to WHCN layout: dst[w, h, c, n]
float * dst_ptr = (float *)((char *)dst_data + dst_x * dst->nb[0] + dst_y * dst->nb[1] + oc * dst->nb[2] + batch_n * dst->nb[3]);
*dst_ptr = value;
}
}
}
}

void ggml_compute_forward_conv_2d(
const ggml_compute_params * params,
ggml_tensor * dst) {

const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];

if (src0->type == GGML_TYPE_F16) {
GGML_ASSERT(false && "F16 not supported yet");
} else {
ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
}
}

// ggml_compute_forward_conv_transpose_2d

void ggml_compute_forward_conv_transpose_2d(
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-cpu/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@

static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);

// Work buffer size for im2col operations in CONV2D
#define GGML_IM2COL_WORK_SIZE (16 * 1024 * 1024) // 16MB work buffer

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -64,6 +67,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
Expand Down Expand Up @@ -105,6 +109,7 @@ void ggml_compute_forward_custom(const struct ggml_compute_params * params, stru
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);

#ifdef __cplusplus
}
Expand Down
43 changes: 41 additions & 2 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -986,7 +986,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW",
};

static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");

static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
Expand Down Expand Up @@ -1043,6 +1043,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)",
"im2col(x)",
"im2col_back(x)",
"conv_2d(x)",
"conv_2d_dw(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
Expand Down Expand Up @@ -1082,7 +1083,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)",
};

static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84");

static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

Expand Down Expand Up @@ -4131,6 +4132,44 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
return result;
}

// ggml_conv_2d_direct

struct ggml_tensor * ggml_conv_2d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
struct ggml_tensor * b, // input data [W, H, C, N]
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1) {// dilation dimension 1

GGML_ASSERT(a->ne[2] == b->ne[2]);
//GGML_ASSERT(a->type == b->type);

int64_t ne[4];
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
ne[2] = a->ne[3];
ne[3] = b->ne[3];

struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);

ggml_set_op_params_i32(result, 0, s0);
ggml_set_op_params_i32(result, 1, s1);
ggml_set_op_params_i32(result, 2, p0);
ggml_set_op_params_i32(result, 3, p1);
ggml_set_op_params_i32(result, 4, d0);
ggml_set_op_params_i32(result, 5, d1);

result->op = GGML_OP_CONV_2D;
result->src[0] = a;
result->src[1] = b;

return result;
}

// ggml_conv_transpose_2d_p0

static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
Expand Down
Loading