Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions include/infiniop/ops/block_diag.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef __INFINIOP_BLOCK_DIAG_API_H__
#define __INFINIOP_BLOCK_DIAG_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopBlockDiagDescriptor_t;

__C __export infiniStatus_t infiniopCreateBlockDiagDescriptor(infiniopHandle_t handle,
infiniopBlockDiagDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t *x,
size_t num_inputs);

__C __export infiniStatus_t infiniopGetBlockDiagWorkspaceSize(infiniopBlockDiagDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopBlockDiag(infiniopBlockDiagDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void **x,
void *stream);

__C __export infiniStatus_t infiniopDestroyBlockDiagDescriptor(infiniopBlockDiagDescriptor_t desc);

#endif
28 changes: 28 additions & 0 deletions include/infiniop/ops/hinge_embedding_loss.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#ifndef __INFINIOP_HINGE_EMBEDDING_LOSS_API_H__
#define __INFINIOP_HINGE_EMBEDDING_LOSS_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopHingeEmbeddingLossDescriptor_t;

__C __export infiniStatus_t infiniopCreateHingeEmbeddingLossDescriptor(infiniopHandle_t handle,
infiniopHingeEmbeddingLossDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t input,
infiniopTensorDescriptor_t target,
double margin,
int reduction);

__C __export infiniStatus_t infiniopGetHingeEmbeddingLossWorkspaceSize(infiniopHingeEmbeddingLossDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopHingeEmbeddingLoss(infiniopHingeEmbeddingLossDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *input,
const void *target,
void *stream);

__C __export infiniStatus_t infiniopDestroyHingeEmbeddingLossDescriptor(infiniopHingeEmbeddingLossDescriptor_t desc);

#endif
26 changes: 26 additions & 0 deletions include/infiniop/ops/kron.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef __INFINIOP_KRON_API_H__
#define __INFINIOP_KRON_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopKronDescriptor_t;

__C __export infiniStatus_t infiniopCreateKronDescriptor(infiniopHandle_t handle,
infiniopKronDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x1,
infiniopTensorDescriptor_t x2);

__C __export infiniStatus_t infiniopGetKronWorkspaceSize(infiniopKronDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopKron(infiniopKronDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x1,
const void *x2,
void *stream);

__C __export infiniStatus_t infiniopDestroyKronDescriptor(infiniopKronDescriptor_t desc);

#endif
24 changes: 24 additions & 0 deletions include/infiniop/ops/selu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_SELU_API_H__
#define __INFINIOP_SELU_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSeluDescriptor_t;

__C __export infiniStatus_t infiniopCreateSeluDescriptor(infiniopHandle_t handle,
infiniopSeluDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetSeluWorkspaceSize(infiniopSeluDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSelu(infiniopSeluDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroySeluDescriptor(infiniopSeluDescriptor_t desc);

#endif
24 changes: 24 additions & 0 deletions include/infiniop/ops/sinh.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_SINH_API_H__
#define __INFINIOP_SINH_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopSinhDescriptor_t;

__C __export infiniStatus_t infiniopCreateSinhDescriptor(infiniopHandle_t handle,
infiniopSinhDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetSinhWorkspaceSize(infiniopSinhDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopSinh(infiniopSinhDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroySinhDescriptor(infiniopSinhDescriptor_t desc);

#endif
145 changes: 145 additions & 0 deletions src/infiniop/ops/block_diag/cpu/block_diag_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include "block_diag_cpu.h"
#include "../../../utils.h"
#include <cstring>

namespace op::block_diag::cpu {

utils::Result<BlockDiagInfo> BlockDiagInfo::create(
infiniopTensorDescriptor_t *input_descs,
size_t num_inputs,
infiniopTensorDescriptor_t y_desc) {

if (num_inputs == 0) {
return INFINI_STATUS_BAD_PARAM;
}

BlockDiagInfo info;
info.num_inputs = num_inputs;
info.input_shapes.resize(num_inputs);
info.row_offsets.resize(num_inputs);
info.col_offsets.resize(num_inputs);

size_t total_rows = 0;
size_t total_cols = 0;

// Process each input matrix
for (size_t i = 0; i < num_inputs; ++i) {
auto shape = input_descs[i]->shape();
if (shape.size() != 2) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}
info.input_shapes[i] = shape;
info.row_offsets[i] = total_rows;
info.col_offsets[i] = total_cols;
total_rows += shape[0];
total_cols += shape[1];
}

// Check output shape
auto y_shape = y_desc->shape();
if (y_shape.size() != 2 || y_shape[0] != total_rows || y_shape[1] != total_cols) {
return INFINI_STATUS_BAD_TENSOR_SHAPE;
}

info.output_shape = y_shape;
info.output_size = y_desc->numel();

return utils::Result<BlockDiagInfo>(std::move(info));
}

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t *input_descs,
size_t num_inputs) {

if (num_inputs == 0) {
return INFINI_STATUS_BAD_PARAM;
}

auto dtype = input_descs[0]->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);

// Check all inputs have same dtype
for (size_t i = 1; i < num_inputs; ++i) {
if (input_descs[i]->dtype() != dtype) {
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
}

auto info_result = BlockDiagInfo::create(input_descs, num_inputs, y_desc);
CHECK_RESULT(info_result);

*desc_ptr = new Descriptor(dtype, info_result.take(), handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}

template <typename T>
void block_diag_impl(
const BlockDiagInfo &info,
T *y,
const T **inputs) {

// Initialize output to zero
std::memset(y, 0, info.output_size * sizeof(T));

// Place each input matrix at its diagonal position
for (size_t i = 0; i < info.num_inputs; ++i) {
size_t rows = info.input_shapes[i][0];
size_t cols = info.input_shapes[i][1];
size_t row_offset = info.row_offsets[i];
size_t col_offset = info.col_offsets[i];
const T *input = reinterpret_cast<const T *>(inputs[i]);

// Copy input matrix to output at diagonal position
for (size_t r = 0; r < rows; ++r) {
for (size_t c = 0; c < cols; ++c) {
size_t out_row = row_offset + r;
size_t out_col = col_offset + c;
size_t out_idx = out_row * info.output_shape[1] + out_col;
size_t in_idx = r * cols + c;
y[out_idx] = input[in_idx];
}
}
}
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *y,
const void **inputs,
void *stream) const {

switch (_dtype) {
case INFINI_DTYPE_F16: {
const fp16_t **typed_inputs = reinterpret_cast<const fp16_t **>(inputs);
block_diag_impl<fp16_t>(_info, reinterpret_cast<fp16_t *>(y), typed_inputs);
break;
}
case INFINI_DTYPE_BF16: {
const bf16_t **typed_inputs = reinterpret_cast<const bf16_t **>(inputs);
block_diag_impl<bf16_t>(_info, reinterpret_cast<bf16_t *>(y), typed_inputs);
break;
}
case INFINI_DTYPE_F32: {
const float **typed_inputs = reinterpret_cast<const float **>(inputs);
block_diag_impl<float>(_info, reinterpret_cast<float *>(y), typed_inputs);
break;
}
case INFINI_DTYPE_F64: {
const double **typed_inputs = reinterpret_cast<const double **>(inputs);
block_diag_impl<double>(_info, reinterpret_cast<double *>(y), typed_inputs);
break;
}
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}

} // namespace op::block_diag::cpu
56 changes: 56 additions & 0 deletions src/infiniop/ops/block_diag/cpu/block_diag_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#ifndef __BLOCK_DIAG_CPU_H__
#define __BLOCK_DIAG_CPU_H__

#include "../../../operator.h"
#include "../../../devices/cpu/common_cpu.h"
#include <vector>

namespace op::block_diag::cpu {

struct BlockDiagInfo {
size_t num_inputs;
std::vector<std::vector<size_t>> input_shapes; // Each input is 2D matrix
std::vector<size_t> output_shape; // 2D output
std::vector<size_t> row_offsets; // Row offset for each input matrix
std::vector<size_t> col_offsets; // Column offset for each input matrix
size_t output_size;

static utils::Result<BlockDiagInfo> create(
infiniopTensorDescriptor_t *input_descs,
size_t num_inputs,
infiniopTensorDescriptor_t y_desc);
};

class Descriptor final : public InfiniopDescriptor {
infiniDtype_t _dtype;
BlockDiagInfo _info;

Descriptor(infiniDtype_t dtype, BlockDiagInfo info,
infiniDevice_t device_type, int device_id)
: InfiniopDescriptor{device_type, device_id},
_dtype(dtype),
_info(std::move(info)) {}

public:
~Descriptor();

static infiniStatus_t create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t y_desc,
infiniopTensorDescriptor_t *input_descs,
size_t num_inputs);

size_t workspaceSize() const { return 0; }

infiniStatus_t calculate(
void *workspace,
size_t workspace_size,
void *y,
const void **inputs,
void *stream) const;
};

} // namespace op::block_diag::cpu

#endif // __BLOCK_DIAG_CPU_H__
47 changes: 47 additions & 0 deletions src/infiniop/ops/block_diag/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#pragma once
#include <cuda_runtime.h>
#include <type_traits>

namespace op::cuda {

template <typename T>
__global__ void block_diag_kernel(
T *output,
const T **inputs,
size_t num_inputs,
size_t output_rows,
size_t output_cols,
size_t *row_offsets,
size_t *col_offsets,
size_t *input_rows,
size_t *input_cols) {

size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
size_t total = output_rows * output_cols;

if (idx >= total) return;

size_t out_row = idx / output_cols;
size_t out_col = idx % output_cols;

// Find which input matrix this output position belongs to
for (size_t i = 0; i < num_inputs; ++i) {
size_t row_start = row_offsets[i];
size_t row_end = row_start + input_rows[i];
size_t col_start = col_offsets[i];
size_t col_end = col_start + input_cols[i];

if (out_row >= row_start && out_row < row_end &&
out_col >= col_start && out_col < col_end) {
// This position belongs to input i
size_t in_row = out_row - row_start;
size_t in_col = out_col - col_start;
size_t in_idx = in_row * input_cols[i] + in_col;
output[idx] = inputs[i][in_idx];
return;
}
}
// Outside all blocks: should be zero (already initialized)
}

} // namespace op::cuda
Loading