Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions include/infiniop/ops/erf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_ERF_API_H__
#define __INFINIOP_ERF_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopErfDescriptor_t;

__C __export infiniStatus_t infiniopCreateErfDescriptor(infiniopHandle_t handle,
infiniopErfDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetErfWorkspaceSize(infiniopErfDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopErf(infiniopErfDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyErfDescriptor(infiniopErfDescriptor_t desc);

#endif
24 changes: 24 additions & 0 deletions include/infiniop/ops/erfc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_ERFC_API_H__
#define __INFINIOP_ERFC_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopErfcDescriptor_t;

__C __export infiniStatus_t infiniopCreateErfcDescriptor(infiniopHandle_t handle,
infiniopErfcDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetErfcWorkspaceSize(infiniopErfcDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopErfc(infiniopErfcDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyErfcDescriptor(infiniopErfcDescriptor_t desc);

#endif
24 changes: 24 additions & 0 deletions include/infiniop/ops/erfinv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#ifndef __INFINIOP_ERFINV_API_H__
#define __INFINIOP_ERFINV_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopErfinvDescriptor_t;

__C __export infiniStatus_t infiniopCreateErfinvDescriptor(infiniopHandle_t handle,
infiniopErfinvDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x);

__C __export infiniStatus_t infiniopGetErfinvWorkspaceSize(infiniopErfinvDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopErfinv(infiniopErfinvDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyErfinvDescriptor(infiniopErfinvDescriptor_t desc);

#endif
25 changes: 25 additions & 0 deletions include/infiniop/ops/matrix_power.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef __INFINIOP_MATRIX_POWER_API_H__
#define __INFINIOP_MATRIX_POWER_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopMatrixPowerDescriptor_t;

__C __export infiniStatus_t infiniopCreateMatrixPowerDescriptor(infiniopHandle_t handle,
infiniopMatrixPowerDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
int n);

__C __export infiniStatus_t infiniopGetMatrixPowerWorkspaceSize(infiniopMatrixPowerDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopMatrixPower(infiniopMatrixPowerDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyMatrixPowerDescriptor(infiniopMatrixPowerDescriptor_t desc);

#endif
25 changes: 25 additions & 0 deletions include/infiniop/ops/pixel_shuffle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef __INFINIOP_PIXEL_SHUFFLE_API_H__
#define __INFINIOP_PIXEL_SHUFFLE_API_H__

#include "../operator_descriptor.h"

typedef struct InfiniopDescriptor *infiniopPixelShuffleDescriptor_t;

__C __export infiniStatus_t infiniopCreatePixelShuffleDescriptor(infiniopHandle_t handle,
infiniopPixelShuffleDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t y,
infiniopTensorDescriptor_t x,
int upscale_factor);

__C __export infiniStatus_t infiniopGetPixelShuffleWorkspaceSize(infiniopPixelShuffleDescriptor_t desc, size_t *size);

__C __export infiniStatus_t infiniopPixelShuffle(infiniopPixelShuffleDescriptor_t desc,
void *workspace,
size_t workspace_size,
void *y,
const void *x,
void *stream);

__C __export infiniStatus_t infiniopDestroyPixelShuffleDescriptor(infiniopPixelShuffleDescriptor_t desc);

#endif
52 changes: 52 additions & 0 deletions src/infiniop/ops/erf/cpu/erf_cpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#include "erf_cpu.h"

namespace op::erf::cpu {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);

CHECK_SAME_SHAPE(output_shape, input_shape);

CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<ErfOp, bf16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<ErfOp, fp16_t>(_info, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<ErfOp, float>(_info, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<ErfOp, double>(_info, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}

} // namespace op::erf::cpu
20 changes: 20 additions & 0 deletions src/infiniop/ops/erf/cpu/erf_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#ifndef __ERF_CPU_H__
#define __ERF_CPU_H__

#include "../../../elementwise/cpu/elementwise_cpu.h"
#include <cmath>

ELEMENTWISE_DESCRIPTOR(erf, cpu)

namespace op::erf::cpu {
typedef struct ErfOp {
public:
static constexpr size_t num_inputs = 1;
template <typename T>
T operator()(const T &x) const {
return std::erf(x);
}
} ErfOp;
} // namespace op::erf::cpu

#endif // __ERF_CPU_H__
25 changes: 25 additions & 0 deletions src/infiniop/ops/erf/cuda/kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once
#include <cmath>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <type_traits>

namespace op::cuda {

template <typename T>
struct ErfOp {
__device__ __forceinline__ T operator()(T x) const {
if constexpr (std::is_same_v<T, float>) {
return erff(x);
} else if constexpr (std::is_same_v<T, double>) {
return erf(x);
} else {
// For F16/BF16: promote to float, compute, then cast back
float xf = static_cast<float>(x);
return static_cast<T>(erff(xf));
}
}
};

} // namespace op::cuda
8 changes: 8 additions & 0 deletions src/infiniop/ops/erf/erf.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __ERF_H__
#define __ERF_H__

#include "../../elementwise/elementwise.h"

#define DESCRIPTOR(NAMESPACE) ELEMENTWISE_DESCRIPTOR(erf, NAMESPACE)

#endif // __ERF_H__
8 changes: 8 additions & 0 deletions src/infiniop/ops/erf/metax/erf_metax.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __ERF_METAX_API_H__
#define __ERF_METAX_API_H__

#include "../../../elementwise/metax/elementwise_metax_api.h"

ELEMENTWISE_DESCRIPTOR(erf, metax)

#endif // __ERF_METAX_API_H__
58 changes: 58 additions & 0 deletions src/infiniop/ops/erf/metax/erf_metax.maca
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "erf_metax.h"

#include "../../../elementwise/metax/elementwise_metax.h"

#include "../cuda/kernel.cuh"

namespace op::erf::metax {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);

CHECK_SAME_SHAPE(output_shape, input_shape);

CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::ErfOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::ErfOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ErfOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
} // namespace op::erf::metax
8 changes: 8 additions & 0 deletions src/infiniop/ops/erf/moore/erf_moore.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef __ERF_MOORE_API_H__
#define __ERF_MOORE_API_H__

#include "../../../elementwise/moore/elementwise_moore_api.h"

ELEMENTWISE_DESCRIPTOR(erf, moore)

#endif // __ERF_MOORE_API_H__
60 changes: 60 additions & 0 deletions src/infiniop/ops/erf/moore/erf_moore.mu
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include "erf_moore.h"

#include "../../../elementwise/moore/elementwise_moore.h"

#include "erf_moore_kernel.h"

namespace op::erf::moore {

Descriptor::~Descriptor() = default;

infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {

auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();

const auto &input_desc = input_desc_vec.at(0);
const auto &output_shape = out_desc->shape();
const auto &input_shape = input_desc->shape();

CHECK_DTYPE(dtype, INFINI_DTYPE_BF16, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);

CHECK_SAME_SHAPE(output_shape, input_shape);

CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)

return INFINI_STATUS_SUCCESS;
}

infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {

if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}

switch (_dtype) {
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, moore::ErfOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F16:
return _device_info->calculate<256, moore::ErfOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, moore::ErfOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, moore::ErfOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}

return INFINI_STATUS_SUCCESS;
}

} // namespace op::erf::moore
Loading