PaddlePaddle
diff --git a/‎build.sh‎
Lines changed: 9 additions & 0 deletions b/‎build.sh‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/get_padding_offset.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/get_padding_offset.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 0 deletions b/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎custom_ops/gpu_ops/rebuild_padding.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/rebuild_padding.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/set_value_by_flags.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/set_value_by_flags.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/step.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/step.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/stop_generation_multi_ends.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/stop_generation_multi_ends.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/stop_generation_multi_stop_seqs.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/stop_generation_multi_stop_seqs.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/token_penalty_multi_scores.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/token_penalty_multi_scores.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎custom_ops/gpu_ops/update_inputs.cu‎
Lines changed: 1 addition & 1 deletion b/‎custom_ops/gpu_ops/update_inputs.cu‎
Lines changed: 1 addition & 1 deletion
@@ -83,6 +83,15 @@ function copy_ops(){
       echo -e "BASE and ROCM ops have been copy to fastdeploy"
       return
     fi
+    is_maca=`$python -c "import paddle; print(paddle.device.is_compiled_with_custom_device('metax_gpu'))"`
+    if [ "$is_maca" = "True" ]; then
+      DEVICE_TYPE="gpu"
+      mkdir -p ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
+      cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
+      echo -e "MACA ops have been copy to fastdeploy"
+      return
+    fi
     mkdir -p ../fastdeploy/model_executor/ops/base
     is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
     if [ "$is_cuda" = "True" ]; then
 
@@ -60,7 +60,7 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
                                              const paddle::Tensor &cum_offsets,
                                              const paddle::Tensor &token_num,
                                              const paddle::Tensor &seq_len) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {
 }
 
 #ifndef PADDLE_WITH_HIP
+#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
 __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,
                                                     int mode = 0) {
   uint32_t flag;
@@ -541,6 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,
                  "l"(flag_addr));
   }
 }
+#endif
 
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
 
@@ -91,7 +91,7 @@ std::vector<paddle::Tensor> rebuild_padding(
     typedef typename traits_::DataType DataType_;
     typedef typename traits_::data_t data_t;
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(tmp_out.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -52,7 +52,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
                            const paddle::Tensor &seq_lens_decoder,
                            const paddle::Tensor &step_idx,
                            const paddle::Tensor &stop_flags) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(stop_flags.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -323,7 +323,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
                 const paddle::Tensor &first_token_ids,
                 const int block_size,
                 const int encoder_decoder_block_num) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(seq_lens_this_time.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -74,7 +74,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
         }
     }
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -89,7 +89,7 @@ void GetStopFlagsMultiSeqs(const paddle::Tensor &topk_ids,
     PD_CHECK(topk_ids.dtype() == paddle::DataType::INT64);
     PD_CHECK(stop_flags.dtype() == paddle::DataType::BOOL);
 
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -156,7 +156,7 @@ void token_penalty_multi_scores_kernel(const paddle::Tensor &pre_ids,
     typedef PDTraits<D> traits_;
     typedef typename traits_::DataType DataType_;
     typedef typename traits_::data_t data_t;
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(logits.place()));
     auto cu_stream = dev_ctx->stream();
 #else
 
@@ -75,7 +75,7 @@ void UpdateInputes(const paddle::Tensor &stop_flags,
                    const paddle::Tensor &stop_nums,
                    const paddle::Tensor &next_tokens,
                    const paddle::Tensor &is_block_step) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
     auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
     auto cu_stream = dev_ctx->stream();
 #else
Original file line number	Diff line number	Diff line change
`@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {`
`509`	`509`	`}`
`510`	`510`
`511`	`511`	`#ifndef PADDLE_WITH_HIP`
	`512`	`+#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU`
`512`	`513`	`__forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,`
`513`	`514`	`int mode = 0) {`
`514`	`515`	`uint32_t flag;`
`@@ -541,6 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,`
`541`	`542`	`"l"(flag_addr));`
`542`	`543`	`}`
`543`	`544`	`}`
	`545`	`+#endif`
`544`	`546`
`545`	`547`	`inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {`
`546`	`548`	`int max_shared_mem_per_block_opt_in = 0;`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,`
`74`	`74`	`}`
`75`	`75`	`}`
`76`	`76`
`77`		`-#ifdef PADDLE_WITH_CUSTOM_DEVICE`
	`77`	`+#if defined(PADDLE_WITH_CUSTOM_DEVICE) && !defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)`
`78`	`78`	`auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(topk_ids.place()));`
`79`	`79`	`auto cu_stream = dev_ctx->stream();`
`80`	`80`	`#else`