microsoft
diff --git a/‎ark/api/executor.cpp
+124-75 b/‎ark/api/executor.cpp
+124-75
diff --git a/‎ark/api/tensor.cpp
-12 b/‎ark/api/tensor.cpp
-12
diff --git a/‎ark/codegen.cpp
+29-31 b/‎ark/codegen.cpp
+29-31
diff --git a/‎ark/codegen.hpp
+3-3 b/‎ark/codegen.hpp
+3-3
diff --git a/‎ark/cpu_timer.cpp
-16 b/‎ark/cpu_timer.cpp
-16
diff --git a/‎ark/cpu_timer.h
-4 b/‎ark/cpu_timer.h
-4
diff --git a/‎ark/external_buffer_registry.cpp
+29 b/‎ark/external_buffer_registry.cpp
+29
diff --git a/‎ark/external_buffer_registry.hpp
+31 b/‎ark/external_buffer_registry.hpp
+31
diff --git a/‎ark/gpu/gpu.hpp
+4 b/‎ark/gpu/gpu.hpp
+4
diff --git a/‎ark/include/ark/executor.hpp
+14-7 b/‎ark/include/ark/executor.hpp
+14-7
diff --git a/‎ark/include/ark/model.hpp
+31-1 b/‎ark/include/ark/model.hpp
+31-1
@@ -9,18 +9,6 @@
 
 namespace ark {
 
-Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape, const DataType& dtype) {
-    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                                std::multiplies<int64_t>()) *
-                                dtype.bytes();
-    auto buffer =
-        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(
-        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
-    ref_ = tensor;
-}
-
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
 
@@ -4,16 +4,17 @@
 #include "codegen.hpp"
 
 #include <list>
+#include <utility>
 
 #include "ark/data_type.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
-#include "model_buffer_manager.hpp"
 #include "range.hpp"
 #include "utils/utils_math.hpp"
 
@@ -55,8 +56,8 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::vector<std::string> &external_args,
-         const std::map<size_t, std::string> &buffer_id_to_name,
+         const std::map<size_t, std::pair<std::string, void *>>
+             &buffer_id_to_kernel_arg,
          const std::string &name);
     ~Impl() = default;
 
@@ -82,8 +83,7 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
-    std::vector<std::string> external_args_;
-    std::map<size_t, std::string> buffer_id_to_name_;
+    std::map<size_t, std::pair<std::string, void *>> buffer_id_to_kernel_arg_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -92,14 +92,13 @@ class CodeGenerator::Impl {
     std::string code_;
 };
 
-CodeGenerator::Impl::Impl(
-    const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::vector<std::string> &external_args,
-    const std::map<size_t, std::string> &buffer_id_to_name,
-    const std::string &name)
+CodeGenerator::Impl::Impl(const PlanJson &plan,
+                          const std::map<size_t, size_t> &buffer_id_to_offset,
+                          const std::map<size_t, std::pair<std::string, void *>>
+                              &buffer_id_to_kernel_arg,
+                          const std::string &name)
     : buffer_id_to_offset_(buffer_id_to_offset),
-      external_args_(external_args),
-      buffer_id_to_name_(buffer_id_to_name),
+      buffer_id_to_kernel_arg_(buffer_id_to_kernel_arg),
       name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
@@ -192,9 +191,10 @@ CodeGenerator::Impl::Impl(
 
     // Generate the global arguments
     std::stringstream global_args_ss, function_args_ss, arg_types_ss;
-    for (const auto &arg : external_args_) {
-        global_args_ss << "void *" << arg << ", ";
-        function_args_ss << arg << ", ";
+    for (const auto &[buf_id, kernel_arg] : buffer_id_to_kernel_arg_) {
+        const auto &arg_name = kernel_arg.first;
+        global_args_ss << "void *" << arg_name << ", ";
+        function_args_ss << arg_name << ", ";
         arg_types_ss << "void *, ";
     }
     std::string global_args = global_args_ss.str();
@@ -219,7 +219,7 @@ CodeGenerator::Impl::Impl(
         {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
-        {"@NAME@", (name_.empty() ? "" : "_" + name_)},
+        {"@NAME@", (!name_.empty() ? "" : name_)},
         {"@GLOBAL_ARGS@", global_args},
         {"@FUNCTION_ARGS@", function_args},
         {"@ARG_TYPES@", arg_types},
@@ -273,29 +273,28 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
                 size_t buffer_id = tns->buffer()->id();
-                if (buffer_id_to_name_.find(buffer_id) ==
-                    buffer_id_to_name_.end()) {
+                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
+                if (it == buffer_id_to_kernel_arg_.end()) {
                     size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + ModelOffset(tns).value();
                     ss << "(" << tns->data_type()->type_str() << "*)&_buf["
                        << offset << "]";
                 } else {
-                    ss << "(" << tns->data_type()->type_str() << "*)"
-                       << buffer_id_to_name_.at(buffer_id);
+                    const auto &name = it->second.first;
+                    ss << "(" << tns->data_type()->type_str() << "*)" << name;
                 }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_id = moff.buffer_id();
-                if (buffer_id_to_name_.find(buffer_id) ==
-                    buffer_id_to_name_.end()) {
+                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
+                if (it == buffer_id_to_kernel_arg_.end()) {
                     size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + moff.value();
                     ss << offset;
                 } else {
-                    const std::string &buffer_name =
-                        buffer_id_to_name_.at(buffer_id);
+                    const auto &name = it->second.first;
                     size_t offset = moff.value();
-                    ss << "(uint64_t)((char*)" << buffer_name << " + " << offset
+                    ss << "(uint64_t)((char*)" << name << " + " << offset
                        << ")";
                 }
             } else {
@@ -372,8 +371,7 @@ std::string CodeGenerator::Impl::resource_group(
             n_slots = total_warps / num_warps_per_task;
         }
         if (n_slots == 0) {
-            ERR(PlanError, "not enough resources for task group: ",
-                tg.dump());
+            ERR(PlanError, "not enough resources for task group: ", tg.dump());
         }
 
         size_t task_b = *task_range.begin();
@@ -498,11 +496,11 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::vector<std::string> &external_args,
-    const std::map<size_t, std::string> &buffer_id_to_name,
+    const std::map<size_t, std::pair<std::string, void *>>
+        &buffer_id_to_kernel_arg,
     const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, external_args,
-                                   buffer_id_to_name, name)) {}
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset,
+                                   buffer_id_to_kernel_arg, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
 
@@ -7,18 +7,18 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "model/model_json.hpp"
-#include "model_buffer_manager.hpp"
 
 namespace ark {
 
 class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::vector<std::string> &external_args,
-                  const std::map<size_t, std::string> &buffer_id_to_name,
+                  const std::map<size_t, std::pair<std::string, void *>>
+                      &buffer_id_to_kernel_arg,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
 
@@ -16,20 +16,4 @@ double cpu_timer(void) {
     return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec;
 }
 
-// Sleep in second.
-int cpu_timer_sleep(double sec) {
-    struct timespec tspec;
-    tspec.tv_sec = (time_t)sec;
-    tspec.tv_nsec = (long)((sec - tspec.tv_sec) * 1.0e9);
-    return nanosleep(&tspec, 0);
-}
-
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec) {
-    struct timespec tspec;
-    tspec.tv_sec = 0;
-    tspec.tv_nsec = nsec;
-    return nanosleep(&tspec, 0);
-}
-
 }  // namespace ark
@@ -8,10 +8,6 @@ namespace ark {
 
 // Measure current time in second.
 double cpu_timer(void);
-// Sleep in second.
-int cpu_timer_sleep(double sec);
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec);
 
 }  // namespace ark
 
 
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "external_buffer_registry.hpp"
+
+#include "logging.hpp"
+
+namespace ark {
+
+ExternalBufferRegistry &ExternalBufferRegistry::get_instance() {
+    static ExternalBufferRegistry instance;
+    return instance;
+}
+
+void ExternalBufferRegistry::set(const size_t id, void *data) {
+    buffers_[id] = data;
+}
+
+void *ExternalBufferRegistry::get(const size_t id) const {
+    auto it = buffers_.find(id);
+    if (it != buffers_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+void ExternalBufferRegistry::clear() { buffers_.clear(); }
+
+}  // namespace ark
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+#define ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
+class ExternalBufferRegistry {
+   public:
+    static ExternalBufferRegistry &get_instance();
+
+    void set(const size_t id, void *data);
+
+    void *get(const size_t id) const;
+
+    void clear();
+
+   private:
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, void *> buffers_;
+    ExternalBufferRegistry() {}
+    ExternalBufferRegistry(const ExternalBufferRegistry &) = delete;
+    ExternalBufferRegistry &operator=(const ExternalBufferRegistry &) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
@@ -53,6 +53,8 @@ ARK_GPU_DEFINE_TYPE_ALIAS(gpuModule, CUmodule, hipModule_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunction, CUfunction, hipFunction_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunctionAttribute, CUfunction_attribute,
                           hipFunction_attribute);
+ARK_GPU_DEFINE_TYPE_ALIAS(gpuPointerAttributes, cudaPointerAttributes,
+                          hipPointerAttributes);
 
 // runtime API
 ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuSuccess, cudaSuccess, hipSuccess);
@@ -126,6 +128,8 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerGetAttributes, cudaPointerGetAttributes,
+                          hipPointerGetAttributes);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
 
@@ -9,6 +9,7 @@
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace ark {
@@ -45,10 +46,14 @@ class Executor {
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(Stream stream = nullptr, bool loop_mode = true);
+    void launch(
+        Stream stream = nullptr, bool loop_mode = true,
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 
     /// Run the executor for `iter` iterations.
-    void run(int iter);
+    void run(
+        int iter,
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
@@ -99,13 +104,15 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(
-        const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<Planner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor", bool loop_mode = true);
+    DefaultExecutor(const Model &model, int device_id = -1,
+                    Stream stream = nullptr,
+                    const std::vector<Planner::ConfigRule> &config_rules = {},
+                    const std::string &name = "DefaultExecutor",
+                    bool loop_mode = true);
 
     /// Launch the default executor.
-    void launch();
+    void launch(
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 };
 
 }  // namespace ark
 
@@ -76,6 +76,37 @@ class Model : public ModelGraph {
                   const Dims &padded_shape = {}, int rank = -1,
                   const std::string &name = "");
 
+    ///
+    /// Returns a tensor object associated with an external buffer.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Strides of each dimension of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer.
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p padded_shape.
+    /// @param padded_shape Padded shape of the tensor. Padding is used to
+    /// reserve extra space for the tensor when computation requires it.
+    /// Data on the padded region is allowed to be accessed by computation,
+    /// but it is not considered as the data of interest. The padded region is
+    /// initialized to zero only once when the Executor is launched. The padded
+    /// shape should be greater than or equal to the @p shape, and the
+    /// @p strides should be greater than or equal to the padded shape. If the
+    /// @p strides are not provided, they are set to the padded shape. If the
+    /// padded shape is not provided, it is set to the @p shape.
+    /// @param rank Rank of the tensor. -1 means the rank of this model.
+    /// @param name Name of the tensor.
+    /// @param data Address of data to pass through placeholder. If provided,
+    /// this buffer is registered with the ExternalBufferRegistry and associated
+    /// with the tensor.
+    /// @return Pointer to a tensor object that references the external buffer.
+    ///
+    Tensor placeholder(const Dims &shape, const DataType &data_type,
+                       const Dims &strides = {}, const Dims &offsets = {},
+                       const Dims &padded_shape = {}, int rank = -1,
+                       void *data = nullptr, const std::string &name = "");
+
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
                  const std::string &name = "");
@@ -254,7 +285,6 @@ class Model : public ModelGraph {
 
     Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num,
                             const std::string &name = "");
-
 };
 
 }  // namespace ark