Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: add parallel restore of BO content to accelerate restore #2527

Open
wants to merge 7 commits into
base: criu-dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Documentation/criu-amdgpu-plugin.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container
Pytorch
Tensorflow
Using CRIU Image Streamer
Parallel Restore

DESCRIPTION
-----------
Expand Down
2 changes: 1 addition & 1 deletion criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -1396,7 +1396,7 @@ static int dump_zombies(void)
item->sid = pps_buf.sid;
item->pgid = pps_buf.pgid;

BUG_ON(!list_empty(&item->children));
BUG_ON(has_children(item));

if (!item->sid) {
pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n",
Expand Down
9 changes: 6 additions & 3 deletions criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -2125,6 +2125,9 @@ static int restore_root_task(struct pstree_item *init)
__restore_switch_stage(CR_STATE_FORKING);

skip_ns_bouncing:
ret = run_plugins(POST_FORKING);
if (ret < 0 && ret != -ENOTSUP)
goto out_kill;

ret = restore_wait_inprogress_tasks();
if (ret < 0)
Expand Down Expand Up @@ -2357,9 +2360,6 @@ int cr_restore_tasks(void)
if (check_img_inventory(/* restore = */ true) < 0)
goto err;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (init_stats(RESTORE_STATS))
goto err;

Expand Down Expand Up @@ -2391,6 +2391,9 @@ int cr_restore_tasks(void)
if (fdstore_init())
goto err;

if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;

if (inherit_fd_move_to_fdstore())
goto err;

Expand Down
4 changes: 4 additions & 0 deletions criu/include/criu-plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ enum {

CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,

CR_PLUGIN_HOOK__POST_FORKING = 12,

CR_PLUGIN_HOOK__MAX
};

Expand All @@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);

enum {
CR_PLUGIN_STAGE__DUMP,
Expand Down Expand Up @@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat);
typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff,
uint64_t *new_pgoff, int *plugin_fd);
typedef int(cr_plugin_resume_devices_late_t)(int pid);
typedef int(cr_plugin_post_forking_t)(void);

#endif /* __CRIU_PLUGIN_H__ */
1 change: 1 addition & 0 deletions criu/include/pstree.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node);
extern struct pid *pstree_pid_by_virt(pid_t pid);

extern struct pstree_item *root_item;
extern bool has_children(struct pstree_item *item);
extern struct pstree_item *pstree_item_next(struct pstree_item *item);
#define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi))

Expand Down
1 change: 1 addition & 0 deletions criu/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
__assign_hook(POST_FORKING, "cr_plugin_post_forking");

#undef __assign_hook

Expand Down
9 changes: 7 additions & 2 deletions criu/pstree.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void free_pstree(struct pstree_item *root_item)
struct pstree_item *item = root_item, *parent;

while (item) {
if (!list_empty(&item->children)) {
if (has_children(item)) {
item = list_first_entry(&item->children, struct pstree_item, sibling);
continue;
}
Expand Down Expand Up @@ -241,10 +241,15 @@ int init_pstree_helper(struct pstree_item *ret)
return 0;
}

bool has_children(struct pstree_item *item)
{
return !list_empty(&item->children);
}

/* Deep first search on children */
struct pstree_item *pstree_item_next(struct pstree_item *item)
{
if (!list_empty(&item->children))
if (has_children(item))
return list_first_entry(&item->children, struct pstree_item, sibling);

while (item->parent) {
Expand Down
2 changes: 1 addition & 1 deletion criu/seize.c
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,7 @@ static int collect_task(struct pstree_item *item)
if (ret < 0)
goto err_close;

if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) {
if ((item->pid->state == TASK_DEAD) && has_children(item)) {
pr_err("Zombie with children?! O_o Run, run, run!\n");
goto err_close;
}
Expand Down
2 changes: 1 addition & 1 deletion plugins/amdgpu/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ endif
criu-amdgpu.pb-c.c: criu-amdgpu.proto
protoc-c --proto_path=. --c_out=. criu-amdgpu.proto

amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)

amdgpu_plugin_clean:
Expand Down
23 changes: 22 additions & 1 deletion plugins/amdgpu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ Supporting ROCm with CRIU

_Felix Kuehling <[email protected]>_<br>
_Rajneesh Bardwaj <[email protected]>_<br>
_David Yat Sin <[email protected]>_
_David Yat Sin <[email protected]>_<br>
_Yanning Yang <[email protected]>_

# Introduction

Expand Down Expand Up @@ -224,6 +225,26 @@ to resume execution on the GPUs.
*This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC
patch series.*

## Restoring BO content in parallel

Restoring the BO content is an important part in the restore of GPU state and
usually takes a significant amount of time. A possible location for this
procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook
blocks the target process from performing other restore operations, which
hinders further optimization of the restore process.

Therefore, a new plugin hook that runs in the master restore process is
introduced, and it interacts with the `cr_plugin_restore_file` hook to complete
the restore of BO content. Specifically, the target process only needs to send
the relevant BOs to the master restore process, while this new hook handles all
the restore of buffer objects. Through this method, during the restore of the BO
content, the target process can perform other restore operations, thus
accelerating the restore procedure. This is an implementation of the gCROP
method proposed in the ACM SoCC'24 paper: [On-demand and Parallel
Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510).

*This optimization technique is enabled by the `__POST_FORKING` hook.*

## Other CRIU changes

In addition to the new plugins, we need to make some changes to CRIU itself to
Expand Down
Loading