Skip to content

Commit

Permalink
Add flags argument to slurm_kill_job_step()
Browse files Browse the repository at this point in the history
job_step_kill_msg_t already has a flags field to populate.

Bug 11360
  • Loading branch information
naterini authored and dannyauble committed Mar 22, 2023
1 parent 293656b commit 165f410
Show file tree
Hide file tree
Showing 22 changed files with 46 additions and 41 deletions.
1 change: 1 addition & 0 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ API CHANGES
-- cli_filter/lua - return nil for unset time options rather than the string
"2982616-04:14:00" (which is the internal macro "NO_VAL" represented as
time string).
- "flags" argument was added to slurm_kill_job_step().
5 changes: 2 additions & 3 deletions contribs/perlapi/libslurm/perl/Slurm.xs
Original file line number Diff line number Diff line change
Expand Up @@ -484,16 +484,15 @@ slurm_kill_job(slurm_t self, uint32_t job_id, uint16_t signal, uint16_t batch_fl
C_ARGS:
job_id, signal, batch_flag

int
slurm_kill_job_step(slurm_t self, uint32_t job_id, uint32_t step_id, uint16_t signal)
int slurm_kill_job_step(slurm_t self, uint32_t job_id, uint32_t step_id, uint16_t signal, uint16_t flags)
INIT:
if (self); /* this is needed to avoid a warning about
unused variables. But if we take slurm_t self
out of the mix Slurm-> doesn't work,
only Slurm::
*/
C_ARGS:
job_id, step_id, signal
job_id, step_id, signal, flags

int
slurm_signal_job(slurm_t self, uint32_t job_id, uint16_t signal)
Expand Down
6 changes: 3 additions & 3 deletions slurm/slurm.h
Original file line number Diff line number Diff line change
Expand Up @@ -3615,11 +3615,11 @@ extern int slurm_kill_job(uint32_t job_id, uint16_t signal, uint16_t flags);
* IN job_id - the job's id
* IN step_id - the job step's id
* IN signal - signal number
* IN flags - see KILL_* or 0 for no flags
* RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
*/
extern int slurm_kill_job_step(uint32_t job_id,
uint32_t step_id,
uint16_t signal);
extern int slurm_kill_job_step(uint32_t job_id, uint32_t step_id,
uint16_t signal, uint16_t flags);
/*
* slurm_kill_job2 - send REQUEST_KILL_JOB msg to an existing job or step.
* IN job_id - the job's id (in a string format)
Expand Down
6 changes: 3 additions & 3 deletions src/api/cancel.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ slurm_kill_job (uint32_t job_id, uint16_t signal, uint16_t flags)
* IN signal - signal number
* RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
*/
extern int
slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
extern int slurm_kill_job_step(uint32_t job_id, uint32_t step_id,
uint16_t signal, uint16_t flags)
{
int rc;
slurm_msg_t msg;
Expand All @@ -125,7 +125,7 @@ slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
req.step_id.step_id = step_id;
req.step_id.step_het_comp = NO_VAL;
req.signal = signal;
req.flags = 0;
req.flags = flags;
msg.msg_type = REQUEST_CANCEL_JOB_STEP;
msg.data = &req;

Expand Down
2 changes: 1 addition & 1 deletion src/api/slurm_pmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -388,5 +388,5 @@ void slurm_pmi_finalize(void)
extern int slurm_pmi_kill_job_step(uint32_t job_id, uint32_t step_id,
uint16_t signal)
{
return slurm_kill_job_step(job_id, step_id, signal);
return slurm_kill_job_step(job_id, step_id, signal, 0);
}
6 changes: 3 additions & 3 deletions src/api/step_launch.c
Original file line number Diff line number Diff line change
Expand Up @@ -624,7 +624,7 @@ static void _step_abort(slurm_step_ctx_t *ctx)

if (!sls->abort_action_taken) {
slurm_kill_job_step(ctx->job_id, ctx->step_resp->job_step_id,
SIGKILL);
SIGKILL, 0);
sls->abort_action_taken = true;
}
}
Expand Down Expand Up @@ -694,7 +694,7 @@ void slurm_step_launch_wait_finish(slurm_step_ctx_t *ctx)
slurm_kill_job_step(ctx->job_id,
ctx->step_resp->
job_step_id,
SIGKILL);
SIGKILL, 0);
sls->abort_action_taken = true;
}
if (!time_set) {
Expand Down Expand Up @@ -725,7 +725,7 @@ void slurm_step_launch_wait_finish(slurm_step_ctx_t *ctx)
*/
slurm_kill_job_step(ctx->job_id,
ctx->step_resp->job_step_id,
SIGKILL);
SIGKILL, 0);
client_io_handler_abort(sls->io);
break;
} else if (errnum != 0) {
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/mpi/pmi2/pmi1.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ _handle_barrier_in(int fd, int lrank, client_req_t *req)
/* cancel the step to avoid tasks hang */
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id,
SIGKILL);
SIGKILL, 0);
} else {
waiting_kvs_resp = 1;
}
Expand Down Expand Up @@ -228,7 +228,7 @@ _handle_abort(int fd, int lrank, client_req_t *req)
debug3("mpi/pmi2: in _handle_abort");
/* no response needed. just cancel the job */
slurm_kill_job_step(job_info.step_id.job_id, job_info.step_id.step_id,
SIGKILL);
SIGKILL, 0);
debug3("mpi/pmi2: out _handle_abort");
return SLURM_SUCCESS;
}
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/mpi/pmi2/pmi2.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ _handle_abort(int fd, int lrank, client_req_t *req)
/* no response needed. just cancel the job step if required */
if (is_world) {
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id, SIGKILL);
job_info.step_id.step_id, SIGKILL, 0);
}
return rc;
}
Expand Down Expand Up @@ -308,7 +308,7 @@ _handle_kvs_fence(int fd, int lrank, client_req_t *req)
/* cancel the step to avoid tasks hang */
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id,
SIGKILL);
SIGKILL, 0);
} else {
waiting_kvs_resp = 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmi2/ring.c
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ static int pmix_stepd_send(const char* buf, uint32_t size, int rank)
/* cancel the step to avoid tasks hang */
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id,
SIGKILL);
SIGKILL, 0);
}

/* didn't succeeded, but we'll retry again,
Expand Down
4 changes: 2 additions & 2 deletions src/plugins/mpi/pmi2/tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ static int _handle_kvs_fence(int fd, buf_t *buf)
/* cancel the step to avoid tasks hang */
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id,
SIGKILL);
SIGKILL, 0);
} else {
if (in_stepd())
waiting_kvs_resp = 1;
Expand Down Expand Up @@ -213,7 +213,7 @@ static int _handle_kvs_fence_resp(int fd, buf_t *buf)
send_kvs_fence_resp_to_clients(rc, errmsg);
if (rc != SLURM_SUCCESS) {
slurm_kill_job_step(job_info.step_id.job_id,
job_info.step_id.step_id, SIGKILL);
job_info.step_id.step_id, SIGKILL, 0);
}
return rc;

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmix/mpi_pmix.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ extern int mpi_p_slurmstepd_prefork(const stepd_step_rec_t *step, char ***env)
err_ext:
/* Abort the whole job if error! */
slurm_kill_job_step(step->step_id.job_id,
step->step_id.step_id, SIGKILL);
step->step_id.step_id, SIGKILL, 0);
return ret;
}

Expand Down
3 changes: 2 additions & 1 deletion src/plugins/mpi/pmix/pmixp_client.c
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,8 @@ extern int pmixp_lib_abort(int status, void *cbfunc, void *cbdata)
*/
pmixp_abort_propagate(status);

slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL,
0);

if (abort_cbfunc)
abort_cbfunc(PMIX_SUCCESS, cbdata);
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/mpi/pmix/pmixp_client_v2.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ static void _errhandler(size_t evhdlr_registration_id,
/* FIXME: use proper specificator for nranges */
PMIXP_ERROR("Error handler invoked: status = %d, source = [%s:%d]",
(int) status, source->nspace, source->rank);
slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL,
0);
}

static pmix_server_module_t slurm_pmix_cb = {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmix/pmixp_coll_ring.c
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr)
hdr->seq, nodename, hdr->nodeid, coll->seq);
pmixp_debug_hang(0); /* enable hang to debug this! */
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
xfree(nodename);
return SLURM_SUCCESS;
} else if (PMIXP_COLL_REQ_SKIP == rc) {
Expand Down
14 changes: 7 additions & 7 deletions src/plugins/mpi/pmix/pmixp_coll_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ static void _reset_coll(pmixp_coll_t *coll)
/* collective is spoiled, reset state */
tree->state = PMIXP_COLL_TREE_SYNC;
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
}
}

Expand Down Expand Up @@ -610,7 +610,7 @@ static int _progress_ufwd(pmixp_coll_t *coll)
/* collective is spoiled, reset state */
tree->state = PMIXP_COLL_TREE_SYNC;
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
return false;
}

Expand Down Expand Up @@ -742,7 +742,7 @@ static int _progress_ufwd_sc(pmixp_coll_t *coll)
/* collective is spoiled, reset state */
tree->state = PMIXP_COLL_TREE_SYNC;
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
return false;
}

Expand Down Expand Up @@ -836,7 +836,7 @@ static int _progress_dfwd(pmixp_coll_t *coll)
/* collective is spoiled, reset state */
tree->state = PMIXP_COLL_TREE_SYNC;
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
return false;
}
#ifdef PMIXP_COLL_DEBUG
Expand Down Expand Up @@ -944,7 +944,7 @@ int pmixp_coll_tree_local(pmixp_coll_t *coll, char *data, size_t size,
/* collective is spoiled, reset state */
tree->state = PMIXP_COLL_TREE_SYNC;
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
ret = SLURM_ERROR;
goto exit;
}
Expand Down Expand Up @@ -1143,7 +1143,7 @@ int pmixp_coll_tree_child(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq,
_reset_coll(coll);
error2:
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
/* unlock the structure */
slurm_mutex_unlock(&coll->lock);

Expand Down Expand Up @@ -1292,7 +1292,7 @@ int pmixp_coll_tree_parent(pmixp_coll_t *coll, uint32_t peerid, uint32_t seq,
_reset_coll(coll);
error2:
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
slurm_mutex_unlock(&coll->lock);

return SLURM_ERROR;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmix/pmixp_dconn.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ static inline int pmixp_dconn_connect(
xfree(nodename);
pmixp_debug_hang(0); /* enable hang to debug this! */
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
}
return rc;
}
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmix/pmixp_debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
#define PMIXP_ABORT(format, args...) { \
PMIXP_ERROR(format, ##args); \
slurm_kill_job_step(pmixp_info_jobid(), \
pmixp_info_stepid(), SIGKILL); \
pmixp_info_stepid(), SIGKILL, 0); \
}

#define PMIXP_ERROR_NO(err, format, args...) { \
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/mpi/pmix/pmixp_server.c
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@ static void _process_server_request(pmixp_base_hdr_t *hdr, buf_t *buf)
hdr->seq, nodename, hdr->nodeid, coll->seq);
pmixp_debug_hang(0); /* enable hang to debug this! */
slurm_kill_job_step(pmixp_info_jobid(),
pmixp_info_stepid(), SIGKILL);
pmixp_info_stepid(), SIGKILL, 0);
xfree(nodename);
break;
} else if (PMIXP_COLL_REQ_SKIP == rc) {
Expand Down
2 changes: 1 addition & 1 deletion src/scancel/scancel.c
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ _cancel_step_id (void *ci)
START_TIMER;
if ((!sig_set) || opt.ctld)
error_code = slurm_kill_job_step(job_id, step_id,
cancel_info->sig);
cancel_info->sig, 0);
else if (cancel_info->sig == SIGKILL)
error_code = slurm_terminate_job_step(job_id, step_id);
else
Expand Down
5 changes: 3 additions & 2 deletions src/srun/launch.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ static int _step_signal(int signal)
while ((my_srun_job = (srun_job_t *) list_next(iter))) {
info("Terminating %ps", &my_srun_job->step_id);
rc2 = slurm_kill_job_step(my_srun_job->step_id.job_id,
my_srun_job->step_id.step_id, signal);
my_srun_job->step_id.step_id, signal,
0);
if (rc2)
rc = rc2;
}
Expand Down Expand Up @@ -269,7 +270,7 @@ _handle_openmpi_port_error(const char *tasks, const char *hosts,
hosts, tasks, msg);

info("Terminating job step %ps", &step_id);
slurm_kill_job_step(step_id.job_id, step_id.step_id, SIGKILL);
slurm_kill_job_step(step_id.job_id, step_id.step_id, SIGKILL, 0);
}

static char *_mpir_get_host_name(char *node_name)
Expand Down
8 changes: 5 additions & 3 deletions src/srun/srun_job.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,7 +1610,8 @@ job_force_termination(srun_job_t *job)
}
if (kill_sent == 1) {
/* Try sending SIGKILL through slurmctld */
slurm_kill_job_step(job->step_id.job_id, job->step_id.step_id, SIGKILL);
slurm_kill_job_step(job->step_id.job_id,
job->step_id.step_id, SIGKILL, 0);
}
}
kill_sent++;
Expand Down Expand Up @@ -2294,13 +2295,14 @@ static int _shepherd_spawn(srun_job_t *job, List srun_job_list, bool got_alloc)
job_iter = list_iterator_create(srun_job_list);
while ((job = list_next(job_iter))) {
(void) slurm_kill_job_step(job->step_id.job_id, job->step_id.step_id,
SIGKILL);
SIGKILL, 0);
if (got_alloc)
slurm_complete_job(job->step_id.job_id, NO_VAL);
}
list_iterator_destroy(job_iter);
} else {
(void) slurm_kill_job_step(job->step_id.job_id, job->step_id.step_id, SIGKILL);
(void) slurm_kill_job_step(job->step_id.job_id,
job->step_id.step_id, SIGKILL, 0);
if (got_alloc)
slurm_complete_job(job->step_id.job_id, NO_VAL);
}
Expand Down
2 changes: 1 addition & 1 deletion src/sview/job_info.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ static int _cancel_step_id(uint32_t job_id, uint32_t step_id,
for (i = 0; i < MAX_CANCEL_RETRY; i++) {
/* NOTE: RPC always sent to slurmctld rather than directly
* to slurmd daemons */
error_code = slurm_kill_job_step(job_id, step_id, signal);
error_code = slurm_kill_job_step(job_id, step_id, signal, 0);

if (error_code == 0
|| (errno != ESLURM_TRANSITION_STATE_NO_UPDATE
Expand Down

0 comments on commit 165f410

Please sign in to comment.