Skip to content

Commit

Permalink
Merge branch 'slurm-23.02'
Browse files Browse the repository at this point in the history
  • Loading branch information
wickberg committed Mar 24, 2023
2 parents b3ccf5c + 671fccf commit 2324007
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 2 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ documents those changes that are of interest to users and administrators.
slurm_spank_job_epilog().
-- Fix srun < 23.02 always getting an "exact" core allocation.
-- Prevent scontrol < 23.02 from setting MaxCPUsPerSocket to 0.
-- Add ScronParameters=explicit_scancel and corresponding scancel --cron
option.

* Changes in Slurm 23.02.0
==========================
Expand Down
11 changes: 11 additions & 0 deletions doc/man/man1/scancel.1
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ This is the default behavior on architectures using front end nodes (e.g.
Cray ALPS computers) or when the \fB\-\-clusters\fR option is used.
.IP

.TP
\fB\-c\fR, \fB\-\-cron\fR
Confirm request to cancel a job submitted by scrontab. This option only has
effect with the "explicit_scancel" option is set in \fBScronParameters\fR.
.IP

.TP
\fB\-f\fR, \fB\-\-full\fR
By default, signals other than SIGKILL are not sent to the batch step (the shell
Expand Down Expand Up @@ -240,6 +246,11 @@ environment variables, along with their corresponding options, are listed below.
\fB\-\-ctld\fR
.IP

.TP
\fBSCANCEL_CRON\fR
\fB\-c, \-\-cron\fR
.IP

.TP
\fBSCANCEL_FULL\fR
\fB\-f, \-\-full\fR
Expand Down
6 changes: 6 additions & 0 deletions doc/man/man5/slurm.conf.5
Original file line number Diff line number Diff line change
Expand Up @@ -4434,6 +4434,12 @@ Multiple options may be comma separated.
.TP
\fBenable\fR
Enable the use of scrontab to submit and manage periodic repeating jobs.
.IP

.TP
\fBexplicit_scancel\fR
When cancelling an scrontab job, require the user to explicitly request
cancelling the job with the --cron flag in scancel.
.RE
.IP

Expand Down
3 changes: 2 additions & 1 deletion slurm/slurm.h
Original file line number Diff line number Diff line change
Expand Up @@ -3596,7 +3596,8 @@ typedef struct job_step_kill_msg {
#define KILL_NO_SIBS SLURM_BIT(7) /* Don't kill other sibling jobs */
#define KILL_JOB_RESV SLURM_BIT(8) /* Job is willing to run on nodes in a
* magnetic reservation. */
#define KILL_NO_SIG_FAIL SLURM_BIT(9) /* Don't fail job due to signal (steps only) */
#define KILL_NO_CRON SLURM_BIT(9) /* request killing cron Jobs */
#define KILL_NO_SIG_FAIL SLURM_BIT(10) /* Don't fail job due to signal (steps only) */

/* Use top bit of uint16_t in conjuction with KILL_* flags to indicate signal
* has been sent to job previously. Does not need to be passed to slurmd. */
Expand Down
1 change: 1 addition & 0 deletions slurm/slurm_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ typedef enum {
ESLURM_PRIO_RESET_FAIL,
ESLURM_CANNOT_MODIFY_CRON_JOB,
ESLURM_INVALID_JOB_CONTAINER_CHANGE,
ESLURM_CANNOT_CANCEL_CRON_JOB,

ESLURM_INVALID_MCS_LABEL = 2099,
ESLURM_BURST_BUFFER_WAIT = 2100,
Expand Down
2 changes: 2 additions & 0 deletions src/common/slurm_errno.c
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,8 @@ slurm_errtab_t slurm_errtab[] = {
"Cannot modify scrontab jobs through scontrol" },
{ ERRTAB_ENTRY(ESLURM_INVALID_JOB_CONTAINER_CHANGE),
"JobContainerType change requires restart of all Slurm daemons and commands to take effect" },
{ ERRTAB_ENTRY(ESLURM_CANNOT_CANCEL_CRON_JOB),
"Cannot cancel scrontab jobs without --cron flag." },
{ ERRTAB_ENTRY(ESLURM_INVALID_MCS_LABEL),
"Invalid mcs_label specified" },
{ ERRTAB_ENTRY(ESLURM_BURST_BUFFER_WAIT),
Expand Down
13 changes: 12 additions & 1 deletion src/scancel/opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ static void _opt_default(void)
#else
opt.ctld = false;
#endif
opt.cron = false;
opt.full = false;
opt.hurry = false;
opt.interactive = false;
Expand Down Expand Up @@ -247,6 +248,9 @@ static void _opt_env(void)
if (getenv("SCANCEL_CTLD"))
opt.ctld = true;

if (getenv("SCANCEL_CRON"))
opt.cron = true;

if ( (val=getenv("SCANCEL_FULL")) ) {
if (xstrcasecmp(val, "true") == 0)
opt.full = true;
Expand Down Expand Up @@ -338,6 +342,7 @@ static void _opt_args(int argc, char **argv)
{"account", required_argument, 0, 'A'},
{"batch", no_argument, 0, 'b'},
{"ctld", no_argument, 0, OPT_LONG_CTLD},
{"cron", no_argument, 0, 'c'},
{"full", no_argument, 0, 'f'},
{"help", no_argument, 0, OPT_LONG_HELP},
{"hurry", no_argument, 0, 'H'},
Expand All @@ -363,7 +368,8 @@ static void _opt_args(int argc, char **argv)
{NULL, 0, 0, 0}
};

while ((opt_char = getopt_long(argc, argv, "A:bfHiM:n:p:Qq:R:s:t:u:vVw:",
while ((opt_char = getopt_long(argc, argv,
"A:bcfHiM:n:p:Qq:R:s:t:u:vVw:",
long_options, &option_index)) != -1) {
switch (opt_char) {
case (int)'?':
Expand All @@ -381,6 +387,9 @@ static void _opt_args(int argc, char **argv)
case OPT_LONG_CTLD:
opt.ctld = true;
break;
case (int)'c':
opt.cron = true;
break;
case (int)'f':
opt.full = true;
break;
Expand Down Expand Up @@ -627,6 +636,7 @@ static void _opt_list(void)
info("account : %s", opt.account);
info("batch : %s", tf_(opt.batch));
info("ctld : %s", tf_(opt.ctld));
info("cron : %s", tf_(opt.cron));
info("full : %s", tf_(opt.full));
info("hurry : %s", tf_(opt.hurry));
info("interactive : %s", tf_(opt.interactive));
Expand Down Expand Up @@ -697,6 +707,7 @@ static void _help(void)
printf(" -A, --account=account act only on jobs charging this account\n");
printf(" -b, --batch signal batch shell for specified job\n");
/* printf(" --ctld send request directly to slurmctld\n"); */
printf(" -c, --cron cancel an scrontab job\n");
printf(" -f, --full signal batch shell and all steps for specified job\n");
printf(" -H, --hurry avoid burst buffer stage out\n");
printf(" -i, --interactive require response from user for each job\n");
Expand Down
12 changes: 12 additions & 0 deletions src/scancel/scancel.c
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,18 @@ _cancel_job_id (void *ci)
flags |= KILL_JOB_BATCH;
job_type = "batch ";
}

/*
* With the introduction of the ScronParameters=explicit_scancel option,
* scancel requests for a cron job should be rejected unless the --cron
* flag is specified.
* To prevent introducing this option from influencing anything other
* than user requests, it has been set up so that when KILL_NO_CRON is
* set when explicit_scancel is also set, the request will be rejected.
*/
if (!opt.cron)
flags |= KILL_NO_CRON;

if (opt.full) {
flags |= KILL_FULL_JOB;
job_type = "full ";
Expand Down
1 change: 1 addition & 0 deletions src/scancel/scancel.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ typedef struct scancel_options {
char *sibling; /* --sibling=<sib_name> */
bool ctld; /* --ctld */
List clusters; /* --cluster=cluster_name -Mcluster-name */
bool cron; /* --cron */
bool full; /* --full, -f */
bool hurry; /* --hurry, -H */
bool interactive; /* --interactive, -i */
Expand Down
10 changes: 10 additions & 0 deletions src/slurmctld/job_mgr.c
Original file line number Diff line number Diff line change
Expand Up @@ -5433,6 +5433,16 @@ extern int job_signal(job_record_t *job_ptr, uint16_t signal,
if (job_ptr->bit_flags & CRON_JOB) {
cron_entry_t *entry =
(cron_entry_t *) job_ptr->details->crontab_entry;
/*
* The KILL_NO_CRON flag being set here is indicating that the
* user has NOT specifically requested killing scrontab jobs. To
* avoid interfering with other possible ways of killing jobs,
* the KILL_NO_CRON flag being unset must mean that killing cron
* jobs is permitted.
*/
if (xstrcasestr(slurm_conf.scron_params, "explicit_scancel") &&
(flags & KILL_NO_CRON))
return ESLURM_CANNOT_CANCEL_CRON_JOB;
job_ptr->bit_flags |= ~CRON_JOB;
error("cancelling cron job, lines %u %u",
entry->line_start, entry->line_end);
Expand Down

0 comments on commit 2324007

Please sign in to comment.