Skip to content

Commit 5848c7f

Browse files
Merge branch 'master' into fix_generate
2 parents 5759518 + dc8d99f commit 5848c7f

18 files changed

+559
-31
lines changed

evaluation/generation/generate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def main():
4747
max_cpu_memory=args.max_cpu_memory if args.parallelize else None,
4848
offload_folder=args.offload_folder if args.parallelize else None,
4949
)
50-
50+
5151
print(f"Loaded model in {datetime.datetime.now() - start}")
5252

5353
text = ''
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11b_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-1362%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11b-1B3-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO_FOLDER
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11c_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-239%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11c-2B5-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11d_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-166%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 256 -> one out of 8 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11d-760M-ml/.*/global_step[0-9]*[02468]000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-276%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 256 -> one out of 8 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*[02468]000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=tr11f_move_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --array=0-155%1
11+
#SBATCH --partition=cpu_p1
12+
13+
# DEBUG
14+
# SLURM_ARRAY_TASK_ID=0 # 0-6549
15+
16+
pushd $six_ALL_CCFRWORK/checkpoints
17+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
18+
# DEBUG regex to test out only on tr11e-350
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
20+
# batch size 512 -> one out of 4 checkpoints for 1B tokens
21+
readarray CHECKPOINTS < <(find . -regex '\./tr11f-6B3-ml/.*/global_step[0-9]*000')
22+
23+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
24+
25+
CHECKPOINT_TO_TAR=${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}
26+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
27+
28+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
29+
DIRNAME=${TEMPNAME:2}
30+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
31+
32+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
33+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
34+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
35+
36+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
37+
echo $CHECKPOINT_TO_TAR
38+
echo $CHECKPOINT_TAR_TO
39+
40+
# cvfj for bz2 compression; won't change much
41+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
42+
43+
popd
44+
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
#SBATCH --job-name=move_first_checkpoints_to_tar # job name
3+
#SBATCH --ntasks=1 # number of MP tasks
4+
#SBATCH --nodes=1
5+
#SBATCH --cpus-per-task=4 # number of cores per tasks
6+
#SBATCH --hint=nomultithread # we get physical cores not logical
7+
#SBATCH --time=20:00:00 # maximum execution time (HH:MM:SS)
8+
#SBATCH --output=logs/%x-%j.out # output file name
9+
#SBATCH --account=six@cpu
10+
#SBATCH --partition=cpu_p1
11+
12+
# DEBUG
13+
# SLURM_ARRAY_TASK_ID=0 # 0-149
14+
15+
16+
# you have to also pass --array=0-<desired_number>%1 as an sbatch flag to compress everything, eg sbatch --array=0-149%1 move_first_150_checkpoints_to_store.slurm tr11b-1B3-ml 150
17+
18+
pushd $six_ALL_CCFRWORK/checkpoints
19+
# readarray CHECKPOINTS < <(find . -regex '\./tr11[a-z].*/global_step[0-9]*')
20+
# DEBUG regex to test out only on tr11e-350
21+
# readarray CHECKPOINTS < <(find . -regex '\./tr11e-350M-ml/.*/global_step[0-9]*')
22+
# batch size 512 -> first 150 checkpoints for 39B tokens, batch size 256 -> 300
23+
readarray CHECKPOINTS < <(ls -v ./"${1}"/checkpoints/main/ | head -"${2}")
24+
25+
echo "Total number of checkpoints to tar: ${#CHECKPOINTS[@]}"
26+
27+
CHECKPOINT_TO_TAR="./${1}/checkpoints/main/${CHECKPOINTS[$SLURM_ARRAY_TASK_ID]}"
28+
echo "Checkpoint to tar: $CHECKPOINT_TO_TAR"
29+
30+
TEMPNAME=$(dirname $CHECKPOINT_TO_TAR)
31+
DIRNAME=${TEMPNAME:2}
32+
BASENAME=$(basename $CHECKPOINT_TO_TAR)
33+
34+
CHECKPOINT_TO_TAR=$DIRNAME/$BASENAME
35+
CHECKPOINT_TAR_TO_FOLDER=$six_ALL_CCFRSTORE/checkpoints/$DIRNAME
36+
CHECKPOINT_TAR_TO=$CHECKPOINT_TAR_TO_FOLDER/$BASENAME.tar
37+
38+
mkdir -p $CHECKPOINT_TAR_TO_FOLDER
39+
echo $CHECKPOINT_TO_TAR
40+
echo $CHECKPOINT_TAR_TO_FOLDER
41+
42+
# cvfj for bz2 compression; won't change much
43+
tar cvf $CHECKPOINT_TAR_TO $CHECKPOINT_TO_TAR
44+
45+
popd

jz/slurm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ more informative all-in-one myjobs that includes the projected start time for pe
295295

296296
```
297297
alias myjobs='squeue -u `whoami` -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
298-
alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
298+
alias groupjobs='squeue -u $(getent group six | cut -d: -f4) -o "%.16i %u %.9P %.26j %.8T %.10M %.8l %.6D %.20S %R"'
299299
```
300300

301301

tools/fs-watchdog.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,25 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
135135
alerts.append(response)
136136
alerts.append("")
137137

138+
def analyse_shared_disk(partition_name, alert_bytes_threshold):
139+
partition_name_2_disk = {
140+
"SCRATCH": "gpfsssd",
141+
"WORK": "gpfsdswork",
142+
"STORE": "gpfsdsstore"
143+
}
144+
cmd = "df"
145+
response = run_cmd(cmd.split())
146+
disk_metas = response.split("\n")
147+
column_names = disk_metas[0].split()
148+
disk_meta = [disk_meta_.split() for disk_meta_ in disk_metas if disk_meta_.startswith(partition_name_2_disk[partition_name])][0]
149+
disk_meta = {column_name: value for column_name, value in zip(column_names, disk_meta)}
150+
151+
# default `df` counts uses 1024-byte units, and `1024 == 2 ** 10`
152+
available_disk_left = int(disk_meta["Available"]) * 2 ** 10
153+
if available_disk_left < alert_bytes_threshold:
154+
alerts.append(f"Shared {partition_name} has {available_disk_left/2**40:.2f}TB left")
155+
alerts.append("")
156+
138157
# WORK and STORE partitions stats can be accessed much faster through `idrquota`, and it already
139158
# includes the quota info
140159
analyse_partition_idrquota(partition_name="WORK", partition_flag="-w", alert_bytes_threshold=0.85, alert_inodes_threshold=0.85)
@@ -143,10 +162,13 @@ def analyse_partition_idrquota(partition_name, partition_flag, alert_bytes_thres
143162
# SCRATCH - check only bytes w/ a hard quota of 400TB - alert on lower threshold than other
144163
# partitions due to it filling up at a faster rate (dumping huge checkpoints)
145164
analyse_partition_bytes(partition_name="SCRATCH", partition_path="/gpfsssd/scratch/rech/six/", hard_limit_bytes=400*2**40, alert_bytes_threshold=0.75)
165+
# Actually SCRATCH is shared with everyone and we should monitor the output of `df -h | grep gpfsssd`
166+
# Check that there's still 40TB left
167+
analyse_shared_disk("SCRATCH", 100 * 2 ** 40)
146168

147-
# WORKFS - check both bytes and inodes w/ hard quotas of 3TB / 3M
148-
analyse_partition_bytes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=3*2**40, alert_bytes_threshold=0.85)
149-
analyse_partition_inodes(partition_name="WORKFS", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
169+
# WORKSF - check both bytes and inodes w/ hard quotas of 2TB / 3M
170+
analyse_partition_bytes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_bytes=2*2**40, alert_bytes_threshold=0.85)
171+
analyse_partition_inodes(partition_name="WORKSF", partition_path="/gpfsssd/worksf/projects/rech/six/", hard_limit_inodes=3*10**6, alert_inodes_threshold=0.85)
150172

151173
if len(alerts) > 0 :
152174
print(f"[ALERT] JZ filesystem is getting close to being full")

tools/slurm-status.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -158,12 +158,16 @@ def main():
158158
in_the_system = False
159159
for l in status_lines:
160160
#print(f"l=[{l}]")
161-
jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
162-
#print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
163-
# XXX: add support for regex matching so partial name can be provided
164-
if name == args.job_name:
165-
in_the_system = True
166-
process_job(jobid, partition, name, state, time, nodes, start_time, notes)
161+
162+
# XXX: apparently some jobs can be run w/o name and break the split() call, so match our
163+
# name first and then split
164+
if args.job_name in l:
165+
jobid, partition, name, state, time, nodes, start_time, notes = l.split(None, 7)
166+
#print("-".join([jobid, partition, name, state, time, nodes, start_time, notes]))
167+
# XXX: add support for regex matching so partial name can be provided
168+
if name == args.job_name:
169+
in_the_system = True
170+
process_job(jobid, partition, name, state, time, nodes, start_time, notes)
167171

168172
if not in_the_system:
169173
preamble = get_preamble()

0 commit comments

Comments
 (0)