File tree Expand file tree Collapse file tree 4 files changed +7
-9
lines changed
recml/inference/benchmarks Expand file tree Collapse file tree 4 files changed +7
-9
lines changed Original file line number Diff line number Diff line change @@ -6,14 +6,14 @@ export XLA_FLAGS=
6
6
7
7
export TPU_NAME=< TPU_NAME>
8
8
export LEARNING_RATE=0.0034
9
- export BATCH_SIZE=135168
9
+ export BATCH_SIZE=4224
10
10
export EMBEDDING_SIZE=128
11
11
export MODEL_DIR=/tmp/
12
12
export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
13
13
export NUM_STEPS=28000
14
14
export CHECKPOINT_INTERVAL=1500
15
15
export EVAL_INTERVAL=1500
16
- export EVAL_FILE_PATTER =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
16
+ export EVAL_FILE_PATTERN =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
17
17
export EVAL_STEPS=660
18
18
export MODE=eval
19
19
export EMBEDDING_THRESHOLD=21000
@@ -23,7 +23,6 @@ export RESTORE_CHECKPOINT=true
23
23
24
24
25
25
python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
26
-
27
26
--learning_rate=${LEARNING_RATE} \
28
27
--batch_size=${BATCH_SIZE} \
29
28
--embedding_size=${EMBEDDING_SIZE} \
Original file line number Diff line number Diff line change @@ -6,22 +6,21 @@ export XLA_FLAGS=
6
6
7
7
export TPU_NAME=< TPU_NAME>
8
8
export LEARNING_RATE=0.0034
9
- export BATCH_SIZE=135168
9
+ export BATCH_SIZE=4224
10
10
export EMBEDDING_SIZE=128
11
11
export MODEL_DIR=/tmp/
12
12
export FILE_PATTERN=gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/train-*
13
13
export NUM_STEPS=28000
14
14
export CHECKPOINT_INTERVAL=1500
15
15
export EVAL_INTERVAL=1500
16
- export EVAL_FILE_PATTER =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
16
+ export EVAL_FILE_PATTERN =gs://qinyiyan-vm/mlperf-dataset/criteo_merge_balanced_4224/eval-*
17
17
export EVAL_STEPS=660
18
18
export MODE=train
19
19
export EMBEDDING_THRESHOLD=21000
20
20
export LOGGING_INTERVAL=1500
21
21
export RESTORE_CHECKPOINT=true
22
22
23
23
python recml/inference/models/jax/DLRM_DCNv2/dlrm_main.py \
24
-
25
24
--learning_rate=${LEARNING_RATE} \
26
25
--batch_size=${BATCH_SIZE} \
27
26
--embedding_size=${EMBEDDING_SIZE} \
Original file line number Diff line number Diff line change @@ -54,10 +54,10 @@ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${Z
54
54
gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="pip install -U tensorflow dm-tree flax google-metrax"
55
55
```
56
56
57
- #### Run workload
57
+ #### Make script executable & Run workload
58
58
59
59
Note: Please update the MODEL_NAME & TASK_NAME before running the below command
60
60
61
61
```
62
- gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="TPU_NAME=${TPU_NAME} ./inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
62
+ gcloud alpha compute tpus tpu-vm ssh ${TPU_NAME} --project ${PROJECT} --zone ${ZONE} --worker=all --command="cd RecML && chmod +x ./recml/inference/benchmarks/<MODEL_NAME>/<TASK_NAME> && TPU_NAME=${TPU_NAME} ./recml /inference/benchmarks/<MODEL_NAME>/<TASK_NAME>"
63
63
```
Original file line number Diff line number Diff line change @@ -63,7 +63,7 @@ platformdirs==4.3.7
63
63
pluggy == 1.5.0
64
64
pre-commit == 4.2.0
65
65
promise == 2.3
66
- protobuf == 5.29.4
66
+ protobuf == 4.21.12
67
67
psutil == 7.0.0
68
68
pyarrow == 19.0.1
69
69
pygments == 2.19.1
You can’t perform that action at this time.
0 commit comments