Skip to content

Commit 026ef4c

Browse files
committed
Remove hardcoded training image refernce
- Already defined as a const
1 parent 723fb6c commit 026ef4c

File tree

4 files changed

+65
-50
lines changed

4 files changed

+65
-50
lines changed

pipeline.py

+5
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
use_secret_as_volume,
1717
)
1818

19+
from utils.consts import RHELAI_IMAGE
20+
21+
1922
TEACHER_CONFIG_MAP = "teacher-server"
2023
TEACHER_SECRET = "teacher-server"
2124
JUDGE_CONFIG_MAP = "judge-server"
@@ -310,6 +313,7 @@ def pipeline(
310313
save_samples=train_save_samples,
311314
max_batch_len=train_max_batch_len,
312315
seed=train_seed,
316+
image=RHELAI_IMAGE,
313317
)
314318
training_phase_1.after(data_processing_task, model_to_pvc_task)
315319
training_phase_1.set_caching_options(False)
@@ -330,6 +334,7 @@ def pipeline(
330334
save_samples=train_save_samples,
331335
max_batch_len=train_max_batch_len,
332336
seed=train_seed,
337+
image=RHELAI_IMAGE,
333338
)
334339

335340
training_phase_2.set_caching_options(False)

pipeline.yaml

+58-48
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,8 @@ components:
351351
defaultValue: 3840.0
352352
isOptional: true
353353
parameterType: NUMBER_INTEGER
354+
image:
355+
parameterType: STRING
354356
input_pvc_name:
355357
parameterType: STRING
356358
learning_rate:
@@ -401,6 +403,8 @@ components:
401403
defaultValue: 3840.0
402404
isOptional: true
403405
parameterType: NUMBER_INTEGER
406+
image:
407+
parameterType: STRING
404408
input_pvc_name:
405409
parameterType: STRING
406410
learning_rate:
@@ -731,32 +735,32 @@ deploymentSpec:
731735
'
732736
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
733737
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
734-
\ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
735-
\ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
736-
\ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n\
737-
\ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
738-
\ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
739-
\ = 42,\n):\n import inspect\n import os\n import time\n\n import\
740-
\ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n\
741-
\ model_dir = \"/output/phase_1/model/hf_format\"\n models\
742-
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
743-
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
744-
\ )[-1]\n newest_model = models[newest_idx]\n return\
745-
\ f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
746-
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
747-
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
738+
\ str,\n output_pvc_name: str,\n name_suffix: str,\n image: str,\n\
739+
\ # path_to_model: str,\n phase_num: int,\n nproc_per_node: int\
740+
\ = 3,\n nnodes: int = 2,\n num_epochs: int = 2,\n effective_batch_size:\
741+
\ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
742+
\ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n\
743+
\ seed: int = 42,\n):\n import inspect\n import os\n import\
744+
\ time\n\n import kubernetes\n import urllib3\n import yaml\n\n\
745+
\ def list_phase1_final_model():\n model_dir = \"/output/phase_1/model/hf_format\"\
746+
\n models = os.listdir(model_dir)\n newest_idx = max(\n \
747+
\ (os.path.getmtime(f\"{model_dir}/{model}\"), i)\n \
748+
\ for i, model in enumerate(models)\n )[-1]\n newest_model\
749+
\ = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\n\
750+
\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n\
751+
\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
752+
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
748753
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
749754
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
750-
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
751-
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
752-
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
753-
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
754-
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
755-
\ 1\n restartPolicy: OnFailure\n template:\n \
756-
\ metadata:\n annotations:\n \
757-
\ sidecar.istio.io/inject: 'false'\n spec:\n \
758-
\ containers:\n - args:\n \
759-
\ - |\n echo \"Running phase {phase_num}\"\
755+
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
756+
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
757+
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
758+
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
759+
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
760+
\ template:\n metadata:\n annotations:\n\
761+
\ sidecar.istio.io/inject: 'false'\n spec:\n\
762+
\ containers:\n - args:\n \
763+
\ - |\n echo \"Running phase {phase_num}\"\
760764
\n echo \"Using {path_to_model} model for training\"\
761765
\n echo \"Using {path_to_data} data for training\"\
762766
\n mkdir -p /output/phase_{phase_num}/model;\n\
@@ -935,32 +939,32 @@ deploymentSpec:
935939
'
936940
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
937941
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
938-
\ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
939-
\ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
940-
\ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n\
941-
\ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
942-
\ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
943-
\ = 42,\n):\n import inspect\n import os\n import time\n\n import\
944-
\ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n\
945-
\ model_dir = \"/output/phase_1/model/hf_format\"\n models\
946-
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
947-
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
948-
\ )[-1]\n newest_model = models[newest_idx]\n return\
949-
\ f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
950-
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
951-
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
942+
\ str,\n output_pvc_name: str,\n name_suffix: str,\n image: str,\n\
943+
\ # path_to_model: str,\n phase_num: int,\n nproc_per_node: int\
944+
\ = 3,\n nnodes: int = 2,\n num_epochs: int = 2,\n effective_batch_size:\
945+
\ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
946+
\ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n\
947+
\ seed: int = 42,\n):\n import inspect\n import os\n import\
948+
\ time\n\n import kubernetes\n import urllib3\n import yaml\n\n\
949+
\ def list_phase1_final_model():\n model_dir = \"/output/phase_1/model/hf_format\"\
950+
\n models = os.listdir(model_dir)\n newest_idx = max(\n \
951+
\ (os.path.getmtime(f\"{model_dir}/{model}\"), i)\n \
952+
\ for i, model in enumerate(models)\n )[-1]\n newest_model\
953+
\ = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\n\
954+
\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n\
955+
\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
956+
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
952957
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
953958
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
954-
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
955-
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
956-
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
957-
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
958-
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
959-
\ 1\n restartPolicy: OnFailure\n template:\n \
960-
\ metadata:\n annotations:\n \
961-
\ sidecar.istio.io/inject: 'false'\n spec:\n \
962-
\ containers:\n - args:\n \
963-
\ - |\n echo \"Running phase {phase_num}\"\
959+
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
960+
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
961+
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
962+
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
963+
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
964+
\ template:\n metadata:\n annotations:\n\
965+
\ sidecar.istio.io/inject: 'false'\n spec:\n\
966+
\ containers:\n - args:\n \
967+
\ - |\n echo \"Running phase {phase_num}\"\
964968
\n echo \"Using {path_to_model} model for training\"\
965969
\n echo \"Using {path_to_data} data for training\"\
966970
\n mkdir -p /output/phase_{phase_num}/model;\n\
@@ -1868,6 +1872,9 @@ root:
18681872
parameters:
18691873
effective_batch_size:
18701874
componentInputParameter: train_effective_batch_size_phase_1
1875+
image:
1876+
runtimeValue:
1877+
constant: registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1
18711878
input_pvc_name:
18721879
taskOutputParameter:
18731880
outputParameterKey: name
@@ -1918,6 +1925,9 @@ root:
19181925
parameters:
19191926
effective_batch_size:
19201927
componentInputParameter: train_effective_batch_size_phase_2
1928+
image:
1929+
runtimeValue:
1930+
constant: registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1
19211931
input_pvc_name:
19221932
taskOutputParameter:
19231933
outputParameterKey: name

training/components.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ def pytorchjob_manifest_op(
126126
input_pvc_name: str,
127127
output_pvc_name: str,
128128
name_suffix: str,
129+
image: str,
129130
# path_to_model: str,
130131
phase_num: int,
131132
nproc_per_node: int = 3,
@@ -167,8 +168,6 @@ def list_phase1_final_model():
167168
else:
168169
raise RuntimeError(f"Unsupported value of {phase_num=}")
169170

170-
image = "registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1"
171-
172171
manifest = inspect.cleandoc(
173172
f"""
174173
apiVersion: kubeflow.org/v1

training/faked/components.py

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def pytorchjob_manifest_op(
1414
input_pvc_name: str,
1515
output_pvc_name: str,
1616
name_suffix: str,
17+
image: str,
1718
) -> NamedTuple("outputs", manifest=str, name=str):
1819
Outputs = NamedTuple("outputs", manifest=str, name=str)
1920
return Outputs("", "")

0 commit comments

Comments
 (0)