@@ -351,6 +351,8 @@ components:
351
351
defaultValue : 3840.0
352
352
isOptional : true
353
353
parameterType : NUMBER_INTEGER
354
+ image :
355
+ parameterType : STRING
354
356
input_pvc_name :
355
357
parameterType : STRING
356
358
learning_rate :
@@ -401,6 +403,8 @@ components:
401
403
defaultValue : 3840.0
402
404
isOptional : true
403
405
parameterType : NUMBER_INTEGER
406
+ image :
407
+ parameterType : STRING
404
408
input_pvc_name :
405
409
parameterType : STRING
406
410
learning_rate :
@@ -731,32 +735,32 @@ deploymentSpec:
731
735
'
732
736
- " \n import kfp\n from kfp import dsl\n from kfp.dsl import *\n from typing import\
733
737
\ *\n\n def pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
734
- \ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
735
- \ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
736
- \ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n \
737
- \ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
738
- \ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
739
- \ = 42,\n ):\n import inspect\n import os\n import time\n\n import\
740
- \ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n \
741
- \ model_dir = \" /output/phase_1/model/hf_format\"\n models\
742
- \ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\" \
743
- {model_dir}/{model}\" ), i)\n for i, model in enumerate(models)\n \
744
- \ )[-1]\n newest_model = models[newest_idx]\n return\
745
- \ f\" {model_dir}/{newest_model}\"\n\n name = f\" train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\" \
746
- \n\n if phase_num == 1:\n path_to_model = \" /input_model\"\n \
747
- \ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
738
+ \ str,\n output_pvc_name: str,\n name_suffix: str,\n image: str,\n \
739
+ \ # path_to_model: str,\n phase_num: int,\n nproc_per_node: int\
740
+ \ = 3,\n nnodes: int = 2,\n num_epochs: int = 2,\n effective_batch_size:\
741
+ \ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
742
+ \ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n \
743
+ \ seed: int = 42,\n ):\n import inspect\n import os\n import\
744
+ \ time\n\n import kubernetes\n import urllib3\n import yaml\n\n \
745
+ \ def list_phase1_final_model():\n model_dir = \" /output/phase_1/model/hf_format\" \
746
+ \n models = os.listdir(model_dir)\n newest_idx = max(\n \
747
+ \ (os.path.getmtime(f\" {model_dir}/{model}\" ), i)\n \
748
+ \ for i, model in enumerate(models)\n )[-1]\n newest_model\
749
+ \ = models[newest_idx]\n return f\" {model_dir}/{newest_model}\"\n \
750
+ \n name = f\" train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n \
751
+ \n if phase_num == 1:\n path_to_model = \" /input_model\"\n \
752
+ \ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
748
753
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
749
754
\ = \" /input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\" \
750
- Unsupported value of {phase_num=}\" )\n\n image = \" registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\" \
751
- \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
752
- \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
753
- \ name: {name}\n spec:\n nprocPerNode: \\\" {nproc_per_node}\\ \
754
- \"\n pytorchReplicaSpecs:\n Master:\n replicas:\
755
- \ 1\n restartPolicy: OnFailure\n template:\n \
756
- \ metadata:\n annotations:\n \
757
- \ sidecar.istio.io/inject: 'false'\n spec:\n \
758
- \ containers:\n - args:\n \
759
- \ - |\n echo \" Running phase {phase_num}\" \
755
+ Unsupported value of {phase_num=}\" )\n\n manifest = inspect.cleandoc(\n \
756
+ \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n \
757
+ \ metadata:\n name: {name}\n spec:\n nprocPerNode:\
758
+ \ \\\" {nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
759
+ \ Master:\n replicas: 1\n restartPolicy: OnFailure\n \
760
+ \ template:\n metadata:\n annotations:\n \
761
+ \ sidecar.istio.io/inject: 'false'\n spec:\n \
762
+ \ containers:\n - args:\n \
763
+ \ - |\n echo \" Running phase {phase_num}\" \
760
764
\n echo \" Using {path_to_model} model for training\" \
761
765
\n echo \" Using {path_to_data} data for training\" \
762
766
\n mkdir -p /output/phase_{phase_num}/model;\n \
@@ -935,32 +939,32 @@ deploymentSpec:
935
939
'
936
940
- " \n import kfp\n from kfp import dsl\n from kfp.dsl import *\n from typing import\
937
941
\ *\n\n def pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
938
- \ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
939
- \ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
940
- \ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n \
941
- \ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
942
- \ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
943
- \ = 42,\n ):\n import inspect\n import os\n import time\n\n import\
944
- \ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n \
945
- \ model_dir = \" /output/phase_1/model/hf_format\"\n models\
946
- \ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\" \
947
- {model_dir}/{model}\" ), i)\n for i, model in enumerate(models)\n \
948
- \ )[-1]\n newest_model = models[newest_idx]\n return\
949
- \ f\" {model_dir}/{newest_model}\"\n\n name = f\" train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\" \
950
- \n\n if phase_num == 1:\n path_to_model = \" /input_model\"\n \
951
- \ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
942
+ \ str,\n output_pvc_name: str,\n name_suffix: str,\n image: str,\n \
943
+ \ # path_to_model: str,\n phase_num: int,\n nproc_per_node: int\
944
+ \ = 3,\n nnodes: int = 2,\n num_epochs: int = 2,\n effective_batch_size:\
945
+ \ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
946
+ \ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n \
947
+ \ seed: int = 42,\n ):\n import inspect\n import os\n import\
948
+ \ time\n\n import kubernetes\n import urllib3\n import yaml\n\n \
949
+ \ def list_phase1_final_model():\n model_dir = \" /output/phase_1/model/hf_format\" \
950
+ \n models = os.listdir(model_dir)\n newest_idx = max(\n \
951
+ \ (os.path.getmtime(f\" {model_dir}/{model}\" ), i)\n \
952
+ \ for i, model in enumerate(models)\n )[-1]\n newest_model\
953
+ \ = models[newest_idx]\n return f\" {model_dir}/{newest_model}\"\n \
954
+ \n name = f\" train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n \
955
+ \n if phase_num == 1:\n path_to_model = \" /input_model\"\n \
956
+ \ path_to_data = \" /input_data/knowledge/data.jsonl\"\n elif phase_num\
952
957
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
953
958
\ = \" /input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\" \
954
- Unsupported value of {phase_num=}\" )\n\n image = \" registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\" \
955
- \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
956
- \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
957
- \ name: {name}\n spec:\n nprocPerNode: \\\" {nproc_per_node}\\ \
958
- \"\n pytorchReplicaSpecs:\n Master:\n replicas:\
959
- \ 1\n restartPolicy: OnFailure\n template:\n \
960
- \ metadata:\n annotations:\n \
961
- \ sidecar.istio.io/inject: 'false'\n spec:\n \
962
- \ containers:\n - args:\n \
963
- \ - |\n echo \" Running phase {phase_num}\" \
959
+ Unsupported value of {phase_num=}\" )\n\n manifest = inspect.cleandoc(\n \
960
+ \ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n \
961
+ \ metadata:\n name: {name}\n spec:\n nprocPerNode:\
962
+ \ \\\" {nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
963
+ \ Master:\n replicas: 1\n restartPolicy: OnFailure\n \
964
+ \ template:\n metadata:\n annotations:\n \
965
+ \ sidecar.istio.io/inject: 'false'\n spec:\n \
966
+ \ containers:\n - args:\n \
967
+ \ - |\n echo \" Running phase {phase_num}\" \
964
968
\n echo \" Using {path_to_model} model for training\" \
965
969
\n echo \" Using {path_to_data} data for training\" \
966
970
\n mkdir -p /output/phase_{phase_num}/model;\n \
@@ -1868,6 +1872,9 @@ root:
1868
1872
parameters :
1869
1873
effective_batch_size :
1870
1874
componentInputParameter : train_effective_batch_size_phase_1
1875
+ image :
1876
+ runtimeValue :
1877
+ constant : registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1
1871
1878
input_pvc_name :
1872
1879
taskOutputParameter :
1873
1880
outputParameterKey : name
@@ -1918,6 +1925,9 @@ root:
1918
1925
parameters :
1919
1926
effective_batch_size :
1920
1927
componentInputParameter : train_effective_batch_size_phase_2
1928
+ image :
1929
+ runtimeValue :
1930
+ constant : registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1
1921
1931
input_pvc_name :
1922
1932
taskOutputParameter :
1923
1933
outputParameterKey : name
0 commit comments