Skip to content

Commit ca090ec

Browse files
authored
feat(job): support AITrainingJob (#1160)
* feat(job): support AITrainingJob * add ut * add register * update installer
1 parent d42c936 commit ca090ec

File tree

14 files changed

+901
-5
lines changed

14 files changed

+901
-5
lines changed

config/server/default/job/job_template.yaml

+75
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,81 @@ spec:
121121
imagePullPolicy: Always
122122
# PaddleJob-kubeflow.org/v1-collective
123123
---
124+
apiVersion: kongming.cce.baiudbce.com/v1
125+
kind: AITrainingJob
126+
metadata:
127+
name: job-horovod-test
128+
namespace: default
129+
spec:
130+
# 任务结束时,pod的清理策略,All表示所有pod,none表示不清理
131+
cleanPodPolicy: All
132+
# 完成策略,All表示所有pod完成即任务完成,Any表示任何pod完成即任务完成
133+
completePolicy: Any
134+
# 失败策略,All表示所有pod失败即任务失败,Any表示任何pod完成即任务完成
135+
failPolicy: Any
136+
# 支持horovod与paddle框架
137+
frameworkType: paddle
138+
# 弹性选项,true表示开启弹性,false不开启,开启时需开启trainer容器的容错选项
139+
faultTolerant: true
140+
plugin:
141+
ssh:
142+
- ""
143+
discovery:
144+
- ""
145+
priority: normal
146+
replicaSpecs:
147+
trainer:
148+
completePolicy: None
149+
failPolicy: None
150+
# 容错配置,控制器将会以下面的配置作为容错判断条件进行容错
151+
faultTolerantPolicy:
152+
# 程序退出码
153+
- exitCodes: 129,10001,127,137,143,129
154+
restartPolicy: ExitCode
155+
restartScope: Pod
156+
# 集群异常事件
157+
- exceptionalEvent: "nodeNotReady,PodForceDeleted"
158+
restartPolicy: OnNodeFail
159+
restartScope: Pod
160+
# 开启弹性的最大副本数
161+
maxReplicas: 5
162+
# 开启弹性的最小副本数
163+
minReplicas: 1
164+
replicaType: worker
165+
replicas: 3
166+
restartLimit: 100
167+
restartPolicy: OnNodeFailWithExitCode
168+
restartTimeLimit: 60
169+
restartTimeout: 864000
170+
template:
171+
metadata:
172+
creationTimestamp: null
173+
spec:
174+
containers:
175+
- command:
176+
- /bin/bash
177+
- -c
178+
- /usr/sbin/sshd && sleep 40000
179+
image: registry.baidubce.com/cce-plugin-dev/horovod:v0.1.0
180+
imagePullPolicy: Always
181+
name: aitj-0
182+
securityContext:
183+
capabilities:
184+
add:
185+
- SYS_ADMIN
186+
volumeMounts:
187+
- mountPath: /dev/shm
188+
name: cache-volume
189+
dnsPolicy: ClusterFirstWithHostNet
190+
terminationGracePeriodSeconds: 300
191+
volumes:
192+
- emptyDir:
193+
medium: Memory
194+
sizeLimit: 100Gi
195+
name: cache-volume
196+
schedulerName: volcano
197+
# AITrainingJob-kongming.cce.baiudbce.com/v1-collective
198+
---
124199
apiVersion: "kubeflow.org/v1"
125200
kind: "PyTorchJob"
126201
metadata:

installer/deploys/paddleflow-server/paddleflow-server-deploy.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,7 @@ rules:
495495
- argoproj.io
496496
- kubeflow.org
497497
- ray.io
498+
- kongming.cce.baiudbce.com
498499
resources:
499500
- '*'
500501
verbs:

installer/paddleflow-deployment-before-v1-18.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ rules:
776776
- argoproj.io
777777
- kubeflow.org
778778
- ray.io
779+
- kongming.cce.baiudbce.com
779780
resources:
780781
- '*'
781782
verbs:

installer/paddleflow-deployment-v1-13.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ rules:
776776
- argoproj.io
777777
- kubeflow.org
778778
- ray.io
779+
- kongming.cce.baiudbce.com
779780
resources:
780781
- '*'
781782
verbs:

installer/paddleflow-deployment.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ rules:
776776
- argoproj.io
777777
- kubeflow.org
778778
- ray.io
779+
- kongming.cce.baiudbce.com
779780
resources:
780781
- '*'
781782
verbs:

0 commit comments

Comments
 (0)