Skip to content

Commit 56688d0

Browse files
authored
Training profiler support (#77)
- Requeue for profiler jobs in training - Refactor to reduce code duplication and use common pkg - Includes some test refactoring to include both profiler and debugger - Clean up code duplication in test_training and test_training_debugger **TODO** fix docs in crd fields getting changed ### Testing ``` (ack) ubuntu@ip-172-31-0-119:~/go/src/github.com/aws-controllers-k8s/sagemaker-controller/test/e2e$ pytest tests/test_trainingjob.py ========================================== test session starts ========================================== platform linux -- Python 3.8.8, pytest-6.2.3, py-1.10.0, pluggy-0.13.1 rootdir: /home/ubuntu/go/src/github.com/aws-controllers-k8s/sagemaker-controller/test/e2e plugins: xdist-2.2.0, forked-1.3.0 collected 2 items tests/test_trainingjob.py .. [100%] ===================================== 2 passed in 441.19s (0:07:21) ===================================== ``` ``` (ack) ubuntu@ip-172-31-0-119:~/go/src/github.com/aws-controllers-k8s/sagemaker-controller/test/e2e$ pytest tests/test_trainingjob_debugger.py ========================================== test session starts ========================================== platform linux -- Python 3.8.8, pytest-6.2.3, py-1.10.0, pluggy-0.13.1 rootdir: /home/ubuntu/go/src/github.com/aws-controllers-k8s/sagemaker-controller/test/e2e plugins: xdist-2.2.0, forked-1.3.0 collected 1 item tests/test_trainingjob_debugger.py . [100%] ===================================== 1 passed in 629.97s (0:10:29) ===================================== ```
1 parent 41a85fd commit 56688d0

13 files changed

+261
-237
lines changed
+4-4
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
ack_generate_info:
2-
build_date: "2021-08-12T00:32:37Z"
2+
build_date: "2021-08-12T23:35:21Z"
33
build_hash: c77aa9c75d944952dee198029ba9822691cd82b0
44
go_version: go1.16.4 linux/amd64
55
version: v0.6.0
6-
api_directory_checksum: c8f263a8d1a2e21f53c85b1ca5e2be3687045dd0
6+
api_directory_checksum: f7275158658c496010e6e3d2d24b3aba37e2e1f4
77
api_version: v1alpha1
88
aws_sdk_go_version: v1.38.11
99
generator_config_info:
10-
file_checksum: 795f6cf17e64a254ed063ee36e489b2949a031ef
10+
file_checksum: 4acc645991dfd96fd543c3c34df274a8e4f4787e
1111
original_file_name: generator.yaml
1212
last_modification:
1313
reason: API generation
14-
timestamp: 2021-08-12 00:32:40.304705695 +0000 UTC
14+
timestamp: 2021-08-12 23:35:25.778426245 +0000 UTC

apis/v1alpha1/generator.yaml

+14-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
operations:
2-
DescribeTrainingJob:
3-
set_output_custom_method_name: customDescribeTrainingJobSetOutput
42
StopTrainingJob:
53
operation_type: Delete
64
resource_name: TrainingJob
@@ -169,7 +167,9 @@ resources:
169167
delta_pre_compare:
170168
code: customSetDefaults(a, b)
171169
sdk_create_post_set_output:
172-
code: rm.customSetOutput(desired, aws.String(svcsdk.TrainingJobStatusInProgress), ko)
170+
code: rm.customCreateTrainingJobSetOutput(&resource{ko})
171+
sdk_read_one_post_set_output:
172+
code: rm.customDescribeTrainingJobSetOutput(&resource{ko})
173173
sdk_delete_pre_build_request:
174174
template_path: training_job/sdk_delete_pre_build_request.go.tpl
175175
fields:
@@ -191,7 +191,17 @@ resources:
191191
is_read_only: true
192192
from:
193193
operation: DescribeTrainingJob
194-
path: DebugRuleEvaluationStatuses
194+
path: DebugRuleEvaluationStatuses
195+
ProfilerRuleEvaluationStatuses:
196+
is_read_only: true
197+
from:
198+
operation: DescribeTrainingJob
199+
path: ProfilerRuleEvaluationStatuses
200+
ModelArtifacts:
201+
is_read_only: true
202+
from:
203+
operation: DescribeTrainingJob
204+
path: ModelArtifacts
195205
FailureReason:
196206
is_read_only: true
197207
print:

apis/v1alpha1/training_job.go

+5
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/v1alpha1/zz_generated.deepcopy.go

+16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/sagemaker.services.k8s.aws_trainingjobs.yaml

+26
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,32 @@ spec:
516516
failureReason:
517517
description: If the training job failed, the reason it failed.
518518
type: string
519+
modelArtifacts:
520+
description: Information about the Amazon S3 location that is configured
521+
for storing model artifacts.
522+
properties:
523+
s3ModelArtifacts:
524+
type: string
525+
type: object
526+
profilerRuleEvaluationStatuses:
527+
description: Evaluation status of Debugger rules for profiling on
528+
a training job.
529+
items:
530+
description: Information about the status of the rule evaluation.
531+
properties:
532+
lastModifiedTime:
533+
format: date-time
534+
type: string
535+
ruleConfigurationName:
536+
type: string
537+
ruleEvaluationJobARN:
538+
type: string
539+
ruleEvaluationStatus:
540+
type: string
541+
statusDetails:
542+
type: string
543+
type: object
544+
type: array
519545
secondaryStatus:
520546
description: "Provides detailed information about the state of the
521547
training job. For detailed information on the secondary status of

generator.yaml

+14-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
operations:
2-
DescribeTrainingJob:
3-
set_output_custom_method_name: customDescribeTrainingJobSetOutput
42
StopTrainingJob:
53
operation_type: Delete
64
resource_name: TrainingJob
@@ -169,7 +167,9 @@ resources:
169167
delta_pre_compare:
170168
code: customSetDefaults(a, b)
171169
sdk_create_post_set_output:
172-
code: rm.customSetOutput(desired, aws.String(svcsdk.TrainingJobStatusInProgress), ko)
170+
code: rm.customCreateTrainingJobSetOutput(&resource{ko})
171+
sdk_read_one_post_set_output:
172+
code: rm.customDescribeTrainingJobSetOutput(&resource{ko})
173173
sdk_delete_pre_build_request:
174174
template_path: training_job/sdk_delete_pre_build_request.go.tpl
175175
fields:
@@ -191,7 +191,17 @@ resources:
191191
is_read_only: true
192192
from:
193193
operation: DescribeTrainingJob
194-
path: DebugRuleEvaluationStatuses
194+
path: DebugRuleEvaluationStatuses
195+
ProfilerRuleEvaluationStatuses:
196+
is_read_only: true
197+
from:
198+
operation: DescribeTrainingJob
199+
path: ProfilerRuleEvaluationStatuses
200+
ModelArtifacts:
201+
is_read_only: true
202+
from:
203+
operation: DescribeTrainingJob
204+
path: ModelArtifacts
195205
FailureReason:
196206
is_read_only: true
197207
print:

pkg/resource/training_job/custom_set_output.go

+30-55
Original file line numberDiff line numberDiff line change
@@ -17,75 +17,50 @@
1717
package training_job
1818

1919
import (
20-
"context"
21-
22-
ackv1alpha1 "github.com/aws-controllers-k8s/runtime/apis/core/v1alpha1"
23-
svcapitypes "github.com/aws-controllers-k8s/sagemaker-controller/apis/v1alpha1"
20+
svccommon "github.com/aws-controllers-k8s/sagemaker-controller/pkg/common"
2421
"github.com/aws/aws-sdk-go/aws"
2522
svcsdk "github.com/aws/aws-sdk-go/service/sagemaker"
26-
corev1 "k8s.io/api/core/v1"
2723
)
2824

29-
// customDescribeTrainingJobSetOutput sets the resource ResourceSynced condition to False if
30-
// TrainingJob is being modified by AWS. It has an additional check on the debugger status.
31-
func (rm *resourceManager) customDescribeTrainingJobSetOutput(
32-
ctx context.Context,
33-
r *resource,
34-
resp *svcsdk.DescribeTrainingJobOutput,
35-
ko *svcapitypes.TrainingJob,
36-
) (*svcapitypes.TrainingJob, error) {
37-
trainingJobStatus := resp.TrainingJobStatus
38-
debuggerRuleInProgress := false
39-
if resp.DebugRuleEvaluationStatuses != nil {
40-
for _, rule := range resp.DebugRuleEvaluationStatuses {
41-
if rule.RuleEvaluationStatus != nil && *rule.RuleEvaluationStatus == svcsdk.RuleEvaluationStatusInProgress {
42-
debuggerRuleInProgress = true
43-
rm.customSetOutput(r, aws.String(svcsdk.TrainingJobStatusInProgress), ko)
44-
break
45-
}
46-
}
25+
var (
26+
trainingJobModifyingStatuses = []string{
27+
svcsdk.TrainingJobStatusInProgress,
28+
svcsdk.TrainingJobStatusStopping,
4729
}
48-
49-
if !debuggerRuleInProgress {
50-
rm.customSetOutput(r, trainingJobStatus, ko)
30+
ruleModifyingStatuses = []string{
31+
svcsdk.RuleEvaluationStatusInProgress,
32+
svcsdk.RuleEvaluationStatusStopping,
5133
}
34+
resourceName = resourceGK.Kind
35+
)
5236

53-
return ko, nil
54-
}
55-
56-
// customSetOutput sets ConditionTypeResourceSynced condition to True or False
57-
// based on the trainingJobStatus on AWS so the reconciler can determine if a
58-
// requeue is needed
59-
func (rm *resourceManager) customSetOutput(
60-
r *resource,
61-
trainingJobStatus *string,
62-
ko *svcapitypes.TrainingJob,
63-
) {
64-
if trainingJobStatus == nil {
37+
// customDescribeTrainingJobSetOutput sets the resource ResourceSynced condition to False if
38+
// TrainingJob is being modified by AWS. It checks for debug and profiler rule status in addition to TrainingJobStatus
39+
func (rm *resourceManager) customDescribeTrainingJobSetOutput(r *resource) {
40+
trainingJobStatus := r.ko.Status.TrainingJobStatus
41+
// early exit if training job is InProgress
42+
if trainingJobStatus != nil && *trainingJobStatus == svcsdk.TrainingJobStatusInProgress {
43+
svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses)
6544
return
6645
}
6746

68-
syncConditionStatus := corev1.ConditionUnknown
69-
if *trainingJobStatus == svcsdk.TrainingJobStatusCompleted || *trainingJobStatus == svcsdk.TrainingJobStatusStopped || *trainingJobStatus == svcsdk.TrainingJobStatusFailed {
70-
syncConditionStatus = corev1.ConditionTrue
71-
} else {
72-
syncConditionStatus = corev1.ConditionFalse
73-
}
74-
75-
var resourceSyncedCondition *ackv1alpha1.Condition = nil
76-
for _, condition := range ko.Status.Conditions {
77-
if condition.Type == ackv1alpha1.ConditionTypeResourceSynced {
78-
resourceSyncedCondition = condition
79-
break
47+
for _, rule := range r.ko.Status.DebugRuleEvaluationStatuses {
48+
if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) {
49+
svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("DebugRule"), &ruleModifyingStatuses)
50+
return
8051
}
8152
}
8253

83-
if resourceSyncedCondition == nil {
84-
resourceSyncedCondition = &ackv1alpha1.Condition{
85-
Type: ackv1alpha1.ConditionTypeResourceSynced,
54+
for _, rule := range r.ko.Status.ProfilerRuleEvaluationStatuses {
55+
if rule.RuleEvaluationStatus != nil && svccommon.IsModifyingStatus(rule.RuleEvaluationStatus, &ruleModifyingStatuses) {
56+
svccommon.SetSyncedCondition(r, rule.RuleEvaluationStatus, aws.String("ProfilerRule"), &ruleModifyingStatuses)
57+
return
8658
}
87-
ko.Status.Conditions = append(ko.Status.Conditions, resourceSyncedCondition)
8859
}
89-
resourceSyncedCondition.Status = syncConditionStatus
9060

61+
svccommon.SetSyncedCondition(r, trainingJobStatus, &resourceName, &trainingJobModifyingStatuses)
62+
}
63+
64+
func (rm *resourceManager) customCreateTrainingJobSetOutput(r *resource) {
65+
svccommon.SetSyncedCondition(r, aws.String(svcsdk.TrainingJobStatusInProgress), &resourceName, &trainingJobModifyingStatuses)
9166
}

pkg/resource/training_job/sdk.go

+36-6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)