Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[reefd] add windows instance reaper #261

Merged
merged 3 commits into from
Mar 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,27 @@ module github.com/ray-project/rayci
go 1.24.1

require (
github.com/aws/aws-sdk-go-v2 v1.36.3
github.com/aws/aws-sdk-go-v2/config v1.29.9
github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1
github.com/google/go-containerregistry v0.20.3
gopkg.in/yaml.v3 v3.0.1
)

require (
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
github.com/Microsoft/go-winio v0.6.2 // indirect
github.com/aws/aws-sdk-go-v2/credentials v1.17.62 // indirect
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17 // indirect
github.com/aws/smithy-go v1.22.2 // indirect
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
github.com/containerd/stargz-snapshotter/estargz v0.16.3 // indirect
github.com/distribution/reference v0.6.0 // indirect
Expand Down
28 changes: 28 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM=
github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg=
github.com/aws/aws-sdk-go-v2/config v1.29.9 h1:Kg+fAYNaJeGXp1vmjtidss8O2uXIsXwaRqsQJKXVr+0=
github.com/aws/aws-sdk-go-v2/config v1.29.9/go.mod h1:oU3jj2O53kgOU4TXq/yipt6ryiooYjlkqqVaZk7gY/U=
github.com/aws/aws-sdk-go-v2/credentials v1.17.62 h1:fvtQY3zFzYJ9CfixuAQ96IxDrBajbBWGqjNTCa79ocU=
github.com/aws/aws-sdk-go-v2/credentials v1.17.62/go.mod h1:ElETBxIQqcxej++Cs8GyPBbgMys5DgQPTwo7cUPDKt8=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw=
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo=
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1 h1:+4A9SDduLZFlDeXWRmfQ6r8kyEJZQfK6lcg+KwdvWrI=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.210.1/go.mod h1:ouvGEfHbLaIlWwpDpOVWPWR+YwO0HDv3vm5tYLq8ImY=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE=
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY=
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1 h1:8JdC7Gr9NROg1Rusk25IcZeTO59zLxsKgE0gkh5O6h0=
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1 h1:KwuLovgQPcdjNMfFt9OhUd9a2OwcOKhxfvF4glTzLuA=
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs=
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17 h1:PZV5W8yk4OtH1JAuhV2PXwwO9v5G5Aoj+eMCn4T+1Kc=
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4=
github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ=
github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
Expand Down
45 changes: 45 additions & 0 deletions reefd/aws_clients.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package reefd

import (
"context"

"github.com/aws/aws-sdk-go-v2/aws"
awsconfig "github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/ec2"
)

type ec2Client interface {
DescribeInstances(
ctx context.Context,
params *ec2.DescribeInstancesInput,
optFns ...func(*ec2.Options),
) (*ec2.DescribeInstancesOutput, error)

TerminateInstances(
ctx context.Context,
params *ec2.TerminateInstancesInput,
optFns ...func(*ec2.Options),
) (*ec2.TerminateInstancesOutput, error)
}

type awsClients struct {
ec2 func() ec2Client
}

func newAWSClientsFromConfig(cfg *aws.Config) *awsClients {
return &awsClients{
ec2: func() ec2Client { return ec2.NewFromConfig(*cfg) },
}
}

const awsRegion = "us-west-2"

func newAWSClients(ctx context.Context) (*awsClients, error) {
cfg, err := awsconfig.LoadDefaultConfig(
ctx, awsconfig.WithRegion(awsRegion),
)
if err != nil {
return nil, err
}
return newAWSClientsFromConfig(&cfg), nil
}
98 changes: 98 additions & 0 deletions reefd/reaper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package reefd

import (
"context"
"fmt"
"log"
"sort"
"time"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
)

type reaper struct {
ec2 ec2Client
nowFunc func() time.Time
}

func newReaper(ec2 ec2Client) *reaper {
return &reaper{ec2: ec2}
}

func (r *reaper) now() time.Time {
if r.nowFunc != nil {
return r.nowFunc()
}
return time.Now()
}

func (r *reaper) setNowFunc(f func() time.Time) {
r.nowFunc = f
}

func (r *reaper) listDeadWindowsInstances(ctx context.Context) ([]string, error) {
filters := []types.Filter{{
Name: aws.String("tag:BuildkiteQueue"),
Values: []string{"*windows*"},
}, {
Name: aws.String("instance-state-code"),
Values: []string{
"0", // pending
"16", // running
},
}}
const maxResults = 500
input := &ec2.DescribeInstancesInput{
Filters: filters,
MaxResults: aws.Int32(maxResults),
}
result, err := r.ec2.DescribeInstances(ctx, input)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might not happen but is there a limit on what DescribeInstances returns before it starts the next page. Should we do pagination here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, that is a good point. let me have a look.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a MaxResults of 500. we will rerun the reaper repeatedly until it terminates nothing.

if err != nil {
return nil, fmt.Errorf("describe instances: %w", err)
}

const instanceAgeLimit = -4 * time.Hour

cut := r.now().Add(instanceAgeLimit)

var instances []string
for _, r := range result.Reservations {
for _, i := range r.Instances {
if i.LaunchTime.Before(cut) {
instances = append(instances, *i.InstanceId)
}
}
}

sort.Strings(instances)

return instances, nil
}

func (r *reaper) terminateInstances(ctx context.Context, ids []string) error {
if len(ids) == 0 {
return nil
}
input := &ec2.TerminateInstancesInput{InstanceIds: ids}
_, err := r.ec2.TerminateInstances(ctx, input)
return err
}

func (r *reaper) listAndReapDeadWindowsInstances(ctx context.Context) (int, error) {
ids, err := r.listDeadWindowsInstances(ctx)
if err != nil {
return 0, err
}
if len(ids) == 0 {
return 0, nil
}

log.Printf("terminating %d instances: %v", len(ids), ids)
if err := r.terminateInstances(ctx, ids); err != nil {
return 0, err
}

return len(ids), nil
}
Loading