-
Couldn't load subscription status.
- Fork 108
Gracefully exit when terminated mid deploy / destroy #3758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| bundle: | ||
| name: signal-test | ||
|
|
||
| resources: | ||
| jobs: | ||
| job1: | ||
| name: job1 | ||
|
|
||
| job2: | ||
| name: job2 (deploy after ${resources.jobs.job1.id}) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
|
|
||
| === Wait until the deployment has started.Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... | ||
| Deploying resources... | ||
| Deployment in progress, sending interrupt signal... | ||
|
|
||
| >>> kill -INT [PID] | ||
|
|
||
| >>> wait [PID] | ||
| Operation interrupted. Gracefully shutting down... | ||
|
|
||
| Exit code: 130 | ||
|
|
||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.0/workspace/delete", | ||
| "body": { | ||
| "path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/deploy.lock" | ||
| } | ||
| } | ||
|
|
||
| === A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment. | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.2/jobs/create", | ||
| "body": { | ||
| "deployment": { | ||
| "kind": "BUNDLE", | ||
| "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/signal-test/default/state/metadata.json" | ||
| }, | ||
| "edit_mode": "UI_LOCKED", | ||
| "format": "MULTI_TASK", | ||
| "max_concurrent_runs": 1, | ||
| "name": "job1", | ||
| "queue": { | ||
| "enabled": true | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| #!/bin/bash | ||
|
|
||
| # Start deployment in background, redirecting stderr to capture when deployment starts | ||
| $CLI bundle deploy 2>&1 & | ||
| DEPLOY_PID=$! | ||
|
|
||
| # Wait for deployment to start by monitoring the requests file | ||
| # Once we see the job creation request starting, we know deployment is in progress | ||
| title "Wait until the deployment has started." | ||
| for i in {1..30}; do | ||
| if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' out.requests.txt >/dev/null 2>&1; then | ||
| echo "Deployment in progress, sending interrupt signal..." | ||
| break | ||
| fi | ||
| sleep 0.1 | ||
| done | ||
|
|
||
| # Send interrupt signal | ||
| trace kill -INT $DEPLOY_PID | ||
|
|
||
| # Wait for process to complete | ||
| errcode trace wait $DEPLOY_PID | ||
|
|
||
| # A deletion request for deploy.lock should have been recorded in the requests file | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("deploy.lock")))' | ||
|
|
||
| title "A creation request for job1 should be recorded in the requests file. No request for job2 should exist since the process was terminated mid deployment." | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/create")))' | ||
|
|
||
| rm out.requests.txt | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| Local = true | ||
| Cloud = false | ||
| RecordRequests = true | ||
|
|
||
| # Add delay to first job creation to ensure we can interrupt during deployment | ||
| [[Server]] | ||
| Pattern = "POST /api/2.2/jobs/create" | ||
| Response.StatusCode = 200 | ||
| Response.Body = '{"job_id": 1111}' | ||
|
|
||
| # Large time to ensure deployment gets stuck when trying to create the first job. | ||
| Delay = "300s" | ||
|
|
||
| # Replace PID numbers in kill/wait commands | ||
| [[Repls]] | ||
| Old = "(kill -INT |wait )\\d+" | ||
| New = "$1[PID]" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| bundle: | ||
| name: signal-test | ||
|
|
||
| resources: | ||
| jobs: | ||
| job1: | ||
| name: job1 | ||
|
|
||
| job2: | ||
| name: job2 (depends on ${resources.jobs.job1.id}) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/signal-test/default/files... | ||
| Deploying resources... | ||
| Updating deployment state... | ||
| Deployment complete! | ||
|
|
||
| === Wait until the destroy has started.The following resources will be deleted: | ||
| delete job job1 | ||
| delete job job2 | ||
|
|
||
| All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/signal-test/default | ||
|
|
||
| Destroy in progress, sending interrupt signal... | ||
|
|
||
| >>> kill -INT [PID] | ||
|
|
||
| >>> wait [PID] | ||
| Operation interrupted. Gracefully shutting down... | ||
|
|
||
| Exit code: 130 | ||
|
|
||
| >>> cat out.requests.txt | ||
|
|
||
| === A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy. | ||
| >>> cat out.requests.txt | ||
| { | ||
| "method": "POST", | ||
| "path": "/api/2.2/jobs/delete", | ||
| "body": { | ||
| "job_id": [NUMID] | ||
| } | ||
| } | ||
|
|
||
| === [CLI] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| #!/bin/bash | ||
|
|
||
| # First deploy the bundle so we have something to destroy | ||
| $CLI bundle deploy --auto-approve | ||
|
|
||
| # Start destroy in background, redirecting stderr to capture when destroy starts | ||
| $CLI bundle destroy --auto-approve 2>&1 & | ||
| DESTROY_PID=$! | ||
|
|
||
| # Wait for destroy to start by monitoring for job deletion request | ||
| title "Wait until the destroy has started." | ||
| for i in {1..30}; do | ||
| if [ -f out.requests.txt ] && jq -e 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' out.requests.txt >/dev/null 2>&1; then | ||
| echo "Destroy in progress, sending interrupt signal..." | ||
| break | ||
| fi | ||
| sleep 0.1 | ||
| done | ||
|
|
||
| # Send interrupt signal | ||
| trace kill -INT $DESTROY_PID | ||
|
|
||
| # Wait for process to complete | ||
| errcode trace wait $DESTROY_PID | ||
|
|
||
| # A deletion request for destroy.lock should have been recorded in the requests file | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("workspace/delete")) and (.body.path | contains("destroy.lock")))' | ||
|
|
||
| title "A deletion request for job2 should be recorded in the requests file. No request for job1 should exist since the process was terminated mid destroy." | ||
| trace cat out.requests.txt | jq 'select(.method == "POST" and (.path | contains("/api/2.2/jobs/delete")))' | ||
|
|
||
| rm out.requests.txt |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| Local = true | ||
| Cloud = false | ||
| RecordRequests = true | ||
|
|
||
| # Test only terraform engine (signal handling is the same for direct) | ||
| EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform"] | ||
|
|
||
| # Add delay to first job deletion to ensure we can interrupt during destroy | ||
| [[Server]] | ||
| Pattern = "POST /api/2.2/jobs/delete" | ||
| Response.StatusCode = 200 | ||
| Response.Body = '{}' | ||
|
|
||
| # Large time to ensure destroy gets stuck when deleting the first job. | ||
| Delay = "300s" | ||
|
|
||
| # Replace PID numbers in kill/wait commands | ||
| [[Repls]] | ||
| Old = "(kill -INT |wait )\\d+" | ||
| New = "$1[PID]" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,9 @@ package phases | |
| import ( | ||
| "context" | ||
| "errors" | ||
| "os" | ||
| "os/signal" | ||
| "syscall" | ||
|
|
||
| "github.com/databricks/cli/bundle" | ||
| "github.com/databricks/cli/bundle/artifacts" | ||
|
|
@@ -132,6 +135,43 @@ func uploadLibraries(ctx context.Context, b *bundle.Bundle, libs map[string][]li | |
| ) | ||
| } | ||
|
|
||
| // registerGracefulCleanup sets up signal handlers to release the lock | ||
| // before the process terminates. Returns a cleanup function for the normal exit path. | ||
| // | ||
| // Catches SIGINT (Ctrl+C), SIGTERM, SIGHUP, and SIGQUIT. | ||
| // Note: SIGKILL and SIGSTOP cannot be caught - the kernel terminates the process directly. | ||
| func registerGracefulCleanup(ctx context.Context, b *bundle.Bundle, goal lock.Goal) func() { | ||
| sigChan := make(chan os.Signal, 1) | ||
| signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP, syscall.SIGQUIT) | ||
|
|
||
| // Start goroutine to handle signals | ||
| go func() { | ||
| sig := <-sigChan | ||
| // Stop listening for more signals | ||
| signal.Stop(sigChan) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The main routine is still running at this point. We need to make sure it is terminated and wait for that termination before releasing the lock and terminating. We could return a new context from this function and cancel it when the signal is received. That doesn't ensure we wait for completion/termination and could be handled similarly. |
||
|
|
||
| cmdio.LogString(ctx, "Operation interrupted. Gracefully shutting down...") | ||
|
|
||
| // Release the lock. | ||
| bundle.ApplyContext(ctx, b, lock.Release(goal)) | ||
|
|
||
| // Exit immediately with standard signal exit code (128 + signal number). | ||
| // The deferred cleanup function returned below won't run because we exit here. | ||
| exitCode := 128 | ||
| if s, ok := sig.(syscall.Signal); ok { | ||
| exitCode += int(s) | ||
| } | ||
| os.Exit(exitCode) | ||
| }() | ||
|
|
||
| // Return cleanup function for normal exit path | ||
| return func() { | ||
| signal.Stop(sigChan) | ||
| // Don't close the channel - it causes the goroutine to receive nil | ||
| bundle.ApplyContext(ctx, b, lock.Release(goal)) | ||
| } | ||
| } | ||
|
|
||
| // The deploy phase deploys artifacts and resources. | ||
| func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHandler) { | ||
| log.Info(ctx, "Phase: deploy") | ||
|
|
@@ -148,10 +188,8 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand | |
| return | ||
| } | ||
|
|
||
| // lock is acquired here | ||
| defer func() { | ||
| bundle.ApplyContext(ctx, b, lock.Release(lock.GoalDeploy)) | ||
| }() | ||
| // lock is acquired here - set up signal handlers and defer cleanup | ||
| defer registerGracefulCleanup(ctx, b, lock.GoalDeploy)() | ||
|
|
||
| libs := deployPrepare(ctx, b) | ||
| if logdiag.HasError(ctx) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
would be nice to record 'bundle plan' after interrupted deploy/destroy.