Fix new debounce run race

ericallam · ericallam · commit d78877b5815b · 2025-12-18T14:58:30.000Z
diff --git a/internal-packages/run-engine/src/engine/systems/debounceSystem.ts b/internal-packages/run-engine/src/engine/systems/debounceSystem.ts
@@ -243,6 +243,74 @@ return 0
     return { claimed: false, existingRunId: existingValue };
   }
 
+  /**
+   * Atomically claims the debounce key before returning "new".
+   * This prevents the race condition where returning "new" without a claimId
+   * allows registerDebouncedRun to do a plain SET that can overwrite another server's registration.
+   *
+   * This method is called when we've determined there's no valid existing run but need
+   * to safely claim the key before creating a new one.
+   */
+  private async claimKeyForNewRun({
+    environmentId,
+    taskIdentifier,
+    debounce,
+    tx,
+  }: {
+    environmentId: string;
+    taskIdentifier: string;
+    debounce: DebounceOptions;
+    tx?: PrismaClientOrTransaction;
+  }): Promise<DebounceResult> {
+    const redisKey = this.getDebounceRedisKey(environmentId, taskIdentifier, debounce.key);
+    const claimId = nanoid(16);
+
+    const claimResult = await this.claimDebounceKey({
+      environmentId,
+      taskIdentifier,
+      debounceKey: debounce.key,
+      claimId,
+      ttlMs: CLAIM_TTL_MS,
+    });
+
+    if (claimResult.claimed) {
+      this.$.logger.debug("claimKeyForNewRun: claimed key, returning new", {
+        debounceKey: debounce.key,
+        taskIdentifier,
+        environmentId,
+        claimId,
+      });
+      return { status: "new", claimId };
+    }
+
+    if (claimResult.existingRunId) {
+      // Another server registered a run while we were processing - handle it
+      this.$.logger.debug("claimKeyForNewRun: found existing run, handling it", {
+        debounceKey: debounce.key,
+        existingRunId: claimResult.existingRunId,
+      });
+      return await this.handleExistingRun({
+        existingRunId: claimResult.existingRunId,
+        redisKey,
+        environmentId,
+        taskIdentifier,
+        debounce,
+        tx,
+      });
+    }
+
+    // Another server is creating (pending state) - wait for it
+    this.$.logger.debug("claimKeyForNewRun: key is pending, waiting for existing run", {
+      debounceKey: debounce.key,
+    });
+    return await this.waitForExistingRun({
+      environmentId,
+      taskIdentifier,
+      debounce,
+      tx,
+    });
+  }
+
   /**
    * Waits for another server to complete its claim and register a run ID.
    * Used when we detect a "pending" state, meaning another server has claimed
@@ -267,13 +335,18 @@ return 0
       const value = await this.redis.get(redisKey);
 
       if (!value) {
-        // Key expired or was deleted - return "new" to create fresh
-        this.$.logger.debug("waitForExistingRun: key expired/deleted, returning new", {
+        // Key expired or was deleted - atomically claim before returning "new"
+        this.$.logger.debug("waitForExistingRun: key expired/deleted, claiming key", {
           redisKey,
           debounceKey: debounce.key,
           attempt: i + 1,
         });
-        return { status: "new" };
+        return await this.claimKeyForNewRun({
+          environmentId,
+          taskIdentifier,
+          debounce,
+          tx,
+        });
       }
 
       if (!value.startsWith("pending:")) {
@@ -287,6 +360,8 @@ return 0
         return await this.handleExistingRun({
           existingRunId: value,
           redisKey,
+          environmentId,
+          taskIdentifier,
           debounce,
           tx,
         });
@@ -314,12 +389,17 @@ return 0
     const deleteResult = await this.conditionallyDeletePendingKey(redisKey);
 
     if (deleteResult.deleted) {
-      // Key was pending (or didn't exist) - safe to create new run
-      this.$.logger.debug("waitForExistingRun: stale pending key deleted, returning new", {
+      // Key was pending (or didn't exist) - atomically claim before returning "new"
+      this.$.logger.debug("waitForExistingRun: stale pending key deleted, claiming key", {
         redisKey,
         debounceKey: debounce.key,
       });
-      return { status: "new" };
+      return await this.claimKeyForNewRun({
+        environmentId,
+        taskIdentifier,
+        debounce,
+        tx,
+      });
     }
 
     // Key now has a valid run ID - the original server completed!
@@ -335,6 +415,8 @@ return 0
     return await this.handleExistingRun({
       existingRunId: deleteResult.existingRunId,
       redisKey,
+      environmentId,
+      taskIdentifier,
       debounce,
       tx,
     });
@@ -347,11 +429,15 @@ return 0
   private async handleExistingRun({
     existingRunId,
     redisKey,
+    environmentId,
+    taskIdentifier,
     debounce,
     tx,
   }: {
     existingRunId: string;
     redisKey: string;
+    environmentId: string;
+    taskIdentifier: string;
     debounce: DebounceOptions;
     tx?: PrismaClientOrTransaction;
   }): Promise<DebounceResult> {
@@ -369,9 +455,14 @@ return 0
           debounceKey: debounce.key,
           error,
         });
-        // Clean up stale Redis key
+        // Clean up stale Redis key and atomically claim before returning "new"
         await this.redis.del(redisKey);
-        return { status: "new" };
+        return await this.claimKeyForNewRun({
+          environmentId,
+          taskIdentifier,
+          debounce,
+          tx,
+        });
       }
 
       // Check if run is still in DELAYED status (or legacy RUN_CREATED for older runs)
@@ -381,9 +472,14 @@ return 0
           executionStatus: snapshot.executionStatus,
           debounceKey: debounce.key,
         });
-        // Clean up Redis key since run is no longer debounceable
+        // Clean up Redis key and atomically claim before returning "new"
         await this.redis.del(redisKey);
-        return { status: "new" };
+        return await this.claimKeyForNewRun({
+          environmentId,
+          taskIdentifier,
+          debounce,
+          tx,
+        });
       }
 
       // Get the run to check debounce metadata and createdAt
@@ -399,8 +495,14 @@ return 0
           existingRunId,
           debounceKey: debounce.key,
         });
+        // Clean up stale Redis key and atomically claim before returning "new"
         await this.redis.del(redisKey);
-        return { status: "new" };
+        return await this.claimKeyForNewRun({
+          environmentId,
+          taskIdentifier,
+          debounce,
+          tx,
+        });
       }
 
       // Calculate new delay - parseNaturalLanguageDuration returns a Date (now + duration)
@@ -409,7 +511,13 @@ return 0
         this.$.logger.error("handleExistingRun: invalid delay duration", {
           delay: debounce.delay,
         });
-        return { status: "new" };
+        // Invalid delay but we still need to atomically claim before returning "new"
+        return await this.claimKeyForNewRun({
+          environmentId,
+          taskIdentifier,
+          debounce,
+          tx,
+        });
       }
 
       // Check if max debounce duration would be exceeded
@@ -566,6 +674,8 @@ return 0
         return await this.handleExistingRun({
           existingRunId: claimResult.existingRunId,
           redisKey,
+          environmentId,
+          taskIdentifier,
           debounce,
           tx,
         });
diff --git a/internal-packages/run-engine/src/engine/tests/debounce.test.ts b/internal-packages/run-engine/src/engine/tests/debounce.test.ts
@@ -2041,5 +2041,134 @@ describe("RunEngine debounce", () => {
       }
     }
   );
+
+  containerTest(
+    "waitForExistingRun: returns claimId when key expires during wait",
+    async ({ prisma, redisOptions }) => {
+      // This test verifies the fix for the race condition where waitForExistingRun
+      // returns { status: "new" } without a claimId. Without the fix:
+      // 1. Server A's pending claim expires
+      // 2. Server B's waitForExistingRun detects key is gone, returns { status: "new" } (no claimId)
+      // 3. Server C atomically claims the key and registers runId-C
+      // 4. Server B calls registerDebouncedRun without claimId, does plain SET, overwrites runId-C
+      //
+      // With the fix, step 2 atomically claims the key before returning, preventing step 4's overwrite.
+
+      const { createRedisClient } = await import("@internal/redis");
+
+      const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION");
+
+      const engine = new RunEngine({
+        prisma,
+        worker: {
+          redis: redisOptions,
+          workers: 1,
+          tasksPerWorker: 10,
+          pollIntervalMs: 100,
+        },
+        queue: {
+          redis: redisOptions,
+        },
+        runLock: {
+          redis: redisOptions,
+        },
+        machines: {
+          defaultMachine: "small-1x",
+          machines: {
+            "small-1x": {
+              name: "small-1x" as const,
+              cpu: 0.5,
+              memory: 0.5,
+              centsPerMs: 0.0001,
+            },
+          },
+          baseCostInCents: 0.0001,
+        },
+        debounce: {
+          maxDebounceDurationMs: 60_000,
+        },
+        tracer: trace.getTracer("test", "0.0.0"),
+      });
+
+      // Create a separate Redis client to simulate "another server" modifying keys directly
+      const simulatedServerRedis = createRedisClient({
+        ...redisOptions,
+        keyPrefix: `${redisOptions.keyPrefix ?? ""}debounce:`,
+      });
+
+      try {
+        const taskIdentifier = "test-task";
+        const debounceKey = "wait-race-test-key";
+        const environmentId = authenticatedEnvironment.id;
+
+        await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier);
+
+        // Construct the Redis key (same format as DebounceSystem.getDebounceRedisKey)
+        const redisKey = `${environmentId}:${taskIdentifier}:${debounceKey}`;
+
+        // Step 1: Server A claims the key with a pending claim
+        const claimIdA = "claim-server-A";
+        await simulatedServerRedis.set(redisKey, `pending:${claimIdA}`, "PX", 60_000);
+
+        // Step 2: Delete the key to simulate Server A's claim expiring
+        await simulatedServerRedis.del(redisKey);
+
+        // Step 3: Server B calls handleDebounce - since key is gone, it should atomically claim
+        const debounceResult = await engine.debounceSystem.handleDebounce({
+          environmentId,
+          taskIdentifier,
+          debounce: {
+            key: debounceKey,
+            delay: "5s",
+          },
+        });
+
+        // Step 4: Verify result is { status: "new" } WITH a claimId
+        expect(debounceResult.status).toBe("new");
+        if (debounceResult.status === "new") {
+          expect(debounceResult.claimId).toBeDefined();
+          expect(typeof debounceResult.claimId).toBe("string");
+          expect(debounceResult.claimId!.length).toBeGreaterThan(0);
+
+          // Step 5: Verify the key now contains Server B's pending claim
+          const valueAfterB = await simulatedServerRedis.get(redisKey);
+          expect(valueAfterB).toBe(`pending:${debounceResult.claimId}`);
+
+          // Step 6: Server C tries to claim the same key - should fail
+          const claimIdC = "claim-server-C";
+          const claimResultC = await simulatedServerRedis.set(
+            redisKey,
+            `pending:${claimIdC}`,
+            "PX",
+            60_000,
+            "NX"
+          );
+          expect(claimResultC).toBeNull(); // NX fails because key exists
+
+          // Step 7: Server B registers its run using its claimId
+          const runIdB = "run_server_B";
+          const delayUntil = new Date(Date.now() + 60_000);
+          const registered = await engine.debounceSystem.registerDebouncedRun({
+            runId: runIdB,
+            environmentId,
+            taskIdentifier,
+            debounceKey,
+            delayUntil,
+            claimId: debounceResult.claimId,
+          });
+
+          // Step 8: Verify Server B's registration succeeded
+          expect(registered).toBe(true);
+
+          // Step 9: Verify Redis contains Server B's run ID
+          const finalValue = await simulatedServerRedis.get(redisKey);
+          expect(finalValue).toBe(runIdB);
+        }
+      } finally {
+        await simulatedServerRedis.quit();
+        await engine.quit();
+      }
+    }
+  );
 });