@@ -243,6 +243,74 @@ return 0
243243 return { claimed : false , existingRunId : existingValue } ;
244244 }
245245
246+ /**
247+ * Atomically claims the debounce key before returning "new".
248+ * This prevents the race condition where returning "new" without a claimId
249+ * allows registerDebouncedRun to do a plain SET that can overwrite another server's registration.
250+ *
251+ * This method is called when we've determined there's no valid existing run but need
252+ * to safely claim the key before creating a new one.
253+ */
254+ private async claimKeyForNewRun ( {
255+ environmentId,
256+ taskIdentifier,
257+ debounce,
258+ tx,
259+ } : {
260+ environmentId : string ;
261+ taskIdentifier : string ;
262+ debounce : DebounceOptions ;
263+ tx ?: PrismaClientOrTransaction ;
264+ } ) : Promise < DebounceResult > {
265+ const redisKey = this . getDebounceRedisKey ( environmentId , taskIdentifier , debounce . key ) ;
266+ const claimId = nanoid ( 16 ) ;
267+
268+ const claimResult = await this . claimDebounceKey ( {
269+ environmentId,
270+ taskIdentifier,
271+ debounceKey : debounce . key ,
272+ claimId,
273+ ttlMs : CLAIM_TTL_MS ,
274+ } ) ;
275+
276+ if ( claimResult . claimed ) {
277+ this . $ . logger . debug ( "claimKeyForNewRun: claimed key, returning new" , {
278+ debounceKey : debounce . key ,
279+ taskIdentifier,
280+ environmentId,
281+ claimId,
282+ } ) ;
283+ return { status : "new" , claimId } ;
284+ }
285+
286+ if ( claimResult . existingRunId ) {
287+ // Another server registered a run while we were processing - handle it
288+ this . $ . logger . debug ( "claimKeyForNewRun: found existing run, handling it" , {
289+ debounceKey : debounce . key ,
290+ existingRunId : claimResult . existingRunId ,
291+ } ) ;
292+ return await this . handleExistingRun ( {
293+ existingRunId : claimResult . existingRunId ,
294+ redisKey,
295+ environmentId,
296+ taskIdentifier,
297+ debounce,
298+ tx,
299+ } ) ;
300+ }
301+
302+ // Another server is creating (pending state) - wait for it
303+ this . $ . logger . debug ( "claimKeyForNewRun: key is pending, waiting for existing run" , {
304+ debounceKey : debounce . key ,
305+ } ) ;
306+ return await this . waitForExistingRun ( {
307+ environmentId,
308+ taskIdentifier,
309+ debounce,
310+ tx,
311+ } ) ;
312+ }
313+
246314 /**
247315 * Waits for another server to complete its claim and register a run ID.
248316 * Used when we detect a "pending" state, meaning another server has claimed
@@ -267,13 +335,18 @@ return 0
267335 const value = await this . redis . get ( redisKey ) ;
268336
269337 if ( ! value ) {
270- // Key expired or was deleted - return "new" to create fresh
271- this . $ . logger . debug ( "waitForExistingRun: key expired/deleted, returning new " , {
338+ // Key expired or was deleted - atomically claim before returning "new"
339+ this . $ . logger . debug ( "waitForExistingRun: key expired/deleted, claiming key " , {
272340 redisKey,
273341 debounceKey : debounce . key ,
274342 attempt : i + 1 ,
275343 } ) ;
276- return { status : "new" } ;
344+ return await this . claimKeyForNewRun ( {
345+ environmentId,
346+ taskIdentifier,
347+ debounce,
348+ tx,
349+ } ) ;
277350 }
278351
279352 if ( ! value . startsWith ( "pending:" ) ) {
@@ -287,6 +360,8 @@ return 0
287360 return await this . handleExistingRun ( {
288361 existingRunId : value ,
289362 redisKey,
363+ environmentId,
364+ taskIdentifier,
290365 debounce,
291366 tx,
292367 } ) ;
@@ -314,12 +389,17 @@ return 0
314389 const deleteResult = await this . conditionallyDeletePendingKey ( redisKey ) ;
315390
316391 if ( deleteResult . deleted ) {
317- // Key was pending (or didn't exist) - safe to create new run
318- this . $ . logger . debug ( "waitForExistingRun: stale pending key deleted, returning new " , {
392+ // Key was pending (or didn't exist) - atomically claim before returning "new"
393+ this . $ . logger . debug ( "waitForExistingRun: stale pending key deleted, claiming key " , {
319394 redisKey,
320395 debounceKey : debounce . key ,
321396 } ) ;
322- return { status : "new" } ;
397+ return await this . claimKeyForNewRun ( {
398+ environmentId,
399+ taskIdentifier,
400+ debounce,
401+ tx,
402+ } ) ;
323403 }
324404
325405 // Key now has a valid run ID - the original server completed!
@@ -335,6 +415,8 @@ return 0
335415 return await this . handleExistingRun ( {
336416 existingRunId : deleteResult . existingRunId ,
337417 redisKey,
418+ environmentId,
419+ taskIdentifier,
338420 debounce,
339421 tx,
340422 } ) ;
@@ -347,11 +429,15 @@ return 0
347429 private async handleExistingRun ( {
348430 existingRunId,
349431 redisKey,
432+ environmentId,
433+ taskIdentifier,
350434 debounce,
351435 tx,
352436 } : {
353437 existingRunId : string ;
354438 redisKey : string ;
439+ environmentId : string ;
440+ taskIdentifier : string ;
355441 debounce : DebounceOptions ;
356442 tx ?: PrismaClientOrTransaction ;
357443 } ) : Promise < DebounceResult > {
@@ -369,9 +455,14 @@ return 0
369455 debounceKey : debounce . key ,
370456 error,
371457 } ) ;
372- // Clean up stale Redis key
458+ // Clean up stale Redis key and atomically claim before returning "new"
373459 await this . redis . del ( redisKey ) ;
374- return { status : "new" } ;
460+ return await this . claimKeyForNewRun ( {
461+ environmentId,
462+ taskIdentifier,
463+ debounce,
464+ tx,
465+ } ) ;
375466 }
376467
377468 // Check if run is still in DELAYED status (or legacy RUN_CREATED for older runs)
@@ -381,9 +472,14 @@ return 0
381472 executionStatus : snapshot . executionStatus ,
382473 debounceKey : debounce . key ,
383474 } ) ;
384- // Clean up Redis key since run is no longer debounceable
475+ // Clean up Redis key and atomically claim before returning "new"
385476 await this . redis . del ( redisKey ) ;
386- return { status : "new" } ;
477+ return await this . claimKeyForNewRun ( {
478+ environmentId,
479+ taskIdentifier,
480+ debounce,
481+ tx,
482+ } ) ;
387483 }
388484
389485 // Get the run to check debounce metadata and createdAt
@@ -399,8 +495,14 @@ return 0
399495 existingRunId,
400496 debounceKey : debounce . key ,
401497 } ) ;
498+ // Clean up stale Redis key and atomically claim before returning "new"
402499 await this . redis . del ( redisKey ) ;
403- return { status : "new" } ;
500+ return await this . claimKeyForNewRun ( {
501+ environmentId,
502+ taskIdentifier,
503+ debounce,
504+ tx,
505+ } ) ;
404506 }
405507
406508 // Calculate new delay - parseNaturalLanguageDuration returns a Date (now + duration)
@@ -409,7 +511,13 @@ return 0
409511 this . $ . logger . error ( "handleExistingRun: invalid delay duration" , {
410512 delay : debounce . delay ,
411513 } ) ;
412- return { status : "new" } ;
514+ // Invalid delay but we still need to atomically claim before returning "new"
515+ return await this . claimKeyForNewRun ( {
516+ environmentId,
517+ taskIdentifier,
518+ debounce,
519+ tx,
520+ } ) ;
413521 }
414522
415523 // Check if max debounce duration would be exceeded
@@ -566,6 +674,8 @@ return 0
566674 return await this . handleExistingRun ( {
567675 existingRunId : claimResult . existingRunId ,
568676 redisKey,
677+ environmentId,
678+ taskIdentifier,
569679 debounce,
570680 tx,
571681 } ) ;
0 commit comments