Skip to content

Commit ed1595c

Browse files
committed
address quequing slowdown
1 parent 77869bb commit ed1595c

File tree

4 files changed

+38
-18
lines changed

4 files changed

+38
-18
lines changed

src/TesApi.Web/BatchScheduler.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public partial class BatchScheduler : IBatchScheduler
6464

6565
internal const string NodeTaskRunnerFilename = "tes-runner";
6666

67-
internal static TimeSpan QueuedTesTaskTaskGroupGatherWindow = TimeSpan.FromSeconds(10);
67+
internal static TimeSpan QueuedTesTaskTaskGroupGatherWindow = TimeSpan.FromSeconds(2);
6868
internal static TimeSpan QueuedTesTaskPoolGroupGatherWindow = TaskScheduler.BatchRunInterval;
6969

7070
private const string AzureSupportUrl = "https://portal.azure.com/#blade/Microsoft_Azure_Support/HelpAndSupportBlade/newsupportrequest";
@@ -233,7 +233,7 @@ async Task<bool> SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState,
233233

234234
if (!pool?.OrphanedTesTasks.TryRemove(tesTask.Id, out _) ?? false)
235235
{
236-
pool.AssociatedTesTasks.Where(pair => tesTask.Id.Equals(pair.Value, StringComparison.InvariantCultureIgnoreCase)).ForEach(pair => _ = pool.AssociatedTesTasks.TryRemove(pair));
236+
pool.AssociatedTesTasks.Where(pair => tesTask.Id.Equals(pair.Value, StringComparison.Ordinal)).ForEach(pair => _ = pool.AssociatedTesTasks.TryRemove(pair));
237237
}
238238
}
239239

src/TesApi.Web/TaskScheduler.cs

+33-13
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft
5656
, ITaskScheduler
5757
{
5858
private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15);
59-
private static readonly TimeSpan queuedRunInterval = TimeSpan.FromMilliseconds(100);
59+
private static readonly TimeSpan queuedRunInterval = TimeSpan.FromSeconds(15);
6060
private static readonly TimeSpan queuedRepositoryInterval = TimeSpan.FromMinutes(1);
6161
internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest processes inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval
62-
private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1);
63-
private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(2.5);
62+
private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromMilliseconds(75);
63+
private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(15);
6464
private static readonly TimeSpan orphanedTaskInterval = TimeSpan.FromMinutes(10);
6565
private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor;
6666

@@ -209,9 +209,31 @@ private async ValueTask ProcessQueuedTesTasksAsync(CancellationToken cancellatio
209209
.OrderByDescending(t => t.CreationTime)
210210
.ToAsyncEnumerable());
211211

212-
while (!cancellationToken.IsCancellationRequested && (queuedTesTasks.TryDequeue(out var tesTask) || (DateTimeOffset.UtcNow >= nextQueuedRepository && await RefillFromRepository() && queuedTesTasks.TryDequeue(out tesTask))))
212+
cancellationToken.ThrowIfCancellationRequested();
213+
var now = DateTimeOffset.UtcNow;
214+
215+
HashSet<TesTask> tasks = new(new TesTasByIdComparer());
216+
217+
while (queuedTesTasks.TryDequeue(out var tesTask))
213218
{
214-
await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), Requeue, cancellationToken);
219+
_ = tasks.Add(tesTask);
220+
}
221+
222+
// Catch any tasks reset back to Queued
223+
if (nextQueuedRepository <= now)
224+
{
225+
nextQueuedRepository = now + queuedRepositoryInterval;
226+
await (await query(cancellationToken)).ForEachAsync(task => _ = tasks.Add(task), cancellationToken);
227+
}
228+
229+
tasks.ForEach(QueueTesTask);
230+
231+
void QueueTesTask(TesTask tesTask)
232+
{
233+
_ = BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken)
234+
.ContinueWith(task => ProcessOrchestratedTesTaskAsync("Queued", new(task, tesTask), Requeue, cancellationToken).AsTask())
235+
.Unwrap()
236+
.ContinueWith(task => Logger.LogError(task.Exception, "Failure to queue TesTask {TesTask}", tesTask.Id), TaskContinuationOptions.OnlyOnFaulted);
215237
}
216238

217239
async ValueTask Requeue(RepositoryCollisionException<TesTask> exception)
@@ -223,14 +245,6 @@ async ValueTask Requeue(RepositoryCollisionException<TesTask> exception)
223245
queuedTesTasks.Enqueue(tesTask);
224246
}
225247
}
226-
227-
// Catch any tasks reset back to Queued
228-
async ValueTask<bool> RefillFromRepository()
229-
{
230-
nextQueuedRepository = DateTimeOffset.UtcNow + queuedRepositoryInterval;
231-
await (await query(cancellationToken)).ForEachAsync(queuedTesTasks.Enqueue, cancellationToken);
232-
return !queuedTesTasks.IsEmpty;
233-
}
234248
}
235249

236250
/// <summary>
@@ -538,6 +552,12 @@ await OrchestrateTesTasksOnBatchAsync(
538552
cancellationToken);
539553
}
540554

555+
private sealed class TesTasByIdComparer : IEqualityComparer<TesTask>
556+
{
557+
bool IEqualityComparer<TesTask>.Equals(TesTask x, TesTask y) => x?.Id.Equals(y.Id) ?? false;
558+
int IEqualityComparer<TesTask>.GetHashCode(TesTask obj) => obj.Id?.GetHashCode() ?? 0;
559+
}
560+
541561
/// <inheritdoc/>
542562
void ITaskScheduler.QueueTesTask(TesTask tesTask)
543563
{

src/build-push-acr/AcrBuild.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ public async ValueTask LoadAsync(IArchive archive, ArmEnvironment environment, C
9292
{
9393
maxTag = await repository.GetAllManifestPropertiesAsync(cancellationToken: cancellationToken)
9494
.SelectMany(props => props.Tags.ToAsyncEnumerable())
95-
.Where(tag => tag.StartsWith(this.tag.Version.ToString(3)) && Version.TryParse(tag, out _))
96-
.Select(tag => new Version(tag))
95+
.Where(tag => tag.StartsWith(this.tag.Version.ToString(3)) && Version.TryParse(tag.Contains('-') ? tag[..tag.IndexOf('-')] : tag, out _))
96+
.Select(tag => new Version(tag.Contains('-') ? tag[..tag.IndexOf('-')] : tag))
9797
.MaxAsync(cancellationToken);
9898
}
9999
catch (Azure.RequestFailedException ex) when (ex.Status == 404)

src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ spec:
128128
- containerPort: {{ .Values.service.tesPort }}
129129
resources:
130130
requests:
131-
cpu: "1.5"
131+
cpu: "2.5"
132132
memory: "3072Mi"
133133
limits:
134134
cpu: "8.0"

0 commit comments

Comments
 (0)