Backoff to avoid excessive retries to Run Service in a duration (#3354)

This commit is contained in:
eric sciple
2024-06-24 16:33:22 -05:00
committed by GitHub
parent ecb732eaf4
commit 054fc2e046
6 changed files with 318 additions and 6 deletions

View File

@@ -0,0 +1,44 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using GitHub.Runner.Common;
using GitHub.Services.Common;
namespace GitHub.Runner.Listener
{
[ServiceLocator(Default = typeof(ErrorThrottler))]
public interface IErrorThrottler : IRunnerService
{
void Reset();
Task IncrementAndWaitAsync(CancellationToken token);
}
public sealed class ErrorThrottler : RunnerService, IErrorThrottler
{
internal static readonly TimeSpan MinBackoff = TimeSpan.FromSeconds(1);
internal static readonly TimeSpan MaxBackoff = TimeSpan.FromMinutes(1);
internal static readonly TimeSpan BackoffCoefficient = TimeSpan.FromSeconds(1);
private int _count = 0;
public void Reset()
{
_count = 0;
}
public async Task IncrementAndWaitAsync(CancellationToken token)
{
if (++_count <= 1)
{
return;
}
TimeSpan backoff = BackoffTimerHelper.GetExponentialBackoff(
attempt: _count - 2, // 0-based attempt
minBackoff: MinBackoff,
maxBackoff: MaxBackoff,
deltaBackoff: BackoffCoefficient);
Trace.Warning($"Back off {backoff.TotalSeconds} seconds before next attempt. Current consecutive error count: {_count}");
await HostContext.Delay(backoff, token);
}
}
}

View File

@@ -32,10 +32,25 @@ namespace GitHub.Runner.Listener
private bool _inConfigStage;
private ManualResetEvent _completedCommand = new(false);
// <summary>
// Helps avoid excessive calls to Run Service when encountering non-retriable errors from /acquirejob.
// Normally we rely on the HTTP clients to back off between retry attempts. However, acquiring a job
// involves calls to both Run Serivce and Broker. And Run Service and Broker communicate with each other
// in an async fashion.
//
// When Run Service encounters a non-retriable error, it sends an async message to Broker. The runner will,
// however, immediately call Broker to get the next message. If the async event from Run Service to Broker
// has not yet been processed, the next message from Broker may be the same job message.
//
// The error throttler helps us back off when encountering successive, non-retriable errors from /acquirejob.
// </summary>
private IErrorThrottler _acquireJobThrottler;
public override void Initialize(IHostContext hostContext)
{
base.Initialize(hostContext);
_term = HostContext.GetService<ITerminal>();
_acquireJobThrottler = HostContext.CreateService<IErrorThrottler>();
}
public async Task<int> ExecuteCommand(CommandSettings command)
@@ -565,13 +580,16 @@ namespace GitHub.Runner.Listener
await runServer.ConnectAsync(new Uri(messageRef.RunServiceUrl), creds);
try
{
jobRequestMessage =
await runServer.GetJobMessageAsync(messageRef.RunnerRequestId,
messageQueueLoopTokenSource.Token);
jobRequestMessage = await runServer.GetJobMessageAsync(messageRef.RunnerRequestId, messageQueueLoopTokenSource.Token);
_acquireJobThrottler.Reset();
}
catch (TaskOrchestrationJobAlreadyAcquiredException)
catch (Exception ex) when (
ex is TaskOrchestrationJobNotFoundException || // HTTP status 404
ex is TaskOrchestrationJobAlreadyAcquiredException || // HTTP status 409
ex is TaskOrchestrationJobUnprocessableException) // HTTP status 422
{
Trace.Info("Job is already acquired, skip this message.");
Trace.Info($"Skipping message Job. {ex.Message}");
await _acquireJobThrottler.IncrementAndWaitAsync(messageQueueLoopTokenSource.Token);
continue;
}
catch (Exception ex)