Compare commits

..

1 Commits

Author SHA1 Message Date
Tingluo Huang
ec9bdfc1b9 Improve reliability for waiting job to start before running steps. 2024-09-04 16:18:04 -04:00
9 changed files with 75 additions and 73 deletions

View File

@@ -92,7 +92,7 @@ namespace GitHub.Runner.Common
public bool ShouldRetryException(Exception ex)
{
if (ex is AccessDeniedException ade)
if (ex is AccessDeniedException ade && ade.ErrorCode == 1)
{
return false;
}

View File

@@ -248,7 +248,6 @@ namespace GitHub.Runner.Common
var currentProcess = Process.GetCurrentProcess();
_userAgents.Add(new ProductInfoHeaderValue("Pid", currentProcess.Id.ToString()));
_userAgents.Add(new ProductInfoHeaderValue("CreationTime", Uri.EscapeDataString(DateTime.UtcNow.ToString("O"))));
_userAgents.Add(new ProductInfoHeaderValue($"({hostType})"));
}
public string GetDirectory(WellKnownDirectory directory)

View File

@@ -77,6 +77,7 @@ namespace GitHub.Runner.Common
private int _resultsServiceExceptionsCount = 0;
private Stopwatch _resultsUploadTimer = new();
private Stopwatch _actionsUploadTimer = new();
private Stopwatch _jobRecordUpdatedTimer = new();
public TaskCompletionSource<int> JobRecordUpdated => _jobRecordUpdated;
@@ -96,6 +97,8 @@ namespace GitHub.Runner.Common
private bool _resultsClientInitiated = false;
private bool _enableTelemetry = false;
private bool _enableJobRecordUpdatedTelemetry = false;
private bool _enableAutoRetry = false;
private delegate Task ResultsFileUploadHandler(ResultsUploadFileInfo file);
public override void Initialize(IHostContext hostContext)
@@ -180,6 +183,23 @@ namespace GitHub.Runner.Common
_allDequeueTasks = new Task[] { _webConsoleLineDequeueTask, _fileUploadDequeueTask, _timelineUpdateDequeueTask, _resultsUploadDequeueTask };
_queueInProcess = true;
if (jobRequest.Variables.TryGetValue("DistributedTask.EnableJobRecordUpdatedTelemetry", out VariableValue enableTelemetry))
{
_enableJobRecordUpdatedTelemetry = StringUtil.ConvertToBoolean(enableTelemetry?.Value);
if (_enableJobRecordUpdatedTelemetry)
{
Trace.Info("Enable telemetry for first job record update.");
_jobRecordUpdatedTimer.Start();
}
}
if (jobRequest.Variables.TryGetValue("DistributedTask.EnableRecordUpdateAutoRetry", out VariableValue enableAutoRetry))
{
Trace.Info("Enable auto retry for timeline record update.");
_enableAutoRetry = StringUtil.ConvertToBoolean(enableAutoRetry?.Value);
}
}
// WebConsoleLine queue and FileUpload queue are always best effort
@@ -232,6 +252,11 @@ namespace GitHub.Runner.Common
Trace.Info(uploadTimeComparison);
_jobTelemetries.Add(new JobTelemetry() { Type = JobTelemetryType.General, Message = uploadTimeComparison });
}
if (_enableJobRecordUpdatedTelemetry)
{
_jobTelemetries.Add(new JobTelemetry() { Type = JobTelemetryType.General, Message = $"First job record updated time after: {_jobRecordUpdatedTimer.ElapsedMilliseconds} ms" });
}
}
public void QueueWebConsoleLine(Guid stepRecordId, string line, long? lineNumber)
@@ -729,6 +754,11 @@ namespace GitHub.Runner.Common
// We have changed the state of the job
Trace.Info("Job timeline record has been updated for the first time.");
_jobRecordUpdated.TrySetResult(0);
if (_enableJobRecordUpdatedTelemetry)
{
_jobRecordUpdatedTimer.Stop();
}
}
}
catch (Exception ex)
@@ -740,6 +770,15 @@ namespace GitHub.Runner.Common
{
mainTimelineRecordsUpdateErrors.Add(ex);
}
if (!runOnce && _enableAutoRetry)
{
foreach (var retryRecordId in update.PendingRecords.DistinctBy(x => x.Id).Select(r => r.Id))
{
Trace.Verbose("Enqueue timeline record {0} update for retry.", retryRecordId);
_timelineUpdateQueue[update.TimelineId].Enqueue(new TimelineRecord() { Id = retryRecordId });
}
}
}
}
}

View File

@@ -93,6 +93,7 @@ namespace GitHub.Runner.Worker.Handlers
ExecutionContext.StepTelemetry.HasPreStep = Data.HasPre;
ExecutionContext.StepTelemetry.HasPostStep = Data.HasPost;
}
ExecutionContext.StepTelemetry.Type = Data.NodeVersion;
ArgUtil.NotNullOrEmpty(target, nameof(target));
target = Path.Combine(ActionDirectory, target);
@@ -123,7 +124,6 @@ namespace GitHub.Runner.Worker.Handlers
Data.NodeVersion = "node20";
}
var nodeRuntimeVersion = await StepHost.DetermineNodeRuntimeVersion(ExecutionContext, Data.NodeVersion);
ExecutionContext.StepTelemetry.Type = nodeRuntimeVersion;
string file = Path.Combine(HostContext.GetDirectory(WellKnownDirectory.Externals), nodeRuntimeVersion, "bin", $"node{IOUtil.ExeExtension}");
// Format the arguments passed to node.

View File

@@ -210,8 +210,14 @@ namespace GitHub.Runner.Worker
// Server won't issue ID_TOKEN for non-inprogress job.
// If the job is trying to use OIDC feature, we want the job to be marked as in-progress before running any customer's steps as much as we can.
// Timeline record update background process runs every 500ms, so delay 1000ms is enough for most of the cases
Trace.Info($"Waiting for job to be marked as started.");
await Task.WhenAny(_jobServerQueue.JobRecordUpdated.Task, Task.Delay(1000));
var maxWaitTimeInSeconds = jobContext.Global.Variables.GetInt("DistributedTask.FirstJobRecordUpdateWaitTimeInSeconds");
if (maxWaitTimeInSeconds == null)
{
maxWaitTimeInSeconds = 1;
}
Trace.Info($"Waiting {maxWaitTimeInSeconds.Value} seconds for job to be marked as started.");
await Task.WhenAny(_jobServerQueue.JobRecordUpdated.Task, Task.Delay(maxWaitTimeInSeconds.Value * 1000));
}
// Run all job steps

View File

@@ -1,20 +0,0 @@
using System.Runtime.Serialization;
namespace GitHub.Actions.RunService.WebApi
{
[DataContract]
public class BrokerError
{
[DataMember(Name = "source", EmitDefaultValue = false)]
public string Source { get; set; }
[DataMember(Name = "errorKind", EmitDefaultValue = false)]
public string ErrorKind { get; set; }
[DataMember(Name = "statusCode", EmitDefaultValue = false)]
public int StatusCode { get; set; }
[DataMember(Name = "errorMessage", EmitDefaultValue = false)]
public string Message { get; set; }
}
}

View File

@@ -1,10 +0,0 @@
using System.Runtime.Serialization;
namespace GitHub.Actions.RunService.WebApi
{
[DataContract]
public class BrokerErrorKind
{
public const string RunnerVersionTooOld = "RunnerVersionTooOld";
}
}

View File

@@ -107,6 +107,15 @@ namespace GitHub.Actions.RunService.WebApi
}
}
// Temporary back compat
switch (result.StatusCode)
{
case HttpStatusCode.NotFound:
throw new TaskOrchestrationJobNotFoundException($"Job message not found: {messageId}");
case HttpStatusCode.Conflict:
throw new TaskOrchestrationJobAlreadyAcquiredException($"Job message already acquired: {messageId}");
}
if (!string.IsNullOrEmpty(result.ErrorBody))
{
throw new Exception($"Failed to get job message: {result.Error}. {Truncate(result.ErrorBody)}");
@@ -162,6 +171,13 @@ namespace GitHub.Actions.RunService.WebApi
}
}
// Temporary back compat
switch (result.StatusCode)
{
case HttpStatusCode.NotFound:
throw new TaskOrchestrationJobNotFoundException($"Job not found: {jobId}");
}
if (!string.IsNullOrEmpty(result.ErrorBody))
{
throw new Exception($"Failed to complete job: {result.Error}. {Truncate(result.ErrorBody)}");
@@ -209,6 +225,13 @@ namespace GitHub.Actions.RunService.WebApi
}
}
// Temporary back compat
switch (result.StatusCode)
{
case HttpStatusCode.NotFound:
throw new TaskOrchestrationJobNotFoundException($"Job not found: {jobId}");
}
if (!string.IsNullOrEmpty(result.ErrorBody))
{
throw new Exception($"Failed to renew job: {result.Error}. {Truncate(result.ErrorBody)}");

View File

@@ -103,7 +103,6 @@ namespace GitHub.Actions.RunService.WebApi
new HttpMethod("GET"),
requestUri: requestUri,
queryParameters: queryParams,
readErrorBody: true,
cancellationToken: cancellationToken);
if (result.IsSuccess)
@@ -111,21 +110,8 @@ namespace GitHub.Actions.RunService.WebApi
return result.Value;
}
if (TryParseErrorBody(result.ErrorBody, out BrokerError brokerError))
{
switch (brokerError.ErrorKind)
{
case BrokerErrorKind.RunnerVersionTooOld:
throw new AccessDeniedException(brokerError.Message)
{
ErrorCode = 1
};
default:
break;
}
}
// temporary back compat
// the only time we throw a `Forbidden` exception from Listener /messages is when the runner is
// disable_update and is too old to poll
if (result.StatusCode == HttpStatusCode.Forbidden)
{
throw new AccessDeniedException($"{result.Error} Runner version v{runnerVersion} is deprecated and cannot receive messages.")
@@ -134,7 +120,7 @@ namespace GitHub.Actions.RunService.WebApi
};
}
throw new Exception($"Failed to get job message. Request to {requestUri} failed with status: {result.StatusCode}. Error message {result.Error}");
throw new Exception($"Failed to get job message: {result.Error}");
}
public async Task<TaskAgentSession> CreateSessionAsync(
@@ -186,26 +172,5 @@ namespace GitHub.Actions.RunService.WebApi
throw new Exception($"Failed to delete broker session: {result.Error}");
}
private static bool TryParseErrorBody(string errorBody, out BrokerError error)
{
if (!string.IsNullOrEmpty(errorBody))
{
try
{
error = JsonUtility.FromString<BrokerError>(errorBody);
if (error?.Source == "actions-broker-listener")
{
return true;
}
}
catch (Exception)
{
}
}
error = null;
return false;
}
}
}