send annotations to run-service (#2574)

* send annotations to run-service

* skip message deletion

* actually don't skip deletion

* enum as numbers

* fix enum

* linting

* remove unncessary file

* feedback
This commit is contained in:
Yashwanth Anantharaju
2023-05-01 08:33:03 -04:00
committed by GitHub
parent 8d74a9ead6
commit 896152d78e
13 changed files with 332 additions and 59 deletions

View File

@@ -15,6 +15,7 @@ using GitHub.Runner.Sdk;
using GitHub.Services.Common;
using GitHub.Services.WebApi;
using GitHub.Services.WebApi.Jwt;
using Sdk.RSWebApi.Contracts;
using Pipelines = GitHub.DistributedTask.Pipelines;
namespace GitHub.Runner.Listener
@@ -372,6 +373,8 @@ namespace GitHub.Runner.Listener
TaskCompletionSource<int> firstJobRequestRenewed = new();
var notification = HostContext.GetService<IJobNotification>();
var systemConnection = message.Resources.Endpoints.SingleOrDefault(x => string.Equals(x.Name, WellKnownServiceEndpointNames.SystemVssConnection, StringComparison.OrdinalIgnoreCase));
// lock renew cancellation token.
using (var lockRenewalTokenSource = new CancellationTokenSource())
using (var workerProcessCancelTokenSource = new CancellationTokenSource())
@@ -379,8 +382,6 @@ namespace GitHub.Runner.Listener
long requestId = message.RequestId;
Guid lockToken = Guid.Empty; // lockToken has never been used, keep this here of compat
var systemConnection = message.Resources.Endpoints.SingleOrDefault(x => string.Equals(x.Name, WellKnownServiceEndpointNames.SystemVssConnection, StringComparison.OrdinalIgnoreCase));
// start renew job request
Trace.Info($"Start renew job request {requestId} for job {message.JobId}.");
Task renewJobRequest = RenewJobRequestAsync(message, systemConnection, _poolId, requestId, lockToken, orchestrationId, firstJobRequestRenewed, lockRenewalTokenSource.Token);
@@ -405,7 +406,7 @@ namespace GitHub.Runner.Listener
await renewJobRequest;
// complete job request with result Cancelled
await CompleteJobRequestAsync(_poolId, message, lockToken, TaskResult.Canceled);
await CompleteJobRequestAsync(_poolId, message, systemConnection, lockToken, TaskResult.Canceled);
return;
}
@@ -544,7 +545,6 @@ namespace GitHub.Runner.Listener
detailInfo = string.Join(Environment.NewLine, workerOutput);
Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result.");
var jobServer = await InitializeJobServerAsync(systemConnection);
await LogWorkerProcessUnhandledException(jobServer, message, detailInfo);
@@ -552,7 +552,7 @@ namespace GitHub.Runner.Listener
if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase))
{
Trace.Info($"Finish job with result 'Failed' due to IOException.");
await ForceFailJob(jobServer, message);
await ForceFailJob(jobServer, message, detailInfo);
}
}
@@ -567,7 +567,7 @@ namespace GitHub.Runner.Listener
await renewJobRequest;
// complete job request
await CompleteJobRequestAsync(_poolId, message, lockToken, result, detailInfo);
await CompleteJobRequestAsync(_poolId, message, systemConnection, lockToken, result, detailInfo);
// print out unhandled exception happened in worker after we complete job request.
// when we run out of disk space, report back to server has higher priority.
@@ -664,7 +664,7 @@ namespace GitHub.Runner.Listener
await renewJobRequest;
// complete job request
await CompleteJobRequestAsync(_poolId, message, lockToken, resultOnAbandonOrCancel);
await CompleteJobRequestAsync(_poolId, message, systemConnection, lockToken, resultOnAbandonOrCancel);
}
finally
{
@@ -1065,7 +1065,7 @@ namespace GitHub.Runner.Listener
}
}
private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequestMessage message, Guid lockToken, TaskResult result, string detailInfo = null)
private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequestMessage message, ServiceEndpoint systemConnection, Guid lockToken, TaskResult result, string detailInfo = null)
{
Trace.Entering();
@@ -1077,7 +1077,23 @@ namespace GitHub.Runner.Listener
if (this._isRunServiceJob)
{
Trace.Verbose($"Skip FinishAgentRequest call from Listener because MessageType is {message.MessageType}");
var runServer = await GetRunServerAsync(systemConnection);
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
var unhandledAnnotation = unhandledExceptionIssue.ToAnnotation();
var jobAnnotations = new List<Annotation>();
if (unhandledAnnotation.HasValue)
{
jobAnnotations.Add(unhandledAnnotation.Value);
}
try
{
await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, result, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, CancellationToken.None);
}
catch (Exception ex)
{
Trace.Error("Fail to raise job completion back to service.");
Trace.Error(ex);
}
return;
}
@@ -1117,7 +1133,7 @@ namespace GitHub.Runner.Listener
}
// log an error issue to job level timeline record
private async Task LogWorkerProcessUnhandledException(IRunnerService server, Pipelines.AgentJobRequestMessage message, string errorMessage)
private async Task LogWorkerProcessUnhandledException(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo)
{
if (server is IJobServer jobServer)
{
@@ -1129,34 +1145,11 @@ namespace GitHub.Runner.Listener
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord));
try
{
if (!string.IsNullOrEmpty(errorMessage) &&
message.Variables.TryGetValue("DistributedTask.EnableRunnerIPCDebug", out var enableRunnerIPCDebug) &&
StringUtil.ConvertToBoolean(enableRunnerIPCDebug.Value))
{
// the trace should be best effort and not affect any job result
var match = _invalidJsonRegex.Match(errorMessage);
if (match.Success &&
match.Groups.Count == 2)
{
var jsonPosition = int.Parse(match.Groups[1].Value);
var serializedJobMessage = JsonUtility.ToString(message);
var originalJson = serializedJobMessage.Substring(jsonPosition - 10, 20);
errorMessage = $"Runner sent Json at position '{jsonPosition}': {originalJson} ({Convert.ToBase64String(Encoding.UTF8.GetBytes(originalJson))})\n{errorMessage}";
}
}
}
catch (Exception ex)
{
Trace.Error(ex);
errorMessage = $"Fail to check json IPC error: {ex.Message}\n{errorMessage}";
}
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = errorMessage };
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++;
jobRecord.Issues.Add(unhandledExceptionIssue);
await jobServer.UpdateTimelineRecordsAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, new TimelineRecord[] { jobRecord }, CancellationToken.None);
}
catch (Exception ex)
@@ -1167,13 +1160,13 @@ namespace GitHub.Runner.Listener
}
else
{
Trace.Info("Job server does not support handling unhandled exception yet, error message: {0}", errorMessage);
Trace.Info("Job server does not support handling unhandled exception yet, error message: {0}", detailInfo);
return;
}
}
// raise job completed event to fail the job.
private async Task ForceFailJob(IRunnerService server, Pipelines.AgentJobRequestMessage message)
private async Task ForceFailJob(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo)
{
if (server is IJobServer jobServer)
{
@@ -1192,7 +1185,15 @@ namespace GitHub.Runner.Listener
{
try
{
await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, CancellationToken.None);
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
var unhandledAnnotation = unhandledExceptionIssue.ToAnnotation();
var jobAnnotations = new List<Annotation>();
if (unhandledAnnotation.HasValue)
{
jobAnnotations.Add(unhandledAnnotation.Value);
}
await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, CancellationToken.None);
}
catch (Exception ex)
{