handle broker run service exception handling (#3163)

* handle run service exception handling

* force fail always

* format

* format
This commit is contained in:
Yashwanth Anantharaju
2024-02-21 12:04:13 -05:00
committed by GitHub
parent 3db5c90cc4
commit b19b9462d8

View File

@@ -546,13 +546,27 @@ namespace GitHub.Runner.Listener
Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result."); Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result.");
var jobServer = await InitializeJobServerAsync(systemConnection); var jobServer = await InitializeJobServerAsync(systemConnection);
await LogWorkerProcessUnhandledException(jobServer, message, detailInfo); var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
switch (jobServer)
{
case IJobServer js:
{
await LogWorkerProcessUnhandledException(js, message, unhandledExceptionIssue);
// Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space. // Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase)) if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase))
{ {
Trace.Info($"Finish job with result 'Failed' due to IOException."); Trace.Info($"Finish job with result 'Failed' due to IOException.");
await ForceFailJob(jobServer, message, detailInfo); await ForceFailJob(js, message);
}
break;
}
case IRunServer rs:
await ForceFailJob(rs, message, unhandledExceptionIssue);
break;
default:
throw new NotSupportedException($"JobServer type '{jobServer.GetType().Name}' is not supported.");
} }
} }
@@ -1131,9 +1145,7 @@ namespace GitHub.Runner.Listener
} }
// log an error issue to job level timeline record // log an error issue to job level timeline record
private async Task LogWorkerProcessUnhandledException(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo) private async Task LogWorkerProcessUnhandledException(IJobServer jobServer, Pipelines.AgentJobRequestMessage message, Issue issue)
{
if (server is IJobServer jobServer)
{ {
try try
{ {
@@ -1143,10 +1155,9 @@ namespace GitHub.Runner.Listener
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job"); TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord)); ArgUtil.NotNull(jobRecord, nameof(jobRecord));
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++; jobRecord.ErrorCount++;
jobRecord.Issues.Add(unhandledExceptionIssue); jobRecord.Issues.Add(issue);
if (message.Variables.TryGetValue("DistributedTask.MarkJobAsFailedOnWorkerCrash", out var markJobAsFailedOnWorkerCrash) && if (message.Variables.TryGetValue("DistributedTask.MarkJobAsFailedOnWorkerCrash", out var markJobAsFailedOnWorkerCrash) &&
StringUtil.ConvertToBoolean(markJobAsFailedOnWorkerCrash?.Value)) StringUtil.ConvertToBoolean(markJobAsFailedOnWorkerCrash?.Value))
@@ -1165,17 +1176,9 @@ namespace GitHub.Runner.Listener
Trace.Error(ex); Trace.Error(ex);
} }
} }
else
{
Trace.Info("Job server does not support handling unhandled exception yet, error message: {0}", detailInfo);
return;
}
}
// raise job completed event to fail the job. // raise job completed event to fail the job.
private async Task ForceFailJob(IRunnerService server, Pipelines.AgentJobRequestMessage message, string detailInfo) private async Task ForceFailJob(IJobServer jobServer, Pipelines.AgentJobRequestMessage message)
{
if (server is IJobServer jobServer)
{ {
try try
{ {
@@ -1188,16 +1191,16 @@ namespace GitHub.Runner.Listener
Trace.Error(ex); Trace.Error(ex);
} }
} }
else if (server is IRunServer runServer)
private async Task ForceFailJob(IRunServer runServer, Pipelines.AgentJobRequestMessage message, Issue issue)
{ {
try try
{ {
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = detailInfo }; var annotation = issue.ToAnnotation();
var unhandledAnnotation = unhandledExceptionIssue.ToAnnotation();
var jobAnnotations = new List<Annotation>(); var jobAnnotations = new List<Annotation>();
if (unhandledAnnotation.HasValue) if (annotation.HasValue)
{ {
jobAnnotations.Add(unhandledAnnotation.Value); jobAnnotations.Add(annotation.Value);
} }
await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, environmentUrl: null, CancellationToken.None); await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, TaskResult.Failed, outputs: null, stepResults: null, jobAnnotations: jobAnnotations, environmentUrl: null, CancellationToken.None);
@@ -1208,11 +1211,6 @@ namespace GitHub.Runner.Listener
Trace.Error(ex); Trace.Error(ex);
} }
} }
else
{
throw new NotSupportedException($"Server type {server.GetType().FullName} is not supported.");
}
}
private async Task<IRunnerService> InitializeJobServerAsync(ServiceEndpoint systemConnection) private async Task<IRunnerService> InitializeJobServerAsync(ServiceEndpoint systemConnection)
{ {