Finish job when worker crashed with IOException. (#1239)

This commit is contained in:
Tingluo Huang
2021-08-03 13:21:39 -07:00
committed by GitHub
parent 5af7b87074
commit aab4aca8f7

View File

@@ -507,7 +507,20 @@ namespace GitHub.Runner.Listener
{ {
detailInfo = string.Join(Environment.NewLine, workerOutput); detailInfo = string.Join(Environment.NewLine, workerOutput);
Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result."); Trace.Info($"Return code {returnCode} indicate worker encounter an unhandled exception or app crash, attach worker stdout/stderr to JobRequest result.");
await LogWorkerProcessUnhandledException(message, detailInfo);
var jobServer = HostContext.GetService<IJobServer>();
VssCredentials jobServerCredential = VssUtil.GetVssCredential(systemConnection);
VssConnection jobConnection = VssUtil.CreateConnection(systemConnection.Url, jobServerCredential);
await jobServer.ConnectAsync(jobConnection);
await LogWorkerProcessUnhandledException(jobServer, message, detailInfo);
// Go ahead to finish the job with result 'Failed' if the STDERR from worker is System.IO.IOException, since it typically means we are running out of disk space.
if (detailInfo.Contains(typeof(System.IO.IOException).ToString(), StringComparison.OrdinalIgnoreCase))
{
Trace.Info($"Finish job with result 'Failed' due to IOException.");
await ForceFailJob(jobServer, message);
}
} }
TaskResult result = TaskResultUtil.TranslateFromReturnCode(returnCode); TaskResult result = TaskResultUtil.TranslateFromReturnCode(returnCode);
@@ -915,53 +928,16 @@ namespace GitHub.Runner.Listener
} }
// log an error issue to job level timeline record // log an error issue to job level timeline record
private async Task LogWorkerProcessUnhandledException(Pipelines.AgentJobRequestMessage message, string errorMessage) private async Task LogWorkerProcessUnhandledException(IJobServer jobServer, Pipelines.AgentJobRequestMessage message, string errorMessage)
{ {
try try
{ {
var systemConnection = message.Resources.Endpoints.SingleOrDefault(x => string.Equals(x.Name, WellKnownServiceEndpointNames.SystemVssConnection));
ArgUtil.NotNull(systemConnection, nameof(systemConnection));
var jobServer = HostContext.GetService<IJobServer>();
VssCredentials jobServerCredential = VssUtil.GetVssCredential(systemConnection);
VssConnection jobConnection = VssUtil.CreateConnection(systemConnection.Url, jobServerCredential);
/* Below is the legacy 'OnPremises' code that is currently unused by the runner
ToDo: re-implement code as appropriate once GHES support is added.
// Make sure SystemConnection Url match Config Url base for OnPremises server
if (!message.Variables.ContainsKey(Constants.Variables.System.ServerType) ||
string.Equals(message.Variables[Constants.Variables.System.ServerType]?.Value, "OnPremises", StringComparison.OrdinalIgnoreCase))
{
try
{
Uri result = null;
Uri configUri = new Uri(_runnerSetting.ServerUrl);
if (Uri.TryCreate(new Uri(configUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped)), jobServerUrl.PathAndQuery, out result))
{
//replace the schema and host portion of messageUri with the host from the
//server URI (which was set at config time)
jobServerUrl = result;
}
}
catch (InvalidOperationException ex)
{
//cannot parse the Uri - not a fatal error
Trace.Error(ex);
}
catch (UriFormatException ex)
{
//cannot parse the Uri - not a fatal error
Trace.Error(ex);
}
} */
await jobServer.ConnectAsync(jobConnection);
var timeline = await jobServer.GetTimelineAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, CancellationToken.None); var timeline = await jobServer.GetTimelineAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, message.Timeline.Id, CancellationToken.None);
ArgUtil.NotNull(timeline, nameof(timeline)); ArgUtil.NotNull(timeline, nameof(timeline));
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job"); TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord)); ArgUtil.NotNull(jobRecord, nameof(jobRecord));
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = errorMessage }; var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = errorMessage };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash; unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++; jobRecord.ErrorCount++;
@@ -975,6 +951,21 @@ namespace GitHub.Runner.Listener
} }
} }
// raise job completed event to fail the job.
private async Task ForceFailJob(IJobServer jobServer, Pipelines.AgentJobRequestMessage message)
{
try
{
var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, TaskResult.Failed);
await jobServer.RaisePlanEventAsync<JobCompletedEvent>(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, CancellationToken.None);
}
catch (Exception ex)
{
Trace.Error("Fail to raise JobCompletedEvent back to service.");
Trace.Error(ex);
}
}
private class WorkerDispatcher : IDisposable private class WorkerDispatcher : IDisposable
{ {
public long RequestId { get; } public long RequestId { get; }