Add trace to help debug IPC message corruption in runner. (#1587)

* Add trace to help debug IPC message corruption in runner.

* .
This commit is contained in:
Tingluo Huang
2022-01-05 13:42:20 -05:00
committed by GitHub
parent fa32fcf2a1
commit f2e210e5f3
3 changed files with 53 additions and 8 deletions

View File

@@ -2,17 +2,19 @@ using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using GitHub.DistributedTask.WebApi;
using GitHub.Runner.Common.Util;
using GitHub.Services.WebApi;
using Pipelines = GitHub.DistributedTask.Pipelines;
using System.Linq;
using GitHub.Services.Common;
using GitHub.Runner.Common;
using GitHub.Runner.Common.Util;
using GitHub.Runner.Sdk;
using GitHub.Services.Common;
using GitHub.Services.WebApi;
using GitHub.Services.WebApi.Jwt;
using Pipelines = GitHub.DistributedTask.Pipelines;
namespace GitHub.Runner.Listener
{
@@ -34,6 +36,7 @@ namespace GitHub.Runner.Listener
// and the server will not send another job while this one is still running.
public sealed class JobDispatcher : RunnerService, IJobDispatcher
{
private static Regex _invalidJsonRegex = new Regex(@"invalid\ Json\ at\ position\ '(\d+)':", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Lazy<Dictionary<long, TaskResult>> _localRunJobResult = new Lazy<Dictionary<long, TaskResult>>();
private int _poolId;
@@ -964,6 +967,30 @@ namespace GitHub.Runner.Listener
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord));
try
{
if (!string.IsNullOrEmpty(errorMessage) &&
message.Variables.TryGetValue("DistributedTask.EnableRunnerIPCDebug", out var enableRunnerIPCDebug) &&
StringUtil.ConvertToBoolean(enableRunnerIPCDebug.Value))
{
// the trace should be best effort and not affect any job result
var match = _invalidJsonRegex.Match(errorMessage);
if (match.Success &&
match.Groups.Count == 2)
{
var jsonPosition = int.Parse(match.Groups[1].Value);
var serializedJobMessage = JsonUtility.ToString(message);
var originalJson = serializedJobMessage.Substring(jsonPosition - 10, 20);
errorMessage = $"Runner sent Json at position '{jsonPosition}': {originalJson} ({Convert.ToBase64String(Encoding.UTF8.GetBytes(originalJson))})\n{errorMessage}";
}
}
}
catch (Exception ex)
{
Trace.Error(ex);
errorMessage = $"Fail to check json IPC error: {ex.Message}\n{errorMessage}";
}
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = errorMessage };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++;