Add trace to help debug IPC message corruption in runner. (#1587)

* Add trace to help debug IPC message corruption in runner.

* .
This commit is contained in:
Tingluo Huang
2022-01-05 13:42:20 -05:00
committed by GitHub
parent fa32fcf2a1
commit f2e210e5f3
3 changed files with 53 additions and 8 deletions

View File

@@ -2,17 +2,19 @@ using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using GitHub.DistributedTask.WebApi;
using GitHub.Runner.Common.Util;
using GitHub.Services.WebApi;
using Pipelines = GitHub.DistributedTask.Pipelines;
using System.Linq;
using GitHub.Services.Common;
using GitHub.Runner.Common;
using GitHub.Runner.Common.Util;
using GitHub.Runner.Sdk;
using GitHub.Services.Common;
using GitHub.Services.WebApi;
using GitHub.Services.WebApi.Jwt;
using Pipelines = GitHub.DistributedTask.Pipelines;
namespace GitHub.Runner.Listener
{
@@ -34,6 +36,7 @@ namespace GitHub.Runner.Listener
// and the server will not send another job while this one is still running.
public sealed class JobDispatcher : RunnerService, IJobDispatcher
{
private static Regex _invalidJsonRegex = new Regex(@"invalid\ Json\ at\ position\ '(\d+)':", RegexOptions.Compiled | RegexOptions.IgnoreCase);
private readonly Lazy<Dictionary<long, TaskResult>> _localRunJobResult = new Lazy<Dictionary<long, TaskResult>>();
private int _poolId;
@@ -964,6 +967,30 @@ namespace GitHub.Runner.Listener
TimelineRecord jobRecord = timeline.Records.FirstOrDefault(x => x.Id == message.JobId && x.RecordType == "Job");
ArgUtil.NotNull(jobRecord, nameof(jobRecord));
try
{
if (!string.IsNullOrEmpty(errorMessage) &&
message.Variables.TryGetValue("DistributedTask.EnableRunnerIPCDebug", out var enableRunnerIPCDebug) &&
StringUtil.ConvertToBoolean(enableRunnerIPCDebug.Value))
{
// the trace should be best effort and not affect any job result
var match = _invalidJsonRegex.Match(errorMessage);
if (match.Success &&
match.Groups.Count == 2)
{
var jsonPosition = int.Parse(match.Groups[1].Value);
var serializedJobMessage = JsonUtility.ToString(message);
var originalJson = serializedJobMessage.Substring(jsonPosition - 10, 20);
errorMessage = $"Runner sent Json at position '{jsonPosition}': {originalJson} ({Convert.ToBase64String(Encoding.UTF8.GetBytes(originalJson))})\n{errorMessage}";
}
}
}
catch (Exception ex)
{
Trace.Error(ex);
errorMessage = $"Fail to check json IPC error: {ex.Message}\n{errorMessage}";
}
var unhandledExceptionIssue = new Issue() { Type = IssueType.Error, Message = errorMessage };
unhandledExceptionIssue.Data[Constants.Runner.InternalTelemetryIssueDataKey] = Constants.Runner.WorkerCrash;
jobRecord.ErrorCount++;

View File

@@ -425,6 +425,7 @@ namespace GitHub.Runner.Listener
}
else
{
Trace.Info($"Received job message of length {message.Body.Length} from service, with hash '{IOUtil.GetSha256Hash(message.Body)}'");
var jobMessage = StringUtil.ConvertFromJson<Pipelines.AgentJobRequestMessage>(message.Body);
jobDispatcher.Run(jobMessage, runOnce);
if (runOnce)
@@ -546,7 +547,7 @@ Config Options:
_term.WriteLine($@" --windowslogonaccount string Account to run the service as. Requires runasservice");
_term.WriteLine($@" --windowslogonpassword string Password for the service account. Requires runasservice");
#endif
_term.WriteLine($@"
_term.WriteLine($@"
Examples:
Check GitHub server network connectivity:
.{separator}run.{ext} --check --url <url> --pat <pat>

View File

@@ -9,6 +9,7 @@ using System.Threading.Tasks;
using GitHub.Services.WebApi;
using GitHub.Runner.Common;
using GitHub.Runner.Sdk;
using System.Text;
namespace GitHub.Runner.Worker
{
@@ -43,6 +44,7 @@ namespace GitHub.Runner.Worker
ArgUtil.NotNullOrEmpty(pipeOut, nameof(pipeOut));
VssUtil.InitializeVssClientSettings(HostContext.UserAgents, HostContext.WebProxy);
var jobRunner = HostContext.CreateService<IJobRunner>();
var terminal = HostContext.GetService<ITerminal>();
using (var channel = HostContext.CreateService<IProcessChannel>())
using (var jobRequestCancellationToken = CancellationTokenSource.CreateLinkedTokenSource(HostContext.RunnerShutdownToken))
@@ -64,7 +66,22 @@ namespace GitHub.Runner.Worker
Trace.Info("Message received.");
ArgUtil.Equal(MessageType.NewJobRequest, channelMessage.MessageType, nameof(channelMessage.MessageType));
ArgUtil.NotNullOrEmpty(channelMessage.Body, nameof(channelMessage.Body));
var jobMessage = StringUtil.ConvertFromJson<Pipelines.AgentJobRequestMessage>(channelMessage.Body);
Pipelines.AgentJobRequestMessage jobMessage = null;
try
{
jobMessage = StringUtil.ConvertFromJson<Pipelines.AgentJobRequestMessage>(channelMessage.Body);
}
catch (JsonReaderException ex)
{
if (channelMessage.Body.Length > ex.LinePosition + 10)
{
var errorChunk = channelMessage.Body.Substring(ex.LinePosition - 10, 20);
terminal.WriteError($"Worker received invalid Json at position '{ex.LinePosition}': {errorChunk} ({Convert.ToBase64String(Encoding.UTF8.GetBytes(errorChunk))})");
}
throw;
}
ArgUtil.NotNull(jobMessage, nameof(jobMessage));
HostContext.WritePerfCounter($"WorkerJobMessageReceived_{jobMessage.RequestId.ToString()}");