Compare commits

...

1 Commits

Author SHA1 Message Date
Ryan van Zeben
4b416898db Add in stall manager 2023-07-24 10:18:51 +00:00
10 changed files with 259 additions and 10 deletions

View File

@@ -158,6 +158,7 @@ namespace GitHub.Runner.Common
public static readonly string LogTemplateErrorsAsDebugMessages = "DistributedTask.LogTemplateErrorsAsDebugMessages"; public static readonly string LogTemplateErrorsAsDebugMessages = "DistributedTask.LogTemplateErrorsAsDebugMessages";
public static readonly string UseContainerPathForTemplate = "DistributedTask.UseContainerPathForTemplate"; public static readonly string UseContainerPathForTemplate = "DistributedTask.UseContainerPathForTemplate";
public static readonly string AllowRunnerContainerHooks = "DistributedTask.AllowRunnerContainerHooks"; public static readonly string AllowRunnerContainerHooks = "DistributedTask.AllowRunnerContainerHooks";
public static readonly string AllowRunnerStallDetect = "DistributedTask.AllowRunnerStallDetect";
} }
public static readonly string InternalTelemetryIssueDataKey = "_internal_telemetry"; public static readonly string InternalTelemetryIssueDataKey = "_internal_telemetry";

View File

@@ -11,5 +11,10 @@ namespace GitHub.Runner.Worker
var isContainerHooksPathSet = !string.IsNullOrEmpty(Environment.GetEnvironmentVariable(Constants.Hooks.ContainerHooksPath)); var isContainerHooksPathSet = !string.IsNullOrEmpty(Environment.GetEnvironmentVariable(Constants.Hooks.ContainerHooksPath));
return isContainerHookFeatureFlagSet && isContainerHooksPathSet; return isContainerHookFeatureFlagSet && isContainerHooksPathSet;
} }
public static bool IsStallDetectEnabled(Variables variables)
{
var isStallDetectFeatureFlagSet = variables?.GetBoolean(Constants.Runner.Features.AllowRunnerStallDetect) ?? false;
return isStallDetectFeatureFlagSet;
}
} }
} }

View File

@@ -1,4 +1,4 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
@@ -240,9 +240,11 @@ namespace GitHub.Runner.Worker.Handlers
} }
else else
{ {
using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, container)) StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null;
using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, container)) using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, container, stallManager),
stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, container, stallManager))
{ {
stallManager?.Initialize();
var runExitCode = await dockerManager.DockerRun(ExecutionContext, container, stdoutManager.OnDataReceived, stderrManager.OnDataReceived); var runExitCode = await dockerManager.DockerRun(ExecutionContext, container, stdoutManager.OnDataReceived, stderrManager.OnDataReceived);
ExecutionContext.Debug($"Docker Action run completed with exit code {runExitCode}"); ExecutionContext.Debug($"Docker Action run completed with exit code {runExitCode}");
if (runExitCode != 0) if (runExitCode != 0)

View File

@@ -159,12 +159,15 @@ namespace GitHub.Runner.Worker.Handlers
ExecutionContext.Global.Variables.Set("Node12ActionsWarnings", StringUtil.ConvertToJson(warningActions)); ExecutionContext.Global.Variables.Set("Node12ActionsWarnings", StringUtil.ConvertToJson(warningActions));
} }
using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager)) StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null;
using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager))
using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager),
stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager))
{ {
StepHost.OutputDataReceived += stdoutManager.OnDataReceived; StepHost.OutputDataReceived += stdoutManager.OnDataReceived;
StepHost.ErrorDataReceived += stderrManager.OnDataReceived; StepHost.ErrorDataReceived += stderrManager.OnDataReceived;
stallManager?.Initialize();
// Execute the process. Exit code 0 should always be returned. // Execute the process. Exit code 0 should always be returned.
// A non-zero exit code indicates infrastructural failure. // A non-zero exit code indicates infrastructural failure.
// Task failure should be communicated over STDOUT using ## commands. // Task failure should be communicated over STDOUT using ## commands.

View File

@@ -26,12 +26,14 @@ namespace GitHub.Runner.Worker.Handlers
private IssueMatcher[] _matchers = Array.Empty<IssueMatcher>(); private IssueMatcher[] _matchers = Array.Empty<IssueMatcher>();
// Mapping that indicates whether a directory belongs to the workflow repository // Mapping that indicates whether a directory belongs to the workflow repository
private readonly Dictionary<string, string> _directoryMap = new(); private readonly Dictionary<string, string> _directoryMap = new();
private StallManager _stallManager;
public OutputManager(IExecutionContext executionContext, IActionCommandManager commandManager, ContainerInfo container = null) public OutputManager(IExecutionContext executionContext, IActionCommandManager commandManager, ContainerInfo container = null, StallManager stallManager = null)
{ {
_executionContext = executionContext; _executionContext = executionContext;
_commandManager = commandManager; _commandManager = commandManager;
_container = container ?? executionContext.Global.Container; _container = container ?? executionContext.Global.Container;
_stallManager = stallManager;
// Recursion failsafe (test override) // Recursion failsafe (test override)
var failsafeString = Environment.GetEnvironmentVariable("RUNNER_TEST_GET_REPOSITORY_PATH_FAILSAFE"); var failsafeString = Environment.GetEnvironmentVariable("RUNNER_TEST_GET_REPOSITORY_PATH_FAILSAFE");
@@ -76,6 +78,10 @@ namespace GitHub.Runner.Worker.Handlers
public void OnDataReceived(object sender, ProcessDataReceivedEventArgs e) public void OnDataReceived(object sender, ProcessDataReceivedEventArgs e)
{ {
if (_stallManager != null)
{
_stallManager.OnDataReceived(sender, e);
}
var line = e.Data; var line = e.Data;
// ## commands // ## commands

View File

@@ -43,11 +43,14 @@ namespace GitHub.Runner.Worker.Handlers
// Make sure only particular task get run as runner plugin. // Make sure only particular task get run as runner plugin.
var runnerPlugin = HostContext.GetService<IRunnerPluginManager>(); var runnerPlugin = HostContext.GetService<IRunnerPluginManager>();
using (var outputManager = new OutputManager(ExecutionContext, ActionCommandManager)) StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null;
using (OutputManager outputManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager))
{ {
ActionCommandManager.EnablePluginInternalCommand(); ActionCommandManager.EnablePluginInternalCommand();
try try
{ {
stallManager?.Initialize();
await runnerPlugin.RunPluginActionAsync(ExecutionContext, plugin, Inputs, Environment, RuntimeVariables, outputManager.OnDataReceived); await runnerPlugin.RunPluginActionAsync(ExecutionContext, plugin, Inputs, Environment, RuntimeVariables, outputManager.OnDataReceived);
} }
finally finally

View File

@@ -321,13 +321,15 @@ namespace GitHub.Runner.Worker.Handlers
ExecutionContext.Debug($"{fileName} {arguments}"); ExecutionContext.Debug($"{fileName} {arguments}");
Inputs.TryGetValue("standardInInput", out var standardInInput); Inputs.TryGetValue("standardInInput", out var standardInInput);
using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager)) StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null;
using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager)) using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager),
stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager))
{ {
StepHost.OutputDataReceived += stdoutManager.OnDataReceived; StepHost.OutputDataReceived += stdoutManager.OnDataReceived;
StepHost.ErrorDataReceived += stderrManager.OnDataReceived; StepHost.ErrorDataReceived += stderrManager.OnDataReceived;
// Execute // Execute
stallManager?.Initialize();
int exitCode = await StepHost.ExecuteAsync(ExecutionContext, int exitCode = await StepHost.ExecuteAsync(ExecutionContext,
workingDirectory: StepHost.ResolvePathForStepHost(ExecutionContext, workingDirectory), workingDirectory: StepHost.ResolvePathForStepHost(ExecutionContext, workingDirectory),
fileName: fileName, fileName: fileName,

View File

@@ -0,0 +1,70 @@
using System;
using System.Timers;
using GitHub.Runner.Common;
using GitHub.Runner.Sdk;
namespace GitHub.Runner.Worker.Handlers
{
[ServiceLocator(Default = typeof(TimerAdapter))]
public interface ITimer
{
void Start();
void Stop();
double Interval { get; set; }
event ElapsedEventHandler Elapsed;
bool AutoReset { get; set; }
void Dispose();
}
public class TimerAdapter : Timer, ITimer { }
public sealed class StallManager : IDisposable
{
public static TimeSpan DefaultStallInterval = TimeSpan.FromMinutes(30);
private readonly IExecutionContext _executionContext;
private readonly double _interval;
private ITimer _timer { get; set; }
private int _intervalsElapsedWhileStalled = 0;
public StallManager(IExecutionContext executionContext, double interval, ITimer timer)
{
_executionContext = executionContext;
_interval = interval;
_timer = timer;
_timer.Interval = _interval;
_timer.Elapsed += TriggerWarning;
}
public StallManager(IExecutionContext executionContext, double interval) : this(executionContext, interval, new TimerAdapter()) { }
public StallManager(IExecutionContext executionContext) : this(executionContext, StallManager.DefaultStallInterval.TotalMilliseconds) { }
public void Initialize()
{
this.OnDataReceived(null, null);
}
public void Dispose()
{
try
{
_timer.Dispose();
}
catch { }
}
public void OnDataReceived(object sender, ProcessDataReceivedEventArgs e)
{
_intervalsElapsedWhileStalled = 0;
_timer.Stop();
_timer.Start();
}
private void TriggerWarning(object source, ElapsedEventArgs e)
{
_intervalsElapsedWhileStalled++;
_executionContext.Warning($"No output has been detected in the last {TimeSpan.FromMilliseconds(_intervalsElapsedWhileStalled * _interval).TotalMinutes} minutes and the process has not yet exited. This step may have stalled and might require some investigation.");
}
}
}

View File

@@ -1014,7 +1014,8 @@ namespace GitHub.Runner.Common.Tests.Worker
return false; return false;
}); });
_outputManager = new OutputManager(_executionContext.Object, _commandManager.Object, stepContainer); StallManager stallManager = new StallManager(_executionContext.Object);
_outputManager = new OutputManager(_executionContext.Object, _commandManager.Object, stepContainer, stallManager);
return hostContext; return hostContext;
} }

View File

@@ -0,0 +1,156 @@
using System;
using System.Timers;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using GitHub.Runner.Worker;
using GitHub.Runner.Worker.Container;
using GitHub.Runner.Worker.Handlers;
using Moq;
using Xunit;
using DTWebApi = GitHub.DistributedTask.WebApi;
using GitHub.Runner.Common.Util;
using GitHub.DistributedTask.WebApi;
using System.Diagnostics;
namespace GitHub.Runner.Common.Tests.Worker
{
public class MockTimer : ITimer
{
public bool _started = false;
public bool _stopped = false;
public bool _reset = false;
public double Interval { get; set; }
public event ElapsedEventHandler Elapsed;
public bool AutoReset { get; set; }
public MockTimer()
{
Interval = 1;
}
public void Dispose() { }
public void Start()
{
_started = true;
if (_stopped)
{
_stopped = false;
_reset = true;
}
}
public void Stop()
{
_reset = false;
_started = false;
_stopped = true;
}
public void TimeElapsed()
{
this.Elapsed.Invoke(this, new EventArgs() as ElapsedEventArgs);
}
}
public sealed class StallManagerL0
{
private Mock<IExecutionContext> _executionContext;
private List<Tuple<DTWebApi.Issue, string>> _issues;
private Variables _variables;
private TestHostContext Setup(
[CallerMemberName] string name = "",
ContainerInfo jobContainer = null,
ContainerInfo stepContainer = null)
{
var hostContext = new TestHostContext(this, name);
_executionContext = new Mock<IExecutionContext>();
_issues = new List<Tuple<DTWebApi.Issue, string>>();
// Variables to test for secret scrubbing & FF options
_variables = new Variables(hostContext, new Dictionary<string, VariableValue>
{
{ "DistributedTask.AllowRunnerStallDetect", new VariableValue("true", true) },
});
_executionContext.Setup(x => x.Global)
.Returns(new GlobalContext
{
Container = jobContainer,
Variables = _variables,
WriteDebug = true,
});
_executionContext.Setup(x => x.AddIssue(It.IsAny<DTWebApi.Issue>(), It.IsAny<ExecutionContextLogOptions>()))
.Callback((DTWebApi.Issue issue, ExecutionContextLogOptions logOptions) =>
{
var resolvedMessage = issue.Message;
if (logOptions.WriteToLog && !string.IsNullOrEmpty(logOptions.LogMessageOverride))
{
resolvedMessage = logOptions.LogMessageOverride;
}
_issues.Add(new(issue, resolvedMessage));
});
return hostContext;
}
[Fact]
[Trait("Level", "L0")]
[Trait("Category", "Worker")]
public void OutputWarningMessageOnTimeElapsed()
{
MockTimer timer = new MockTimer();
using (Setup())
using (StallManager manager = new StallManager(_executionContext.Object, TimeSpan.FromMinutes(10).TotalMilliseconds, timer))
{
timer.TimeElapsed();
Assert.Equal(1, _issues.Count);
Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[0].Item1.Message);
Assert.Equal(DTWebApi.IssueType.Warning, _issues[0].Item1.Type);
}
}
[Fact]
[Trait("Level", "L0")]
[Trait("Category", "Worker")]
public void ValidateTimerResetOnNewMessage()
{
MockTimer timer = new MockTimer();
using (Setup())
using (StallManager manager = new StallManager(_executionContext.Object, TimeSpan.FromMinutes(10).TotalMilliseconds, timer))
{
// Trigger 2 elapsed
timer.TimeElapsed();
timer.TimeElapsed();
// Should have triggered 2 warnings
Assert.Equal(2, _issues.Count);
Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[0].Item1.Message);
Assert.Equal("No output has been detected in the last 20 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[1].Item1.Message);
Assert.Equal(DTWebApi.IssueType.Warning, _issues[0].Item1.Type);
Assert.Equal(DTWebApi.IssueType.Warning, _issues[1].Item1.Type);
// Should reset timer
manager.OnDataReceived(null, null);
Assert.True(timer._reset);
Assert.Equal(2, _issues.Count);
// Trigger another elapsed interval
timer.TimeElapsed();
// Timer should have reset and one new warning should have been added
Assert.Equal(3, _issues.Count);
Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[2].Item1.Message);
Assert.Equal(DTWebApi.IssueType.Warning, _issues[2].Item1.Type);
}
}
}
}