From 4b416898db3cd4c69ef6e3f7fa66089c70f5bde0 Mon Sep 17 00:00:00 2001 From: Ryan van Zeben Date: Mon, 24 Jul 2023 10:18:51 +0000 Subject: [PATCH] Add in stall manager --- src/Runner.Common/Constants.cs | 1 + src/Runner.Worker/FeatureManager.cs | 5 + .../Handlers/ContainerActionHandler.cs | 8 +- .../Handlers/NodeScriptActionHandler.cs | 7 +- src/Runner.Worker/Handlers/OutputManager.cs | 8 +- .../Handlers/RunnerPluginHandler.cs | 5 +- src/Runner.Worker/Handlers/ScriptHandler.cs | 6 +- src/Runner.Worker/Handlers/StallManager.cs | 70 ++++++++ src/Test/L0/Worker/OutputManagerL0.cs | 3 +- src/Test/L0/Worker/StallManagerL0.cs | 156 ++++++++++++++++++ 10 files changed, 259 insertions(+), 10 deletions(-) create mode 100644 src/Runner.Worker/Handlers/StallManager.cs create mode 100644 src/Test/L0/Worker/StallManagerL0.cs diff --git a/src/Runner.Common/Constants.cs b/src/Runner.Common/Constants.cs index 31b3daeef..17eac67b6 100644 --- a/src/Runner.Common/Constants.cs +++ b/src/Runner.Common/Constants.cs @@ -158,6 +158,7 @@ namespace GitHub.Runner.Common public static readonly string LogTemplateErrorsAsDebugMessages = "DistributedTask.LogTemplateErrorsAsDebugMessages"; public static readonly string UseContainerPathForTemplate = "DistributedTask.UseContainerPathForTemplate"; public static readonly string AllowRunnerContainerHooks = "DistributedTask.AllowRunnerContainerHooks"; + public static readonly string AllowRunnerStallDetect = "DistributedTask.AllowRunnerStallDetect"; } public static readonly string InternalTelemetryIssueDataKey = "_internal_telemetry"; diff --git a/src/Runner.Worker/FeatureManager.cs b/src/Runner.Worker/FeatureManager.cs index 98f49e8fd..c585f3f5a 100644 --- a/src/Runner.Worker/FeatureManager.cs +++ b/src/Runner.Worker/FeatureManager.cs @@ -11,5 +11,10 @@ namespace GitHub.Runner.Worker var isContainerHooksPathSet = !string.IsNullOrEmpty(Environment.GetEnvironmentVariable(Constants.Hooks.ContainerHooksPath)); return isContainerHookFeatureFlagSet && isContainerHooksPathSet; } + public static bool IsStallDetectEnabled(Variables variables) + { + var isStallDetectFeatureFlagSet = variables?.GetBoolean(Constants.Runner.Features.AllowRunnerStallDetect) ?? false; + return isStallDetectFeatureFlagSet; + } } } diff --git a/src/Runner.Worker/Handlers/ContainerActionHandler.cs b/src/Runner.Worker/Handlers/ContainerActionHandler.cs index 9d9d20f17..0297039b8 100644 --- a/src/Runner.Worker/Handlers/ContainerActionHandler.cs +++ b/src/Runner.Worker/Handlers/ContainerActionHandler.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -240,9 +240,11 @@ namespace GitHub.Runner.Worker.Handlers } else { - using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, container)) - using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, container)) + StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null; + using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, container, stallManager), + stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, container, stallManager)) { + stallManager?.Initialize(); var runExitCode = await dockerManager.DockerRun(ExecutionContext, container, stdoutManager.OnDataReceived, stderrManager.OnDataReceived); ExecutionContext.Debug($"Docker Action run completed with exit code {runExitCode}"); if (runExitCode != 0) diff --git a/src/Runner.Worker/Handlers/NodeScriptActionHandler.cs b/src/Runner.Worker/Handlers/NodeScriptActionHandler.cs index b5de017d6..a7253fe36 100644 --- a/src/Runner.Worker/Handlers/NodeScriptActionHandler.cs +++ b/src/Runner.Worker/Handlers/NodeScriptActionHandler.cs @@ -159,12 +159,15 @@ namespace GitHub.Runner.Worker.Handlers ExecutionContext.Global.Variables.Set("Node12ActionsWarnings", StringUtil.ConvertToJson(warningActions)); } - using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager)) - using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager)) + StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null; + + using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager), + stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager)) { StepHost.OutputDataReceived += stdoutManager.OnDataReceived; StepHost.ErrorDataReceived += stderrManager.OnDataReceived; + stallManager?.Initialize(); // Execute the process. Exit code 0 should always be returned. // A non-zero exit code indicates infrastructural failure. // Task failure should be communicated over STDOUT using ## commands. diff --git a/src/Runner.Worker/Handlers/OutputManager.cs b/src/Runner.Worker/Handlers/OutputManager.cs index f424f33f7..b778aafb4 100644 --- a/src/Runner.Worker/Handlers/OutputManager.cs +++ b/src/Runner.Worker/Handlers/OutputManager.cs @@ -26,12 +26,14 @@ namespace GitHub.Runner.Worker.Handlers private IssueMatcher[] _matchers = Array.Empty(); // Mapping that indicates whether a directory belongs to the workflow repository private readonly Dictionary _directoryMap = new(); + private StallManager _stallManager; - public OutputManager(IExecutionContext executionContext, IActionCommandManager commandManager, ContainerInfo container = null) + public OutputManager(IExecutionContext executionContext, IActionCommandManager commandManager, ContainerInfo container = null, StallManager stallManager = null) { _executionContext = executionContext; _commandManager = commandManager; _container = container ?? executionContext.Global.Container; + _stallManager = stallManager; // Recursion failsafe (test override) var failsafeString = Environment.GetEnvironmentVariable("RUNNER_TEST_GET_REPOSITORY_PATH_FAILSAFE"); @@ -76,6 +78,10 @@ namespace GitHub.Runner.Worker.Handlers public void OnDataReceived(object sender, ProcessDataReceivedEventArgs e) { + if (_stallManager != null) + { + _stallManager.OnDataReceived(sender, e); + } var line = e.Data; // ## commands diff --git a/src/Runner.Worker/Handlers/RunnerPluginHandler.cs b/src/Runner.Worker/Handlers/RunnerPluginHandler.cs index 61949df1e..5020e618e 100644 --- a/src/Runner.Worker/Handlers/RunnerPluginHandler.cs +++ b/src/Runner.Worker/Handlers/RunnerPluginHandler.cs @@ -43,11 +43,14 @@ namespace GitHub.Runner.Worker.Handlers // Make sure only particular task get run as runner plugin. var runnerPlugin = HostContext.GetService(); - using (var outputManager = new OutputManager(ExecutionContext, ActionCommandManager)) + StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null; + + using (OutputManager outputManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager)) { ActionCommandManager.EnablePluginInternalCommand(); try { + stallManager?.Initialize(); await runnerPlugin.RunPluginActionAsync(ExecutionContext, plugin, Inputs, Environment, RuntimeVariables, outputManager.OnDataReceived); } finally diff --git a/src/Runner.Worker/Handlers/ScriptHandler.cs b/src/Runner.Worker/Handlers/ScriptHandler.cs index 57f61e9c4..09412738f 100644 --- a/src/Runner.Worker/Handlers/ScriptHandler.cs +++ b/src/Runner.Worker/Handlers/ScriptHandler.cs @@ -321,13 +321,15 @@ namespace GitHub.Runner.Worker.Handlers ExecutionContext.Debug($"{fileName} {arguments}"); Inputs.TryGetValue("standardInInput", out var standardInInput); - using (var stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager)) - using (var stderrManager = new OutputManager(ExecutionContext, ActionCommandManager)) + StallManager stallManager = FeatureManager.IsStallDetectEnabled(ExecutionContext.Global.Variables) ? new StallManager(ExecutionContext) : null; + using (OutputManager stdoutManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager), + stderrManager = new OutputManager(ExecutionContext, ActionCommandManager, null, stallManager)) { StepHost.OutputDataReceived += stdoutManager.OnDataReceived; StepHost.ErrorDataReceived += stderrManager.OnDataReceived; // Execute + stallManager?.Initialize(); int exitCode = await StepHost.ExecuteAsync(ExecutionContext, workingDirectory: StepHost.ResolvePathForStepHost(ExecutionContext, workingDirectory), fileName: fileName, diff --git a/src/Runner.Worker/Handlers/StallManager.cs b/src/Runner.Worker/Handlers/StallManager.cs new file mode 100644 index 000000000..b5ecc781a --- /dev/null +++ b/src/Runner.Worker/Handlers/StallManager.cs @@ -0,0 +1,70 @@ +using System; +using System.Timers; +using GitHub.Runner.Common; +using GitHub.Runner.Sdk; + +namespace GitHub.Runner.Worker.Handlers +{ + [ServiceLocator(Default = typeof(TimerAdapter))] + public interface ITimer + { + void Start(); + void Stop(); + double Interval { get; set; } + event ElapsedEventHandler Elapsed; + bool AutoReset { get; set; } + void Dispose(); + } + + public class TimerAdapter : Timer, ITimer { } + + public sealed class StallManager : IDisposable + { + public static TimeSpan DefaultStallInterval = TimeSpan.FromMinutes(30); + + private readonly IExecutionContext _executionContext; + private readonly double _interval; + + private ITimer _timer { get; set; } + private int _intervalsElapsedWhileStalled = 0; + + public StallManager(IExecutionContext executionContext, double interval, ITimer timer) + { + _executionContext = executionContext; + _interval = interval; + _timer = timer; + + _timer.Interval = _interval; + _timer.Elapsed += TriggerWarning; + } + public StallManager(IExecutionContext executionContext, double interval) : this(executionContext, interval, new TimerAdapter()) { } + public StallManager(IExecutionContext executionContext) : this(executionContext, StallManager.DefaultStallInterval.TotalMilliseconds) { } + + public void Initialize() + { + this.OnDataReceived(null, null); + } + + public void Dispose() + { + try + { + _timer.Dispose(); + } + catch { } + } + + public void OnDataReceived(object sender, ProcessDataReceivedEventArgs e) + { + _intervalsElapsedWhileStalled = 0; + _timer.Stop(); + _timer.Start(); + } + + private void TriggerWarning(object source, ElapsedEventArgs e) + { + _intervalsElapsedWhileStalled++; + _executionContext.Warning($"No output has been detected in the last {TimeSpan.FromMilliseconds(_intervalsElapsedWhileStalled * _interval).TotalMinutes} minutes and the process has not yet exited. This step may have stalled and might require some investigation."); + } + } +} diff --git a/src/Test/L0/Worker/OutputManagerL0.cs b/src/Test/L0/Worker/OutputManagerL0.cs index 9d7f5d3f2..81217b15f 100644 --- a/src/Test/L0/Worker/OutputManagerL0.cs +++ b/src/Test/L0/Worker/OutputManagerL0.cs @@ -1014,7 +1014,8 @@ namespace GitHub.Runner.Common.Tests.Worker return false; }); - _outputManager = new OutputManager(_executionContext.Object, _commandManager.Object, stepContainer); + StallManager stallManager = new StallManager(_executionContext.Object); + _outputManager = new OutputManager(_executionContext.Object, _commandManager.Object, stepContainer, stallManager); return hostContext; } diff --git a/src/Test/L0/Worker/StallManagerL0.cs b/src/Test/L0/Worker/StallManagerL0.cs new file mode 100644 index 000000000..00c965aad --- /dev/null +++ b/src/Test/L0/Worker/StallManagerL0.cs @@ -0,0 +1,156 @@ +using System; +using System.Timers; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using GitHub.Runner.Worker; +using GitHub.Runner.Worker.Container; +using GitHub.Runner.Worker.Handlers; +using Moq; +using Xunit; +using DTWebApi = GitHub.DistributedTask.WebApi; +using GitHub.Runner.Common.Util; +using GitHub.DistributedTask.WebApi; +using System.Diagnostics; + +namespace GitHub.Runner.Common.Tests.Worker +{ + + public class MockTimer : ITimer + { + public bool _started = false; + public bool _stopped = false; + public bool _reset = false; + public double Interval { get; set; } + public event ElapsedEventHandler Elapsed; + public bool AutoReset { get; set; } + + public MockTimer() + { + Interval = 1; + } + + public void Dispose() { } + + public void Start() + { + _started = true; + if (_stopped) + { + _stopped = false; + _reset = true; + } + } + public void Stop() + { + _reset = false; + _started = false; + _stopped = true; + } + + public void TimeElapsed() + { + this.Elapsed.Invoke(this, new EventArgs() as ElapsedEventArgs); + } + } + + public sealed class StallManagerL0 + { + private Mock _executionContext; + private List> _issues; + private Variables _variables; + + private TestHostContext Setup( + [CallerMemberName] string name = "", + ContainerInfo jobContainer = null, + ContainerInfo stepContainer = null) + { + var hostContext = new TestHostContext(this, name); + _executionContext = new Mock(); + _issues = new List>(); + + // Variables to test for secret scrubbing & FF options + _variables = new Variables(hostContext, new Dictionary + { + { "DistributedTask.AllowRunnerStallDetect", new VariableValue("true", true) }, + }); + + _executionContext.Setup(x => x.Global) + .Returns(new GlobalContext + { + Container = jobContainer, + Variables = _variables, + WriteDebug = true, + }); + + _executionContext.Setup(x => x.AddIssue(It.IsAny(), It.IsAny())) + .Callback((DTWebApi.Issue issue, ExecutionContextLogOptions logOptions) => + { + var resolvedMessage = issue.Message; + if (logOptions.WriteToLog && !string.IsNullOrEmpty(logOptions.LogMessageOverride)) + { + resolvedMessage = logOptions.LogMessageOverride; + } + _issues.Add(new(issue, resolvedMessage)); + }); + + return hostContext; + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Worker")] + public void OutputWarningMessageOnTimeElapsed() + { + MockTimer timer = new MockTimer(); + using (Setup()) + using (StallManager manager = new StallManager(_executionContext.Object, TimeSpan.FromMinutes(10).TotalMilliseconds, timer)) + { + + timer.TimeElapsed(); + + Assert.Equal(1, _issues.Count); + Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[0].Item1.Message); + Assert.Equal(DTWebApi.IssueType.Warning, _issues[0].Item1.Type); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Worker")] + public void ValidateTimerResetOnNewMessage() + { + + MockTimer timer = new MockTimer(); + using (Setup()) + using (StallManager manager = new StallManager(_executionContext.Object, TimeSpan.FromMinutes(10).TotalMilliseconds, timer)) + { + + // Trigger 2 elapsed + timer.TimeElapsed(); + timer.TimeElapsed(); + + // Should have triggered 2 warnings + Assert.Equal(2, _issues.Count); + Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[0].Item1.Message); + Assert.Equal("No output has been detected in the last 20 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[1].Item1.Message); + Assert.Equal(DTWebApi.IssueType.Warning, _issues[0].Item1.Type); + Assert.Equal(DTWebApi.IssueType.Warning, _issues[1].Item1.Type); + + // Should reset timer + manager.OnDataReceived(null, null); + + Assert.True(timer._reset); + Assert.Equal(2, _issues.Count); + + // Trigger another elapsed interval + timer.TimeElapsed(); + + // Timer should have reset and one new warning should have been added + Assert.Equal(3, _issues.Count); + Assert.Equal("No output has been detected in the last 10 minutes and the process has not yet exited. This step may have stalled and might require some investigation.", _issues[2].Item1.Message); + Assert.Equal(DTWebApi.IssueType.Warning, _issues[2].Item1.Type); + + } + } + } +}