diff --git a/.gitignore b/.gitignore index 411fe4011..8864c969a 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,4 @@ TestResults TestLogs .DS_Store .mono -**/*.DotSettings.user \ No newline at end of file +**/*.DotSettings.user/tmp/ diff --git a/killmode-analysis.md b/killmode-analysis.md new file mode 100644 index 000000000..2843e6668 --- /dev/null +++ b/killmode-analysis.md @@ -0,0 +1,120 @@ +# GitHub Actions Runner KillMode Analysis + +## Problem Statement +The question "is this a good idea?" regarding "killmode changing?" asks us to evaluate whether the current systemd `KillMode=process` setting should be changed to a different option. + +## Current Implementation + +### Systemd Service Configuration +- **KillMode**: `process` (only main process gets signal) +- **KillSignal**: `SIGTERM` +- **TimeoutStopSec**: `5min` + +### Signal Handling Flow +1. systemd sends SIGTERM to `runsvc.sh` (main process) +2. `runsvc.sh` has trap: `trap 'kill -INT $PID' TERM INT` +3. Converts SIGTERM → SIGINT and sends to Node.js runner process +4. Node.js process handles graceful shutdown + +## Analysis of Current Approach + +### Strengths +1. **Graceful Shutdown Control**: Manual signal conversion allows proper Node.js shutdown handling +2. **Predictable Behavior**: Only main process receives systemd signals +3. **Custom Logic**: Allows for runner-specific shutdown procedures +4. **Signal Compatibility**: SIGINT is more commonly handled by Node.js applications + +### Potential Issues +1. **Single Point of Failure**: If `runsvc.sh` fails to forward signals, child processes orphaned +2. **Complex Chain**: More components in signal propagation path +3. **Process Tree Cleanup**: May not handle deep process hierarchies as robustly + +## Orphan Process Context + +The codebase reveals significant effort to handle orphan processes: + +### Evidence from Code Analysis +1. **JobExtension.cs**: Dedicated orphan process cleanup mechanism + - Tracks processes before/after job execution + - Uses `RUNNER_TRACKING_ID` environment variable + - Terminates orphan processes at job completion + +2. **JobDispatcher.cs**: Worker process orphan prevention + - Explicit waits to prevent orphan worker processes + - Handles "zombie worker" scenarios + +3. **ProcessInvoker.cs**: Process tree termination + - Implements both Windows and Unix process tree killing + - Signal escalation: SIGINT → SIGTERM → SIGKILL + +## Alternative KillMode Options + +### KillMode=control-group +**Behavior**: All processes in service's cgroup get SIGTERM, then SIGKILL after timeout + +**Pros**: +- Robust cleanup of entire process tree +- Built-in systemd guarantees +- Simpler signal flow +- No dependency on runsvc.sh signal forwarding + +**Cons**: +- Less control over shutdown sequence +- All processes get SIGTERM simultaneously +- May interrupt graceful shutdown of worker processes + +### KillMode=mixed +**Behavior**: Main process gets SIGTERM, remaining processes get SIGKILL after timeout + +**Pros**: +- Combines benefits of both approaches +- Main process can handle graceful shutdown +- Systemd ensures process tree cleanup +- Fallback protection against orphan processes + +**Cons**: +- More complex behavior +- Still depends on main process signal handling + +## Security and Reliability Considerations + +### Current Risks +1. If `runsvc.sh` crashes before forwarding signals, Node.js process continues running +2. Deep process trees from job execution may not be properly cleaned up +3. Container processes might not receive proper termination signals + +### Reliability Improvements with control-group/mixed +1. systemd guarantees process cleanup regardless of main process behavior +2. Reduces risk of orphan processes surviving service shutdown +3. More predictable behavior for administrators + +## Recommendation + +### Recommended Change: KillMode=mixed + +**Rationale**: +1. **Maintains Graceful Shutdown**: Main process (runsvc.sh) still receives SIGTERM first +2. **Adds Safety Net**: systemd ensures cleanup if main process fails to handle signals +3. **Reduces Orphan Risk**: Addresses the orphan process concerns evident in the codebase +4. **Better Process Tree Handling**: More robust for complex job process hierarchies +5. **Container Compatibility**: Better handling of containerized workloads + +### Implementation Impact +- **Low Risk**: Change only affects service shutdown behavior +- **Backward Compatible**: No changes to startup or normal operation +- **Testable**: Can be validated with process monitoring during service stops + +### Alternative Considerations +- **KillMode=control-group** could be considered if graceful shutdown proves problematic +- Current **KillMode=process** could remain if the signal forwarding is deemed reliable enough + +## Testing Recommendations + +1. Test service shutdown with various job types running +2. Verify process cleanup with nested process trees +3. Test container job termination scenarios +4. Monitor for any regressions in graceful shutdown behavior + +## Conclusion + +Changing to `KillMode=mixed` would provide a good balance between maintaining the current graceful shutdown behavior while adding systemd's robust process cleanup guarantees. This addresses the orphan process concerns evident throughout the codebase while maintaining compatibility. \ No newline at end of file diff --git a/src/Misc/layoutbin/actions.runner.service.template b/src/Misc/layoutbin/actions.runner.service.template index 4dcec7e36..389bc70f2 100644 --- a/src/Misc/layoutbin/actions.runner.service.template +++ b/src/Misc/layoutbin/actions.runner.service.template @@ -6,7 +6,7 @@ After=network.target ExecStart={{RunnerRoot}}/runsvc.sh User={{User}} WorkingDirectory={{RunnerRoot}} -KillMode=process +KillMode=mixed KillSignal=SIGTERM TimeoutStopSec=5min diff --git a/src/Test/L0/Misc/SystemdServiceTemplateL0.cs b/src/Test/L0/Misc/SystemdServiceTemplateL0.cs new file mode 100644 index 000000000..78055256a --- /dev/null +++ b/src/Test/L0/Misc/SystemdServiceTemplateL0.cs @@ -0,0 +1,48 @@ +using System; +using System.IO; +using Xunit; + +namespace GitHub.Runner.Common.Tests.Misc +{ + public sealed class SystemdServiceTemplateL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void ServiceTemplate_ContainsExpectedKillMode() + { + // Arrange + var templatePath = Path.Combine(TestUtil.GetSrcPath(), "Misc", "layoutbin", "actions.runner.service.template"); + + // Act + var templateContent = File.ReadAllText(templatePath); + + // Assert + Assert.Contains("KillMode=mixed", templateContent); + Assert.Contains("KillSignal=SIGTERM", templateContent); + Assert.Contains("TimeoutStopSec=5min", templateContent); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void ServiceTemplate_HasValidStructure() + { + // Arrange + var templatePath = Path.Combine(TestUtil.GetSrcPath(), "Misc", "layoutbin", "actions.runner.service.template"); + + // Act + var templateContent = File.ReadAllText(templatePath); + var lines = templateContent.Split('\n', StringSplitOptions.RemoveEmptyEntries); + + // Assert + Assert.Contains("[Unit]", lines); + Assert.Contains("[Service]", lines); + Assert.Contains("[Install]", lines); + Assert.Contains("Description={{Description}}", lines); + Assert.Contains("ExecStart={{RunnerRoot}}/runsvc.sh", lines); + Assert.Contains("User={{User}}", lines); + Assert.Contains("WorkingDirectory={{RunnerRoot}}", lines); + } + } +} \ No newline at end of file