Skip to content

Commit

Permalink
Backoff to avoid excessive retries to Run Service in a duration
Browse files Browse the repository at this point in the history
  • Loading branch information
ericsciple committed Jun 24, 2024
1 parent ecb732e commit ec1e338
Show file tree
Hide file tree
Showing 6 changed files with 318 additions and 6 deletions.
5 changes: 4 additions & 1 deletion src/Runner.Common/RunServer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ public Task<AgentJobRequestMessage> GetJobMessageAsync(string id, CancellationTo
CheckConnection();
return RetryRequest<AgentJobRequestMessage>(
async () => await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, cancellationToken), cancellationToken,
shouldRetry: ex => ex is not TaskOrchestrationJobAlreadyAcquiredException);
shouldRetry: ex =>
ex is not TaskOrchestrationJobNotFoundException && // HTTP status 404
ex is not TaskOrchestrationJobAlreadyAcquiredException && // HTTP status 409
ex is not TaskOrchestrationJobUnprocessableException); // HTTP status 422
}

public Task CompleteJobAsync(
Expand Down
44 changes: 44 additions & 0 deletions src/Runner.Listener/ErrorThrottler.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using GitHub.Runner.Common;
using GitHub.Services.Common;

namespace GitHub.Runner.Listener
{
[ServiceLocator(Default = typeof(ErrorThrottler))]
public interface IErrorThrottler : IRunnerService
{
void Reset();
Task IncrementAndWaitAsync(CancellationToken token);
}

public sealed class ErrorThrottler : RunnerService, IErrorThrottler
{
internal static readonly TimeSpan MinBackoff = TimeSpan.FromSeconds(1);
internal static readonly TimeSpan MaxBackoff = TimeSpan.FromMinutes(1);
internal static readonly TimeSpan BackoffCoefficient = TimeSpan.FromSeconds(1);
private int _count = 0;

public void Reset()
{
_count = 0;
}

public async Task IncrementAndWaitAsync(CancellationToken token)
{
if (++_count <= 1)
{
return;
}

TimeSpan backoff = BackoffTimerHelper.GetExponentialBackoff(
attempt: _count - 2, // 0-based attempt
minBackoff: MinBackoff,
maxBackoff: MaxBackoff,
deltaBackoff: BackoffCoefficient);
Trace.Warning($"Back off {backoff.TotalSeconds} seconds before next attempt. Current consecutive error count: {_count}");
await HostContext.Delay(backoff, token);
}
}
}
28 changes: 23 additions & 5 deletions src/Runner.Listener/Runner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,25 @@ public sealed class Runner : RunnerService, IRunner
private bool _inConfigStage;
private ManualResetEvent _completedCommand = new(false);

// <summary>
// Helps avoid excessive calls to Run Service when encountering non-retriable errors from /acquirejob.
// Normally we rely on the HTTP clients to back off between retry attempts. However, acquiring a job
// involves calls to both Run Serivce and Broker. And Run Service and Broker communicate with each other
// in an async fashion.
//
// When Run Service encounters a non-retriable error, it sends an async message to Broker. The runner will,
// however, immediately call Broker to get the next message. If the async event from Run Service to Broker
// has not yet been processed, the next message from Broker may be the same job message.
//
// The error throttler helps us back off when encountering successive, non-retriable errors from /acquirejob.
// </summary>
private IErrorThrottler _acquireJobThrottler;

public override void Initialize(IHostContext hostContext)
{
base.Initialize(hostContext);
_term = HostContext.GetService<ITerminal>();
_acquireJobThrottler = HostContext.CreateService<IErrorThrottler>();
}

public async Task<int> ExecuteCommand(CommandSettings command)
Expand Down Expand Up @@ -565,13 +580,16 @@ private async Task<int> RunAsync(RunnerSettings settings, bool runOnce = false)
await runServer.ConnectAsync(new Uri(messageRef.RunServiceUrl), creds);
try
{
jobRequestMessage =
await runServer.GetJobMessageAsync(messageRef.RunnerRequestId,
messageQueueLoopTokenSource.Token);
jobRequestMessage = await runServer.GetJobMessageAsync(messageRef.RunnerRequestId, messageQueueLoopTokenSource.Token);
_acquireJobThrottler.Reset();
}
catch (TaskOrchestrationJobAlreadyAcquiredException)
catch (Exception ex) when (
ex is TaskOrchestrationJobNotFoundException || // HTTP status 404
ex is TaskOrchestrationJobAlreadyAcquiredException || // HTTP status 409
ex is TaskOrchestrationJobUnprocessableException) // HTTP status 422
{
Trace.Info("Job is already acquired, skip this message.");
Trace.Info($"Skipping message Job. {ex.Message}");
await _acquireJobThrottler.IncrementAndWaitAsync(messageQueueLoopTokenSource.Token);
continue;
}
catch (Exception ex)
Expand Down
213 changes: 213 additions & 0 deletions src/Test/L0/Listener/ErrorThrottlerL0.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using GitHub.DistributedTask.WebApi;
using GitHub.Runner.Listener;
using GitHub.Runner.Listener.Configuration;
using GitHub.Runner.Common.Tests;
using System.Runtime.CompilerServices;
using GitHub.Services.WebApi;
using Moq;
using Xunit;

namespace GitHub.Runner.Common.Tests.Listener
{
public sealed class ErrorThrottlerL0
{
[Theory]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
[InlineData(4)]
[InlineData(5)]
[InlineData(6)]
[InlineData(7)]
[InlineData(8)]
public async void TestIncrementAndWait(int totalAttempts)
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};

// Act
for (int attempt = 1; attempt <= totalAttempts; attempt++)
{
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
}

// Assert
Assert.Equal(totalAttempts - 1, eventArgs.Count);
for (int i = 0; i < eventArgs.Count; i++)
{
// Expected milliseconds
int expectedMin;
int expectedMax;

switch (i)
{
case 0:
expectedMin = 1000; // Min backoff
expectedMax = 1000;
break;
case 1:
expectedMin = 1800; // Min + 0.8 * Coefficient
expectedMax = 2200; // Min + 1.2 * Coefficient
break;
case 2:
expectedMin = 3400; // Min + 0.8 * Coefficient * 3
expectedMax = 4600; // Min + 1.2 * Coefficient * 3
break;
case 3:
expectedMin = 6600; // Min + 0.8 * Coefficient * 7
expectedMax = 9400; // Min + 1.2 * Coefficient * 7
break;
case 4:
expectedMin = 13000; // Min + 0.8 * Coefficient * 15
expectedMax = 19000; // Min + 1.2 * Coefficient * 15
break;
case 5:
expectedMin = 25800; // Min + 0.8 * Coefficient * 31
expectedMax = 38200; // Min + 1.2 * Coefficient * 31
break;
case 6:
expectedMin = 51400; // Min + 0.8 * Coefficient * 63
expectedMax = 60000; // Max backoff
break;
case 7:
expectedMin = 60000;
expectedMax = 60000;
break;
default:
throw new NotSupportedException("Unexpected eventArgs count");
}

var actualMilliseconds = eventArgs[i].Delay.TotalMilliseconds;
Assert.True(expectedMin <= actualMilliseconds, $"Unexpected min delay for eventArgs[{i}]. Expected min {expectedMin}, actual {actualMilliseconds}");
Assert.True(expectedMax >= actualMilliseconds, $"Unexpected max delay for eventArgs[{i}]. Expected max {expectedMax}, actual {actualMilliseconds}");
}
}
}

[Fact]
public async void TestReset()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};

// Act
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
errorThrottler.Reset();
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);

// Assert
Assert.Equal(4, eventArgs.Count);
for (int i = 0; i < eventArgs.Count; i++)
{
// Expected milliseconds
int expectedMin;
int expectedMax;

switch (i)
{
case 0:
case 2:
expectedMin = 1000; // Min backoff
expectedMax = 1000;
break;
case 1:
case 3:
expectedMin = 1800; // Min + 0.8 * Coefficient
expectedMax = 2200; // Min + 1.2 * Coefficient
break;
default:
throw new NotSupportedException("Unexpected eventArgs count");
}

var actualMilliseconds = eventArgs[i].Delay.TotalMilliseconds;
Assert.True(expectedMin <= actualMilliseconds, $"Unexpected min delay for eventArgs[{i}]. Expected min {expectedMin}, actual {actualMilliseconds}");
Assert.True(expectedMax >= actualMilliseconds, $"Unexpected max delay for eventArgs[{i}]. Expected max {expectedMax}, actual {actualMilliseconds}");
}
}
}

[Fact]
public async void TestReceivesCancellationToken()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};
var cancellationTokenSource1 = new CancellationTokenSource();
var cancellationTokenSource2 = new CancellationTokenSource();
var cancellationTokenSource3 = new CancellationTokenSource();

// Act
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource1.Token);
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource2.Token);
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource3.Token);

// Assert
Assert.Equal(2, eventArgs.Count);
Assert.Equal(cancellationTokenSource2.Token, eventArgs[0].Token);
Assert.Equal(cancellationTokenSource3.Token, eventArgs[1].Token);
}
}

[Fact]
public async void TestReceivesSender()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var senders = new List<object>();
hc.Delaying += (sender, args) =>
{
senders.Add(sender);
};

// Act
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);

// Assert
Assert.Equal(2, senders.Count);
Assert.Equal(hc, senders[0]);
Assert.Equal(hc, senders[1]);
}
}

private TestHostContext CreateTestContext([CallerMemberName] String testName = "")
{
return new TestHostContext(this, testName);
}
}
}
9 changes: 9 additions & 0 deletions src/Test/L0/Listener/RunnerL0.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public sealed class RunnerL0
private Mock<ITerminal> _term;
private Mock<IConfigurationStore> _configStore;
private Mock<ISelfUpdater> _updater;
private Mock<IErrorThrottler> _acquireJobThrottler;

public RunnerL0()
{
Expand All @@ -35,6 +36,7 @@ public RunnerL0()
_term = new Mock<ITerminal>();
_configStore = new Mock<IConfigurationStore>();
_updater = new Mock<ISelfUpdater>();
_acquireJobThrottler = new Mock<IErrorThrottler>();
}

private Pipelines.AgentJobRequestMessage CreateJobRequestMessage(string jobName)
Expand Down Expand Up @@ -67,6 +69,7 @@ public async void TestRunAsync()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -174,6 +177,7 @@ public async void TestExecuteCommandForRunAsService(string[] args, bool configur
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IMessageListener>(_messageListener.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, args);

Expand Down Expand Up @@ -205,6 +209,7 @@ public async void TestMachineProvisionerCLI()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IMessageListener>(_messageListener.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, new[] { "run" });

Expand Down Expand Up @@ -242,6 +247,7 @@ public async void TestRunOnce()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -338,6 +344,7 @@ public async void TestRunOnceOnlyTakeOneJobMessage()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -439,6 +446,7 @@ public async void TestRunOnceHandleUpdateMessage()
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.SetSingleton<ISelfUpdater>(_updater.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

runner.Initialize(hc);
var settings = new RunnerSettings
Expand Down Expand Up @@ -522,6 +530,7 @@ public async void TestRemoveLocalRunnerConfig()
hc.SetSingleton<IConfigurationManager>(_configurationManager.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, new[] { "remove", "--local" });

Expand Down
Loading

0 comments on commit ec1e338

Please sign in to comment.