Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backoff to avoid excessive retries to Run Service in a duration #3354

Merged
merged 1 commit into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/Runner.Common/RunServer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ public Task<AgentJobRequestMessage> GetJobMessageAsync(string id, CancellationTo
CheckConnection();
return RetryRequest<AgentJobRequestMessage>(
async () => await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, cancellationToken), cancellationToken,
shouldRetry: ex => ex is not TaskOrchestrationJobAlreadyAcquiredException);
shouldRetry: ex =>
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with not retrying these errors, means we immediately call to Broker to get the next message, which might be the same job message.

When one of these errors is encountered from Run Service, an async message is sent to Broker. We don't want to keep hammering the service in a tight loop if there are delays processing the message queue.

ex is not TaskOrchestrationJobNotFoundException && // HTTP status 404
ex is not TaskOrchestrationJobAlreadyAcquiredException && // HTTP status 409
ex is not TaskOrchestrationJobUnprocessableException); // HTTP status 422
}

public Task CompleteJobAsync(
Expand Down
44 changes: 44 additions & 0 deletions src/Runner.Listener/ErrorThrottler.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using GitHub.Runner.Common;
using GitHub.Services.Common;

namespace GitHub.Runner.Listener
{
[ServiceLocator(Default = typeof(ErrorThrottler))]
public interface IErrorThrottler : IRunnerService
{
void Reset();
Task IncrementAndWaitAsync(CancellationToken token);
}

public sealed class ErrorThrottler : RunnerService, IErrorThrottler
{
internal static readonly TimeSpan MinBackoff = TimeSpan.FromSeconds(1);
internal static readonly TimeSpan MaxBackoff = TimeSpan.FromMinutes(1);
internal static readonly TimeSpan BackoffCoefficient = TimeSpan.FromSeconds(1);
private int _count = 0;

public void Reset()
{
_count = 0;
}

public async Task IncrementAndWaitAsync(CancellationToken token)
{
if (++_count <= 1)
{
return;
}

TimeSpan backoff = BackoffTimerHelper.GetExponentialBackoff(
attempt: _count - 2, // 0-based attempt
minBackoff: MinBackoff,
maxBackoff: MaxBackoff,
deltaBackoff: BackoffCoefficient);
Trace.Warning($"Back off {backoff.TotalSeconds} seconds before next attempt. Current consecutive error count: {_count}");
await HostContext.Delay(backoff, token);
}
}
}
28 changes: 23 additions & 5 deletions src/Runner.Listener/Runner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,25 @@ public sealed class Runner : RunnerService, IRunner
private bool _inConfigStage;
private ManualResetEvent _completedCommand = new(false);

// <summary>
// Helps avoid excessive calls to Run Service when encountering non-retriable errors from /acquirejob.
// Normally we rely on the HTTP clients to back off between retry attempts. However, acquiring a job
// involves calls to both Run Serivce and Broker. And Run Service and Broker communicate with each other
// in an async fashion.
//
// When Run Service encounters a non-retriable error, it sends an async message to Broker. The runner will,
// however, immediately call Broker to get the next message. If the async event from Run Service to Broker
// has not yet been processed, the next message from Broker may be the same job message.
//
// The error throttler helps us back off when encountering successive, non-retriable errors from /acquirejob.
// </summary>
private IErrorThrottler _acquireJobThrottler;

public override void Initialize(IHostContext hostContext)
{
base.Initialize(hostContext);
_term = HostContext.GetService<ITerminal>();
_acquireJobThrottler = HostContext.CreateService<IErrorThrottler>();
}

public async Task<int> ExecuteCommand(CommandSettings command)
Expand Down Expand Up @@ -565,13 +580,16 @@ private async Task<int> RunAsync(RunnerSettings settings, bool runOnce = false)
await runServer.ConnectAsync(new Uri(messageRef.RunServiceUrl), creds);
try
{
jobRequestMessage =
await runServer.GetJobMessageAsync(messageRef.RunnerRequestId,
messageQueueLoopTokenSource.Token);
jobRequestMessage = await runServer.GetJobMessageAsync(messageRef.RunnerRequestId, messageQueueLoopTokenSource.Token);
_acquireJobThrottler.Reset();
}
catch (TaskOrchestrationJobAlreadyAcquiredException)
catch (Exception ex) when (
ex is TaskOrchestrationJobNotFoundException || // HTTP status 404
ex is TaskOrchestrationJobAlreadyAcquiredException || // HTTP status 409
ex is TaskOrchestrationJobUnprocessableException) // HTTP status 422
{
Trace.Info("Job is already acquired, skip this message.");
Trace.Info($"Skipping message Job. {ex.Message}");
await _acquireJobThrottler.IncrementAndWaitAsync(messageQueueLoopTokenSource.Token);
continue;
}
catch (Exception ex)
Expand Down
213 changes: 213 additions & 0 deletions src/Test/L0/Listener/ErrorThrottlerL0.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using GitHub.DistributedTask.WebApi;
using GitHub.Runner.Listener;
using GitHub.Runner.Listener.Configuration;
using GitHub.Runner.Common.Tests;
using System.Runtime.CompilerServices;
using GitHub.Services.WebApi;
using Moq;
using Xunit;

namespace GitHub.Runner.Common.Tests.Listener
{
public sealed class ErrorThrottlerL0
{
[Theory]
[InlineData(1)]
[InlineData(2)]
[InlineData(3)]
[InlineData(4)]
[InlineData(5)]
[InlineData(6)]
[InlineData(7)]
[InlineData(8)]
public async void TestIncrementAndWait(int totalAttempts)
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};

// Act
for (int attempt = 1; attempt <= totalAttempts; attempt++)
{
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
}

// Assert
Assert.Equal(totalAttempts - 1, eventArgs.Count);
for (int i = 0; i < eventArgs.Count; i++)
{
// Expected milliseconds
int expectedMin;
int expectedMax;

switch (i)
{
case 0:
expectedMin = 1000; // Min backoff
expectedMax = 1000;
break;
case 1:
expectedMin = 1800; // Min + 0.8 * Coefficient
expectedMax = 2200; // Min + 1.2 * Coefficient
break;
case 2:
expectedMin = 3400; // Min + 0.8 * Coefficient * 3
expectedMax = 4600; // Min + 1.2 * Coefficient * 3
break;
case 3:
expectedMin = 6600; // Min + 0.8 * Coefficient * 7
expectedMax = 9400; // Min + 1.2 * Coefficient * 7
break;
case 4:
expectedMin = 13000; // Min + 0.8 * Coefficient * 15
expectedMax = 19000; // Min + 1.2 * Coefficient * 15
break;
case 5:
expectedMin = 25800; // Min + 0.8 * Coefficient * 31
expectedMax = 38200; // Min + 1.2 * Coefficient * 31
break;
case 6:
expectedMin = 51400; // Min + 0.8 * Coefficient * 63
expectedMax = 60000; // Max backoff
break;
case 7:
expectedMin = 60000;
expectedMax = 60000;
break;
default:
throw new NotSupportedException("Unexpected eventArgs count");
}

var actualMilliseconds = eventArgs[i].Delay.TotalMilliseconds;
Assert.True(expectedMin <= actualMilliseconds, $"Unexpected min delay for eventArgs[{i}]. Expected min {expectedMin}, actual {actualMilliseconds}");
Assert.True(expectedMax >= actualMilliseconds, $"Unexpected max delay for eventArgs[{i}]. Expected max {expectedMax}, actual {actualMilliseconds}");
}
}
}

[Fact]
public async void TestReset()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};

// Act
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
errorThrottler.Reset();
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);

// Assert
Assert.Equal(4, eventArgs.Count);
for (int i = 0; i < eventArgs.Count; i++)
{
// Expected milliseconds
int expectedMin;
int expectedMax;

switch (i)
{
case 0:
case 2:
expectedMin = 1000; // Min backoff
expectedMax = 1000;
break;
case 1:
case 3:
expectedMin = 1800; // Min + 0.8 * Coefficient
expectedMax = 2200; // Min + 1.2 * Coefficient
break;
default:
throw new NotSupportedException("Unexpected eventArgs count");
}

var actualMilliseconds = eventArgs[i].Delay.TotalMilliseconds;
Assert.True(expectedMin <= actualMilliseconds, $"Unexpected min delay for eventArgs[{i}]. Expected min {expectedMin}, actual {actualMilliseconds}");
Assert.True(expectedMax >= actualMilliseconds, $"Unexpected max delay for eventArgs[{i}]. Expected max {expectedMax}, actual {actualMilliseconds}");
}
}
}

[Fact]
public async void TestReceivesCancellationToken()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var eventArgs = new List<DelayEventArgs>();
hc.Delaying += (sender, args) =>
{
eventArgs.Add(args);
};
var cancellationTokenSource1 = new CancellationTokenSource();
var cancellationTokenSource2 = new CancellationTokenSource();
var cancellationTokenSource3 = new CancellationTokenSource();

// Act
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource1.Token);
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource2.Token);
await errorThrottler.IncrementAndWaitAsync(cancellationTokenSource3.Token);

// Assert
Assert.Equal(2, eventArgs.Count);
Assert.Equal(cancellationTokenSource2.Token, eventArgs[0].Token);
Assert.Equal(cancellationTokenSource3.Token, eventArgs[1].Token);
}
}

[Fact]
public async void TestReceivesSender()
{
using (TestHostContext hc = CreateTestContext())
{
// Arrange
var errorThrottler = new ErrorThrottler();
errorThrottler.Initialize(hc);
var senders = new List<object>();
hc.Delaying += (sender, args) =>
{
senders.Add(sender);
};

// Act
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);
await errorThrottler.IncrementAndWaitAsync(CancellationToken.None);

// Assert
Assert.Equal(2, senders.Count);
Assert.Equal(hc, senders[0]);
Assert.Equal(hc, senders[1]);
}
}

private TestHostContext CreateTestContext([CallerMemberName] String testName = "")
{
return new TestHostContext(this, testName);
}
}
}
9 changes: 9 additions & 0 deletions src/Test/L0/Listener/RunnerL0.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public sealed class RunnerL0
private Mock<ITerminal> _term;
private Mock<IConfigurationStore> _configStore;
private Mock<ISelfUpdater> _updater;
private Mock<IErrorThrottler> _acquireJobThrottler;

public RunnerL0()
{
Expand All @@ -35,6 +36,7 @@ public RunnerL0()
_term = new Mock<ITerminal>();
_configStore = new Mock<IConfigurationStore>();
_updater = new Mock<ISelfUpdater>();
_acquireJobThrottler = new Mock<IErrorThrottler>();
}

private Pipelines.AgentJobRequestMessage CreateJobRequestMessage(string jobName)
Expand Down Expand Up @@ -67,6 +69,7 @@ public async void TestRunAsync()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -174,6 +177,7 @@ public async void TestExecuteCommandForRunAsService(string[] args, bool configur
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IMessageListener>(_messageListener.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, args);

Expand Down Expand Up @@ -205,6 +209,7 @@ public async void TestMachineProvisionerCLI()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IMessageListener>(_messageListener.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, new[] { "run" });

Expand Down Expand Up @@ -242,6 +247,7 @@ public async void TestRunOnce()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -338,6 +344,7 @@ public async void TestRunOnceOnlyTakeOneJobMessage()
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);
runner.Initialize(hc);
var settings = new RunnerSettings
{
Expand Down Expand Up @@ -439,6 +446,7 @@ public async void TestRunOnceHandleUpdateMessage()
hc.SetSingleton<IRunnerServer>(_runnerServer.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.SetSingleton<ISelfUpdater>(_updater.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

runner.Initialize(hc);
var settings = new RunnerSettings
Expand Down Expand Up @@ -522,6 +530,7 @@ public async void TestRemoveLocalRunnerConfig()
hc.SetSingleton<IConfigurationManager>(_configurationManager.Object);
hc.SetSingleton<IConfigurationStore>(_configStore.Object);
hc.SetSingleton<IPromptManager>(_promptManager.Object);
hc.EnqueueInstance<IErrorThrottler>(_acquireJobThrottler.Object);

var command = new CommandSettings(hc, new[] { "remove", "--local" });

Expand Down
Loading
Loading