feat(jobs): implement exponential backoff for unreachable servers (#9184)

This commit is contained in:
Andras Bacsai 2026-03-31 16:47:22 +02:00 committed by GitHub
commit 83caabac17
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 245 additions and 14 deletions

View file

@ -15,6 +15,7 @@
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\Middleware\WithoutOverlapping;
use Illuminate\Queue\SerializesModels;
use Illuminate\Queue\TimeoutExceededException;
use Illuminate\Support\Facades\Log;
class ServerCheckJob implements ShouldBeEncrypted, ShouldQueue
@ -36,11 +37,12 @@ public function __construct(public Server $server) {}
public function failed(?\Throwable $exception): void
{
if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) {
if ($exception instanceof TimeoutExceededException) {
Log::warning('ServerCheckJob timed out', [
'server_id' => $this->server->id,
'server_name' => $this->server->name,
]);
$this->server->increment('unreachable_count');
// Delete the queue job so it doesn't appear in Horizon's failed list.
$this->job?->delete();

View file

@ -2,8 +2,10 @@
namespace App\Jobs;
use App\Helpers\SshMultiplexingHelper;
use App\Models\Server;
use App\Services\ConfigurationRepository;
use App\Services\HetznerService;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldBeEncrypted;
use Illuminate\Contracts\Queue\ShouldQueue;
@ -11,7 +13,9 @@
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\Middleware\WithoutOverlapping;
use Illuminate\Queue\SerializesModels;
use Illuminate\Queue\TimeoutExceededException;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Process;
class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue
{
@ -19,7 +23,7 @@ class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue
public $tries = 1;
public $timeout = 30;
public $timeout = 15;
public function __construct(
public Server $server,
@ -28,7 +32,7 @@ public function __construct(
public function middleware(): array
{
return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(45)->dontRelease()];
return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(25)->dontRelease()];
}
private function disableSshMux(): void
@ -72,6 +76,7 @@ public function handle()
'is_reachable' => false,
'is_usable' => false,
]);
$this->server->increment('unreachable_count');
Log::warning('ServerConnectionCheck: Server not reachable', [
'server_id' => $this->server->id,
@ -90,6 +95,10 @@ public function handle()
'is_usable' => $isUsable,
]);
if ($this->server->unreachable_count > 0) {
$this->server->update(['unreachable_count' => 0]);
}
} catch (\Throwable $e) {
Log::error('ServerConnectionCheckJob failed', [
@ -100,6 +109,7 @@ public function handle()
'is_reachable' => false,
'is_usable' => false,
]);
$this->server->increment('unreachable_count');
return;
}
@ -107,11 +117,12 @@ public function handle()
public function failed(?\Throwable $exception): void
{
if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) {
if ($exception instanceof TimeoutExceededException) {
$this->server->settings->update([
'is_reachable' => false,
'is_usable' => false,
]);
$this->server->increment('unreachable_count');
// Delete the queue job so it doesn't appear in Horizon's failed list.
$this->job?->delete();
@ -123,7 +134,7 @@ private function checkHetznerStatus(): void
$status = null;
try {
$hetznerService = new \App\Services\HetznerService($this->server->cloudProviderToken->token);
$hetznerService = new HetznerService($this->server->cloudProviderToken->token);
$serverData = $hetznerService->getServer($this->server->hetzner_server_id);
$status = $serverData['status'] ?? null;
@ -144,15 +155,18 @@ private function checkHetznerStatus(): void
private function checkConnection(): bool
{
try {
// Use instant_remote_process with a simple command
// This will automatically handle mux, sudo, IPv6, Cloudflare tunnel, etc.
$output = instant_remote_process_with_timeout(
['ls -la /'],
$this->server,
false // don't throw error
);
// Single SSH attempt without SshRetryHandler — retries waste time for connectivity checks.
// Backoff is managed at the dispatch level via unreachable_count.
$commands = ['ls -la /'];
if ($this->server->isNonRoot()) {
$commands = parseCommandsByLineForSudo(collect($commands), $this->server);
}
$commandString = implode("\n", $commands);
return $output !== null;
$sshCommand = SshMultiplexingHelper::generateSshCommand($this->server, $commandString, true);
$process = Process::timeout(10)->run($sshCommand);
return $process->exitCode() === 0;
} catch (\Throwable $e) {
Log::debug('ServerConnectionCheck: Connection check failed', [
'server_id' => $this->server->id,

View file

@ -86,6 +86,9 @@ private function dispatchConnectionChecks(Collection $servers): void
if ($server->isSentinelEnabled() && $server->isSentinelLive()) {
return;
}
if ($this->shouldSkipDueToBackoff($server)) {
return;
}
ServerConnectionCheckJob::dispatch($server);
} catch (\Exception $e) {
Log::channel('scheduled-errors')->error('Failed to dispatch ServerConnectionCheck', [
@ -129,7 +132,9 @@ private function processServerTasks(Server $server): void
if ($sentinelOutOfSync) {
// Dispatch ServerCheckJob if Sentinel is out of sync
if (shouldRunCronNow($this->checkFrequency, $serverTimezone, "server-check:{$server->id}", $this->executionTime)) {
ServerCheckJob::dispatch($server);
if (! $this->shouldSkipDueToBackoff($server)) {
ServerCheckJob::dispatch($server);
}
}
}
@ -165,4 +170,39 @@ private function processServerTasks(Server $server): void
// Note: CheckAndStartSentinelJob is only dispatched daily (line above) for version updates.
// Crash recovery is handled by sentinelOutOfSync → ServerCheckJob → CheckAndStartSentinelJob.
}
/**
* Determine the backoff cycle interval based on how many consecutive times a server has been unreachable.
* Higher counts less frequent checks (based on 5-min cloud cycle):
* 0-2: every cycle, 3-5: ~15 min, 6-11: ~30 min, 12+: ~60 min
*/
private function getBackoffCycleInterval(int $unreachableCount): int
{
return match (true) {
$unreachableCount <= 2 => 1,
$unreachableCount <= 5 => 3,
$unreachableCount <= 11 => 6,
default => 12,
};
}
/**
* Check if a server should be skipped this cycle due to unreachable backoff.
* Uses server ID hash to distribute checks across cycles (avoid thundering herd).
*/
private function shouldSkipDueToBackoff(Server $server): bool
{
$unreachableCount = $server->unreachable_count ?? 0;
$interval = $this->getBackoffCycleInterval($unreachableCount);
if ($interval <= 1) {
return false;
}
$cyclePeriodMinutes = isCloud() ? 5 : 1;
$cycleIndex = intdiv($this->executionTime->minute, $cyclePeriodMinutes);
$serverHash = abs(crc32((string) $server->id));
return ($cycleIndex + $serverHash) % $interval !== 0;
}
}

View file

@ -0,0 +1,175 @@
<?php
use App\Jobs\ServerCheckJob;
use App\Jobs\ServerConnectionCheckJob;
use App\Jobs\ServerManagerJob;
use App\Models\Server;
use Illuminate\Queue\TimeoutExceededException;
use Illuminate\Support\Carbon;
use Tests\TestCase;
uses(TestCase::class);
beforeEach(function () {
Carbon::setTestNow('2025-01-15 12:00:00');
});
afterEach(function () {
Mockery::close();
Carbon::setTestNow();
});
describe('getBackoffCycleInterval', function () {
it('returns correct intervals for unreachable counts', function () {
$job = new ServerManagerJob;
$method = new ReflectionMethod($job, 'getBackoffCycleInterval');
expect($method->invoke($job, 0))->toBe(1)
->and($method->invoke($job, 1))->toBe(1)
->and($method->invoke($job, 2))->toBe(1)
->and($method->invoke($job, 3))->toBe(3)
->and($method->invoke($job, 5))->toBe(3)
->and($method->invoke($job, 6))->toBe(6)
->and($method->invoke($job, 11))->toBe(6)
->and($method->invoke($job, 12))->toBe(12)
->and($method->invoke($job, 100))->toBe(12);
});
});
describe('shouldSkipDueToBackoff', function () {
it('never skips servers with unreachable_count <= 2', function () {
$job = new ServerManagerJob;
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
$server = Mockery::mock(Server::class)->makePartial();
$server->id = 42;
foreach ([0, 1, 2] as $count) {
$server->unreachable_count = $count;
// Test across all minutes in an hour
for ($minute = 0; $minute < 60; $minute++) {
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
$executionTimeProp->setValue($job, Carbon::now());
expect($method->invoke($job, $server))->toBeFalse(
"Should not skip with unreachable_count={$count} at minute={$minute}"
);
}
}
});
it('skips most cycles for servers with high unreachable count', function () {
$job = new ServerManagerJob;
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
$server = Mockery::mock(Server::class)->makePartial();
$server->id = 42;
$server->unreachable_count = 15; // interval = 12
$skipCount = 0;
$allowCount = 0;
for ($minute = 0; $minute < 60; $minute++) {
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
$executionTimeProp->setValue($job, Carbon::now());
if ($method->invoke($job, $server)) {
$skipCount++;
} else {
$allowCount++;
}
}
// With interval=12, most cycles should be skipped but at least one should be allowed
expect($allowCount)->toBeGreaterThan(0)
->and($skipCount)->toBeGreaterThan($allowCount);
});
it('distributes checks across servers using server ID hash', function () {
$job = new ServerManagerJob;
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
// Two servers with same unreachable_count but different IDs
$server1 = Mockery::mock(Server::class)->makePartial();
$server1->id = 1;
$server1->unreachable_count = 5; // interval = 3
$server2 = Mockery::mock(Server::class)->makePartial();
$server2->id = 2;
$server2->unreachable_count = 5; // interval = 3
$server1AllowedMinutes = [];
$server2AllowedMinutes = [];
for ($minute = 0; $minute < 60; $minute++) {
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
$executionTimeProp->setValue($job, Carbon::now());
if (! $method->invoke($job, $server1)) {
$server1AllowedMinutes[] = $minute;
}
if (! $method->invoke($job, $server2)) {
$server2AllowedMinutes[] = $minute;
}
}
// Both servers should have some allowed minutes, but not all the same
expect($server1AllowedMinutes)->not->toBeEmpty()
->and($server2AllowedMinutes)->not->toBeEmpty()
->and($server1AllowedMinutes)->not->toBe($server2AllowedMinutes);
});
});
describe('ServerConnectionCheckJob unreachable_count', function () {
it('increments unreachable_count on timeout', function () {
$settings = Mockery::mock();
$settings->shouldReceive('update')
->with(['is_reachable' => false, 'is_usable' => false])
->once();
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
$server->shouldReceive('getAttribute')->with('settings')->andReturn($settings);
$server->shouldReceive('increment')->with('unreachable_count')->once();
$server->id = 1;
$server->name = 'test-server';
$job = new ServerConnectionCheckJob($server);
$job->failed(new TimeoutExceededException);
});
it('does not increment unreachable_count for non-timeout failures', function () {
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
$server->shouldNotReceive('increment');
$server->id = 1;
$server->name = 'test-server';
$job = new ServerConnectionCheckJob($server);
$job->failed(new RuntimeException('Some other error'));
});
});
describe('ServerCheckJob unreachable_count', function () {
it('increments unreachable_count on timeout', function () {
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
$server->shouldReceive('increment')->with('unreachable_count')->once();
$server->id = 1;
$server->name = 'test-server';
$job = new ServerCheckJob($server);
$job->failed(new TimeoutExceededException);
});
it('does not increment unreachable_count for non-timeout failures', function () {
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
$server->shouldNotReceive('increment');
$server->id = 1;
$server->name = 'test-server';
$job = new ServerCheckJob($server);
$job->failed(new RuntimeException('Some other error'));
});
});