feat(jobs): implement exponential backoff for unreachable servers
Reduce load on unreachable servers by implementing exponential backoff during connectivity failures. Check frequency decreases based on consecutive failure count: 0-2: every cycle 3-5: ~15 min intervals 6-11: ~30 min intervals 12+: ~60 min intervals Uses server ID hash to distribute checks across cycles and prevent thundering herd. ServerCheckJob and ServerConnectionCheckJob increment unreachable_count on failures. ServerManagerJob applies backoff logic before dispatching checks. Includes comprehensive test coverage.
This commit is contained in:
parent
d77e4c864f
commit
dd2c9c291a
4 changed files with 245 additions and 14 deletions
|
|
@ -15,6 +15,7 @@
|
|||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\Middleware\WithoutOverlapping;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
use Illuminate\Queue\TimeoutExceededException;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
|
||||
class ServerCheckJob implements ShouldBeEncrypted, ShouldQueue
|
||||
|
|
@ -36,11 +37,12 @@ public function __construct(public Server $server) {}
|
|||
|
||||
public function failed(?\Throwable $exception): void
|
||||
{
|
||||
if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) {
|
||||
if ($exception instanceof TimeoutExceededException) {
|
||||
Log::warning('ServerCheckJob timed out', [
|
||||
'server_id' => $this->server->id,
|
||||
'server_name' => $this->server->name,
|
||||
]);
|
||||
$this->server->increment('unreachable_count');
|
||||
|
||||
// Delete the queue job so it doesn't appear in Horizon's failed list.
|
||||
$this->job?->delete();
|
||||
|
|
|
|||
|
|
@ -2,8 +2,10 @@
|
|||
|
||||
namespace App\Jobs;
|
||||
|
||||
use App\Helpers\SshMultiplexingHelper;
|
||||
use App\Models\Server;
|
||||
use App\Services\ConfigurationRepository;
|
||||
use App\Services\HetznerService;
|
||||
use Illuminate\Bus\Queueable;
|
||||
use Illuminate\Contracts\Queue\ShouldBeEncrypted;
|
||||
use Illuminate\Contracts\Queue\ShouldQueue;
|
||||
|
|
@ -11,7 +13,9 @@
|
|||
use Illuminate\Queue\InteractsWithQueue;
|
||||
use Illuminate\Queue\Middleware\WithoutOverlapping;
|
||||
use Illuminate\Queue\SerializesModels;
|
||||
use Illuminate\Queue\TimeoutExceededException;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use Illuminate\Support\Facades\Process;
|
||||
|
||||
class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue
|
||||
{
|
||||
|
|
@ -19,7 +23,7 @@ class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue
|
|||
|
||||
public $tries = 1;
|
||||
|
||||
public $timeout = 30;
|
||||
public $timeout = 15;
|
||||
|
||||
public function __construct(
|
||||
public Server $server,
|
||||
|
|
@ -28,7 +32,7 @@ public function __construct(
|
|||
|
||||
public function middleware(): array
|
||||
{
|
||||
return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(45)->dontRelease()];
|
||||
return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(25)->dontRelease()];
|
||||
}
|
||||
|
||||
private function disableSshMux(): void
|
||||
|
|
@ -72,6 +76,7 @@ public function handle()
|
|||
'is_reachable' => false,
|
||||
'is_usable' => false,
|
||||
]);
|
||||
$this->server->increment('unreachable_count');
|
||||
|
||||
Log::warning('ServerConnectionCheck: Server not reachable', [
|
||||
'server_id' => $this->server->id,
|
||||
|
|
@ -90,6 +95,10 @@ public function handle()
|
|||
'is_usable' => $isUsable,
|
||||
]);
|
||||
|
||||
if ($this->server->unreachable_count > 0) {
|
||||
$this->server->update(['unreachable_count' => 0]);
|
||||
}
|
||||
|
||||
} catch (\Throwable $e) {
|
||||
|
||||
Log::error('ServerConnectionCheckJob failed', [
|
||||
|
|
@ -100,6 +109,7 @@ public function handle()
|
|||
'is_reachable' => false,
|
||||
'is_usable' => false,
|
||||
]);
|
||||
$this->server->increment('unreachable_count');
|
||||
|
||||
return;
|
||||
}
|
||||
|
|
@ -107,11 +117,12 @@ public function handle()
|
|||
|
||||
public function failed(?\Throwable $exception): void
|
||||
{
|
||||
if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) {
|
||||
if ($exception instanceof TimeoutExceededException) {
|
||||
$this->server->settings->update([
|
||||
'is_reachable' => false,
|
||||
'is_usable' => false,
|
||||
]);
|
||||
$this->server->increment('unreachable_count');
|
||||
|
||||
// Delete the queue job so it doesn't appear in Horizon's failed list.
|
||||
$this->job?->delete();
|
||||
|
|
@ -123,7 +134,7 @@ private function checkHetznerStatus(): void
|
|||
$status = null;
|
||||
|
||||
try {
|
||||
$hetznerService = new \App\Services\HetznerService($this->server->cloudProviderToken->token);
|
||||
$hetznerService = new HetznerService($this->server->cloudProviderToken->token);
|
||||
$serverData = $hetznerService->getServer($this->server->hetzner_server_id);
|
||||
$status = $serverData['status'] ?? null;
|
||||
|
||||
|
|
@ -144,15 +155,18 @@ private function checkHetznerStatus(): void
|
|||
private function checkConnection(): bool
|
||||
{
|
||||
try {
|
||||
// Use instant_remote_process with a simple command
|
||||
// This will automatically handle mux, sudo, IPv6, Cloudflare tunnel, etc.
|
||||
$output = instant_remote_process_with_timeout(
|
||||
['ls -la /'],
|
||||
$this->server,
|
||||
false // don't throw error
|
||||
);
|
||||
// Single SSH attempt without SshRetryHandler — retries waste time for connectivity checks.
|
||||
// Backoff is managed at the dispatch level via unreachable_count.
|
||||
$commands = ['ls -la /'];
|
||||
if ($this->server->isNonRoot()) {
|
||||
$commands = parseCommandsByLineForSudo(collect($commands), $this->server);
|
||||
}
|
||||
$commandString = implode("\n", $commands);
|
||||
|
||||
return $output !== null;
|
||||
$sshCommand = SshMultiplexingHelper::generateSshCommand($this->server, $commandString, true);
|
||||
$process = Process::timeout(10)->run($sshCommand);
|
||||
|
||||
return $process->exitCode() === 0;
|
||||
} catch (\Throwable $e) {
|
||||
Log::debug('ServerConnectionCheck: Connection check failed', [
|
||||
'server_id' => $this->server->id,
|
||||
|
|
|
|||
|
|
@ -86,6 +86,9 @@ private function dispatchConnectionChecks(Collection $servers): void
|
|||
if ($server->isSentinelEnabled() && $server->isSentinelLive()) {
|
||||
return;
|
||||
}
|
||||
if ($this->shouldSkipDueToBackoff($server)) {
|
||||
return;
|
||||
}
|
||||
ServerConnectionCheckJob::dispatch($server);
|
||||
} catch (\Exception $e) {
|
||||
Log::channel('scheduled-errors')->error('Failed to dispatch ServerConnectionCheck', [
|
||||
|
|
@ -129,7 +132,9 @@ private function processServerTasks(Server $server): void
|
|||
if ($sentinelOutOfSync) {
|
||||
// Dispatch ServerCheckJob if Sentinel is out of sync
|
||||
if (shouldRunCronNow($this->checkFrequency, $serverTimezone, "server-check:{$server->id}", $this->executionTime)) {
|
||||
ServerCheckJob::dispatch($server);
|
||||
if (! $this->shouldSkipDueToBackoff($server)) {
|
||||
ServerCheckJob::dispatch($server);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -165,4 +170,39 @@ private function processServerTasks(Server $server): void
|
|||
// Note: CheckAndStartSentinelJob is only dispatched daily (line above) for version updates.
|
||||
// Crash recovery is handled by sentinelOutOfSync → ServerCheckJob → CheckAndStartSentinelJob.
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the backoff cycle interval based on how many consecutive times a server has been unreachable.
|
||||
* Higher counts → less frequent checks (based on 5-min cloud cycle):
|
||||
* 0-2: every cycle, 3-5: ~15 min, 6-11: ~30 min, 12+: ~60 min
|
||||
*/
|
||||
private function getBackoffCycleInterval(int $unreachableCount): int
|
||||
{
|
||||
return match (true) {
|
||||
$unreachableCount <= 2 => 1,
|
||||
$unreachableCount <= 5 => 3,
|
||||
$unreachableCount <= 11 => 6,
|
||||
default => 12,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a server should be skipped this cycle due to unreachable backoff.
|
||||
* Uses server ID hash to distribute checks across cycles (avoid thundering herd).
|
||||
*/
|
||||
private function shouldSkipDueToBackoff(Server $server): bool
|
||||
{
|
||||
$unreachableCount = $server->unreachable_count ?? 0;
|
||||
$interval = $this->getBackoffCycleInterval($unreachableCount);
|
||||
|
||||
if ($interval <= 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$cyclePeriodMinutes = isCloud() ? 5 : 1;
|
||||
$cycleIndex = intdiv($this->executionTime->minute, $cyclePeriodMinutes);
|
||||
$serverHash = abs(crc32((string) $server->id));
|
||||
|
||||
return ($cycleIndex + $serverHash) % $interval !== 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
175
tests/Unit/ServerBackoffTest.php
Normal file
175
tests/Unit/ServerBackoffTest.php
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
<?php
|
||||
|
||||
use App\Jobs\ServerCheckJob;
|
||||
use App\Jobs\ServerConnectionCheckJob;
|
||||
use App\Jobs\ServerManagerJob;
|
||||
use App\Models\Server;
|
||||
use Illuminate\Queue\TimeoutExceededException;
|
||||
use Illuminate\Support\Carbon;
|
||||
use Tests\TestCase;
|
||||
|
||||
uses(TestCase::class);
|
||||
|
||||
beforeEach(function () {
|
||||
Carbon::setTestNow('2025-01-15 12:00:00');
|
||||
});
|
||||
|
||||
afterEach(function () {
|
||||
Mockery::close();
|
||||
Carbon::setTestNow();
|
||||
});
|
||||
|
||||
describe('getBackoffCycleInterval', function () {
|
||||
it('returns correct intervals for unreachable counts', function () {
|
||||
$job = new ServerManagerJob;
|
||||
$method = new ReflectionMethod($job, 'getBackoffCycleInterval');
|
||||
|
||||
expect($method->invoke($job, 0))->toBe(1)
|
||||
->and($method->invoke($job, 1))->toBe(1)
|
||||
->and($method->invoke($job, 2))->toBe(1)
|
||||
->and($method->invoke($job, 3))->toBe(3)
|
||||
->and($method->invoke($job, 5))->toBe(3)
|
||||
->and($method->invoke($job, 6))->toBe(6)
|
||||
->and($method->invoke($job, 11))->toBe(6)
|
||||
->and($method->invoke($job, 12))->toBe(12)
|
||||
->and($method->invoke($job, 100))->toBe(12);
|
||||
});
|
||||
});
|
||||
|
||||
describe('shouldSkipDueToBackoff', function () {
|
||||
it('never skips servers with unreachable_count <= 2', function () {
|
||||
$job = new ServerManagerJob;
|
||||
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
|
||||
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
|
||||
|
||||
$server = Mockery::mock(Server::class)->makePartial();
|
||||
$server->id = 42;
|
||||
|
||||
foreach ([0, 1, 2] as $count) {
|
||||
$server->unreachable_count = $count;
|
||||
|
||||
// Test across all minutes in an hour
|
||||
for ($minute = 0; $minute < 60; $minute++) {
|
||||
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
|
||||
$executionTimeProp->setValue($job, Carbon::now());
|
||||
|
||||
expect($method->invoke($job, $server))->toBeFalse(
|
||||
"Should not skip with unreachable_count={$count} at minute={$minute}"
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it('skips most cycles for servers with high unreachable count', function () {
|
||||
$job = new ServerManagerJob;
|
||||
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
|
||||
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
|
||||
|
||||
$server = Mockery::mock(Server::class)->makePartial();
|
||||
$server->id = 42;
|
||||
$server->unreachable_count = 15; // interval = 12
|
||||
|
||||
$skipCount = 0;
|
||||
$allowCount = 0;
|
||||
|
||||
for ($minute = 0; $minute < 60; $minute++) {
|
||||
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
|
||||
$executionTimeProp->setValue($job, Carbon::now());
|
||||
|
||||
if ($method->invoke($job, $server)) {
|
||||
$skipCount++;
|
||||
} else {
|
||||
$allowCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// With interval=12, most cycles should be skipped but at least one should be allowed
|
||||
expect($allowCount)->toBeGreaterThan(0)
|
||||
->and($skipCount)->toBeGreaterThan($allowCount);
|
||||
});
|
||||
|
||||
it('distributes checks across servers using server ID hash', function () {
|
||||
$job = new ServerManagerJob;
|
||||
$executionTimeProp = new ReflectionProperty($job, 'executionTime');
|
||||
$method = new ReflectionMethod($job, 'shouldSkipDueToBackoff');
|
||||
|
||||
// Two servers with same unreachable_count but different IDs
|
||||
$server1 = Mockery::mock(Server::class)->makePartial();
|
||||
$server1->id = 1;
|
||||
$server1->unreachable_count = 5; // interval = 3
|
||||
|
||||
$server2 = Mockery::mock(Server::class)->makePartial();
|
||||
$server2->id = 2;
|
||||
$server2->unreachable_count = 5; // interval = 3
|
||||
|
||||
$server1AllowedMinutes = [];
|
||||
$server2AllowedMinutes = [];
|
||||
|
||||
for ($minute = 0; $minute < 60; $minute++) {
|
||||
Carbon::setTestNow("2025-01-15 12:{$minute}:00");
|
||||
$executionTimeProp->setValue($job, Carbon::now());
|
||||
|
||||
if (! $method->invoke($job, $server1)) {
|
||||
$server1AllowedMinutes[] = $minute;
|
||||
}
|
||||
if (! $method->invoke($job, $server2)) {
|
||||
$server2AllowedMinutes[] = $minute;
|
||||
}
|
||||
}
|
||||
|
||||
// Both servers should have some allowed minutes, but not all the same
|
||||
expect($server1AllowedMinutes)->not->toBeEmpty()
|
||||
->and($server2AllowedMinutes)->not->toBeEmpty()
|
||||
->and($server1AllowedMinutes)->not->toBe($server2AllowedMinutes);
|
||||
});
|
||||
});
|
||||
|
||||
describe('ServerConnectionCheckJob unreachable_count', function () {
|
||||
it('increments unreachable_count on timeout', function () {
|
||||
$settings = Mockery::mock();
|
||||
$settings->shouldReceive('update')
|
||||
->with(['is_reachable' => false, 'is_usable' => false])
|
||||
->once();
|
||||
|
||||
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
|
||||
$server->shouldReceive('getAttribute')->with('settings')->andReturn($settings);
|
||||
$server->shouldReceive('increment')->with('unreachable_count')->once();
|
||||
$server->id = 1;
|
||||
$server->name = 'test-server';
|
||||
|
||||
$job = new ServerConnectionCheckJob($server);
|
||||
$job->failed(new TimeoutExceededException);
|
||||
});
|
||||
|
||||
it('does not increment unreachable_count for non-timeout failures', function () {
|
||||
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
|
||||
$server->shouldNotReceive('increment');
|
||||
$server->id = 1;
|
||||
$server->name = 'test-server';
|
||||
|
||||
$job = new ServerConnectionCheckJob($server);
|
||||
$job->failed(new RuntimeException('Some other error'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('ServerCheckJob unreachable_count', function () {
|
||||
it('increments unreachable_count on timeout', function () {
|
||||
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
|
||||
$server->shouldReceive('increment')->with('unreachable_count')->once();
|
||||
$server->id = 1;
|
||||
$server->name = 'test-server';
|
||||
|
||||
$job = new ServerCheckJob($server);
|
||||
$job->failed(new TimeoutExceededException);
|
||||
});
|
||||
|
||||
it('does not increment unreachable_count for non-timeout failures', function () {
|
||||
$server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods();
|
||||
$server->shouldNotReceive('increment');
|
||||
$server->id = 1;
|
||||
$server->name = 'test-server';
|
||||
|
||||
$job = new ServerCheckJob($server);
|
||||
$job->failed(new RuntimeException('Some other error'));
|
||||
});
|
||||
});
|
||||
Loading…
Reference in a new issue