diff --git a/app/Jobs/ServerCheckJob.php b/app/Jobs/ServerCheckJob.php index a18d45b9a..10faa7e9b 100644 --- a/app/Jobs/ServerCheckJob.php +++ b/app/Jobs/ServerCheckJob.php @@ -15,6 +15,7 @@ use Illuminate\Queue\InteractsWithQueue; use Illuminate\Queue\Middleware\WithoutOverlapping; use Illuminate\Queue\SerializesModels; +use Illuminate\Queue\TimeoutExceededException; use Illuminate\Support\Facades\Log; class ServerCheckJob implements ShouldBeEncrypted, ShouldQueue @@ -36,11 +37,12 @@ public function __construct(public Server $server) {} public function failed(?\Throwable $exception): void { - if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) { + if ($exception instanceof TimeoutExceededException) { Log::warning('ServerCheckJob timed out', [ 'server_id' => $this->server->id, 'server_name' => $this->server->name, ]); + $this->server->increment('unreachable_count'); // Delete the queue job so it doesn't appear in Horizon's failed list. $this->job?->delete(); diff --git a/app/Jobs/ServerConnectionCheckJob.php b/app/Jobs/ServerConnectionCheckJob.php index 2c73ae43e..7ce316dcd 100644 --- a/app/Jobs/ServerConnectionCheckJob.php +++ b/app/Jobs/ServerConnectionCheckJob.php @@ -2,8 +2,10 @@ namespace App\Jobs; +use App\Helpers\SshMultiplexingHelper; use App\Models\Server; use App\Services\ConfigurationRepository; +use App\Services\HetznerService; use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldBeEncrypted; use Illuminate\Contracts\Queue\ShouldQueue; @@ -11,7 +13,9 @@ use Illuminate\Queue\InteractsWithQueue; use Illuminate\Queue\Middleware\WithoutOverlapping; use Illuminate\Queue\SerializesModels; +use Illuminate\Queue\TimeoutExceededException; use Illuminate\Support\Facades\Log; +use Illuminate\Support\Facades\Process; class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue { @@ -19,7 +23,7 @@ class ServerConnectionCheckJob implements ShouldBeEncrypted, ShouldQueue public $tries = 1; - public $timeout = 30; + public $timeout = 15; public function __construct( public Server $server, @@ -28,7 +32,7 @@ public function __construct( public function middleware(): array { - return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(45)->dontRelease()]; + return [(new WithoutOverlapping('server-connection-check-'.$this->server->uuid))->expireAfter(25)->dontRelease()]; } private function disableSshMux(): void @@ -72,6 +76,7 @@ public function handle() 'is_reachable' => false, 'is_usable' => false, ]); + $this->server->increment('unreachable_count'); Log::warning('ServerConnectionCheck: Server not reachable', [ 'server_id' => $this->server->id, @@ -90,6 +95,10 @@ public function handle() 'is_usable' => $isUsable, ]); + if ($this->server->unreachable_count > 0) { + $this->server->update(['unreachable_count' => 0]); + } + } catch (\Throwable $e) { Log::error('ServerConnectionCheckJob failed', [ @@ -100,6 +109,7 @@ public function handle() 'is_reachable' => false, 'is_usable' => false, ]); + $this->server->increment('unreachable_count'); return; } @@ -107,11 +117,12 @@ public function handle() public function failed(?\Throwable $exception): void { - if ($exception instanceof \Illuminate\Queue\TimeoutExceededException) { + if ($exception instanceof TimeoutExceededException) { $this->server->settings->update([ 'is_reachable' => false, 'is_usable' => false, ]); + $this->server->increment('unreachable_count'); // Delete the queue job so it doesn't appear in Horizon's failed list. $this->job?->delete(); @@ -123,7 +134,7 @@ private function checkHetznerStatus(): void $status = null; try { - $hetznerService = new \App\Services\HetznerService($this->server->cloudProviderToken->token); + $hetznerService = new HetznerService($this->server->cloudProviderToken->token); $serverData = $hetznerService->getServer($this->server->hetzner_server_id); $status = $serverData['status'] ?? null; @@ -144,15 +155,18 @@ private function checkHetznerStatus(): void private function checkConnection(): bool { try { - // Use instant_remote_process with a simple command - // This will automatically handle mux, sudo, IPv6, Cloudflare tunnel, etc. - $output = instant_remote_process_with_timeout( - ['ls -la /'], - $this->server, - false // don't throw error - ); + // Single SSH attempt without SshRetryHandler — retries waste time for connectivity checks. + // Backoff is managed at the dispatch level via unreachable_count. + $commands = ['ls -la /']; + if ($this->server->isNonRoot()) { + $commands = parseCommandsByLineForSudo(collect($commands), $this->server); + } + $commandString = implode("\n", $commands); - return $output !== null; + $sshCommand = SshMultiplexingHelper::generateSshCommand($this->server, $commandString, true); + $process = Process::timeout(10)->run($sshCommand); + + return $process->exitCode() === 0; } catch (\Throwable $e) { Log::debug('ServerConnectionCheck: Connection check failed', [ 'server_id' => $this->server->id, diff --git a/app/Jobs/ServerManagerJob.php b/app/Jobs/ServerManagerJob.php index 3f748f0ca..9532282cc 100644 --- a/app/Jobs/ServerManagerJob.php +++ b/app/Jobs/ServerManagerJob.php @@ -86,6 +86,9 @@ private function dispatchConnectionChecks(Collection $servers): void if ($server->isSentinelEnabled() && $server->isSentinelLive()) { return; } + if ($this->shouldSkipDueToBackoff($server)) { + return; + } ServerConnectionCheckJob::dispatch($server); } catch (\Exception $e) { Log::channel('scheduled-errors')->error('Failed to dispatch ServerConnectionCheck', [ @@ -129,7 +132,9 @@ private function processServerTasks(Server $server): void if ($sentinelOutOfSync) { // Dispatch ServerCheckJob if Sentinel is out of sync if (shouldRunCronNow($this->checkFrequency, $serverTimezone, "server-check:{$server->id}", $this->executionTime)) { - ServerCheckJob::dispatch($server); + if (! $this->shouldSkipDueToBackoff($server)) { + ServerCheckJob::dispatch($server); + } } } @@ -165,4 +170,39 @@ private function processServerTasks(Server $server): void // Note: CheckAndStartSentinelJob is only dispatched daily (line above) for version updates. // Crash recovery is handled by sentinelOutOfSync → ServerCheckJob → CheckAndStartSentinelJob. } + + /** + * Determine the backoff cycle interval based on how many consecutive times a server has been unreachable. + * Higher counts → less frequent checks (based on 5-min cloud cycle): + * 0-2: every cycle, 3-5: ~15 min, 6-11: ~30 min, 12+: ~60 min + */ + private function getBackoffCycleInterval(int $unreachableCount): int + { + return match (true) { + $unreachableCount <= 2 => 1, + $unreachableCount <= 5 => 3, + $unreachableCount <= 11 => 6, + default => 12, + }; + } + + /** + * Check if a server should be skipped this cycle due to unreachable backoff. + * Uses server ID hash to distribute checks across cycles (avoid thundering herd). + */ + private function shouldSkipDueToBackoff(Server $server): bool + { + $unreachableCount = $server->unreachable_count ?? 0; + $interval = $this->getBackoffCycleInterval($unreachableCount); + + if ($interval <= 1) { + return false; + } + + $cyclePeriodMinutes = isCloud() ? 5 : 1; + $cycleIndex = intdiv($this->executionTime->minute, $cyclePeriodMinutes); + $serverHash = abs(crc32((string) $server->id)); + + return ($cycleIndex + $serverHash) % $interval !== 0; + } } diff --git a/tests/Unit/ServerBackoffTest.php b/tests/Unit/ServerBackoffTest.php new file mode 100644 index 000000000..bdcefb74f --- /dev/null +++ b/tests/Unit/ServerBackoffTest.php @@ -0,0 +1,175 @@ +invoke($job, 0))->toBe(1) + ->and($method->invoke($job, 1))->toBe(1) + ->and($method->invoke($job, 2))->toBe(1) + ->and($method->invoke($job, 3))->toBe(3) + ->and($method->invoke($job, 5))->toBe(3) + ->and($method->invoke($job, 6))->toBe(6) + ->and($method->invoke($job, 11))->toBe(6) + ->and($method->invoke($job, 12))->toBe(12) + ->and($method->invoke($job, 100))->toBe(12); + }); +}); + +describe('shouldSkipDueToBackoff', function () { + it('never skips servers with unreachable_count <= 2', function () { + $job = new ServerManagerJob; + $executionTimeProp = new ReflectionProperty($job, 'executionTime'); + $method = new ReflectionMethod($job, 'shouldSkipDueToBackoff'); + + $server = Mockery::mock(Server::class)->makePartial(); + $server->id = 42; + + foreach ([0, 1, 2] as $count) { + $server->unreachable_count = $count; + + // Test across all minutes in an hour + for ($minute = 0; $minute < 60; $minute++) { + Carbon::setTestNow("2025-01-15 12:{$minute}:00"); + $executionTimeProp->setValue($job, Carbon::now()); + + expect($method->invoke($job, $server))->toBeFalse( + "Should not skip with unreachable_count={$count} at minute={$minute}" + ); + } + } + }); + + it('skips most cycles for servers with high unreachable count', function () { + $job = new ServerManagerJob; + $executionTimeProp = new ReflectionProperty($job, 'executionTime'); + $method = new ReflectionMethod($job, 'shouldSkipDueToBackoff'); + + $server = Mockery::mock(Server::class)->makePartial(); + $server->id = 42; + $server->unreachable_count = 15; // interval = 12 + + $skipCount = 0; + $allowCount = 0; + + for ($minute = 0; $minute < 60; $minute++) { + Carbon::setTestNow("2025-01-15 12:{$minute}:00"); + $executionTimeProp->setValue($job, Carbon::now()); + + if ($method->invoke($job, $server)) { + $skipCount++; + } else { + $allowCount++; + } + } + + // With interval=12, most cycles should be skipped but at least one should be allowed + expect($allowCount)->toBeGreaterThan(0) + ->and($skipCount)->toBeGreaterThan($allowCount); + }); + + it('distributes checks across servers using server ID hash', function () { + $job = new ServerManagerJob; + $executionTimeProp = new ReflectionProperty($job, 'executionTime'); + $method = new ReflectionMethod($job, 'shouldSkipDueToBackoff'); + + // Two servers with same unreachable_count but different IDs + $server1 = Mockery::mock(Server::class)->makePartial(); + $server1->id = 1; + $server1->unreachable_count = 5; // interval = 3 + + $server2 = Mockery::mock(Server::class)->makePartial(); + $server2->id = 2; + $server2->unreachable_count = 5; // interval = 3 + + $server1AllowedMinutes = []; + $server2AllowedMinutes = []; + + for ($minute = 0; $minute < 60; $minute++) { + Carbon::setTestNow("2025-01-15 12:{$minute}:00"); + $executionTimeProp->setValue($job, Carbon::now()); + + if (! $method->invoke($job, $server1)) { + $server1AllowedMinutes[] = $minute; + } + if (! $method->invoke($job, $server2)) { + $server2AllowedMinutes[] = $minute; + } + } + + // Both servers should have some allowed minutes, but not all the same + expect($server1AllowedMinutes)->not->toBeEmpty() + ->and($server2AllowedMinutes)->not->toBeEmpty() + ->and($server1AllowedMinutes)->not->toBe($server2AllowedMinutes); + }); +}); + +describe('ServerConnectionCheckJob unreachable_count', function () { + it('increments unreachable_count on timeout', function () { + $settings = Mockery::mock(); + $settings->shouldReceive('update') + ->with(['is_reachable' => false, 'is_usable' => false]) + ->once(); + + $server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods(); + $server->shouldReceive('getAttribute')->with('settings')->andReturn($settings); + $server->shouldReceive('increment')->with('unreachable_count')->once(); + $server->id = 1; + $server->name = 'test-server'; + + $job = new ServerConnectionCheckJob($server); + $job->failed(new TimeoutExceededException); + }); + + it('does not increment unreachable_count for non-timeout failures', function () { + $server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods(); + $server->shouldNotReceive('increment'); + $server->id = 1; + $server->name = 'test-server'; + + $job = new ServerConnectionCheckJob($server); + $job->failed(new RuntimeException('Some other error')); + }); +}); + +describe('ServerCheckJob unreachable_count', function () { + it('increments unreachable_count on timeout', function () { + $server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods(); + $server->shouldReceive('increment')->with('unreachable_count')->once(); + $server->id = 1; + $server->name = 'test-server'; + + $job = new ServerCheckJob($server); + $job->failed(new TimeoutExceededException); + }); + + it('does not increment unreachable_count for non-timeout failures', function () { + $server = Mockery::mock(Server::class)->makePartial()->shouldAllowMockingProtectedMethods(); + $server->shouldNotReceive('increment'); + $server->id = 1; + $server->name = 'test-server'; + + $job = new ServerCheckJob($server); + $job->failed(new RuntimeException('Some other error')); + }); +});