diff --git a/app/Console/Kernel.php b/app/Console/Kernel.php index 494e6d7b6..e97105836 100644 --- a/app/Console/Kernel.php +++ b/app/Console/Kernel.php @@ -8,7 +8,6 @@ use App\Jobs\CheckTraefikVersionJob; use App\Jobs\CleanupInstanceStuffsJob; use App\Jobs\CleanupOrphanedPreviewContainersJob; -use App\Jobs\CleanupStaleMultiplexedConnections; use App\Jobs\PullChangelog; use App\Jobs\PullTemplatesFromCDN; use App\Jobs\RegenerateSslCertJob; @@ -41,9 +40,6 @@ protected function schedule(Schedule $schedule): void $this->instanceTimezone = config('app.timezone'); } - $this->scheduleInstance->job(new CleanupStaleMultiplexedConnections, crons_queue()) - ->hourly() - ->onOneServer(); $this->scheduleInstance->command('cleanup:redis --clear-locks')->daily(); $this->scheduleInstance->command('sanctum:prune-expired --hours=1')->hourly()->onOneServer(); $this->scheduleInstance->job(new ApiTokenExpirationWarningJob)->hourly()->onOneServer(); diff --git a/app/Jobs/CleanupStaleMultiplexedConnections.php b/app/Jobs/CleanupStaleMultiplexedConnections.php deleted file mode 100644 index 0a10fa420..000000000 --- a/app/Jobs/CleanupStaleMultiplexedConnections.php +++ /dev/null @@ -1,295 +0,0 @@ -cleanupStaleConnections(); - $this->cleanupNonExistentServerConnections(); - $this->cleanupDuplicateSshProcesses(); - $this->cleanupOrphanedSshProcesses(); - $this->cleanupOrphanedCloudflaredProcesses(); - } - - /** - * Once two background ssh masters share the same ControlPath, OpenSSH's - * control socket state is no longer trustworthy: `ssh -O check` may report - * one PID while the socket lifecycle is tied to another. Reset the whole - * duplicate group rather than trying to choose an owner. - */ - private function cleanupDuplicateSshProcesses(): void - { - $muxDir = storage_path('app/ssh/mux'); - $groups = []; - - foreach ($this->listProcesses() as $process) { - $controlPath = $this->extractControlPath($process['args']); - if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) { - continue; - } - - $groups[$controlPath][] = $process; - } - - foreach ($groups as $controlPath => $processes) { - if (count($processes) < 2) { - continue; - } - - $this->resetDuplicateGroup($controlPath, $processes); - } - } - - /** - * Kill backgrounded ssh master processes that lost the ControlPath socket - * race. Such processes are not masters, so ControlPersist never reaps them - * and they leak memory until the container restarts. A legitimate master - * always owns its socket file; an orphan has none. - * - * Processes younger than the minimum age are skipped: a freshly forked - * master creates its socket a few milliseconds after starting, so a young - * process with no socket may simply be mid-establish rather than orphaned. - */ - private function cleanupOrphanedSshProcesses(): void - { - $muxDir = storage_path('app/ssh/mux'); - $minAge = (int) config('constants.ssh.mux_orphan_min_age'); - - foreach ($this->listProcesses() as $process) { - // Only ever touch ssh processes pointing at Coolify's mux directory. - $controlPath = $this->extractControlPath($process['args']); - if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) { - continue; - } - - if ($process['etimes'] >= $minAge && ! file_exists($controlPath)) { - $this->reapOrphan('ssh', $process); - } - } - } - - /** - * Kill orphaned `cloudflared access ssh` proxy processes. Each is spawned - * as the SSH ProxyCommand transport for a Cloudflare Tunnel server and must - * die with its parent ssh. When that ssh is killed or orphaned (e.g. a lost - * mux master), the cloudflared process can leak and accumulate. A legitimate - * proxy always has a live ssh parent; one without is safe to reap. - * - * Processes younger than the minimum age are skipped so a proxy whose parent - * ssh is still starting up, or a transient `ssh -O check` proxy mid-exit, is - * never mistaken for an orphan. - */ - private function cleanupOrphanedCloudflaredProcesses(): void - { - $minAge = (int) config('constants.ssh.mux_orphan_min_age'); - $processes = $this->listProcesses(); - - $sshPids = []; - foreach ($processes as $process) { - // The ssh binary itself, not `cloudflared access ssh` (space before ssh). - if (preg_match('#(^|/)ssh\s#', $process['args'])) { - $sshPids[$process['pid']] = true; - } - } - - foreach ($processes as $process) { - // `cloudflared access ssh`, never the `cloudflared tunnel` daemon. - if (! str_contains($process['args'], 'cloudflared access ssh')) { - continue; - } - - // Orphaned when no live ssh process is its parent. - if ($process['etimes'] >= $minAge && ! isset($sshPids[$process['ppid']])) { - $this->reapOrphan('cloudflared', $process); - } - } - } - - /** - * Reap a detected orphan process. When orphan reaping is disabled (the - * default), the orphan is only logged — a dry-run mode that lets operators - * verify what would be killed before enabling it for real. - * - * @param array{pid: string, ppid: string, etimes: int, args: string} $process - */ - private function reapOrphan(string $kind, array $process): void - { - if (! config('constants.ssh.mux_orphan_reap_enabled')) { - Log::info("Orphaned {$kind} process detected (dry-run, not killed)", [ - 'pid' => $process['pid'], - 'etimes' => $process['etimes'], - 'command' => $process['args'], - ]); - - return; - } - - Process::run('kill '.escapeshellarg($process['pid'])); - Log::info("Killed orphaned {$kind} process", [ - 'pid' => $process['pid'], - 'etimes' => $process['etimes'], - 'command' => $process['args'], - ]); - } - - /** - * Snapshot of running processes. - * - * @return list - */ - private function listProcesses(): array - { - $ps = Process::run('ps -ww -eo pid=,ppid=,etimes=,args='); - if ($ps->exitCode() !== 0) { - return []; - } - - $processes = []; - foreach (explode("\n", trim($ps->output())) as $line) { - if (! preg_match('/^\s*(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/', $line, $matches)) { - continue; - } - $processes[] = [ - 'pid' => $matches[1], - 'ppid' => $matches[2], - 'etimes' => (int) $matches[3], - 'args' => $matches[4], - ]; - } - - return $processes; - } - - /** - * @param list $processes - */ - private function resetDuplicateGroup(string $controlPath, array $processes): void - { - if (! config('constants.ssh.mux_orphan_reap_enabled')) { - Log::info('Duplicate ssh mux processes detected (dry-run, not killed)', [ - 'control_path' => $controlPath, - 'pids' => array_column($processes, 'pid'), - ]); - - return; - } - - foreach ($processes as $process) { - Process::run('kill '.escapeshellarg($process['pid'])); - } - - if (file_exists($controlPath)) { - @unlink($controlPath); - } - - Log::info('Reset duplicate ssh mux processes', [ - 'control_path' => $controlPath, - 'pids' => array_column($processes, 'pid'), - ]); - } - - private function extractControlPath(string $args): ?string - { - if (! preg_match('/(?:^|\s)-o\s+ControlPath=(?:"([^"]+)"|\'([^\']+)\'|(\S+))/', $args, $matches)) { - if (preg_match('/^ssh:\s+(\S+)\s+\[mux\]$/', $args, $matches)) { - return $matches[1]; - } - - return null; - } - - return $matches[1] ?: ($matches[2] ?: $matches[3]); - } - - private function cleanupStaleConnections() - { - $muxFiles = Storage::disk('ssh-mux')->files(); - - foreach ($muxFiles as $muxFile) { - $serverUuid = $this->extractServerUuidFromMuxFile($muxFile); - $server = Server::where('uuid', $serverUuid)->first(); - - if (! $server) { - $this->removeMultiplexFile($muxFile, 'server_not_found'); - - continue; - } - - $muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}"; - $checkCommand = "ssh -O check -o ControlPath={$muxSocket} {$server->user}@{$server->ip} 2>/dev/null"; - $checkProcess = Process::run($checkCommand); - - if ($checkProcess->exitCode() !== 0) { - $this->removeMultiplexFile($muxFile, 'connection_check_failed'); - } else { - $muxContent = Storage::disk('ssh-mux')->get($muxFile); - $establishedAt = Carbon::parse(substr($muxContent, 37)); - $expirationTime = $establishedAt->addSeconds(config('constants.ssh.mux_persist_time')); - - if (Carbon::now()->isAfter($expirationTime)) { - $this->removeMultiplexFile($muxFile, 'expired'); - } - } - } - } - - private function cleanupNonExistentServerConnections() - { - $muxFiles = Storage::disk('ssh-mux')->files(); - $existingServerUuids = Server::pluck('uuid')->toArray(); - - foreach ($muxFiles as $muxFile) { - $serverUuid = $this->extractServerUuidFromMuxFile($muxFile); - if (! in_array($serverUuid, $existingServerUuids)) { - $this->removeMultiplexFile($muxFile, 'server_does_not_exist'); - } - } - } - - private function extractServerUuidFromMuxFile($muxFile) - { - return substr($muxFile, 4); - } - - /** - * Close and delete a stale mux socket file. When orphan reaping is disabled - * (the default), the file is only logged — a dry-run mode that lets operators - * verify what would be removed before enabling it for real. - */ - private function removeMultiplexFile(string $muxFile, string $reason): void - { - if (! config('constants.ssh.mux_orphan_reap_enabled')) { - Log::info('Stale mux file detected (dry-run, not removed)', [ - 'file' => $muxFile, - 'reason' => $reason, - ]); - - return; - } - - $muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}"; - $closeCommand = "ssh -O exit -o ControlPath={$muxSocket} localhost 2>/dev/null"; - Process::run($closeCommand); - Storage::disk('ssh-mux')->delete($muxFile); - - Log::info('Removed stale mux file', [ - 'file' => $muxFile, - 'reason' => $reason, - ]); - } -} diff --git a/config/constants.php b/config/constants.php index 0a0227ea6..dfff17542 100644 --- a/config/constants.php +++ b/config/constants.php @@ -67,13 +67,6 @@ 'ssh' => [ 'mux_enabled' => env('MUX_ENABLED', env('SSH_MUX_ENABLED', true)), 'mux_persist_time' => env('SSH_MUX_PERSIST_TIME', 3600), - 'mux_health_check_enabled' => env('SSH_MUX_HEALTH_CHECK_ENABLED', true), - 'mux_health_check_timeout' => env('SSH_MUX_HEALTH_CHECK_TIMEOUT', 5), - 'mux_max_age' => env('SSH_MUX_MAX_AGE', 1800), // 30 minutes - 'mux_lock_ttl' => env('SSH_MUX_LOCK_TTL', 30), // lock auto-release, seconds - 'mux_lock_timeout' => env('SSH_MUX_LOCK_TIMEOUT', 10), // max wait for lock, seconds - 'mux_orphan_min_age' => env('SSH_MUX_ORPHAN_MIN_AGE', 600), // min process age before reaping orphans, seconds - 'mux_orphan_reap_enabled' => env('SSH_MUX_ORPHAN_REAP_ENABLED', false), // false = dry-run, only log orphans 'connection_timeout' => 10, 'server_interval' => 20, 'command_timeout' => 3600, diff --git a/tests/Feature/ScheduleOnOneServerTest.php b/tests/Feature/ScheduleOnOneServerTest.php index b11017107..10758738c 100644 --- a/tests/Feature/ScheduleOnOneServerTest.php +++ b/tests/Feature/ScheduleOnOneServerTest.php @@ -1,10 +1,8 @@ onOneServer)->toBeTrue(); }); -it('schedules CleanupStaleMultiplexedConnections on the crons queue', function () { - config(['constants.coolify.self_hosted' => false]); - Queue::fake(); - - $schedule = app(Schedule::class); - - $event = collect($schedule->events())->first( - fn ($e) => str_contains((string) $e->description, CleanupStaleMultiplexedConnections::class) - ); - - expect($event)->not->toBeNull(); - - $event->run(app()); - - Queue::assertPushedOn('crons', CleanupStaleMultiplexedConnections::class); -}); - it('schedules every production job with onOneServer', function () { $schedule = app(Schedule::class); diff --git a/tests/Feature/SshMultiplexingLockTest.php b/tests/Feature/SshMultiplexingLockTest.php index fee4ec632..e97be1f50 100644 --- a/tests/Feature/SshMultiplexingLockTest.php +++ b/tests/Feature/SshMultiplexingLockTest.php @@ -1,12 +1,10 @@ true]); - $muxDir = storage_path('app/ssh/mux'); - File::ensureDirectoryExists($muxDir); - - $liveSocket = $muxDir.'/mux_live_'.uniqid(); - $orphanSocket = $muxDir.'/mux_orphan_'.uniqid(); - $youngSocket = $muxDir.'/mux_young_'.uniqid(); - File::put($liveSocket, 'x'); - - Process::fake([ - 'ps*' => Process::result(output: "111 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$liveSocket} root@1.2.3.4\n". - "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n". - "333 1 30 ssh -fN -o ControlMaster=auto -o ControlPath={$youngSocket} root@1.2.3.4\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222')); - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111')); - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '333')); - - File::delete($liveSocket); -}); - -it('kills old orphaned native openssh mux masters whose control socket no longer exists', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => true]); - $muxDir = storage_path('app/ssh/mux'); - File::ensureDirectoryExists($muxDir); - - $liveSocket = $muxDir.'/mux_native_live_'.uniqid(); - $orphanSocket = $muxDir.'/mux_native_orphan_'.uniqid(); - File::put($liveSocket, 'x'); - - Process::fake([ - 'ps*' => Process::result(output: "111 1 5000 ssh: {$liveSocket} [mux]\n". - "222 1 5000 ssh: {$orphanSocket} [mux]\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222')); - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111')); - - File::delete($liveSocket); -}); - -it('kills only old orphaned cloudflared proxies whose parent ssh is gone', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => true]); - - Process::fake([ - 'ps*' => Process::result(output: "100 1 5000 ssh -fN -o ControlMaster=auto root@1.2.3.4\n". - "200 100 5000 cloudflared access ssh --hostname host.example.com\n". - "300 2176 5000 cloudflared access ssh --hostname host.example.com\n". - "400 2176 30 cloudflared access ssh --hostname host.example.com\n". - "2176 1 9000 /usr/bin/some-supervisor\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupOrphanedCloudflaredProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '300')); - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '200')); - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '400')); -}); - -it('dry-run mode logs orphans but kills nothing when reaping is disabled', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => false]); - $muxDir = storage_path('app/ssh/mux'); - File::ensureDirectoryExists($muxDir); - - $orphanSocket = $muxDir.'/mux_orphan_'.uniqid(); - - Process::fake([ - 'ps*' => Process::result(output: "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill')); -}); - -it('resets duplicate ssh mux process groups atomically when reaping is enabled', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => true]); - $muxDir = storage_path('app/ssh/mux'); - File::ensureDirectoryExists($muxDir); - $controlPath = $muxDir.'/mux_duplicate_'.uniqid(); - File::put($controlPath, 'socket'); - - Process::fake([ - 'ps*' => Process::result(output: "111 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$controlPath} root@1.2.3.4\n". - "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$controlPath} root@1.2.3.4\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupDuplicateSshProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111')); - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222')); - expect(file_exists($controlPath))->toBeFalse(); -}); - -it('resets duplicate native openssh mux process groups atomically when reaping is enabled', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => true]); - $muxDir = storage_path('app/ssh/mux'); - File::ensureDirectoryExists($muxDir); - $controlPath = $muxDir.'/mux_native_duplicate_'.uniqid(); - File::put($controlPath, 'socket'); - - Process::fake([ - 'ps*' => Process::result(output: "111 1 5000 ssh: {$controlPath} [mux]\n". - "222 1 5000 ssh: {$controlPath} [mux]\n"), - 'kill*' => Process::result(exitCode: 0), - ]); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupDuplicateSshProcesses'); - $method->setAccessible(true); - $method->invoke($job); - - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111')); - Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222')); - expect(file_exists($controlPath))->toBeFalse(); -}); - -it('removes mux files for non-existent servers when reaping is enabled', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => true]); - Storage::fake('ssh-mux'); - $file = 'mux_ghost'.uniqid(); - Storage::disk('ssh-mux')->put($file, 'x'); - Process::fake(); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections'); - $method->setAccessible(true); - $method->invoke($job); - - expect(Storage::disk('ssh-mux')->exists($file))->toBeFalse(); -}); - -it('keeps mux files for non-existent servers in dry-run mode', function () { - config(['constants.ssh.mux_orphan_reap_enabled' => false]); - Storage::fake('ssh-mux'); - $file = 'mux_ghost'.uniqid(); - Storage::disk('ssh-mux')->put($file, 'x'); - Process::fake(); - - $job = new CleanupStaleMultiplexedConnections; - $method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections'); - $method->setAccessible(true); - $method->invoke($job); - - expect(Storage::disk('ssh-mux')->exists($file))->toBeTrue(); - Process::assertNothingRan(); -});