coolify/app/Jobs/CleanupStaleMultiplexedConnections.php

296 lines
10 KiB
PHP
Raw Normal View History

<?php
namespace App\Jobs;
use App\Models\Server;
2024-09-23 17:51:31 +00:00
use Carbon\Carbon;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Process;
2024-09-17 13:54:22 +00:00
use Illuminate\Support\Facades\Storage;
class CleanupStaleMultiplexedConnections implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
public function handle()
{
2024-09-17 13:54:22 +00:00
$this->cleanupStaleConnections();
$this->cleanupNonExistentServerConnections();
$this->cleanupDuplicateSshProcesses();
$this->cleanupOrphanedSshProcesses();
$this->cleanupOrphanedCloudflaredProcesses();
}
/**
* Once two background ssh masters share the same ControlPath, OpenSSH's
* control socket state is no longer trustworthy: `ssh -O check` may report
* one PID while the socket lifecycle is tied to another. Reset the whole
* duplicate group rather than trying to choose an owner.
*/
private function cleanupDuplicateSshProcesses(): void
{
$muxDir = storage_path('app/ssh/mux');
$groups = [];
foreach ($this->listProcesses() as $process) {
$controlPath = $this->extractControlPath($process['args']);
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
continue;
}
$groups[$controlPath][] = $process;
}
foreach ($groups as $controlPath => $processes) {
if (count($processes) < 2) {
continue;
}
$this->resetDuplicateGroup($controlPath, $processes);
}
}
/**
* Kill backgrounded ssh master processes that lost the ControlPath socket
* race. Such processes are not masters, so ControlPersist never reaps them
* and they leak memory until the container restarts. A legitimate master
* always owns its socket file; an orphan has none.
*
* Processes younger than the minimum age are skipped: a freshly forked
* master creates its socket a few milliseconds after starting, so a young
* process with no socket may simply be mid-establish rather than orphaned.
*/
private function cleanupOrphanedSshProcesses(): void
{
$muxDir = storage_path('app/ssh/mux');
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
foreach ($this->listProcesses() as $process) {
// Only ever touch ssh processes pointing at Coolify's mux directory.
$controlPath = $this->extractControlPath($process['args']);
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
continue;
}
if ($process['etimes'] >= $minAge && ! file_exists($controlPath)) {
$this->reapOrphan('ssh', $process);
}
}
}
/**
* Kill orphaned `cloudflared access ssh` proxy processes. Each is spawned
* as the SSH ProxyCommand transport for a Cloudflare Tunnel server and must
* die with its parent ssh. When that ssh is killed or orphaned (e.g. a lost
* mux master), the cloudflared process can leak and accumulate. A legitimate
* proxy always has a live ssh parent; one without is safe to reap.
*
* Processes younger than the minimum age are skipped so a proxy whose parent
* ssh is still starting up, or a transient `ssh -O check` proxy mid-exit, is
* never mistaken for an orphan.
*/
private function cleanupOrphanedCloudflaredProcesses(): void
{
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
$processes = $this->listProcesses();
$sshPids = [];
foreach ($processes as $process) {
// The ssh binary itself, not `cloudflared access ssh` (space before ssh).
if (preg_match('#(^|/)ssh\s#', $process['args'])) {
$sshPids[$process['pid']] = true;
}
}
foreach ($processes as $process) {
// `cloudflared access ssh`, never the `cloudflared tunnel` daemon.
if (! str_contains($process['args'], 'cloudflared access ssh')) {
continue;
}
// Orphaned when no live ssh process is its parent.
if ($process['etimes'] >= $minAge && ! isset($sshPids[$process['ppid']])) {
$this->reapOrphan('cloudflared', $process);
}
}
}
/**
* Reap a detected orphan process. When orphan reaping is disabled (the
* default), the orphan is only logged a dry-run mode that lets operators
* verify what would be killed before enabling it for real.
*
* @param array{pid: string, ppid: string, etimes: int, args: string} $process
*/
private function reapOrphan(string $kind, array $process): void
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info("Orphaned {$kind} process detected (dry-run, not killed)", [
'pid' => $process['pid'],
'etimes' => $process['etimes'],
'command' => $process['args'],
]);
return;
}
Process::run('kill '.escapeshellarg($process['pid']));
Log::info("Killed orphaned {$kind} process", [
'pid' => $process['pid'],
'etimes' => $process['etimes'],
'command' => $process['args'],
]);
}
/**
* Snapshot of running processes.
*
* @return list<array{pid: string, ppid: string, etimes: int, args: string}>
*/
private function listProcesses(): array
{
$ps = Process::run('ps -ww -eo pid=,ppid=,etimes=,args=');
if ($ps->exitCode() !== 0) {
return [];
}
$processes = [];
foreach (explode("\n", trim($ps->output())) as $line) {
if (! preg_match('/^\s*(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/', $line, $matches)) {
continue;
}
$processes[] = [
'pid' => $matches[1],
'ppid' => $matches[2],
'etimes' => (int) $matches[3],
'args' => $matches[4],
];
}
return $processes;
2024-09-17 13:54:22 +00:00
}
/**
* @param list<array{pid: string, ppid: string, etimes: int, args: string}> $processes
*/
private function resetDuplicateGroup(string $controlPath, array $processes): void
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info('Duplicate ssh mux processes detected (dry-run, not killed)', [
'control_path' => $controlPath,
'pids' => array_column($processes, 'pid'),
]);
return;
}
foreach ($processes as $process) {
Process::run('kill '.escapeshellarg($process['pid']));
}
if (file_exists($controlPath)) {
@unlink($controlPath);
}
Log::info('Reset duplicate ssh mux processes', [
'control_path' => $controlPath,
'pids' => array_column($processes, 'pid'),
]);
}
private function extractControlPath(string $args): ?string
{
if (! preg_match('/(?:^|\s)-o\s+ControlPath=(?:"([^"]+)"|\'([^\']+)\'|(\S+))/', $args, $matches)) {
if (preg_match('/^ssh:\s+(\S+)\s+\[mux\]$/', $args, $matches)) {
return $matches[1];
}
return null;
}
return $matches[1] ?: ($matches[2] ?: $matches[3]);
}
2024-09-17 13:54:22 +00:00
private function cleanupStaleConnections()
{
$muxFiles = Storage::disk('ssh-mux')->files();
foreach ($muxFiles as $muxFile) {
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
$server = Server::where('uuid', $serverUuid)->first();
2024-09-23 17:51:31 +00:00
if (! $server) {
$this->removeMultiplexFile($muxFile, 'server_not_found');
2024-09-23 17:51:31 +00:00
2024-09-17 13:54:22 +00:00
continue;
}
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
$checkCommand = "ssh -O check -o ControlPath={$muxSocket} {$server->user}@{$server->ip} 2>/dev/null";
$checkProcess = Process::run($checkCommand);
if ($checkProcess->exitCode() !== 0) {
$this->removeMultiplexFile($muxFile, 'connection_check_failed');
2024-09-17 13:54:22 +00:00
} else {
$muxContent = Storage::disk('ssh-mux')->get($muxFile);
$establishedAt = Carbon::parse(substr($muxContent, 37));
$expirationTime = $establishedAt->addSeconds(config('constants.ssh.mux_persist_time'));
if (Carbon::now()->isAfter($expirationTime)) {
$this->removeMultiplexFile($muxFile, 'expired');
2024-09-17 13:54:22 +00:00
}
}
2024-09-17 13:54:22 +00:00
}
}
2024-09-17 13:54:22 +00:00
private function cleanupNonExistentServerConnections()
{
2024-09-17 13:54:22 +00:00
$muxFiles = Storage::disk('ssh-mux')->files();
$existingServerUuids = Server::pluck('uuid')->toArray();
2024-09-17 13:54:22 +00:00
foreach ($muxFiles as $muxFile) {
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
2024-09-23 17:51:31 +00:00
if (! in_array($serverUuid, $existingServerUuids)) {
$this->removeMultiplexFile($muxFile, 'server_does_not_exist');
2024-09-17 13:54:22 +00:00
}
}
}
2024-09-17 13:54:22 +00:00
private function extractServerUuidFromMuxFile($muxFile)
{
return substr($muxFile, 4);
}
/**
* Close and delete a stale mux socket file. When orphan reaping is disabled
* (the default), the file is only logged a dry-run mode that lets operators
* verify what would be removed before enabling it for real.
*/
private function removeMultiplexFile(string $muxFile, string $reason): void
2024-09-17 13:54:22 +00:00
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info('Stale mux file detected (dry-run, not removed)', [
'file' => $muxFile,
'reason' => $reason,
]);
return;
}
2024-09-17 13:54:22 +00:00
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
$closeCommand = "ssh -O exit -o ControlPath={$muxSocket} localhost 2>/dev/null";
Process::run($closeCommand);
Storage::disk('ssh-mux')->delete($muxFile);
Log::info('Removed stale mux file', [
'file' => $muxFile,
'reason' => $reason,
]);
2024-09-17 13:54:22 +00:00
}
}