chore(ssh): remove stale mux cleanup job

Drop the scheduled stale multiplexed connection cleanup job, its SSH mux
health/orphan config, and the tests that covered that cleanup path.
This commit is contained in:
Andras Bacsai 2026-05-26 14:40:38 +02:00
parent b5be9fe9e8
commit 43884823c6
5 changed files with 0 additions and 501 deletions

View file

@ -8,7 +8,6 @@
use App\Jobs\CheckTraefikVersionJob;
use App\Jobs\CleanupInstanceStuffsJob;
use App\Jobs\CleanupOrphanedPreviewContainersJob;
use App\Jobs\CleanupStaleMultiplexedConnections;
use App\Jobs\PullChangelog;
use App\Jobs\PullTemplatesFromCDN;
use App\Jobs\RegenerateSslCertJob;
@ -41,9 +40,6 @@ protected function schedule(Schedule $schedule): void
$this->instanceTimezone = config('app.timezone');
}
$this->scheduleInstance->job(new CleanupStaleMultiplexedConnections, crons_queue())
->hourly()
->onOneServer();
$this->scheduleInstance->command('cleanup:redis --clear-locks')->daily();
$this->scheduleInstance->command('sanctum:prune-expired --hours=1')->hourly()->onOneServer();
$this->scheduleInstance->job(new ApiTokenExpirationWarningJob)->hourly()->onOneServer();

View file

@ -1,295 +0,0 @@
<?php
namespace App\Jobs;
use App\Models\Server;
use Carbon\Carbon;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Process;
use Illuminate\Support\Facades\Storage;
class CleanupStaleMultiplexedConnections implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
public function handle()
{
$this->cleanupStaleConnections();
$this->cleanupNonExistentServerConnections();
$this->cleanupDuplicateSshProcesses();
$this->cleanupOrphanedSshProcesses();
$this->cleanupOrphanedCloudflaredProcesses();
}
/**
* Once two background ssh masters share the same ControlPath, OpenSSH's
* control socket state is no longer trustworthy: `ssh -O check` may report
* one PID while the socket lifecycle is tied to another. Reset the whole
* duplicate group rather than trying to choose an owner.
*/
private function cleanupDuplicateSshProcesses(): void
{
$muxDir = storage_path('app/ssh/mux');
$groups = [];
foreach ($this->listProcesses() as $process) {
$controlPath = $this->extractControlPath($process['args']);
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
continue;
}
$groups[$controlPath][] = $process;
}
foreach ($groups as $controlPath => $processes) {
if (count($processes) < 2) {
continue;
}
$this->resetDuplicateGroup($controlPath, $processes);
}
}
/**
* Kill backgrounded ssh master processes that lost the ControlPath socket
* race. Such processes are not masters, so ControlPersist never reaps them
* and they leak memory until the container restarts. A legitimate master
* always owns its socket file; an orphan has none.
*
* Processes younger than the minimum age are skipped: a freshly forked
* master creates its socket a few milliseconds after starting, so a young
* process with no socket may simply be mid-establish rather than orphaned.
*/
private function cleanupOrphanedSshProcesses(): void
{
$muxDir = storage_path('app/ssh/mux');
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
foreach ($this->listProcesses() as $process) {
// Only ever touch ssh processes pointing at Coolify's mux directory.
$controlPath = $this->extractControlPath($process['args']);
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
continue;
}
if ($process['etimes'] >= $minAge && ! file_exists($controlPath)) {
$this->reapOrphan('ssh', $process);
}
}
}
/**
* Kill orphaned `cloudflared access ssh` proxy processes. Each is spawned
* as the SSH ProxyCommand transport for a Cloudflare Tunnel server and must
* die with its parent ssh. When that ssh is killed or orphaned (e.g. a lost
* mux master), the cloudflared process can leak and accumulate. A legitimate
* proxy always has a live ssh parent; one without is safe to reap.
*
* Processes younger than the minimum age are skipped so a proxy whose parent
* ssh is still starting up, or a transient `ssh -O check` proxy mid-exit, is
* never mistaken for an orphan.
*/
private function cleanupOrphanedCloudflaredProcesses(): void
{
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
$processes = $this->listProcesses();
$sshPids = [];
foreach ($processes as $process) {
// The ssh binary itself, not `cloudflared access ssh` (space before ssh).
if (preg_match('#(^|/)ssh\s#', $process['args'])) {
$sshPids[$process['pid']] = true;
}
}
foreach ($processes as $process) {
// `cloudflared access ssh`, never the `cloudflared tunnel` daemon.
if (! str_contains($process['args'], 'cloudflared access ssh')) {
continue;
}
// Orphaned when no live ssh process is its parent.
if ($process['etimes'] >= $minAge && ! isset($sshPids[$process['ppid']])) {
$this->reapOrphan('cloudflared', $process);
}
}
}
/**
* Reap a detected orphan process. When orphan reaping is disabled (the
* default), the orphan is only logged a dry-run mode that lets operators
* verify what would be killed before enabling it for real.
*
* @param array{pid: string, ppid: string, etimes: int, args: string} $process
*/
private function reapOrphan(string $kind, array $process): void
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info("Orphaned {$kind} process detected (dry-run, not killed)", [
'pid' => $process['pid'],
'etimes' => $process['etimes'],
'command' => $process['args'],
]);
return;
}
Process::run('kill '.escapeshellarg($process['pid']));
Log::info("Killed orphaned {$kind} process", [
'pid' => $process['pid'],
'etimes' => $process['etimes'],
'command' => $process['args'],
]);
}
/**
* Snapshot of running processes.
*
* @return list<array{pid: string, ppid: string, etimes: int, args: string}>
*/
private function listProcesses(): array
{
$ps = Process::run('ps -ww -eo pid=,ppid=,etimes=,args=');
if ($ps->exitCode() !== 0) {
return [];
}
$processes = [];
foreach (explode("\n", trim($ps->output())) as $line) {
if (! preg_match('/^\s*(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/', $line, $matches)) {
continue;
}
$processes[] = [
'pid' => $matches[1],
'ppid' => $matches[2],
'etimes' => (int) $matches[3],
'args' => $matches[4],
];
}
return $processes;
}
/**
* @param list<array{pid: string, ppid: string, etimes: int, args: string}> $processes
*/
private function resetDuplicateGroup(string $controlPath, array $processes): void
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info('Duplicate ssh mux processes detected (dry-run, not killed)', [
'control_path' => $controlPath,
'pids' => array_column($processes, 'pid'),
]);
return;
}
foreach ($processes as $process) {
Process::run('kill '.escapeshellarg($process['pid']));
}
if (file_exists($controlPath)) {
@unlink($controlPath);
}
Log::info('Reset duplicate ssh mux processes', [
'control_path' => $controlPath,
'pids' => array_column($processes, 'pid'),
]);
}
private function extractControlPath(string $args): ?string
{
if (! preg_match('/(?:^|\s)-o\s+ControlPath=(?:"([^"]+)"|\'([^\']+)\'|(\S+))/', $args, $matches)) {
if (preg_match('/^ssh:\s+(\S+)\s+\[mux\]$/', $args, $matches)) {
return $matches[1];
}
return null;
}
return $matches[1] ?: ($matches[2] ?: $matches[3]);
}
private function cleanupStaleConnections()
{
$muxFiles = Storage::disk('ssh-mux')->files();
foreach ($muxFiles as $muxFile) {
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
$server = Server::where('uuid', $serverUuid)->first();
if (! $server) {
$this->removeMultiplexFile($muxFile, 'server_not_found');
continue;
}
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
$checkCommand = "ssh -O check -o ControlPath={$muxSocket} {$server->user}@{$server->ip} 2>/dev/null";
$checkProcess = Process::run($checkCommand);
if ($checkProcess->exitCode() !== 0) {
$this->removeMultiplexFile($muxFile, 'connection_check_failed');
} else {
$muxContent = Storage::disk('ssh-mux')->get($muxFile);
$establishedAt = Carbon::parse(substr($muxContent, 37));
$expirationTime = $establishedAt->addSeconds(config('constants.ssh.mux_persist_time'));
if (Carbon::now()->isAfter($expirationTime)) {
$this->removeMultiplexFile($muxFile, 'expired');
}
}
}
}
private function cleanupNonExistentServerConnections()
{
$muxFiles = Storage::disk('ssh-mux')->files();
$existingServerUuids = Server::pluck('uuid')->toArray();
foreach ($muxFiles as $muxFile) {
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
if (! in_array($serverUuid, $existingServerUuids)) {
$this->removeMultiplexFile($muxFile, 'server_does_not_exist');
}
}
}
private function extractServerUuidFromMuxFile($muxFile)
{
return substr($muxFile, 4);
}
/**
* Close and delete a stale mux socket file. When orphan reaping is disabled
* (the default), the file is only logged a dry-run mode that lets operators
* verify what would be removed before enabling it for real.
*/
private function removeMultiplexFile(string $muxFile, string $reason): void
{
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
Log::info('Stale mux file detected (dry-run, not removed)', [
'file' => $muxFile,
'reason' => $reason,
]);
return;
}
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
$closeCommand = "ssh -O exit -o ControlPath={$muxSocket} localhost 2>/dev/null";
Process::run($closeCommand);
Storage::disk('ssh-mux')->delete($muxFile);
Log::info('Removed stale mux file', [
'file' => $muxFile,
'reason' => $reason,
]);
}
}

View file

@ -67,13 +67,6 @@
'ssh' => [
'mux_enabled' => env('MUX_ENABLED', env('SSH_MUX_ENABLED', true)),
'mux_persist_time' => env('SSH_MUX_PERSIST_TIME', 3600),
'mux_health_check_enabled' => env('SSH_MUX_HEALTH_CHECK_ENABLED', true),
'mux_health_check_timeout' => env('SSH_MUX_HEALTH_CHECK_TIMEOUT', 5),
'mux_max_age' => env('SSH_MUX_MAX_AGE', 1800), // 30 minutes
'mux_lock_ttl' => env('SSH_MUX_LOCK_TTL', 30), // lock auto-release, seconds
'mux_lock_timeout' => env('SSH_MUX_LOCK_TIMEOUT', 10), // max wait for lock, seconds
'mux_orphan_min_age' => env('SSH_MUX_ORPHAN_MIN_AGE', 600), // min process age before reaping orphans, seconds
'mux_orphan_reap_enabled' => env('SSH_MUX_ORPHAN_REAP_ENABLED', false), // false = dry-run, only log orphans
'connection_timeout' => 10,
'server_interval' => 20,
'command_timeout' => 3600,

View file

@ -1,10 +1,8 @@
<?php
use App\Jobs\CleanupStaleMultiplexedConnections;
use App\Models\InstanceSettings;
use Illuminate\Console\Scheduling\Schedule;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\Queue;
uses(RefreshDatabase::class);
@ -23,23 +21,6 @@
expect($event->onOneServer)->toBeTrue();
});
it('schedules CleanupStaleMultiplexedConnections on the crons queue', function () {
config(['constants.coolify.self_hosted' => false]);
Queue::fake();
$schedule = app(Schedule::class);
$event = collect($schedule->events())->first(
fn ($e) => str_contains((string) $e->description, CleanupStaleMultiplexedConnections::class)
);
expect($event)->not->toBeNull();
$event->run(app());
Queue::assertPushedOn('crons', CleanupStaleMultiplexedConnections::class);
});
it('schedules every production job with onOneServer', function () {
$schedule = app(Schedule::class);

View file

@ -1,12 +1,10 @@
<?php
use App\Helpers\SshMultiplexingHelper;
use App\Jobs\CleanupStaleMultiplexedConnections;
use App\Models\PrivateKey;
use App\Models\Server;
use App\Models\User;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Support\Facades\File;
use Illuminate\Support\Facades\Process;
use Illuminate\Support\Facades\Storage;
@ -116,177 +114,3 @@ function makeMuxServer(): Server
Process::assertNothingRan();
});
it('kills only old orphaned ssh masters whose control socket no longer exists', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
$muxDir = storage_path('app/ssh/mux');
File::ensureDirectoryExists($muxDir);
$liveSocket = $muxDir.'/mux_live_'.uniqid();
$orphanSocket = $muxDir.'/mux_orphan_'.uniqid();
$youngSocket = $muxDir.'/mux_young_'.uniqid();
File::put($liveSocket, 'x');
Process::fake([
'ps*' => Process::result(output: "111 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$liveSocket} root@1.2.3.4\n".
"222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n".
"333 1 30 ssh -fN -o ControlMaster=auto -o ControlPath={$youngSocket} root@1.2.3.4\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222'));
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111'));
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '333'));
File::delete($liveSocket);
});
it('kills old orphaned native openssh mux masters whose control socket no longer exists', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
$muxDir = storage_path('app/ssh/mux');
File::ensureDirectoryExists($muxDir);
$liveSocket = $muxDir.'/mux_native_live_'.uniqid();
$orphanSocket = $muxDir.'/mux_native_orphan_'.uniqid();
File::put($liveSocket, 'x');
Process::fake([
'ps*' => Process::result(output: "111 1 5000 ssh: {$liveSocket} [mux]\n".
"222 1 5000 ssh: {$orphanSocket} [mux]\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222'));
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111'));
File::delete($liveSocket);
});
it('kills only old orphaned cloudflared proxies whose parent ssh is gone', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
Process::fake([
'ps*' => Process::result(output: "100 1 5000 ssh -fN -o ControlMaster=auto root@1.2.3.4\n".
"200 100 5000 cloudflared access ssh --hostname host.example.com\n".
"300 2176 5000 cloudflared access ssh --hostname host.example.com\n".
"400 2176 30 cloudflared access ssh --hostname host.example.com\n".
"2176 1 9000 /usr/bin/some-supervisor\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupOrphanedCloudflaredProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '300'));
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '200'));
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '400'));
});
it('dry-run mode logs orphans but kills nothing when reaping is disabled', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => false]);
$muxDir = storage_path('app/ssh/mux');
File::ensureDirectoryExists($muxDir);
$orphanSocket = $muxDir.'/mux_orphan_'.uniqid();
Process::fake([
'ps*' => Process::result(output: "222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$orphanSocket} root@1.2.3.4\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupOrphanedSshProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertNotRan(fn ($process) => str_contains($process->command, 'kill'));
});
it('resets duplicate ssh mux process groups atomically when reaping is enabled', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
$muxDir = storage_path('app/ssh/mux');
File::ensureDirectoryExists($muxDir);
$controlPath = $muxDir.'/mux_duplicate_'.uniqid();
File::put($controlPath, 'socket');
Process::fake([
'ps*' => Process::result(output: "111 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$controlPath} root@1.2.3.4\n".
"222 1 5000 ssh -fN -o ControlMaster=auto -o ControlPath={$controlPath} root@1.2.3.4\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupDuplicateSshProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111'));
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222'));
expect(file_exists($controlPath))->toBeFalse();
});
it('resets duplicate native openssh mux process groups atomically when reaping is enabled', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
$muxDir = storage_path('app/ssh/mux');
File::ensureDirectoryExists($muxDir);
$controlPath = $muxDir.'/mux_native_duplicate_'.uniqid();
File::put($controlPath, 'socket');
Process::fake([
'ps*' => Process::result(output: "111 1 5000 ssh: {$controlPath} [mux]\n".
"222 1 5000 ssh: {$controlPath} [mux]\n"),
'kill*' => Process::result(exitCode: 0),
]);
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupDuplicateSshProcesses');
$method->setAccessible(true);
$method->invoke($job);
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '111'));
Process::assertRan(fn ($process) => str_contains($process->command, 'kill') && str_contains($process->command, '222'));
expect(file_exists($controlPath))->toBeFalse();
});
it('removes mux files for non-existent servers when reaping is enabled', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => true]);
Storage::fake('ssh-mux');
$file = 'mux_ghost'.uniqid();
Storage::disk('ssh-mux')->put($file, 'x');
Process::fake();
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections');
$method->setAccessible(true);
$method->invoke($job);
expect(Storage::disk('ssh-mux')->exists($file))->toBeFalse();
});
it('keeps mux files for non-existent servers in dry-run mode', function () {
config(['constants.ssh.mux_orphan_reap_enabled' => false]);
Storage::fake('ssh-mux');
$file = 'mux_ghost'.uniqid();
Storage::disk('ssh-mux')->put($file, 'x');
Process::fake();
$job = new CleanupStaleMultiplexedConnections;
$method = new ReflectionMethod($job, 'cleanupNonExistentServerConnections');
$method->setAccessible(true);
$method->invoke($job);
expect(Storage::disk('ssh-mux')->exists($file))->toBeTrue();
Process::assertNothingRan();
});