Wrap first-use SSH and SCP multiplexed commands with a lock to avoid racing while the control socket is created. Also detect native OpenSSH mux master process names during stale connection cleanup and cover both orphaned and duplicate mux processes with tests.
295 lines
10 KiB
PHP
295 lines
10 KiB
PHP
<?php
|
|
|
|
namespace App\Jobs;
|
|
|
|
use App\Models\Server;
|
|
use Carbon\Carbon;
|
|
use Illuminate\Bus\Queueable;
|
|
use Illuminate\Contracts\Queue\ShouldQueue;
|
|
use Illuminate\Foundation\Bus\Dispatchable;
|
|
use Illuminate\Queue\InteractsWithQueue;
|
|
use Illuminate\Queue\SerializesModels;
|
|
use Illuminate\Support\Facades\Log;
|
|
use Illuminate\Support\Facades\Process;
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
class CleanupStaleMultiplexedConnections implements ShouldQueue
|
|
{
|
|
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
|
|
|
|
public function handle()
|
|
{
|
|
$this->cleanupStaleConnections();
|
|
$this->cleanupNonExistentServerConnections();
|
|
$this->cleanupDuplicateSshProcesses();
|
|
$this->cleanupOrphanedSshProcesses();
|
|
$this->cleanupOrphanedCloudflaredProcesses();
|
|
}
|
|
|
|
/**
|
|
* Once two background ssh masters share the same ControlPath, OpenSSH's
|
|
* control socket state is no longer trustworthy: `ssh -O check` may report
|
|
* one PID while the socket lifecycle is tied to another. Reset the whole
|
|
* duplicate group rather than trying to choose an owner.
|
|
*/
|
|
private function cleanupDuplicateSshProcesses(): void
|
|
{
|
|
$muxDir = storage_path('app/ssh/mux');
|
|
$groups = [];
|
|
|
|
foreach ($this->listProcesses() as $process) {
|
|
$controlPath = $this->extractControlPath($process['args']);
|
|
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
|
|
continue;
|
|
}
|
|
|
|
$groups[$controlPath][] = $process;
|
|
}
|
|
|
|
foreach ($groups as $controlPath => $processes) {
|
|
if (count($processes) < 2) {
|
|
continue;
|
|
}
|
|
|
|
$this->resetDuplicateGroup($controlPath, $processes);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Kill backgrounded ssh master processes that lost the ControlPath socket
|
|
* race. Such processes are not masters, so ControlPersist never reaps them
|
|
* and they leak memory until the container restarts. A legitimate master
|
|
* always owns its socket file; an orphan has none.
|
|
*
|
|
* Processes younger than the minimum age are skipped: a freshly forked
|
|
* master creates its socket a few milliseconds after starting, so a young
|
|
* process with no socket may simply be mid-establish rather than orphaned.
|
|
*/
|
|
private function cleanupOrphanedSshProcesses(): void
|
|
{
|
|
$muxDir = storage_path('app/ssh/mux');
|
|
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
|
|
|
|
foreach ($this->listProcesses() as $process) {
|
|
// Only ever touch ssh processes pointing at Coolify's mux directory.
|
|
$controlPath = $this->extractControlPath($process['args']);
|
|
if (! is_string($controlPath) || ! str_starts_with($controlPath, $muxDir.'/')) {
|
|
continue;
|
|
}
|
|
|
|
if ($process['etimes'] >= $minAge && ! file_exists($controlPath)) {
|
|
$this->reapOrphan('ssh', $process);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Kill orphaned `cloudflared access ssh` proxy processes. Each is spawned
|
|
* as the SSH ProxyCommand transport for a Cloudflare Tunnel server and must
|
|
* die with its parent ssh. When that ssh is killed or orphaned (e.g. a lost
|
|
* mux master), the cloudflared process can leak and accumulate. A legitimate
|
|
* proxy always has a live ssh parent; one without is safe to reap.
|
|
*
|
|
* Processes younger than the minimum age are skipped so a proxy whose parent
|
|
* ssh is still starting up, or a transient `ssh -O check` proxy mid-exit, is
|
|
* never mistaken for an orphan.
|
|
*/
|
|
private function cleanupOrphanedCloudflaredProcesses(): void
|
|
{
|
|
$minAge = (int) config('constants.ssh.mux_orphan_min_age');
|
|
$processes = $this->listProcesses();
|
|
|
|
$sshPids = [];
|
|
foreach ($processes as $process) {
|
|
// The ssh binary itself, not `cloudflared access ssh` (space before ssh).
|
|
if (preg_match('#(^|/)ssh\s#', $process['args'])) {
|
|
$sshPids[$process['pid']] = true;
|
|
}
|
|
}
|
|
|
|
foreach ($processes as $process) {
|
|
// `cloudflared access ssh`, never the `cloudflared tunnel` daemon.
|
|
if (! str_contains($process['args'], 'cloudflared access ssh')) {
|
|
continue;
|
|
}
|
|
|
|
// Orphaned when no live ssh process is its parent.
|
|
if ($process['etimes'] >= $minAge && ! isset($sshPids[$process['ppid']])) {
|
|
$this->reapOrphan('cloudflared', $process);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reap a detected orphan process. When orphan reaping is disabled (the
|
|
* default), the orphan is only logged — a dry-run mode that lets operators
|
|
* verify what would be killed before enabling it for real.
|
|
*
|
|
* @param array{pid: string, ppid: string, etimes: int, args: string} $process
|
|
*/
|
|
private function reapOrphan(string $kind, array $process): void
|
|
{
|
|
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
|
|
Log::info("Orphaned {$kind} process detected (dry-run, not killed)", [
|
|
'pid' => $process['pid'],
|
|
'etimes' => $process['etimes'],
|
|
'command' => $process['args'],
|
|
]);
|
|
|
|
return;
|
|
}
|
|
|
|
Process::run('kill '.escapeshellarg($process['pid']));
|
|
Log::info("Killed orphaned {$kind} process", [
|
|
'pid' => $process['pid'],
|
|
'etimes' => $process['etimes'],
|
|
'command' => $process['args'],
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Snapshot of running processes.
|
|
*
|
|
* @return list<array{pid: string, ppid: string, etimes: int, args: string}>
|
|
*/
|
|
private function listProcesses(): array
|
|
{
|
|
$ps = Process::run('ps -ww -eo pid=,ppid=,etimes=,args=');
|
|
if ($ps->exitCode() !== 0) {
|
|
return [];
|
|
}
|
|
|
|
$processes = [];
|
|
foreach (explode("\n", trim($ps->output())) as $line) {
|
|
if (! preg_match('/^\s*(\d+)\s+(\d+)\s+(\d+)\s+(.*)$/', $line, $matches)) {
|
|
continue;
|
|
}
|
|
$processes[] = [
|
|
'pid' => $matches[1],
|
|
'ppid' => $matches[2],
|
|
'etimes' => (int) $matches[3],
|
|
'args' => $matches[4],
|
|
];
|
|
}
|
|
|
|
return $processes;
|
|
}
|
|
|
|
/**
|
|
* @param list<array{pid: string, ppid: string, etimes: int, args: string}> $processes
|
|
*/
|
|
private function resetDuplicateGroup(string $controlPath, array $processes): void
|
|
{
|
|
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
|
|
Log::info('Duplicate ssh mux processes detected (dry-run, not killed)', [
|
|
'control_path' => $controlPath,
|
|
'pids' => array_column($processes, 'pid'),
|
|
]);
|
|
|
|
return;
|
|
}
|
|
|
|
foreach ($processes as $process) {
|
|
Process::run('kill '.escapeshellarg($process['pid']));
|
|
}
|
|
|
|
if (file_exists($controlPath)) {
|
|
@unlink($controlPath);
|
|
}
|
|
|
|
Log::info('Reset duplicate ssh mux processes', [
|
|
'control_path' => $controlPath,
|
|
'pids' => array_column($processes, 'pid'),
|
|
]);
|
|
}
|
|
|
|
private function extractControlPath(string $args): ?string
|
|
{
|
|
if (! preg_match('/(?:^|\s)-o\s+ControlPath=(?:"([^"]+)"|\'([^\']+)\'|(\S+))/', $args, $matches)) {
|
|
if (preg_match('/^ssh:\s+(\S+)\s+\[mux\]$/', $args, $matches)) {
|
|
return $matches[1];
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
return $matches[1] ?: ($matches[2] ?: $matches[3]);
|
|
}
|
|
|
|
private function cleanupStaleConnections()
|
|
{
|
|
$muxFiles = Storage::disk('ssh-mux')->files();
|
|
|
|
foreach ($muxFiles as $muxFile) {
|
|
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
|
|
$server = Server::where('uuid', $serverUuid)->first();
|
|
|
|
if (! $server) {
|
|
$this->removeMultiplexFile($muxFile, 'server_not_found');
|
|
|
|
continue;
|
|
}
|
|
|
|
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
|
|
$checkCommand = "ssh -O check -o ControlPath={$muxSocket} {$server->user}@{$server->ip} 2>/dev/null";
|
|
$checkProcess = Process::run($checkCommand);
|
|
|
|
if ($checkProcess->exitCode() !== 0) {
|
|
$this->removeMultiplexFile($muxFile, 'connection_check_failed');
|
|
} else {
|
|
$muxContent = Storage::disk('ssh-mux')->get($muxFile);
|
|
$establishedAt = Carbon::parse(substr($muxContent, 37));
|
|
$expirationTime = $establishedAt->addSeconds(config('constants.ssh.mux_persist_time'));
|
|
|
|
if (Carbon::now()->isAfter($expirationTime)) {
|
|
$this->removeMultiplexFile($muxFile, 'expired');
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private function cleanupNonExistentServerConnections()
|
|
{
|
|
$muxFiles = Storage::disk('ssh-mux')->files();
|
|
$existingServerUuids = Server::pluck('uuid')->toArray();
|
|
|
|
foreach ($muxFiles as $muxFile) {
|
|
$serverUuid = $this->extractServerUuidFromMuxFile($muxFile);
|
|
if (! in_array($serverUuid, $existingServerUuids)) {
|
|
$this->removeMultiplexFile($muxFile, 'server_does_not_exist');
|
|
}
|
|
}
|
|
}
|
|
|
|
private function extractServerUuidFromMuxFile($muxFile)
|
|
{
|
|
return substr($muxFile, 4);
|
|
}
|
|
|
|
/**
|
|
* Close and delete a stale mux socket file. When orphan reaping is disabled
|
|
* (the default), the file is only logged — a dry-run mode that lets operators
|
|
* verify what would be removed before enabling it for real.
|
|
*/
|
|
private function removeMultiplexFile(string $muxFile, string $reason): void
|
|
{
|
|
if (! config('constants.ssh.mux_orphan_reap_enabled')) {
|
|
Log::info('Stale mux file detected (dry-run, not removed)', [
|
|
'file' => $muxFile,
|
|
'reason' => $reason,
|
|
]);
|
|
|
|
return;
|
|
}
|
|
|
|
$muxSocket = "/var/www/html/storage/app/ssh/mux/{$muxFile}";
|
|
$closeCommand = "ssh -O exit -o ControlPath={$muxSocket} localhost 2>/dev/null";
|
|
Process::run($closeCommand);
|
|
Storage::disk('ssh-mux')->delete($muxFile);
|
|
|
|
Log::info('Removed stale mux file', [
|
|
'file' => $muxFile,
|
|
'reason' => $reason,
|
|
]);
|
|
}
|
|
}
|