fix(scheduler): add self-healing for stale Redis locks and detection in UI (#8618)

This commit is contained in:
Andras Bacsai 2026-02-25 12:08:45 +01:00 committed by GitHub
commit f3b63b4d8d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 161 additions and 2 deletions

View file

@ -40,7 +40,7 @@ protected function schedule(Schedule $schedule): void
}
// $this->scheduleInstance->job(new CleanupStaleMultiplexedConnections)->hourly();
$this->scheduleInstance->command('cleanup:redis')->weekly();
$this->scheduleInstance->command('cleanup:redis --clear-locks')->daily();
if (isDev()) {
// Instance Jobs

View file

@ -15,7 +15,9 @@
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Carbon;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Redis;
class ScheduledJobManager implements ShouldQueue
{
@ -54,6 +56,11 @@ private function determineQueue(): string
*/
public function middleware(): array
{
// Self-healing: clear any stale lock before WithoutOverlapping tries to acquire it.
// Stale locks (TTL = -1) can occur during upgrades, Redis restarts, or edge cases.
// @see https://github.com/coollabsio/coolify/issues/8327
self::clearStaleLockIfPresent();
return [
(new WithoutOverlapping('scheduled-job-manager'))
->expireAfter(90) // Lock expires after 90s to handle high-load environments with many tasks
@ -61,6 +68,34 @@ public function middleware(): array
];
}
/**
* Clear a stale WithoutOverlapping lock if it has no TTL (TTL = -1).
*
* This provides continuous self-healing since it runs every time the job is dispatched.
* Stale locks permanently block all scheduled job executions with no user-visible error.
*/
private static function clearStaleLockIfPresent(): void
{
try {
$cachePrefix = config('cache.prefix', '');
$lockKey = $cachePrefix.'laravel-queue-overlap:'.self::class.':scheduled-job-manager';
$ttl = Redis::connection('default')->ttl($lockKey);
if ($ttl === -1) {
Redis::connection('default')->del($lockKey);
Log::channel('scheduled')->warning('Cleared stale ScheduledJobManager lock', [
'lock_key' => $lockKey,
]);
}
} catch (\Throwable $e) {
// Never let lock cleanup failure prevent the job from running
Log::channel('scheduled-errors')->error('Failed to check/clear stale lock', [
'error' => $e->getMessage(),
]);
}
}
public function handle(): void
{
// Freeze the execution time at the start of the job
@ -108,6 +143,13 @@ public function handle(): void
'dispatched' => $this->dispatchedCount,
'skipped' => $this->skippedCount,
]);
// Write heartbeat so the UI can detect when the scheduler has stopped
try {
Cache::put('scheduled-job-manager:heartbeat', now()->toIso8601String(), 300);
} catch (\Throwable) {
// Non-critical; don't let heartbeat failure affect the job
}
}
private function processScheduledBackups(): void

View file

@ -3,8 +3,13 @@
namespace App\Livewire\Server;
use App\Jobs\DockerCleanupJob;
use App\Models\DockerCleanupExecution;
use App\Models\Server;
use Cron\CronExpression;
use Illuminate\Foundation\Auth\Access\AuthorizesRequests;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\Cache;
use Livewire\Attributes\Computed;
use Livewire\Attributes\Validate;
use Livewire\Component;
@ -34,6 +39,53 @@ class DockerCleanup extends Component
#[Validate('boolean')]
public bool $disableApplicationImageRetention = false;
#[Computed]
public function isCleanupStale(): bool
{
try {
$lastExecution = DockerCleanupExecution::where('server_id', $this->server->id)
->orderBy('created_at', 'desc')
->first();
if (! $lastExecution) {
return false;
}
$frequency = $this->server->settings->docker_cleanup_frequency ?? '0 0 * * *';
if (isset(VALID_CRON_STRINGS[$frequency])) {
$frequency = VALID_CRON_STRINGS[$frequency];
}
$cron = new CronExpression($frequency);
$now = Carbon::now();
$nextRun = Carbon::parse($cron->getNextRunDate($now));
$afterThat = Carbon::parse($cron->getNextRunDate($nextRun));
$intervalMinutes = $nextRun->diffInMinutes($afterThat);
$threshold = max($intervalMinutes * 2, 10);
return Carbon::parse($lastExecution->created_at)->diffInMinutes($now) > $threshold;
} catch (\Throwable) {
return false;
}
}
#[Computed]
public function lastExecutionTime(): ?string
{
return DockerCleanupExecution::where('server_id', $this->server->id)
->orderBy('created_at', 'desc')
->first()
?->created_at
?->diffForHumans();
}
#[Computed]
public function isSchedulerHealthy(): bool
{
return Cache::get('scheduled-job-manager:heartbeat') !== null;
}
public function mount(string $server_uuid)
{
try {

View file

@ -27,6 +27,22 @@
<div class="mt-1 mb-6">Configure Docker cleanup settings for your server.</div>
</div>
@if ($this->isCleanupStale)
<div class="mb-4">
<x-callout type="warning" title="Docker Cleanup May Be Stalled">
<p>The last Docker cleanup ran {{ $this->lastExecutionTime ?? 'unknown time' }} ago,
which is longer than expected for the configured frequency.</p>
@if (!$this->isSchedulerHealthy)
<p class="mt-1">The scheduled job manager appears to be inactive. This may indicate
a stale Redis lock is blocking all scheduled jobs.</p>
@endif
<p class="mt-2">To resolve, run on your Coolify instance:
<code class="bg-black/10 dark:bg-white/10 px-1 rounded">php artisan cleanup:redis --clear-locks</code>
</p>
</x-callout>
</div>
@endif
<div class="flex flex-col gap-2">
<div class="flex gap-4">
<h3>Cleanup Configuration</h3>

View file

@ -0,0 +1,49 @@
<?php
use App\Jobs\ScheduledJobManager;
use Illuminate\Support\Facades\Redis;
it('clears stale lock when TTL is -1', function () {
$cachePrefix = config('cache.prefix');
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
$redis = Redis::connection('default');
$redis->set($lockKey, 'stale-owner');
expect($redis->ttl($lockKey))->toBe(-1);
$job = new ScheduledJobManager;
$job->middleware();
expect($redis->exists($lockKey))->toBe(0);
});
it('preserves valid lock with positive TTL', function () {
$cachePrefix = config('cache.prefix');
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
$redis = Redis::connection('default');
$redis->set($lockKey, 'active-owner');
$redis->expire($lockKey, 60);
expect($redis->ttl($lockKey))->toBeGreaterThan(0);
$job = new ScheduledJobManager;
$job->middleware();
expect($redis->exists($lockKey))->toBe(1);
$redis->del($lockKey);
});
it('does not fail when no lock exists', function () {
$cachePrefix = config('cache.prefix');
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
Redis::connection('default')->del($lockKey);
$job = new ScheduledJobManager;
$middleware = $job->middleware();
expect($middleware)->toBeArray()->toHaveCount(1);
});

View file

@ -24,7 +24,7 @@
$expiresAfterProperty->setAccessible(true);
$expiresAfter = $expiresAfterProperty->getValue($overlappingMiddleware);
expect($expiresAfter)->toBe(60)
expect($expiresAfter)->toBe(90)
->and($expiresAfter)->toBeGreaterThan(0, 'expireAfter must be set to prevent stale locks');
// Check releaseAfter is NOT set (we use dontRelease)