fix(scheduler): add self-healing for stale Redis locks and detection in UI (#8618)
This commit is contained in:
commit
f3b63b4d8d
6 changed files with 161 additions and 2 deletions
|
|
@ -40,7 +40,7 @@ protected function schedule(Schedule $schedule): void
|
|||
}
|
||||
|
||||
// $this->scheduleInstance->job(new CleanupStaleMultiplexedConnections)->hourly();
|
||||
$this->scheduleInstance->command('cleanup:redis')->weekly();
|
||||
$this->scheduleInstance->command('cleanup:redis --clear-locks')->daily();
|
||||
|
||||
if (isDev()) {
|
||||
// Instance Jobs
|
||||
|
|
|
|||
|
|
@ -15,7 +15,9 @@
|
|||
use Illuminate\Queue\SerializesModels;
|
||||
use Illuminate\Support\Carbon;
|
||||
use Illuminate\Support\Collection;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
use Illuminate\Support\Facades\Log;
|
||||
use Illuminate\Support\Facades\Redis;
|
||||
|
||||
class ScheduledJobManager implements ShouldQueue
|
||||
{
|
||||
|
|
@ -54,6 +56,11 @@ private function determineQueue(): string
|
|||
*/
|
||||
public function middleware(): array
|
||||
{
|
||||
// Self-healing: clear any stale lock before WithoutOverlapping tries to acquire it.
|
||||
// Stale locks (TTL = -1) can occur during upgrades, Redis restarts, or edge cases.
|
||||
// @see https://github.com/coollabsio/coolify/issues/8327
|
||||
self::clearStaleLockIfPresent();
|
||||
|
||||
return [
|
||||
(new WithoutOverlapping('scheduled-job-manager'))
|
||||
->expireAfter(90) // Lock expires after 90s to handle high-load environments with many tasks
|
||||
|
|
@ -61,6 +68,34 @@ public function middleware(): array
|
|||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear a stale WithoutOverlapping lock if it has no TTL (TTL = -1).
|
||||
*
|
||||
* This provides continuous self-healing since it runs every time the job is dispatched.
|
||||
* Stale locks permanently block all scheduled job executions with no user-visible error.
|
||||
*/
|
||||
private static function clearStaleLockIfPresent(): void
|
||||
{
|
||||
try {
|
||||
$cachePrefix = config('cache.prefix', '');
|
||||
$lockKey = $cachePrefix.'laravel-queue-overlap:'.self::class.':scheduled-job-manager';
|
||||
|
||||
$ttl = Redis::connection('default')->ttl($lockKey);
|
||||
|
||||
if ($ttl === -1) {
|
||||
Redis::connection('default')->del($lockKey);
|
||||
Log::channel('scheduled')->warning('Cleared stale ScheduledJobManager lock', [
|
||||
'lock_key' => $lockKey,
|
||||
]);
|
||||
}
|
||||
} catch (\Throwable $e) {
|
||||
// Never let lock cleanup failure prevent the job from running
|
||||
Log::channel('scheduled-errors')->error('Failed to check/clear stale lock', [
|
||||
'error' => $e->getMessage(),
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
public function handle(): void
|
||||
{
|
||||
// Freeze the execution time at the start of the job
|
||||
|
|
@ -108,6 +143,13 @@ public function handle(): void
|
|||
'dispatched' => $this->dispatchedCount,
|
||||
'skipped' => $this->skippedCount,
|
||||
]);
|
||||
|
||||
// Write heartbeat so the UI can detect when the scheduler has stopped
|
||||
try {
|
||||
Cache::put('scheduled-job-manager:heartbeat', now()->toIso8601String(), 300);
|
||||
} catch (\Throwable) {
|
||||
// Non-critical; don't let heartbeat failure affect the job
|
||||
}
|
||||
}
|
||||
|
||||
private function processScheduledBackups(): void
|
||||
|
|
|
|||
|
|
@ -3,8 +3,13 @@
|
|||
namespace App\Livewire\Server;
|
||||
|
||||
use App\Jobs\DockerCleanupJob;
|
||||
use App\Models\DockerCleanupExecution;
|
||||
use App\Models\Server;
|
||||
use Cron\CronExpression;
|
||||
use Illuminate\Foundation\Auth\Access\AuthorizesRequests;
|
||||
use Illuminate\Support\Carbon;
|
||||
use Illuminate\Support\Facades\Cache;
|
||||
use Livewire\Attributes\Computed;
|
||||
use Livewire\Attributes\Validate;
|
||||
use Livewire\Component;
|
||||
|
||||
|
|
@ -34,6 +39,53 @@ class DockerCleanup extends Component
|
|||
#[Validate('boolean')]
|
||||
public bool $disableApplicationImageRetention = false;
|
||||
|
||||
#[Computed]
|
||||
public function isCleanupStale(): bool
|
||||
{
|
||||
try {
|
||||
$lastExecution = DockerCleanupExecution::where('server_id', $this->server->id)
|
||||
->orderBy('created_at', 'desc')
|
||||
->first();
|
||||
|
||||
if (! $lastExecution) {
|
||||
return false;
|
||||
}
|
||||
|
||||
$frequency = $this->server->settings->docker_cleanup_frequency ?? '0 0 * * *';
|
||||
if (isset(VALID_CRON_STRINGS[$frequency])) {
|
||||
$frequency = VALID_CRON_STRINGS[$frequency];
|
||||
}
|
||||
|
||||
$cron = new CronExpression($frequency);
|
||||
$now = Carbon::now();
|
||||
$nextRun = Carbon::parse($cron->getNextRunDate($now));
|
||||
$afterThat = Carbon::parse($cron->getNextRunDate($nextRun));
|
||||
$intervalMinutes = $nextRun->diffInMinutes($afterThat);
|
||||
|
||||
$threshold = max($intervalMinutes * 2, 10);
|
||||
|
||||
return Carbon::parse($lastExecution->created_at)->diffInMinutes($now) > $threshold;
|
||||
} catch (\Throwable) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#[Computed]
|
||||
public function lastExecutionTime(): ?string
|
||||
{
|
||||
return DockerCleanupExecution::where('server_id', $this->server->id)
|
||||
->orderBy('created_at', 'desc')
|
||||
->first()
|
||||
?->created_at
|
||||
?->diffForHumans();
|
||||
}
|
||||
|
||||
#[Computed]
|
||||
public function isSchedulerHealthy(): bool
|
||||
{
|
||||
return Cache::get('scheduled-job-manager:heartbeat') !== null;
|
||||
}
|
||||
|
||||
public function mount(string $server_uuid)
|
||||
{
|
||||
try {
|
||||
|
|
|
|||
|
|
@ -27,6 +27,22 @@
|
|||
<div class="mt-1 mb-6">Configure Docker cleanup settings for your server.</div>
|
||||
</div>
|
||||
|
||||
@if ($this->isCleanupStale)
|
||||
<div class="mb-4">
|
||||
<x-callout type="warning" title="Docker Cleanup May Be Stalled">
|
||||
<p>The last Docker cleanup ran {{ $this->lastExecutionTime ?? 'unknown time' }} ago,
|
||||
which is longer than expected for the configured frequency.</p>
|
||||
@if (!$this->isSchedulerHealthy)
|
||||
<p class="mt-1">The scheduled job manager appears to be inactive. This may indicate
|
||||
a stale Redis lock is blocking all scheduled jobs.</p>
|
||||
@endif
|
||||
<p class="mt-2">To resolve, run on your Coolify instance:
|
||||
<code class="bg-black/10 dark:bg-white/10 px-1 rounded">php artisan cleanup:redis --clear-locks</code>
|
||||
</p>
|
||||
</x-callout>
|
||||
</div>
|
||||
@endif
|
||||
|
||||
<div class="flex flex-col gap-2">
|
||||
<div class="flex gap-4">
|
||||
<h3>Cleanup Configuration</h3>
|
||||
|
|
|
|||
49
tests/Feature/ScheduledJobManagerStaleLockTest.php
Normal file
49
tests/Feature/ScheduledJobManagerStaleLockTest.php
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
<?php
|
||||
|
||||
use App\Jobs\ScheduledJobManager;
|
||||
use Illuminate\Support\Facades\Redis;
|
||||
|
||||
it('clears stale lock when TTL is -1', function () {
|
||||
$cachePrefix = config('cache.prefix');
|
||||
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
|
||||
|
||||
$redis = Redis::connection('default');
|
||||
$redis->set($lockKey, 'stale-owner');
|
||||
|
||||
expect($redis->ttl($lockKey))->toBe(-1);
|
||||
|
||||
$job = new ScheduledJobManager;
|
||||
$job->middleware();
|
||||
|
||||
expect($redis->exists($lockKey))->toBe(0);
|
||||
});
|
||||
|
||||
it('preserves valid lock with positive TTL', function () {
|
||||
$cachePrefix = config('cache.prefix');
|
||||
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
|
||||
|
||||
$redis = Redis::connection('default');
|
||||
$redis->set($lockKey, 'active-owner');
|
||||
$redis->expire($lockKey, 60);
|
||||
|
||||
expect($redis->ttl($lockKey))->toBeGreaterThan(0);
|
||||
|
||||
$job = new ScheduledJobManager;
|
||||
$job->middleware();
|
||||
|
||||
expect($redis->exists($lockKey))->toBe(1);
|
||||
|
||||
$redis->del($lockKey);
|
||||
});
|
||||
|
||||
it('does not fail when no lock exists', function () {
|
||||
$cachePrefix = config('cache.prefix');
|
||||
$lockKey = $cachePrefix.'laravel-queue-overlap:'.ScheduledJobManager::class.':scheduled-job-manager';
|
||||
|
||||
Redis::connection('default')->del($lockKey);
|
||||
|
||||
$job = new ScheduledJobManager;
|
||||
$middleware = $job->middleware();
|
||||
|
||||
expect($middleware)->toBeArray()->toHaveCount(1);
|
||||
});
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
$expiresAfterProperty->setAccessible(true);
|
||||
$expiresAfter = $expiresAfterProperty->getValue($overlappingMiddleware);
|
||||
|
||||
expect($expiresAfter)->toBe(60)
|
||||
expect($expiresAfter)->toBe(90)
|
||||
->and($expiresAfter)->toBeGreaterThan(0, 'expireAfter must be set to prevent stale locks');
|
||||
|
||||
// Check releaseAfter is NOT set (we use dontRelease)
|
||||
|
|
|
|||
Loading…
Reference in a new issue