fix(deployment): prevent base deployments from being killed when PRs close (#7113)

- Fix container filtering to properly distinguish base deployments (pullRequestId=0) from PR deployments
- Add deployment cancellation when PR closes via webhook to prevent race conditions
- Prevent CleanupHelperContainersJob from killing active deployment containers
- Enhance error messages with exit codes and actual errors instead of vague "Oops" messages
- Protect status transitions in finally blocks to ensure proper job failure handling
This commit is contained in:
Andras Bacsai 2025-11-09 14:41:35 +01:00
parent 775216e7a5
commit 67605d50fc
8 changed files with 269 additions and 40 deletions

View file

@ -246,6 +246,50 @@ public function manual(Request $request)
if ($action === 'closed') {
$found = ApplicationPreview::where('application_id', $application->id)->where('pull_request_id', $pull_request_id)->first();
if ($found) {
// Cancel any active deployments for this PR immediately
$activeDeployment = \App\Models\ApplicationDeploymentQueue::where('application_id', $application->id)
->where('pull_request_id', $pull_request_id)
->whereIn('status', [
\App\Enums\ApplicationDeploymentStatus::QUEUED->value,
\App\Enums\ApplicationDeploymentStatus::IN_PROGRESS->value,
])
->first();
if ($activeDeployment) {
try {
// Mark deployment as cancelled
$activeDeployment->update([
'status' => \App\Enums\ApplicationDeploymentStatus::CANCELLED_BY_USER->value,
]);
// Add cancellation log entry
$activeDeployment->addLogEntry('Deployment cancelled: Pull request closed.', 'stderr');
// Check if helper container exists and kill it
$deployment_uuid = $activeDeployment->deployment_uuid;
$server = $application->destination->server;
$checkCommand = "docker ps -a --filter name={$deployment_uuid} --format '{{.Names}}'";
$containerExists = instant_remote_process([$checkCommand], $server);
if ($containerExists && str($containerExists)->trim()->isNotEmpty()) {
instant_remote_process(["docker rm -f {$deployment_uuid}"], $server);
$activeDeployment->addLogEntry('Deployment container stopped.');
}
// Kill running process if process ID exists
if ($activeDeployment->current_process_id) {
try {
$processKillCommand = "kill -9 {$activeDeployment->current_process_id}";
instant_remote_process([$processKillCommand], $server);
} catch (\Throwable $e) {
// Process might already be gone
}
}
} catch (\Throwable $e) {
// Silently handle errors during deployment cancellation
}
}
DeleteResourceJob::dispatch($found);
$return_payloads->push([
'application' => $application->name,
@ -481,6 +525,51 @@ public function normal(Request $request)
if ($action === 'closed' || $action === 'close') {
$found = ApplicationPreview::where('application_id', $application->id)->where('pull_request_id', $pull_request_id)->first();
if ($found) {
// Cancel any active deployments for this PR immediately
$activeDeployment = \App\Models\ApplicationDeploymentQueue::where('application_id', $application->id)
->where('pull_request_id', $pull_request_id)
->whereIn('status', [
\App\Enums\ApplicationDeploymentStatus::QUEUED->value,
\App\Enums\ApplicationDeploymentStatus::IN_PROGRESS->value,
])
->first();
if ($activeDeployment) {
try {
// Mark deployment as cancelled
$activeDeployment->update([
'status' => \App\Enums\ApplicationDeploymentStatus::CANCELLED_BY_USER->value,
]);
// Add cancellation log entry
$activeDeployment->addLogEntry('Deployment cancelled: Pull request closed.', 'stderr');
// Check if helper container exists and kill it
$deployment_uuid = $activeDeployment->deployment_uuid;
$server = $application->destination->server;
$checkCommand = "docker ps -a --filter name={$deployment_uuid} --format '{{.Names}}'";
$containerExists = instant_remote_process([$checkCommand], $server);
if ($containerExists && str($containerExists)->trim()->isNotEmpty()) {
instant_remote_process(["docker rm -f {$deployment_uuid}"], $server);
$activeDeployment->addLogEntry('Deployment container stopped.');
}
// Kill running process if process ID exists
if ($activeDeployment->current_process_id) {
try {
$processKillCommand = "kill -9 {$activeDeployment->current_process_id}";
instant_remote_process([$processKillCommand], $server);
} catch (\Throwable $e) {
// Process might already be gone
}
}
} catch (\Throwable $e) {
// Silently handle errors during deployment cancellation
}
}
// Clean up any deployed containers
$containers = getCurrentApplicationContainerStatus($application->destination->server, $application->id, $pull_request_id);
if ($containers->isNotEmpty()) {
$containers->each(function ($container) use ($application) {

View file

@ -341,20 +341,42 @@ public function handle(): void
$this->fail($e);
throw $e;
} finally {
$this->application_deployment_queue->update([
'finished_at' => Carbon::now()->toImmutable(),
]);
if ($this->use_build_server) {
$this->server = $this->build_server;
} else {
$this->write_deployment_configurations();
// Wrap cleanup operations in try-catch to prevent exceptions from interfering
// with Laravel's job failure handling and status updates
try {
$this->application_deployment_queue->update([
'finished_at' => Carbon::now()->toImmutable(),
]);
} catch (Exception $e) {
// Log but don't fail - finished_at is not critical
\Log::warning('Failed to update finished_at for deployment '.$this->deployment_uuid.': '.$e->getMessage());
}
$this->application_deployment_queue->addLogEntry("Gracefully shutting down build container: {$this->deployment_uuid}");
$this->graceful_shutdown_container($this->deployment_uuid);
try {
if ($this->use_build_server) {
$this->server = $this->build_server;
} else {
$this->write_deployment_configurations();
}
} catch (Exception $e) {
// Log but don't fail - configuration writing errors shouldn't prevent status updates
$this->application_deployment_queue->addLogEntry('Warning: Failed to write deployment configurations: '.$e->getMessage(), 'stderr');
}
ServiceStatusChanged::dispatch(data_get($this->application, 'environment.project.team.id'));
try {
$this->application_deployment_queue->addLogEntry("Gracefully shutting down build container: {$this->deployment_uuid}");
$this->graceful_shutdown_container($this->deployment_uuid);
} catch (Exception $e) {
// Log but don't fail - container cleanup errors are expected when container is already gone
\Log::warning('Failed to shutdown container '.$this->deployment_uuid.': '.$e->getMessage());
}
try {
ServiceStatusChanged::dispatch(data_get($this->application, 'environment.project.team.id'));
} catch (Exception $e) {
// Log but don't fail - event dispatch errors shouldn't prevent status updates
\Log::warning('Failed to dispatch ServiceStatusChanged for deployment '.$this->deployment_uuid.': '.$e->getMessage());
}
}
}
@ -3798,10 +3820,8 @@ private function failDeployment(): void
public function failed(Throwable $exception): void
{
$this->failDeployment();
$this->application_deployment_queue->addLogEntry('Oops something is not okay, are you okay? 😢', 'stderr');
if (str($exception->getMessage())->isNotEmpty()) {
$this->application_deployment_queue->addLogEntry($exception->getMessage(), 'stderr');
}
$errorMessage = $exception->getMessage() ?: 'Unknown error occurred';
$this->application_deployment_queue->addLogEntry("Deployment failed: {$errorMessage}", 'stderr');
if ($this->application->build_pack !== 'dockercompose') {
$code = $exception->getCode();

View file

@ -2,6 +2,8 @@
namespace App\Jobs;
use App\Enums\ApplicationDeploymentStatus;
use App\Models\ApplicationDeploymentQueue;
use App\Models\Server;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldBeEncrypted;
@ -20,10 +22,51 @@ public function __construct(public Server $server) {}
public function handle(): void
{
try {
// Get all active deployments on this server
$activeDeployments = ApplicationDeploymentQueue::where('server_id', $this->server->id)
->whereIn('status', [
ApplicationDeploymentStatus::IN_PROGRESS->value,
ApplicationDeploymentStatus::QUEUED->value,
])
->pluck('deployment_uuid')
->toArray();
\Log::info('CleanupHelperContainersJob - Active deployments', [
'server' => $this->server->name,
'active_deployment_uuids' => $activeDeployments,
]);
$containers = instant_remote_process_with_timeout(['docker container ps --format \'{{json .}}\' | jq -s \'map(select(.Image | contains("'.config('constants.coolify.registry_url').'/coollabsio/coolify-helper")))\''], $this->server, false);
$containerIds = collect(json_decode($containers))->pluck('ID');
if ($containerIds->count() > 0) {
foreach ($containerIds as $containerId) {
$helperContainers = collect(json_decode($containers));
if ($helperContainers->count() > 0) {
foreach ($helperContainers as $container) {
$containerId = data_get($container, 'ID');
$containerName = data_get($container, 'Names');
// Check if this container belongs to an active deployment
$isActiveDeployment = false;
foreach ($activeDeployments as $deploymentUuid) {
if (str_contains($containerName, $deploymentUuid)) {
$isActiveDeployment = true;
break;
}
}
if ($isActiveDeployment) {
\Log::info('CleanupHelperContainersJob - Skipping active deployment container', [
'container' => $containerName,
'id' => $containerId,
]);
continue;
}
\Log::info('CleanupHelperContainersJob - Removing orphaned helper container', [
'container' => $containerName,
'id' => $containerId,
]);
instant_remote_process_with_timeout(['docker container rm -f '.$containerId], $this->server, false);
}
}

View file

@ -124,6 +124,51 @@ private function deleteApplicationPreview()
$this->resource->delete();
}
// Cancel any active deployments for this PR (same logic as API cancel_deployment)
$activeDeployment = \App\Models\ApplicationDeploymentQueue::where('application_id', $application->id)
->where('pull_request_id', $pull_request_id)
->whereIn('status', [
\App\Enums\ApplicationDeploymentStatus::QUEUED->value,
\App\Enums\ApplicationDeploymentStatus::IN_PROGRESS->value,
])
->first();
if ($activeDeployment) {
try {
// Mark deployment as cancelled
$activeDeployment->update([
'status' => \App\Enums\ApplicationDeploymentStatus::CANCELLED_BY_USER->value,
]);
// Add cancellation log entry
$activeDeployment->addLogEntry('Deployment cancelled: Pull request closed.', 'stderr');
// Check if helper container exists and kill it
$deployment_uuid = $activeDeployment->deployment_uuid;
$checkCommand = "docker ps -a --filter name={$deployment_uuid} --format '{{.Names}}'";
$containerExists = instant_remote_process([$checkCommand], $server);
if ($containerExists && str($containerExists)->trim()->isNotEmpty()) {
instant_remote_process(["docker rm -f {$deployment_uuid}"], $server);
$activeDeployment->addLogEntry('Deployment container stopped.');
} else {
$activeDeployment->addLogEntry('Helper container not yet started. Deployment will be cancelled when job checks status.');
}
// Kill running process if process ID exists
if ($activeDeployment->current_process_id) {
try {
$processKillCommand = "kill -9 {$activeDeployment->current_process_id}";
instant_remote_process([$processKillCommand], $server);
} catch (\Throwable $e) {
// Process might already be gone
}
}
} catch (\Throwable $e) {
// Silently handle errors during deployment cancellation
}
}
try {
if ($server->isSwarm()) {
instant_remote_process(["docker stack rm {$application->uuid}-{$pull_request_id}"], $server);
@ -133,7 +178,7 @@ private function deleteApplicationPreview()
}
} catch (\Throwable $e) {
// Log the error but don't fail the job
ray('Error stopping preview containers: '.$e->getMessage());
\Log::warning('Error stopping preview containers for application '.$application->uuid.', PR #'.$pull_request_id.': '.$e->getMessage());
}
// Finally, force delete to trigger resource cleanup
@ -156,7 +201,6 @@ private function stopPreviewContainers(array $containers, $server, int $timeout
"docker stop --time=$timeout $containerList",
"docker rm -f $containerList",
];
instant_remote_process(
command: $commands,
server: $server,

View file

@ -219,9 +219,22 @@ private function executeCommandWithProcess($command, $hidden, $customType, $appe
$process_result = $process->wait();
if ($process_result->exitCode() !== 0) {
if (! $ignore_errors) {
// Check if deployment was cancelled while command was running
if (isset($this->application_deployment_queue)) {
$this->application_deployment_queue->refresh();
if ($this->application_deployment_queue->status === \App\Enums\ApplicationDeploymentStatus::CANCELLED_BY_USER->value) {
throw new \RuntimeException('Deployment cancelled by user', 69420);
}
}
// Don't immediately set to FAILED - let the retry logic handle it
// This prevents premature status changes during retryable SSH errors
throw new \RuntimeException($process_result->errorOutput());
$error = $process_result->errorOutput();
if (empty($error)) {
$error = $process_result->output() ?: 'Command failed with no error output';
}
$redactedCommand = $this->redact_sensitive_info($command);
throw new \RuntimeException("Command execution failed (exit code {$process_result->exitCode()}): {$redactedCommand}\nError: {$error}");
}
}
}

View file

@ -17,13 +17,31 @@ function getCurrentApplicationContainerStatus(Server $server, int $id, ?int $pul
if (! $server->isSwarm()) {
$containers = instant_remote_process(["docker ps -a --filter='label=coolify.applicationId={$id}' --format '{{json .}}' "], $server);
$containers = format_docker_command_output_to_json($containers);
$containers = $containers->map(function ($container) use ($pullRequestId, $includePullrequests) {
$labels = data_get($container, 'Labels');
if (! str($labels)->contains('coolify.pullRequestId=')) {
data_set($container, 'Labels', $labels.",coolify.pullRequestId={$pullRequestId}");
$containerName = data_get($container, 'Names');
$hasPrLabel = str($labels)->contains('coolify.pullRequestId=');
$prLabelValue = null;
if ($hasPrLabel) {
preg_match('/coolify\.pullRequestId=(\d+)/', $labels, $matches);
$prLabelValue = $matches[1] ?? null;
}
// Treat pullRequestId=0 or missing label as base deployment (convention: 0 = no PR)
$isBaseDeploy = ! $hasPrLabel || (int) $prLabelValue === 0;
// If we're looking for a specific PR and this is a base deployment, exclude it
if ($pullRequestId !== null && $pullRequestId !== 0 && $isBaseDeploy) {
return null;
}
// If this is a base deployment, include it when not filtering for PRs
if ($isBaseDeploy) {
return $container;
}
if ($includePullrequests) {
return $container;
}
@ -34,7 +52,9 @@ function getCurrentApplicationContainerStatus(Server $server, int $id, ?int $pul
return null;
});
return $containers->filter();
$filtered = $containers->filter();
return $filtered;
}
return $containers;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long