fix(docker): queue/scheduler/horizon resilience + container HEALTHCHECK

- start-container: generous startretries + startsecs and graceful SIGTERM stopwaitsecs for queue/scheduler/horizon, so a transient boot failure (e.g. DB not ready) no longer marks a worker FATAL forever and silently drops jobs, and an in-flight job finishes before SIGKILL on deploy/restart. - container-health: new HEALTHCHECK that reports UNHEALTHY when php-fpm/nginx or any enabled queue/scheduler/horizon worker is not RUNNING, so a dead worker surfaces in docker ps / orchestration instead of letting jobs pile up.
2026-06-12 15:57:59 +02:00 · 2026-06-12 15:57:59 +02:00 · 9cb3ae5bce
parent c4fb894699
commit 9cb3ae5bce
3 changed files with 82 additions and 6 deletions
--- a/12
+++ b/12
@ -154,10 +154,11 @@ RUN mkdir -p /etc/supervisor/conf.d /etc/supervisor/laravel.d /etc/supervisor/cu
             /var/log/supervisor /var/log/nginx /var/log/php

 # ---------------------------------------------------------------------------
-# Startup script
+# Startup script + healthcheck
 # ---------------------------------------------------------------------------
 COPY scripts/start-container /usr/local/bin/start-container
-RUN chmod +x /usr/local/bin/start-container
+COPY scripts/container-health /usr/local/bin/container-health
+RUN chmod +x /usr/local/bin/start-container /usr/local/bin/container-health

 # Composer cache
 RUN mkdir -p /.composer && chmod 0777 /.composer
@ -179,5 +180,10 @@ ENV MYSQL_CLIENT_VERIFY=OFF

 EXPOSE 80

-ENTRYPOINT ["start-container"]
+# Report UNHEALTHY when an expected supervised process (php-fpm, nginx, or any
+# enabled queue/scheduler/horizon worker) is not RUNNING — so a dead worker
+# surfaces in `docker ps`/orchestration instead of silently dropping jobs.
+# start-period gives supervisord + a slow DB time to come up before counting.
+HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
+    CMD container-health

--- a/scripts/container-health
+++ b/scripts/container-health
@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# ===========================================================================
+# container-health — Docker HEALTHCHECK for the docker-laravel image.
+#
+# Asserts that every supervisor program that is SUPPOSED to be running (the
+# core web stack, plus whichever ENABLE_* services are on) is actually in the
+# RUNNING state. If a queue worker or scheduler has crashed and supervisor gave
+# up (FATAL), or never started, the container is reported UNHEALTHY — so a dead
+# worker is visible in `docker ps` / orchestration instead of silently letting
+# jobs pile up.
+#
+# Exit 0 = healthy, 1 = unhealthy. Used by HEALTHCHECK in the Dockerfile.
+# ===========================================================================
+set -uo pipefail
+
+# Programs that must always be up.
+expected=(php-fpm nginx)
+
+# Optional services, gated by the same env vars start-container reads.
+[ "${ENABLE_QUEUE:-false}" = "true" ]     && expected+=(queue)
+[ "${ENABLE_SCHEDULER:-false}" = "true" ] && expected+=(scheduler)
+[ "${ENABLE_HORIZON:-false}" = "true" ]   && expected+=(horizon)
+
+status="$(supervisorctl status 2>/dev/null)"
+if [ -z "$status" ]; then
+    echo "UNHEALTHY: supervisorctl returned no status (supervisord down?)"
+    exit 1
+fi
+
+rc=0
+for p in "${expected[@]}"; do
+    # supervisorctl prints e.g. "queue   RUNNING   pid 123, uptime 0:10:00"
+    if ! echo "$status" | grep -qE "^${p}([:_][^ ]*)?[[:space:]]+RUNNING"; then
+        line="$(echo "$status" | grep -E "^${p}([:_][^ ]*)?[[:space:]]" || echo "${p}  (missing)")"
+        echo "UNHEALTHY: '${p}' is not RUNNING -> ${line}"
+        rc=1
+    fi
+done
+
+exit $rc
--- a/scripts/start-container
+++ b/scripts/start-container
@ -118,9 +118,19 @@ if [ "${ENABLE_QUEUE:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600
 autostart=true
 autorestart=true
+; A few rapid boot failures (e.g. DB not ready yet) must NOT make supervisor
+; give up and mark the worker FATAL forever — that's how a queue silently dies
+; and jobs pile up. Keep retrying generously; once it stays up past startsecs
+; the counter resets, so the hourly --max-time exit never trips it.
+startretries=30
+startsecs=5
+; Graceful shutdown on deploy/restart: queue:work handles SIGTERM by finishing
+; the current job then exiting. Wait longer than --timeout before SIGKILL so an
+; in-flight job is never killed mid-write (which would fail + retry it).
+stopsignal=TERM
+stopwaitsecs=630
 user=www-data
 priority=20
-startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -135,9 +145,12 @@ if [ "${ENABLE_SCHEDULER:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work
 autostart=true
 autorestart=true
+startretries=30
+startsecs=5
+stopsignal=TERM
+stopwaitsecs=60
 user=www-data
 priority=20
-startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -152,9 +165,13 @@ if [ "${ENABLE_HORIZON:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon
 autostart=true
 autorestart=true
+startretries=30
+startsecs=5
+; Horizon traps SIGTERM and shuts its workers down gracefully.
+stopsignal=TERM
+stopwaitsecs=630
 user=www-data
 priority=20
-startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -162,6 +179,19 @@ stderr_logfile_maxbytes=0
 CONF
 fi

+# Loud warning for the silent-failure mode this whole section exists to prevent:
+# a database/redis queue with NO worker (or NO scheduler) means jobs/scheduled
+# tasks accumulate forever with nothing processing them — exactly how a paid
+# purchase can leave a user un-granted. `sync` is the one connection that runs
+# jobs inline, so it's the only safe one without a worker.
+QC="${QUEUE_CONNECTION:-sync}"
+if [ "${ENABLE_QUEUE:-false}" != "true" ] && [ "${ENABLE_HORIZON:-false}" != "true" ] && [ "$QC" != "sync" ] && [ "$QC" != "null" ]; then
+    echo "  !! WARNING: ENABLE_QUEUE=false but QUEUE_CONNECTION='${QC}' — queued jobs will PILE UP unprocessed (no worker). Set ENABLE_QUEUE=true, or QUEUE_CONNECTION=sync."
+fi
+if [ "${ENABLE_SCHEDULER:-false}" != "true" ]; then
+    echo "  !! WARNING: ENABLE_SCHEDULER=false — scheduled tasks (cleanups, health checks, reminders) will NOT run."
+fi
+
 # Report custom programs
 CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l)
 if [ "$CUSTOM_COUNT" -gt 0 ]; then