fix(docker): queue/scheduler/horizon resilience + container HEALTHCHECK

- start-container: generous startretries + startsecs and graceful SIGTERM stopwaitsecs for queue/scheduler/horizon, so a transient boot failure (e.g. DB not ready) no longer marks a worker FATAL forever and silently drops jobs, and an in-flight job finishes before SIGKILL on deploy/restart. - container-health: new HEALTHCHECK that reports UNHEALTHY when php-fpm/nginx or any enabled queue/scheduler/horizon worker is not RUNNING, so a dead worker surfaces in docker ps / orchestration instead of letting jobs pile up.
2026-06-12 15:57:59 +02:00 · 2026-06-12 15:57:59 +02:00 · 9cb3ae5bce
parent c4fb894699
commit 9cb3ae5bce
3 changed files with 82 additions and 6 deletions
--- a/12
+++ b/12
@ -154,10 +154,11 @@ RUN mkdir -p /etc/supervisor/conf.d /etc/supervisor/laravel.d /etc/supervisor/cu
             /var/log/supervisor /var/log/nginx /var/log/php
 # ---------------------------------------------------------------------------
-# Startup script
+# Startup script + healthcheck
 # ---------------------------------------------------------------------------
 COPY scripts/start-container /usr/local/bin/start-container
-RUN chmod +x /usr/local/bin/start-container
+COPY scripts/container-health /usr/local/bin/container-health
 RUN chmod +x /usr/local/bin/start-container /usr/local/bin/container-health
 # Composer cache
 RUN mkdir -p /.composer && chmod 0777 /.composer
@ -179,5 +180,10 @@ ENV MYSQL_CLIENT_VERIFY=OFF
 EXPOSE 80
-ENTRYPOINT ["start-container"]
+# Report UNHEALTHY when an expected supervised process (php-fpm, nginx, or any
 # enabled queue/scheduler/horizon worker) is not RUNNING — so a dead worker
 # surfaces in `docker ps`/orchestration instead of silently dropping jobs.
 # start-period gives supervisord + a slow DB time to come up before counting.
 HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
    CMD container-health
--- a/scripts/container-health
+++ b/scripts/container-health
@ -0,0 +1,40 @@
 #!/usr/bin/env bash
 # ===========================================================================
 # container-health — Docker HEALTHCHECK for the docker-laravel image.
 #
 # Asserts that every supervisor program that is SUPPOSED to be running (the
 # core web stack, plus whichever ENABLE_* services are on) is actually in the
 # RUNNING state. If a queue worker or scheduler has crashed and supervisor gave
 # up (FATAL), or never started, the container is reported UNHEALTHY — so a dead
 # worker is visible in `docker ps` / orchestration instead of silently letting
 # jobs pile up.
 #
 # Exit 0 = healthy, 1 = unhealthy. Used by HEALTHCHECK in the Dockerfile.
 # ===========================================================================
 set -uo pipefail
 # Programs that must always be up.
 expected=(php-fpm nginx)
 # Optional services, gated by the same env vars start-container reads.
 [ "${ENABLE_QUEUE:-false}" = "true" ]     && expected+=(queue)
 [ "${ENABLE_SCHEDULER:-false}" = "true" ] && expected+=(scheduler)
 [ "${ENABLE_HORIZON:-false}" = "true" ]   && expected+=(horizon)
 status="$(supervisorctl status 2>/dev/null)"
 if [ -z "$status" ]; then
    echo "UNHEALTHY: supervisorctl returned no status (supervisord down?)"
    exit 1
 fi
 rc=0
 for p in "${expected[@]}"; do
    # supervisorctl prints e.g. "queue   RUNNING   pid 123, uptime 0:10:00"
    if ! echo "$status" | grep -qE "^${p}([:_][^ ]*)?[[:space:]]+RUNNING"; then
        line="$(echo "$status" | grep -E "^${p}([:_][^ ]*)?[[:space:]]" || echo "${p}  (missing)")"
        echo "UNHEALTHY: '${p}' is not RUNNING -> ${line}"
        rc=1
    fi
 done
 exit $rc
--- a/scripts/start-container
+++ b/scripts/start-container
@ -118,9 +118,19 @@ if [ "${ENABLE_QUEUE:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600
 autostart=true
 autorestart=true
 ; A few rapid boot failures (e.g. DB not ready yet) must NOT make supervisor
 ; give up and mark the worker FATAL forever — that's how a queue silently dies
 ; and jobs pile up. Keep retrying generously; once it stays up past startsecs
 ; the counter resets, so the hourly --max-time exit never trips it.
 startretries=30
 startsecs=5
 ; Graceful shutdown on deploy/restart: queue:work handles SIGTERM by finishing
 ; the current job then exiting. Wait longer than --timeout before SIGKILL so an
 ; in-flight job is never killed mid-write (which would fail + retry it).
 stopsignal=TERM
 stopwaitsecs=630
 user=www-data
 priority=20
 startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -135,9 +145,12 @@ if [ "${ENABLE_SCHEDULER:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work
 autostart=true
 autorestart=true
 startretries=30
 startsecs=5
 stopsignal=TERM
 stopwaitsecs=60
 user=www-data
 priority=20
 startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -152,9 +165,13 @@ if [ "${ENABLE_HORIZON:-false}" = "true" ]; then
 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon
 autostart=true
 autorestart=true
 startretries=30
 startsecs=5
 ; Horizon traps SIGTERM and shuts its workers down gracefully.
 stopsignal=TERM
 stopwaitsecs=630
 user=www-data
 priority=20
 startsecs=5
 stdout_logfile=/proc/1/fd/1
 stdout_logfile_maxbytes=0
 stderr_logfile=/proc/1/fd/2
@ -162,6 +179,19 @@ stderr_logfile_maxbytes=0
 CONF
 fi
 # Loud warning for the silent-failure mode this whole section exists to prevent:
 # a database/redis queue with NO worker (or NO scheduler) means jobs/scheduled
 # tasks accumulate forever with nothing processing them — exactly how a paid
 # purchase can leave a user un-granted. `sync` is the one connection that runs
 # jobs inline, so it's the only safe one without a worker.
 QC="${QUEUE_CONNECTION:-sync}"
 if [ "${ENABLE_QUEUE:-false}" != "true" ] && [ "${ENABLE_HORIZON:-false}" != "true" ] && [ "$QC" != "sync" ] && [ "$QC" != "null" ]; then
    echo "  !! WARNING: ENABLE_QUEUE=false but QUEUE_CONNECTION='${QC}' — queued jobs will PILE UP unprocessed (no worker). Set ENABLE_QUEUE=true, or QUEUE_CONNECTION=sync."
 fi
 if [ "${ENABLE_SCHEDULER:-false}" != "true" ]; then
    echo "  !! WARNING: ENABLE_SCHEDULER=false — scheduled tasks (cleanups, health checks, reminders) will NOT run."
 fi
 # Report custom programs
 CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l)
 if [ "$CUSTOM_COUNT" -gt 0 ]; then