fix(docker): queue/scheduler/horizon resilience + container HEALTHCHECK

- start-container: generous startretries + startsecs and graceful SIGTERM
  stopwaitsecs for queue/scheduler/horizon, so a transient boot failure (e.g.
  DB not ready) no longer marks a worker FATAL forever and silently drops jobs,
  and an in-flight job finishes before SIGKILL on deploy/restart.
- container-health: new HEALTHCHECK that reports UNHEALTHY when php-fpm/nginx or
  any enabled queue/scheduler/horizon worker is not RUNNING, so a dead worker
  surfaces in docker ps / orchestration instead of letting jobs pile up.
This commit is contained in:
Fabian @ Blax Software 2026-06-12 15:57:59 +02:00
parent c4fb894699
commit 9cb3ae5bce
3 changed files with 82 additions and 6 deletions

View File

@ -154,10 +154,11 @@ RUN mkdir -p /etc/supervisor/conf.d /etc/supervisor/laravel.d /etc/supervisor/cu
/var/log/supervisor /var/log/nginx /var/log/php /var/log/supervisor /var/log/nginx /var/log/php
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Startup script # Startup script + healthcheck
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
COPY scripts/start-container /usr/local/bin/start-container COPY scripts/start-container /usr/local/bin/start-container
RUN chmod +x /usr/local/bin/start-container COPY scripts/container-health /usr/local/bin/container-health
RUN chmod +x /usr/local/bin/start-container /usr/local/bin/container-health
# Composer cache # Composer cache
RUN mkdir -p /.composer && chmod 0777 /.composer RUN mkdir -p /.composer && chmod 0777 /.composer
@ -179,5 +180,10 @@ ENV MYSQL_CLIENT_VERIFY=OFF
EXPOSE 80 EXPOSE 80
ENTRYPOINT ["start-container"] # Report UNHEALTHY when an expected supervised process (php-fpm, nginx, or any
# enabled queue/scheduler/horizon worker) is not RUNNING — so a dead worker
# surfaces in `docker ps`/orchestration instead of silently dropping jobs.
# start-period gives supervisord + a slow DB time to come up before counting.
HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
CMD container-health

40
scripts/container-health Executable file
View File

@ -0,0 +1,40 @@
#!/usr/bin/env bash
# ===========================================================================
# container-health — Docker HEALTHCHECK for the docker-laravel image.
#
# Asserts that every supervisor program that is SUPPOSED to be running (the
# core web stack, plus whichever ENABLE_* services are on) is actually in the
# RUNNING state. If a queue worker or scheduler has crashed and supervisor gave
# up (FATAL), or never started, the container is reported UNHEALTHY — so a dead
# worker is visible in `docker ps` / orchestration instead of silently letting
# jobs pile up.
#
# Exit 0 = healthy, 1 = unhealthy. Used by HEALTHCHECK in the Dockerfile.
# ===========================================================================
set -uo pipefail
# Programs that must always be up.
expected=(php-fpm nginx)
# Optional services, gated by the same env vars start-container reads.
[ "${ENABLE_QUEUE:-false}" = "true" ] && expected+=(queue)
[ "${ENABLE_SCHEDULER:-false}" = "true" ] && expected+=(scheduler)
[ "${ENABLE_HORIZON:-false}" = "true" ] && expected+=(horizon)
status="$(supervisorctl status 2>/dev/null)"
if [ -z "$status" ]; then
echo "UNHEALTHY: supervisorctl returned no status (supervisord down?)"
exit 1
fi
rc=0
for p in "${expected[@]}"; do
# supervisorctl prints e.g. "queue RUNNING pid 123, uptime 0:10:00"
if ! echo "$status" | grep -qE "^${p}([:_][^ ]*)?[[:space:]]+RUNNING"; then
line="$(echo "$status" | grep -E "^${p}([:_][^ ]*)?[[:space:]]" || echo "${p} (missing)")"
echo "UNHEALTHY: '${p}' is not RUNNING -> ${line}"
rc=1
fi
done
exit $rc

View File

@ -118,9 +118,19 @@ if [ "${ENABLE_QUEUE:-false}" = "true" ]; then
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600 command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600
autostart=true autostart=true
autorestart=true autorestart=true
; A few rapid boot failures (e.g. DB not ready yet) must NOT make supervisor
; give up and mark the worker FATAL forever — that's how a queue silently dies
; and jobs pile up. Keep retrying generously; once it stays up past startsecs
; the counter resets, so the hourly --max-time exit never trips it.
startretries=30
startsecs=5
; Graceful shutdown on deploy/restart: queue:work handles SIGTERM by finishing
; the current job then exiting. Wait longer than --timeout before SIGKILL so an
; in-flight job is never killed mid-write (which would fail + retry it).
stopsignal=TERM
stopwaitsecs=630
user=www-data user=www-data
priority=20 priority=20
startsecs=5
stdout_logfile=/proc/1/fd/1 stdout_logfile=/proc/1/fd/1
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/proc/1/fd/2 stderr_logfile=/proc/1/fd/2
@ -135,9 +145,12 @@ if [ "${ENABLE_SCHEDULER:-false}" = "true" ]; then
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work
autostart=true autostart=true
autorestart=true autorestart=true
startretries=30
startsecs=5
stopsignal=TERM
stopwaitsecs=60
user=www-data user=www-data
priority=20 priority=20
startsecs=5
stdout_logfile=/proc/1/fd/1 stdout_logfile=/proc/1/fd/1
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/proc/1/fd/2 stderr_logfile=/proc/1/fd/2
@ -152,9 +165,13 @@ if [ "${ENABLE_HORIZON:-false}" = "true" ]; then
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon
autostart=true autostart=true
autorestart=true autorestart=true
startretries=30
startsecs=5
; Horizon traps SIGTERM and shuts its workers down gracefully.
stopsignal=TERM
stopwaitsecs=630
user=www-data user=www-data
priority=20 priority=20
startsecs=5
stdout_logfile=/proc/1/fd/1 stdout_logfile=/proc/1/fd/1
stdout_logfile_maxbytes=0 stdout_logfile_maxbytes=0
stderr_logfile=/proc/1/fd/2 stderr_logfile=/proc/1/fd/2
@ -162,6 +179,19 @@ stderr_logfile_maxbytes=0
CONF CONF
fi fi
# Loud warning for the silent-failure mode this whole section exists to prevent:
# a database/redis queue with NO worker (or NO scheduler) means jobs/scheduled
# tasks accumulate forever with nothing processing them — exactly how a paid
# purchase can leave a user un-granted. `sync` is the one connection that runs
# jobs inline, so it's the only safe one without a worker.
QC="${QUEUE_CONNECTION:-sync}"
if [ "${ENABLE_QUEUE:-false}" != "true" ] && [ "${ENABLE_HORIZON:-false}" != "true" ] && [ "$QC" != "sync" ] && [ "$QC" != "null" ]; then
echo " !! WARNING: ENABLE_QUEUE=false but QUEUE_CONNECTION='${QC}' — queued jobs will PILE UP unprocessed (no worker). Set ENABLE_QUEUE=true, or QUEUE_CONNECTION=sync."
fi
if [ "${ENABLE_SCHEDULER:-false}" != "true" ]; then
echo " !! WARNING: ENABLE_SCHEDULER=false — scheduled tasks (cleanups, health checks, reminders) will NOT run."
fi
# Report custom programs # Report custom programs
CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l) CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l)
if [ "$CUSTOM_COUNT" -gt 0 ]; then if [ "$CUSTOM_COUNT" -gt 0 ]; then