fix(docker): queue/scheduler/horizon resilience + container HEALTHCHECK
- start-container: generous startretries + startsecs and graceful SIGTERM stopwaitsecs for queue/scheduler/horizon, so a transient boot failure (e.g. DB not ready) no longer marks a worker FATAL forever and silently drops jobs, and an in-flight job finishes before SIGKILL on deploy/restart. - container-health: new HEALTHCHECK that reports UNHEALTHY when php-fpm/nginx or any enabled queue/scheduler/horizon worker is not RUNNING, so a dead worker surfaces in docker ps / orchestration instead of letting jobs pile up.
This commit is contained in:
parent
c4fb894699
commit
9cb3ae5bce
12
Dockerfile
12
Dockerfile
|
|
@ -154,10 +154,11 @@ RUN mkdir -p /etc/supervisor/conf.d /etc/supervisor/laravel.d /etc/supervisor/cu
|
|||
/var/log/supervisor /var/log/nginx /var/log/php
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Startup script
|
||||
# Startup script + healthcheck
|
||||
# ---------------------------------------------------------------------------
|
||||
COPY scripts/start-container /usr/local/bin/start-container
|
||||
RUN chmod +x /usr/local/bin/start-container
|
||||
COPY scripts/container-health /usr/local/bin/container-health
|
||||
RUN chmod +x /usr/local/bin/start-container /usr/local/bin/container-health
|
||||
|
||||
# Composer cache
|
||||
RUN mkdir -p /.composer && chmod 0777 /.composer
|
||||
|
|
@ -179,5 +180,10 @@ ENV MYSQL_CLIENT_VERIFY=OFF
|
|||
|
||||
EXPOSE 80
|
||||
|
||||
ENTRYPOINT ["start-container"]
|
||||
# Report UNHEALTHY when an expected supervised process (php-fpm, nginx, or any
|
||||
# enabled queue/scheduler/horizon worker) is not RUNNING — so a dead worker
|
||||
# surfaces in `docker ps`/orchestration instead of silently dropping jobs.
|
||||
# start-period gives supervisord + a slow DB time to come up before counting.
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \
|
||||
CMD container-health
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env bash
|
||||
# ===========================================================================
|
||||
# container-health — Docker HEALTHCHECK for the docker-laravel image.
|
||||
#
|
||||
# Asserts that every supervisor program that is SUPPOSED to be running (the
|
||||
# core web stack, plus whichever ENABLE_* services are on) is actually in the
|
||||
# RUNNING state. If a queue worker or scheduler has crashed and supervisor gave
|
||||
# up (FATAL), or never started, the container is reported UNHEALTHY — so a dead
|
||||
# worker is visible in `docker ps` / orchestration instead of silently letting
|
||||
# jobs pile up.
|
||||
#
|
||||
# Exit 0 = healthy, 1 = unhealthy. Used by HEALTHCHECK in the Dockerfile.
|
||||
# ===========================================================================
|
||||
set -uo pipefail
|
||||
|
||||
# Programs that must always be up.
|
||||
expected=(php-fpm nginx)
|
||||
|
||||
# Optional services, gated by the same env vars start-container reads.
|
||||
[ "${ENABLE_QUEUE:-false}" = "true" ] && expected+=(queue)
|
||||
[ "${ENABLE_SCHEDULER:-false}" = "true" ] && expected+=(scheduler)
|
||||
[ "${ENABLE_HORIZON:-false}" = "true" ] && expected+=(horizon)
|
||||
|
||||
status="$(supervisorctl status 2>/dev/null)"
|
||||
if [ -z "$status" ]; then
|
||||
echo "UNHEALTHY: supervisorctl returned no status (supervisord down?)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rc=0
|
||||
for p in "${expected[@]}"; do
|
||||
# supervisorctl prints e.g. "queue RUNNING pid 123, uptime 0:10:00"
|
||||
if ! echo "$status" | grep -qE "^${p}([:_][^ ]*)?[[:space:]]+RUNNING"; then
|
||||
line="$(echo "$status" | grep -E "^${p}([:_][^ ]*)?[[:space:]]" || echo "${p} (missing)")"
|
||||
echo "UNHEALTHY: '${p}' is not RUNNING -> ${line}"
|
||||
rc=1
|
||||
fi
|
||||
done
|
||||
|
||||
exit $rc
|
||||
|
|
@ -118,9 +118,19 @@ if [ "${ENABLE_QUEUE:-false}" = "true" ]; then
|
|||
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; A few rapid boot failures (e.g. DB not ready yet) must NOT make supervisor
|
||||
; give up and mark the worker FATAL forever — that's how a queue silently dies
|
||||
; and jobs pile up. Keep retrying generously; once it stays up past startsecs
|
||||
; the counter resets, so the hourly --max-time exit never trips it.
|
||||
startretries=30
|
||||
startsecs=5
|
||||
; Graceful shutdown on deploy/restart: queue:work handles SIGTERM by finishing
|
||||
; the current job then exiting. Wait longer than --timeout before SIGKILL so an
|
||||
; in-flight job is never killed mid-write (which would fail + retry it).
|
||||
stopsignal=TERM
|
||||
stopwaitsecs=630
|
||||
user=www-data
|
||||
priority=20
|
||||
startsecs=5
|
||||
stdout_logfile=/proc/1/fd/1
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/proc/1/fd/2
|
||||
|
|
@ -135,9 +145,12 @@ if [ "${ENABLE_SCHEDULER:-false}" = "true" ]; then
|
|||
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work
|
||||
autostart=true
|
||||
autorestart=true
|
||||
startretries=30
|
||||
startsecs=5
|
||||
stopsignal=TERM
|
||||
stopwaitsecs=60
|
||||
user=www-data
|
||||
priority=20
|
||||
startsecs=5
|
||||
stdout_logfile=/proc/1/fd/1
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/proc/1/fd/2
|
||||
|
|
@ -152,9 +165,13 @@ if [ "${ENABLE_HORIZON:-false}" = "true" ]; then
|
|||
command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon
|
||||
autostart=true
|
||||
autorestart=true
|
||||
startretries=30
|
||||
startsecs=5
|
||||
; Horizon traps SIGTERM and shuts its workers down gracefully.
|
||||
stopsignal=TERM
|
||||
stopwaitsecs=630
|
||||
user=www-data
|
||||
priority=20
|
||||
startsecs=5
|
||||
stdout_logfile=/proc/1/fd/1
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/proc/1/fd/2
|
||||
|
|
@ -162,6 +179,19 @@ stderr_logfile_maxbytes=0
|
|||
CONF
|
||||
fi
|
||||
|
||||
# Loud warning for the silent-failure mode this whole section exists to prevent:
|
||||
# a database/redis queue with NO worker (or NO scheduler) means jobs/scheduled
|
||||
# tasks accumulate forever with nothing processing them — exactly how a paid
|
||||
# purchase can leave a user un-granted. `sync` is the one connection that runs
|
||||
# jobs inline, so it's the only safe one without a worker.
|
||||
QC="${QUEUE_CONNECTION:-sync}"
|
||||
if [ "${ENABLE_QUEUE:-false}" != "true" ] && [ "${ENABLE_HORIZON:-false}" != "true" ] && [ "$QC" != "sync" ] && [ "$QC" != "null" ]; then
|
||||
echo " !! WARNING: ENABLE_QUEUE=false but QUEUE_CONNECTION='${QC}' — queued jobs will PILE UP unprocessed (no worker). Set ENABLE_QUEUE=true, or QUEUE_CONNECTION=sync."
|
||||
fi
|
||||
if [ "${ENABLE_SCHEDULER:-false}" != "true" ]; then
|
||||
echo " !! WARNING: ENABLE_SCHEDULER=false — scheduled tasks (cleanups, health checks, reminders) will NOT run."
|
||||
fi
|
||||
|
||||
# Report custom programs
|
||||
CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l)
|
||||
if [ "$CUSTOM_COUNT" -gt 0 ]; then
|
||||
|
|
|
|||
Loading…
Reference in New Issue