diff --git a/Dockerfile b/Dockerfile index 6ea3c1a..3fad3e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -154,10 +154,11 @@ RUN mkdir -p /etc/supervisor/conf.d /etc/supervisor/laravel.d /etc/supervisor/cu /var/log/supervisor /var/log/nginx /var/log/php # --------------------------------------------------------------------------- -# Startup script +# Startup script + healthcheck # --------------------------------------------------------------------------- COPY scripts/start-container /usr/local/bin/start-container -RUN chmod +x /usr/local/bin/start-container +COPY scripts/container-health /usr/local/bin/container-health +RUN chmod +x /usr/local/bin/start-container /usr/local/bin/container-health # Composer cache RUN mkdir -p /.composer && chmod 0777 /.composer @@ -179,5 +180,10 @@ ENV MYSQL_CLIENT_VERIFY=OFF EXPOSE 80 -ENTRYPOINT ["start-container"] +# Report UNHEALTHY when an expected supervised process (php-fpm, nginx, or any +# enabled queue/scheduler/horizon worker) is not RUNNING — so a dead worker +# surfaces in `docker ps`/orchestration instead of silently dropping jobs. +# start-period gives supervisord + a slow DB time to come up before counting. +HEALTHCHECK --interval=30s --timeout=10s --start-period=90s --retries=3 \ + CMD container-health diff --git a/scripts/container-health b/scripts/container-health new file mode 100755 index 0000000..5f0cb59 --- /dev/null +++ b/scripts/container-health @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# =========================================================================== +# container-health — Docker HEALTHCHECK for the docker-laravel image. +# +# Asserts that every supervisor program that is SUPPOSED to be running (the +# core web stack, plus whichever ENABLE_* services are on) is actually in the +# RUNNING state. If a queue worker or scheduler has crashed and supervisor gave +# up (FATAL), or never started, the container is reported UNHEALTHY — so a dead +# worker is visible in `docker ps` / orchestration instead of silently letting +# jobs pile up. +# +# Exit 0 = healthy, 1 = unhealthy. Used by HEALTHCHECK in the Dockerfile. +# =========================================================================== +set -uo pipefail + +# Programs that must always be up. +expected=(php-fpm nginx) + +# Optional services, gated by the same env vars start-container reads. +[ "${ENABLE_QUEUE:-false}" = "true" ] && expected+=(queue) +[ "${ENABLE_SCHEDULER:-false}" = "true" ] && expected+=(scheduler) +[ "${ENABLE_HORIZON:-false}" = "true" ] && expected+=(horizon) + +status="$(supervisorctl status 2>/dev/null)" +if [ -z "$status" ]; then + echo "UNHEALTHY: supervisorctl returned no status (supervisord down?)" + exit 1 +fi + +rc=0 +for p in "${expected[@]}"; do + # supervisorctl prints e.g. "queue RUNNING pid 123, uptime 0:10:00" + if ! echo "$status" | grep -qE "^${p}([:_][^ ]*)?[[:space:]]+RUNNING"; then + line="$(echo "$status" | grep -E "^${p}([:_][^ ]*)?[[:space:]]" || echo "${p} (missing)")" + echo "UNHEALTHY: '${p}' is not RUNNING -> ${line}" + rc=1 + fi +done + +exit $rc diff --git a/scripts/start-container b/scripts/start-container index 39fa45d..42f6b58 100755 --- a/scripts/start-container +++ b/scripts/start-container @@ -118,9 +118,19 @@ if [ "${ENABLE_QUEUE:-false}" = "true" ]; then command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan queue:work --tries=3 --sleep=5 --timeout=600 --max-jobs=500 --max-time=3600 autostart=true autorestart=true +; A few rapid boot failures (e.g. DB not ready yet) must NOT make supervisor +; give up and mark the worker FATAL forever — that's how a queue silently dies +; and jobs pile up. Keep retrying generously; once it stays up past startsecs +; the counter resets, so the hourly --max-time exit never trips it. +startretries=30 +startsecs=5 +; Graceful shutdown on deploy/restart: queue:work handles SIGTERM by finishing +; the current job then exiting. Wait longer than --timeout before SIGKILL so an +; in-flight job is never killed mid-write (which would fail + retry it). +stopsignal=TERM +stopwaitsecs=630 user=www-data priority=20 -startsecs=5 stdout_logfile=/proc/1/fd/1 stdout_logfile_maxbytes=0 stderr_logfile=/proc/1/fd/2 @@ -135,9 +145,12 @@ if [ "${ENABLE_SCHEDULER:-false}" = "true" ]; then command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan schedule:work autostart=true autorestart=true +startretries=30 +startsecs=5 +stopsignal=TERM +stopwaitsecs=60 user=www-data priority=20 -startsecs=5 stdout_logfile=/proc/1/fd/1 stdout_logfile_maxbytes=0 stderr_logfile=/proc/1/fd/2 @@ -152,9 +165,13 @@ if [ "${ENABLE_HORIZON:-false}" = "true" ]; then command=/usr/local/bin/php -d variables_order=EGPCS /var/www/html/artisan horizon autostart=true autorestart=true +startretries=30 +startsecs=5 +; Horizon traps SIGTERM and shuts its workers down gracefully. +stopsignal=TERM +stopwaitsecs=630 user=www-data priority=20 -startsecs=5 stdout_logfile=/proc/1/fd/1 stdout_logfile_maxbytes=0 stderr_logfile=/proc/1/fd/2 @@ -162,6 +179,19 @@ stderr_logfile_maxbytes=0 CONF fi +# Loud warning for the silent-failure mode this whole section exists to prevent: +# a database/redis queue with NO worker (or NO scheduler) means jobs/scheduled +# tasks accumulate forever with nothing processing them — exactly how a paid +# purchase can leave a user un-granted. `sync` is the one connection that runs +# jobs inline, so it's the only safe one without a worker. +QC="${QUEUE_CONNECTION:-sync}" +if [ "${ENABLE_QUEUE:-false}" != "true" ] && [ "${ENABLE_HORIZON:-false}" != "true" ] && [ "$QC" != "sync" ] && [ "$QC" != "null" ]; then + echo " !! WARNING: ENABLE_QUEUE=false but QUEUE_CONNECTION='${QC}' — queued jobs will PILE UP unprocessed (no worker). Set ENABLE_QUEUE=true, or QUEUE_CONNECTION=sync." +fi +if [ "${ENABLE_SCHEDULER:-false}" != "true" ]; then + echo " !! WARNING: ENABLE_SCHEDULER=false — scheduled tasks (cleanups, health checks, reminders) will NOT run." +fi + # Report custom programs CUSTOM_COUNT=$(find /etc/supervisor/custom.d/ -name '*.conf' 2>/dev/null | wc -l) if [ "$CUSTOM_COUNT" -gt 0 ]; then