diff --git a/.github/workflows/pg-diagnose.yml b/.github/workflows/pg-diagnose.yml new file mode 100644 index 00000000..73c6a399 --- /dev/null +++ b/.github/workflows/pg-diagnose.yml @@ -0,0 +1,96 @@ +name: Diagnose PostgreSQL state on liderra.ru + +# Read-only diagnostic для incident "PG не принимает connections". +# Запускается вручную: gh workflow run pg-diagnose.yml --ref +# Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime +# + tail последних 200 строк postgresql-16-main.log. + +on: + workflow_dispatch: + +jobs: + diagnose: + runs-on: ubuntu-latest + timeout-minutes: 5 + + env: + LIDERRA_HOST: 111.88.246.137 + LIDERRA_USER: ubuntu + + steps: + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy + chmod 600 ~/.ssh/liderra_deploy + ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null + + - name: Run PG diagnostic on prod + run: | + ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \ + "bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log + set +e + echo "=== 1. hostname + UTC time ===" + echo "host=$(hostname); utc=$(date -u)" + echo + echo "=== 2. uptime ===" + uptime + echo + echo "=== 3. last reboot ===" + who -b + last reboot --time-format=iso | head -5 + echo + echo "=== 4. df -h / and /var ===" + df -h / /var /var/lib/postgresql 2>&1 | head -10 + echo + echo "=== 5. free -h ===" + free -h + echo + echo "=== 6. systemctl status postgresql ===" + sudo systemctl status postgresql --no-pager 2>&1 | head -30 + echo + echo "=== 7. systemctl status postgresql@16-main (cluster) ===" + sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30 + echo + echo "=== 8. nginx + php-fpm status (one-line each) ===" + sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1 + echo + echo "=== 9. ps aux | postgres (top 15) ===" + ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15 + echo + echo "=== 10. journalctl postgresql last 80 lines ===" + sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80 + echo + echo "=== 11. journalctl postgresql@16-main last 80 lines ===" + sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80 + echo + echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ===" + sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1 + echo + echo "=== 13. WAL size and count ===" + sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1 + sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l + echo + echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ===" + sudo dmesg -T 2>&1 | tail -40 + echo + echo "=== 15. liderra.ru HTTPS probe ===" + curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10 + echo + echo "=== DONE ===" + REMOTE + + - name: Print summary + if: always() + run: | + { + echo "## PG diagnostic on liderra.ru" + echo + echo '```' + cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + - name: Cleanup SSH key + if: always() + run: rm -f ~/.ssh/liderra_deploy