name: Diagnose PostgreSQL state on liderra.ru # Read-only diagnostic для incident "PG не принимает connections". # Запускается вручную: gh workflow run pg-diagnose.yml --ref # Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime # + tail последних 200 строк postgresql-16-main.log. on: workflow_dispatch: jobs: diagnose: runs-on: ubuntu-latest timeout-minutes: 5 env: LIDERRA_HOST: 111.88.246.137 LIDERRA_USER: ubuntu steps: - name: Setup SSH key run: | mkdir -p ~/.ssh echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy chmod 600 ~/.ssh/liderra_deploy ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null - name: Run PG diagnostic on prod run: | ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \ "bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log set +e echo "=== 1. hostname + UTC time ===" echo "host=$(hostname); utc=$(date -u)" echo echo "=== 2. uptime ===" uptime echo echo "=== 3. last reboot ===" who -b last reboot --time-format=iso | head -5 echo echo "=== 4. df -h / and /var ===" df -h / /var /var/lib/postgresql 2>&1 | head -10 echo echo "=== 5. free -h ===" free -h echo echo "=== 6. systemctl status postgresql ===" sudo systemctl status postgresql --no-pager 2>&1 | head -30 echo echo "=== 7. systemctl status postgresql@16-main (cluster) ===" sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30 echo echo "=== 8. nginx + php-fpm status (one-line each) ===" sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1 echo echo "=== 9. ps aux | postgres (top 15) ===" ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15 echo echo "=== 10. journalctl postgresql last 80 lines ===" sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80 echo echo "=== 11. journalctl postgresql@16-main last 80 lines ===" sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80 echo echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ===" sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1 echo echo "=== 13. WAL size and count ===" sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1 sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l echo echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ===" sudo dmesg -T 2>&1 | tail -40 echo echo "=== 15. liderra.ru HTTPS probe ===" curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10 echo echo "=== DONE ===" REMOTE - name: Print summary if: always() run: | { echo "## PG diagnostic on liderra.ru" echo echo '```' cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)" echo '```' } >> "$GITHUB_STEP_SUMMARY" - name: Cleanup SSH key if: always() run: rm -f ~/.ssh/liderra_deploy