portal/.github/workflows/pg-diagnose.yml

name: Diagnose PostgreSQL state on liderra.ru

# Read-only diagnostic для incident "PG не принимает connections".
# Запускается вручную: gh workflow run pg-diagnose.yml --ref <branch>
# Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime
# + tail последних 200 строк postgresql-16-main.log.

on:
  workflow_dispatch:

jobs:
  diagnose:
    runs-on: ubuntu-latest
    timeout-minutes: 5

    env:
      LIDERRA_HOST: 111.88.246.137
      LIDERRA_USER: ubuntu

    steps:
      - name: Setup SSH key
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
          chmod 600 ~/.ssh/liderra_deploy
          ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null

      - name: Run PG diagnostic on prod
        run: |
          ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \
            "bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log
          set +e
          echo "=== 1. hostname + UTC time ==="
          echo "host=$(hostname); utc=$(date -u)"
          echo
          echo "=== 2. uptime ==="
          uptime
          echo
          echo "=== 3. last reboot ==="
          who -b
          last reboot --time-format=iso | head -5
          echo
          echo "=== 4. df -h / and /var ==="
          df -h / /var /var/lib/postgresql 2>&1 | head -10
          echo
          echo "=== 5. free -h ==="
          free -h
          echo
          echo "=== 6. systemctl status postgresql ==="
          sudo systemctl status postgresql --no-pager 2>&1 | head -30
          echo
          echo "=== 7. systemctl status postgresql@16-main (cluster) ==="
          sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30
          echo
          echo "=== 8. nginx + php-fpm status (one-line each) ==="
          sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1
          echo
          echo "=== 9. ps aux | postgres (top 15) ==="
          ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15
          echo
          echo "=== 10. journalctl postgresql last 80 lines ==="
          sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80
          echo
          echo "=== 11. journalctl postgresql@16-main last 80 lines ==="
          sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80
          echo
          echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ==="
          sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1
          echo
          echo "=== 13. WAL size and count ==="
          sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1
          sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l
          echo
          echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ==="
          sudo dmesg -T 2>&1 | tail -40
          echo
          echo "=== 15. liderra.ru HTTPS probe ==="
          curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10
          echo
          echo "=== DONE ==="
          REMOTE

      - name: Print summary
        if: always()
        run: |
          {
            echo "## PG diagnostic on liderra.ru"
            echo
            echo '```'
            cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)"
            echo '```'
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Cleanup SSH key
        if: always()
        run: rm -f ~/.ssh/liderra_deploy