f187425835
ремонт: PG не отвечает 20+ мин, нужен диагностический workflow
Read-only SSH-based diagnostic for PG-not-accepting-connections incident:
systemctl/journalctl/df/free/uptime + tail /var/log/postgresql/postgresql-16-main.log
+ WAL size + dmesg + HTTPS probe of liderra.ru.
Triggered manually via gh workflow run pg-diagnose.yml.
No production mutations.
(Cherry-picked from feat/router-gate-hard-wall 8cbb84e1 — gh workflow run
requires file on default branch.)
97 lines
3.5 KiB
YAML
97 lines
3.5 KiB
YAML
name: Diagnose PostgreSQL state on liderra.ru
|
|
|
|
# Read-only diagnostic для incident "PG не принимает connections".
|
|
# Запускается вручную: gh workflow run pg-diagnose.yml --ref <branch>
|
|
# Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime
|
|
# + tail последних 200 строк postgresql-16-main.log.
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
diagnose:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 5
|
|
|
|
env:
|
|
LIDERRA_HOST: 111.88.246.137
|
|
LIDERRA_USER: ubuntu
|
|
|
|
steps:
|
|
- name: Setup SSH key
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
|
|
chmod 600 ~/.ssh/liderra_deploy
|
|
ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
|
|
|
|
- name: Run PG diagnostic on prod
|
|
run: |
|
|
ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \
|
|
"bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log
|
|
set +e
|
|
echo "=== 1. hostname + UTC time ==="
|
|
echo "host=$(hostname); utc=$(date -u)"
|
|
echo
|
|
echo "=== 2. uptime ==="
|
|
uptime
|
|
echo
|
|
echo "=== 3. last reboot ==="
|
|
who -b
|
|
last reboot --time-format=iso | head -5
|
|
echo
|
|
echo "=== 4. df -h / and /var ==="
|
|
df -h / /var /var/lib/postgresql 2>&1 | head -10
|
|
echo
|
|
echo "=== 5. free -h ==="
|
|
free -h
|
|
echo
|
|
echo "=== 6. systemctl status postgresql ==="
|
|
sudo systemctl status postgresql --no-pager 2>&1 | head -30
|
|
echo
|
|
echo "=== 7. systemctl status postgresql@16-main (cluster) ==="
|
|
sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30
|
|
echo
|
|
echo "=== 8. nginx + php-fpm status (one-line each) ==="
|
|
sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1
|
|
echo
|
|
echo "=== 9. ps aux | postgres (top 15) ==="
|
|
ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15
|
|
echo
|
|
echo "=== 10. journalctl postgresql last 80 lines ==="
|
|
sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80
|
|
echo
|
|
echo "=== 11. journalctl postgresql@16-main last 80 lines ==="
|
|
sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80
|
|
echo
|
|
echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ==="
|
|
sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1
|
|
echo
|
|
echo "=== 13. WAL size and count ==="
|
|
sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1
|
|
sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l
|
|
echo
|
|
echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ==="
|
|
sudo dmesg -T 2>&1 | tail -40
|
|
echo
|
|
echo "=== 15. liderra.ru HTTPS probe ==="
|
|
curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10
|
|
echo
|
|
echo "=== DONE ==="
|
|
REMOTE
|
|
|
|
- name: Print summary
|
|
if: always()
|
|
run: |
|
|
{
|
|
echo "## PG diagnostic on liderra.ru"
|
|
echo
|
|
echo '```'
|
|
cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)"
|
|
echo '```'
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
- name: Cleanup SSH key
|
|
if: always()
|
|
run: rm -f ~/.ssh/liderra_deploy
|