Files
portal/.github/workflows/disk-recover.yml
T
Дмитрий 7f7036f3ab ops(incident): disk-recover v2 — laravel.log 8.7G + sudo bash redirect для PG log
ремонт: v1 освободил только 440M (apt clean + nginx gz); главный виновник — laravel.log 8.7G + syslog 525M + playwright cache 440M; sudo truncate на PG log дал Permission denied — workaround через sudo bash -c ': > file'

Targeted fixes for v1 issues:
- laravel.log 8.7G + laravel.log.1 572M → truncate via sudo bash redirect
- syslog 525M → truncate
- PG log 497M → workaround via sudo bash redirect (sudo truncate gave Permission denied)
- /var/www/.cache/ms-playwright ~440M → removed (dev cache, not needed in prod)
2026-05-29 12:48:04 +03:00

214 lines
8.0 KiB
YAML

name: Disk-full recovery on liderra.ru
# Incident response: PG в PANIC loop из-за / диск 100%.
# 1) Диагностика: что где лежит (top-20 крупных, du по /var/log)
# 2) Безопасная чистка:
# - truncate /var/log/postgresql/postgresql-16-main.log (PG в PANIC, не пишет, inode preserved)
# - journalctl --vacuum-size=200M
# - старые ротированные *.gz логи nginx >7 дней
# - apt-get clean
# - Laravel storage/logs *.log >7 дней
# 3) Final df check + PG probe.
#
# Триггер: gh workflow run disk-recover.yml -f confirm_apply=true
on:
workflow_dispatch:
inputs:
confirm_apply:
description: 'Подтверждаю удаление логов на проде'
required: true
default: 'false'
type: boolean
jobs:
recover:
runs-on: ubuntu-latest
timeout-minutes: 10
env:
LIDERRA_HOST: 111.88.246.137
LIDERRA_USER: ubuntu
CONFIRM: ${{ github.event.inputs.confirm_apply }}
steps:
- name: Guard
run: |
if [[ "$CONFIRM" != "true" ]]; then
echo "::error::confirm_apply=true required (this workflow mutates disk on prod)"
exit 1
fi
- name: Setup SSH key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
chmod 600 ~/.ssh/liderra_deploy
ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
- name: Diagnose + cleanup
run: |
ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \
"bash -s" <<'REMOTE' | tee /tmp/recover.log
set +e
echo "=== A. BEFORE: df -h / ==="
df -h / /var /var/lib/postgresql 2>&1 | head -10
echo
echo "=== B. Top-20 largest files in /var (>50M) ==="
sudo find /var -xdev -type f -size +50M -printf "%s %p\n" 2>/dev/null | sort -rn | head -20 | awk '{printf "%8.1f MB %s\n", $1/1024/1024, $2}'
echo
echo "=== C. du /var/log/ top-15 directories ==="
sudo du -sh /var/log/*/ 2>/dev/null | sort -rh | head -15
echo
echo "=== D. du /var/log/postgresql/* (individual files) ==="
sudo du -sh /var/log/postgresql/* 2>/dev/null | sort -rh | head -10
echo
echo "=== E. journalctl disk usage ==="
sudo journalctl --disk-usage 2>&1
echo
echo "=== F. /var/lib/postgresql/16/main top-15 subdirs ==="
sudo du -sh /var/lib/postgresql/16/main/*/ 2>/dev/null | sort -rh | head -15
echo
echo "=== G. /var/www top-10 if exists ==="
sudo du -sh /var/www/*/ 2>/dev/null | sort -rh | head -10
sudo du -sh /var/www/lidpotok/storage/logs/ 2>/dev/null
echo
echo "=== H. apt cache + tmp ==="
sudo du -sh /var/cache/apt/archives/ /tmp/ /var/tmp/ 2>/dev/null
echo
echo "=========================================="
echo "=== STARTING CLEANUP (confirm_apply=true) ==="
echo "=========================================="
echo
echo "=== 1a. PRIORITY: Truncate laravel.log (8.7 GB!) and rotated copies ==="
for f in /var/www/liderra/app/storage/logs/laravel.log /var/www/liderra/app/storage/logs/laravel.log.1; do
if [[ -f "$f" ]]; then
BEFORE=$(sudo du -m "$f" | cut -f1)
echo "BEFORE: $f = $BEFORE MB"
sudo bash -c ": > '$f'" 2>&1 || sudo truncate -s 0 "$f"
AFTER=$(sudo du -m "$f" | cut -f1)
echo "AFTER: $f = $AFTER MB"
fi
done
# Старые laravel-* (если daily-rotated)
sudo find /var/www/liderra/app/storage/logs -name "laravel-*.log" -mtime +3 -print -delete 2>&1 | head -10
echo
echo "=== 1b. Truncate PG audit log via sudo bash redirect (workaround) ==="
if [[ -f /var/log/postgresql/postgresql-16-main.log ]]; then
BEFORE=$(sudo du -m /var/log/postgresql/postgresql-16-main.log | cut -f1)
echo "BEFORE: $BEFORE MB"
sudo bash -c ': > /var/log/postgresql/postgresql-16-main.log' 2>&1
AFTER=$(sudo du -m /var/log/postgresql/postgresql-16-main.log | cut -f1)
echo "AFTER: $AFTER MB"
fi
sudo find /var/log/postgresql -type f \( -name "*.gz" -o -name "*.log.[0-9]*" \) -delete 2>&1
echo
echo "=== 1c. Truncate syslog (525M) ==="
sudo bash -c ': > /var/log/syslog' 2>&1
echo "syslog now: $(sudo du -m /var/log/syslog 2>/dev/null | cut -f1) MB"
echo
echo "=== 1d. Remove playwright dev cache (~440M, не нужен в проде) ==="
if [[ -d /var/www/.cache/ms-playwright ]]; then
sudo du -sh /var/www/.cache/ms-playwright 2>&1
sudo rm -rf /var/www/.cache/ms-playwright
echo "removed"
fi
echo
echo "=== 2. journalctl vacuum --size=200M ==="
sudo journalctl --vacuum-size=200M 2>&1 | tail -10
echo
echo "=== 3. nginx old rotated logs (gz files >3 days) ==="
sudo find /var/log/nginx -name "*.gz" -mtime +3 -print -delete 2>&1 | head -20
echo
# current access.log если >500M — truncate (nginx переоткрывает по reopen signal)
for f in /var/log/nginx/access.log /var/log/nginx/error.log; do
if [[ -f "$f" ]]; then
SIZE_MB=$(sudo du -m "$f" | cut -f1)
if [[ $SIZE_MB -gt 500 ]]; then
echo "Truncating $f ($SIZE_MB MB)"
sudo truncate -s 0 "$f"
fi
fi
done
echo
echo "=== 4. apt-get clean ==="
sudo apt-get clean 2>&1 | tail -5
echo
echo "=== 5. Laravel storage/logs *.log older 7 days ==="
if [[ -d /var/www/lidpotok ]]; then
sudo find /var/www/lidpotok -path '*/storage/logs/*.log' -mtime +7 -print -delete 2>&1 | head -20
fi
for d in /var/www/*/; do
if [[ -d "$d/storage/logs" ]]; then
for f in "$d"/storage/logs/laravel.log "$d"/storage/logs/worker.log; do
if [[ -f "$f" ]]; then
SIZE_MB=$(sudo du -m "$f" | cut -f1)
if [[ $SIZE_MB -gt 200 ]]; then
echo "Truncating $f ($SIZE_MB MB)"
sudo truncate -s 0 "$f"
fi
fi
done
fi
done
echo
echo "=== 6. Old rotated *.1 *.2 *.gz logs >50M anywhere in /var/log ==="
sudo find /var/log -type f \( -name "*.1" -o -name "*.2" -o -name "*.3" -o -name "*.gz" \) -size +50M -print -delete 2>&1 | head -20
echo
echo "=========================================="
echo "=== AFTER CLEANUP ==="
echo "=========================================="
echo "=== Z1. df -h / ==="
df -h / /var /var/lib/postgresql 2>&1 | head -10
echo
echo "=== Z2. PG status quick check ==="
sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -10
echo
echo "=== Z3. PG probe ==="
sleep 5
sudo -u postgres psql -d liderra -c "SELECT 1 AS probe, NOW() AS ts" 2>&1
echo
echo "=== Z4. HTTPS probe ==="
curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10
echo
echo "=== DONE ==="
REMOTE
- name: Print summary
if: always()
run: |
{
echo "## Disk recovery on liderra.ru"
echo
echo '```'
cat /tmp/recover.log 2>/dev/null || echo "(no log captured)"
echo '```'
} >> "$GITHUB_STEP_SUMMARY"
- name: Cleanup SSH key
if: always()
run: rm -f ~/.ssh/liderra_deploy