7f7036f3ab
ремонт: v1 освободил только 440M (apt clean + nginx gz); главный виновник — laravel.log 8.7G + syslog 525M + playwright cache 440M; sudo truncate на PG log дал Permission denied — workaround через sudo bash -c ': > file' Targeted fixes for v1 issues: - laravel.log 8.7G + laravel.log.1 572M → truncate via sudo bash redirect - syslog 525M → truncate - PG log 497M → workaround via sudo bash redirect (sudo truncate gave Permission denied) - /var/www/.cache/ms-playwright ~440M → removed (dev cache, not needed in prod)
214 lines
8.0 KiB
YAML
214 lines
8.0 KiB
YAML
name: Disk-full recovery on liderra.ru
|
|
|
|
# Incident response: PG в PANIC loop из-за / диск 100%.
|
|
# 1) Диагностика: что где лежит (top-20 крупных, du по /var/log)
|
|
# 2) Безопасная чистка:
|
|
# - truncate /var/log/postgresql/postgresql-16-main.log (PG в PANIC, не пишет, inode preserved)
|
|
# - journalctl --vacuum-size=200M
|
|
# - старые ротированные *.gz логи nginx >7 дней
|
|
# - apt-get clean
|
|
# - Laravel storage/logs *.log >7 дней
|
|
# 3) Final df check + PG probe.
|
|
#
|
|
# Триггер: gh workflow run disk-recover.yml -f confirm_apply=true
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
confirm_apply:
|
|
description: 'Подтверждаю удаление логов на проде'
|
|
required: true
|
|
default: 'false'
|
|
type: boolean
|
|
|
|
jobs:
|
|
recover:
|
|
runs-on: ubuntu-latest
|
|
timeout-minutes: 10
|
|
|
|
env:
|
|
LIDERRA_HOST: 111.88.246.137
|
|
LIDERRA_USER: ubuntu
|
|
CONFIRM: ${{ github.event.inputs.confirm_apply }}
|
|
|
|
steps:
|
|
- name: Guard
|
|
run: |
|
|
if [[ "$CONFIRM" != "true" ]]; then
|
|
echo "::error::confirm_apply=true required (this workflow mutates disk on prod)"
|
|
exit 1
|
|
fi
|
|
|
|
- name: Setup SSH key
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
|
|
chmod 600 ~/.ssh/liderra_deploy
|
|
ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
|
|
|
|
- name: Diagnose + cleanup
|
|
run: |
|
|
ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \
|
|
"bash -s" <<'REMOTE' | tee /tmp/recover.log
|
|
set +e
|
|
|
|
echo "=== A. BEFORE: df -h / ==="
|
|
df -h / /var /var/lib/postgresql 2>&1 | head -10
|
|
echo
|
|
|
|
echo "=== B. Top-20 largest files in /var (>50M) ==="
|
|
sudo find /var -xdev -type f -size +50M -printf "%s %p\n" 2>/dev/null | sort -rn | head -20 | awk '{printf "%8.1f MB %s\n", $1/1024/1024, $2}'
|
|
echo
|
|
|
|
echo "=== C. du /var/log/ top-15 directories ==="
|
|
sudo du -sh /var/log/*/ 2>/dev/null | sort -rh | head -15
|
|
echo
|
|
|
|
echo "=== D. du /var/log/postgresql/* (individual files) ==="
|
|
sudo du -sh /var/log/postgresql/* 2>/dev/null | sort -rh | head -10
|
|
echo
|
|
|
|
echo "=== E. journalctl disk usage ==="
|
|
sudo journalctl --disk-usage 2>&1
|
|
echo
|
|
|
|
echo "=== F. /var/lib/postgresql/16/main top-15 subdirs ==="
|
|
sudo du -sh /var/lib/postgresql/16/main/*/ 2>/dev/null | sort -rh | head -15
|
|
echo
|
|
|
|
echo "=== G. /var/www top-10 if exists ==="
|
|
sudo du -sh /var/www/*/ 2>/dev/null | sort -rh | head -10
|
|
sudo du -sh /var/www/lidpotok/storage/logs/ 2>/dev/null
|
|
echo
|
|
|
|
echo "=== H. apt cache + tmp ==="
|
|
sudo du -sh /var/cache/apt/archives/ /tmp/ /var/tmp/ 2>/dev/null
|
|
echo
|
|
|
|
echo "=========================================="
|
|
echo "=== STARTING CLEANUP (confirm_apply=true) ==="
|
|
echo "=========================================="
|
|
echo
|
|
|
|
echo "=== 1a. PRIORITY: Truncate laravel.log (8.7 GB!) and rotated copies ==="
|
|
for f in /var/www/liderra/app/storage/logs/laravel.log /var/www/liderra/app/storage/logs/laravel.log.1; do
|
|
if [[ -f "$f" ]]; then
|
|
BEFORE=$(sudo du -m "$f" | cut -f1)
|
|
echo "BEFORE: $f = $BEFORE MB"
|
|
sudo bash -c ": > '$f'" 2>&1 || sudo truncate -s 0 "$f"
|
|
AFTER=$(sudo du -m "$f" | cut -f1)
|
|
echo "AFTER: $f = $AFTER MB"
|
|
fi
|
|
done
|
|
# Старые laravel-* (если daily-rotated)
|
|
sudo find /var/www/liderra/app/storage/logs -name "laravel-*.log" -mtime +3 -print -delete 2>&1 | head -10
|
|
echo
|
|
|
|
echo "=== 1b. Truncate PG audit log via sudo bash redirect (workaround) ==="
|
|
if [[ -f /var/log/postgresql/postgresql-16-main.log ]]; then
|
|
BEFORE=$(sudo du -m /var/log/postgresql/postgresql-16-main.log | cut -f1)
|
|
echo "BEFORE: $BEFORE MB"
|
|
sudo bash -c ': > /var/log/postgresql/postgresql-16-main.log' 2>&1
|
|
AFTER=$(sudo du -m /var/log/postgresql/postgresql-16-main.log | cut -f1)
|
|
echo "AFTER: $AFTER MB"
|
|
fi
|
|
sudo find /var/log/postgresql -type f \( -name "*.gz" -o -name "*.log.[0-9]*" \) -delete 2>&1
|
|
echo
|
|
|
|
echo "=== 1c. Truncate syslog (525M) ==="
|
|
sudo bash -c ': > /var/log/syslog' 2>&1
|
|
echo "syslog now: $(sudo du -m /var/log/syslog 2>/dev/null | cut -f1) MB"
|
|
echo
|
|
|
|
echo "=== 1d. Remove playwright dev cache (~440M, не нужен в проде) ==="
|
|
if [[ -d /var/www/.cache/ms-playwright ]]; then
|
|
sudo du -sh /var/www/.cache/ms-playwright 2>&1
|
|
sudo rm -rf /var/www/.cache/ms-playwright
|
|
echo "removed"
|
|
fi
|
|
echo
|
|
|
|
echo "=== 2. journalctl vacuum --size=200M ==="
|
|
sudo journalctl --vacuum-size=200M 2>&1 | tail -10
|
|
echo
|
|
|
|
echo "=== 3. nginx old rotated logs (gz files >3 days) ==="
|
|
sudo find /var/log/nginx -name "*.gz" -mtime +3 -print -delete 2>&1 | head -20
|
|
echo
|
|
# current access.log если >500M — truncate (nginx переоткрывает по reopen signal)
|
|
for f in /var/log/nginx/access.log /var/log/nginx/error.log; do
|
|
if [[ -f "$f" ]]; then
|
|
SIZE_MB=$(sudo du -m "$f" | cut -f1)
|
|
if [[ $SIZE_MB -gt 500 ]]; then
|
|
echo "Truncating $f ($SIZE_MB MB)"
|
|
sudo truncate -s 0 "$f"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
echo
|
|
echo "=== 4. apt-get clean ==="
|
|
sudo apt-get clean 2>&1 | tail -5
|
|
echo
|
|
|
|
echo "=== 5. Laravel storage/logs *.log older 7 days ==="
|
|
if [[ -d /var/www/lidpotok ]]; then
|
|
sudo find /var/www/lidpotok -path '*/storage/logs/*.log' -mtime +7 -print -delete 2>&1 | head -20
|
|
fi
|
|
for d in /var/www/*/; do
|
|
if [[ -d "$d/storage/logs" ]]; then
|
|
for f in "$d"/storage/logs/laravel.log "$d"/storage/logs/worker.log; do
|
|
if [[ -f "$f" ]]; then
|
|
SIZE_MB=$(sudo du -m "$f" | cut -f1)
|
|
if [[ $SIZE_MB -gt 200 ]]; then
|
|
echo "Truncating $f ($SIZE_MB MB)"
|
|
sudo truncate -s 0 "$f"
|
|
fi
|
|
fi
|
|
done
|
|
fi
|
|
done
|
|
echo
|
|
|
|
echo "=== 6. Old rotated *.1 *.2 *.gz logs >50M anywhere in /var/log ==="
|
|
sudo find /var/log -type f \( -name "*.1" -o -name "*.2" -o -name "*.3" -o -name "*.gz" \) -size +50M -print -delete 2>&1 | head -20
|
|
echo
|
|
|
|
echo "=========================================="
|
|
echo "=== AFTER CLEANUP ==="
|
|
echo "=========================================="
|
|
echo "=== Z1. df -h / ==="
|
|
df -h / /var /var/lib/postgresql 2>&1 | head -10
|
|
echo
|
|
|
|
echo "=== Z2. PG status quick check ==="
|
|
sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -10
|
|
echo
|
|
|
|
echo "=== Z3. PG probe ==="
|
|
sleep 5
|
|
sudo -u postgres psql -d liderra -c "SELECT 1 AS probe, NOW() AS ts" 2>&1
|
|
echo
|
|
|
|
echo "=== Z4. HTTPS probe ==="
|
|
curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10
|
|
echo
|
|
|
|
echo "=== DONE ==="
|
|
REMOTE
|
|
|
|
- name: Print summary
|
|
if: always()
|
|
run: |
|
|
{
|
|
echo "## Disk recovery on liderra.ru"
|
|
echo
|
|
echo '```'
|
|
cat /tmp/recover.log 2>/dev/null || echo "(no log captured)"
|
|
echo '```'
|
|
} >> "$GITHUB_STEP_SUMMARY"
|
|
|
|
- name: Cleanup SSH key
|
|
if: always()
|
|
run: rm -f ~/.ssh/liderra_deploy
|