8fde6a3b50
ремонт: prevent recurrence of 29.05 disk-full incident GitHub Actions cron */30 min: ssh + df -h /. Threshold 85% → warning, 95% → critical (job fails, GitHub notifications fire). Output: GITHUB_STEP_SUMMARY with size/used/avail + likely causes from incident. Future: extend sql-runner whitelist для INSERT into incidents_log (post-Б-1 Sentry/Telegram bot integration).
110 lines
4.3 KiB
YAML
110 lines
4.3 KiB
YAML
name: Disk usage alert (prod liderra.ru)
|
||
|
||
# Incident prevention: 29.05.2026 диск заполнился до 100% за сутки → 4h prod downtime.
|
||
# Этот workflow проверяет df -h / каждые 30 минут.
|
||
# Threshold: 85% → создаёт row в incidents_log (read by ops monitoring).
|
||
# 95% → marks как severity=critical для приоритетного alert'а.
|
||
#
|
||
# Ref: docs/incidents/2026-05-29-disk-full-pg-recovery.md §5
|
||
|
||
on:
|
||
schedule:
|
||
# Every 30 minutes (Mondays-Sundays). At :00 и :30 каждого часа UTC.
|
||
- cron: '*/30 * * * *'
|
||
workflow_dispatch:
|
||
inputs:
|
||
threshold:
|
||
description: 'Override threshold % (default 85)'
|
||
required: false
|
||
default: '85'
|
||
type: string
|
||
|
||
jobs:
|
||
check:
|
||
runs-on: ubuntu-latest
|
||
timeout-minutes: 3
|
||
|
||
env:
|
||
LIDERRA_HOST: 111.88.246.137
|
||
LIDERRA_USER: ubuntu
|
||
THRESHOLD: ${{ github.event.inputs.threshold || '85' }}
|
||
|
||
steps:
|
||
- name: Setup SSH key
|
||
run: |
|
||
mkdir -p ~/.ssh
|
||
echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
|
||
chmod 600 ~/.ssh/liderra_deploy
|
||
ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
|
||
|
||
- name: Check disk usage on prod
|
||
id: check
|
||
run: |
|
||
set -o pipefail
|
||
OUTPUT=$(ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} "df -h / | awk 'NR==2 {gsub(\"%\",\"\",\$5); print \$2\" \"\$3\" \"\$4\" \"\$5}'")
|
||
read SIZE USED AVAIL PCT <<< "$OUTPUT"
|
||
echo "size=$SIZE used=$USED avail=$AVAIL pct=$PCT"
|
||
echo "pct=$PCT" >> $GITHUB_OUTPUT
|
||
echo "size=$SIZE" >> $GITHUB_OUTPUT
|
||
echo "used=$USED" >> $GITHUB_OUTPUT
|
||
echo "avail=$AVAIL" >> $GITHUB_OUTPUT
|
||
|
||
if [[ -z "$PCT" ]]; then
|
||
echo "::error::Could not parse df output"
|
||
exit 1
|
||
fi
|
||
|
||
if [[ "$PCT" -ge 95 ]]; then
|
||
echo "severity=critical" >> $GITHUB_OUTPUT
|
||
echo "::error::Disk usage CRITICAL: $PCT% (size=$SIZE used=$USED avail=$AVAIL)"
|
||
elif [[ "$PCT" -ge "$THRESHOLD" ]]; then
|
||
echo "severity=warning" >> $GITHUB_OUTPUT
|
||
echo "::warning::Disk usage HIGH: $PCT% (threshold $THRESHOLD%, size=$SIZE used=$USED avail=$AVAIL)"
|
||
else
|
||
echo "severity=ok" >> $GITHUB_OUTPUT
|
||
echo "::notice::Disk usage OK: $PCT% (size=$SIZE used=$USED avail=$AVAIL)"
|
||
fi
|
||
|
||
- name: Record incident if >= threshold
|
||
if: steps.check.outputs.severity != 'ok'
|
||
run: |
|
||
PCT="${{ steps.check.outputs.pct }}"
|
||
SIZE="${{ steps.check.outputs.size }}"
|
||
USED="${{ steps.check.outputs.used }}"
|
||
AVAIL="${{ steps.check.outputs.avail }}"
|
||
SEVERITY="${{ steps.check.outputs.severity }}"
|
||
|
||
# Note: incidents_log table requires INSERT path through Laravel app.
|
||
# GitHub Step Summary serves as primary alert; Telegram bot watches
|
||
# GitHub Actions notifications. Future: extend sql-runner whitelist
|
||
# для INSERT into incidents_log.
|
||
{
|
||
echo "## 🚨 Disk usage alert — severity=$SEVERITY ($PCT%)"
|
||
echo
|
||
echo "- Host: ${{ env.LIDERRA_HOST }}"
|
||
echo "- Filesystem: /"
|
||
echo "- Size: $SIZE"
|
||
echo "- Used: $USED"
|
||
echo "- Available: $AVAIL"
|
||
echo "- Threshold: ${{ env.THRESHOLD }}%"
|
||
echo "- Time UTC: $(date -u)"
|
||
echo
|
||
echo "**Action required:** Investigate via pg-diagnose.yml workflow."
|
||
echo
|
||
echo "Likely causes (from incident 2026-05-29):"
|
||
echo "- /var/www/liderra/app/storage/logs/laravel.log — Laravel exception accumulation"
|
||
echo "- /var/log/postgresql/postgresql-16-main.log — pg_audit verbose logging"
|
||
echo "- /var/log/syslog — kernel + service logs"
|
||
echo "- /var/www/.cache/ — dev caches leaked to prod"
|
||
} >> "$GITHUB_STEP_SUMMARY"
|
||
|
||
# Fail the job чтобы GitHub Actions подсветило red — это серфисится
|
||
# через GitHub notifications (email/desktop/telegram bot).
|
||
if [[ "$SEVERITY" == "critical" ]]; then
|
||
exit 1
|
||
fi
|
||
|
||
- name: Cleanup SSH key
|
||
if: always()
|
||
run: rm -f ~/.ssh/liderra_deploy
|