name: Disk usage alert (prod liderra.ru) # Incident prevention: 29.05.2026 диск заполнился до 100% за сутки → 4h prod downtime. # Этот workflow проверяет df -h / каждые 30 минут. # Threshold: 85% → создаёт row в incidents_log (read by ops monitoring). # 95% → marks как severity=critical для приоритетного alert'а. # # Ref: docs/incidents/2026-05-29-disk-full-pg-recovery.md §5 on: schedule: # Every 30 minutes (Mondays-Sundays). At :00 и :30 каждого часа UTC. - cron: '*/30 * * * *' workflow_dispatch: inputs: threshold: description: 'Override threshold % (default 85)' required: false default: '85' type: string jobs: check: runs-on: ubuntu-latest timeout-minutes: 3 env: LIDERRA_HOST: 111.88.246.137 LIDERRA_USER: ubuntu THRESHOLD: ${{ github.event.inputs.threshold || '85' }} steps: - name: Setup SSH key run: | mkdir -p ~/.ssh echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy chmod 600 ~/.ssh/liderra_deploy ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null - name: Check disk usage on prod id: check run: | set -o pipefail OUTPUT=$(ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} "df -h / | awk 'NR==2 {gsub(\"%\",\"\",\$5); print \$2\" \"\$3\" \"\$4\" \"\$5}'") read SIZE USED AVAIL PCT <<< "$OUTPUT" echo "size=$SIZE used=$USED avail=$AVAIL pct=$PCT" echo "pct=$PCT" >> $GITHUB_OUTPUT echo "size=$SIZE" >> $GITHUB_OUTPUT echo "used=$USED" >> $GITHUB_OUTPUT echo "avail=$AVAIL" >> $GITHUB_OUTPUT if [[ -z "$PCT" ]]; then echo "::error::Could not parse df output" exit 1 fi if [[ "$PCT" -ge 95 ]]; then echo "severity=critical" >> $GITHUB_OUTPUT echo "::error::Disk usage CRITICAL: $PCT% (size=$SIZE used=$USED avail=$AVAIL)" elif [[ "$PCT" -ge "$THRESHOLD" ]]; then echo "severity=warning" >> $GITHUB_OUTPUT echo "::warning::Disk usage HIGH: $PCT% (threshold $THRESHOLD%, size=$SIZE used=$USED avail=$AVAIL)" else echo "severity=ok" >> $GITHUB_OUTPUT echo "::notice::Disk usage OK: $PCT% (size=$SIZE used=$USED avail=$AVAIL)" fi - name: Record incident if >= threshold if: steps.check.outputs.severity != 'ok' run: | PCT="${{ steps.check.outputs.pct }}" SIZE="${{ steps.check.outputs.size }}" USED="${{ steps.check.outputs.used }}" AVAIL="${{ steps.check.outputs.avail }}" SEVERITY="${{ steps.check.outputs.severity }}" # Note: incidents_log table requires INSERT path through Laravel app. # GitHub Step Summary serves as primary alert; Telegram bot watches # GitHub Actions notifications. Future: extend sql-runner whitelist # для INSERT into incidents_log. { echo "## 🚨 Disk usage alert — severity=$SEVERITY ($PCT%)" echo echo "- Host: ${{ env.LIDERRA_HOST }}" echo "- Filesystem: /" echo "- Size: $SIZE" echo "- Used: $USED" echo "- Available: $AVAIL" echo "- Threshold: ${{ env.THRESHOLD }}%" echo "- Time UTC: $(date -u)" echo echo "**Action required:** Investigate via pg-diagnose.yml workflow." echo echo "Likely causes (from incident 2026-05-29):" echo "- /var/www/liderra/app/storage/logs/laravel.log — Laravel exception accumulation" echo "- /var/log/postgresql/postgresql-16-main.log — pg_audit verbose logging" echo "- /var/log/syslog — kernel + service logs" echo "- /var/www/.cache/ — dev caches leaked to prod" } >> "$GITHUB_STEP_SUMMARY" # Fail the job чтобы GitHub Actions подсветило red — это серфисится # через GitHub notifications (email/desktop/telegram bot). if [[ "$SEVERITY" == "critical" ]]; then exit 1 fi - name: Cleanup SSH key if: always() run: rm -f ~/.ssh/liderra_deploy