name: Disk usage alert (prod liderra.ru)

# Incident prevention: 29.05.2026 диск заполнился до 100% за сутки → 4h prod downtime.
# Этот workflow проверяет df -h / каждые 30 минут.
# Threshold: 85% → создаёт row в incidents_log (read by ops monitoring).
# 95% → marks как severity=critical для приоритетного alert'а.
#
# Ref: docs/incidents/2026-05-29-disk-full-pg-recovery.md §5

on:
  schedule:
    # Every 30 minutes (Mondays-Sundays). At :00 и :30 каждого часа UTC.
    - cron: '*/30 * * * *'
  workflow_dispatch:
    inputs:
      threshold:
        description: 'Override threshold % (default 85)'
        required: false
        default: '85'
        type: string

jobs:
  check:
    runs-on: ubuntu-latest
    timeout-minutes: 3

    env:
      LIDERRA_HOST: 111.88.246.137
      LIDERRA_USER: ubuntu
      THRESHOLD: ${{ github.event.inputs.threshold || '85' }}

    steps:
      - name: Setup SSH key
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
          chmod 600 ~/.ssh/liderra_deploy
          ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null

      - name: Check disk usage on prod
        id: check
        run: |
          set -o pipefail
          OUTPUT=$(ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} "df -h / | awk 'NR==2 {gsub(\"%\",\"\",\$5); print \$2\" \"\$3\" \"\$4\" \"\$5}'")
          read SIZE USED AVAIL PCT <<< "$OUTPUT"
          echo "size=$SIZE used=$USED avail=$AVAIL pct=$PCT"
          echo "pct=$PCT" >> $GITHUB_OUTPUT
          echo "size=$SIZE" >> $GITHUB_OUTPUT
          echo "used=$USED" >> $GITHUB_OUTPUT
          echo "avail=$AVAIL" >> $GITHUB_OUTPUT

          if [[ -z "$PCT" ]]; then
            echo "::error::Could not parse df output"
            exit 1
          fi

          if [[ "$PCT" -ge 95 ]]; then
            echo "severity=critical" >> $GITHUB_OUTPUT
            echo "::error::Disk usage CRITICAL: $PCT% (size=$SIZE used=$USED avail=$AVAIL)"
          elif [[ "$PCT" -ge "$THRESHOLD" ]]; then
            echo "severity=warning" >> $GITHUB_OUTPUT
            echo "::warning::Disk usage HIGH: $PCT% (threshold $THRESHOLD%, size=$SIZE used=$USED avail=$AVAIL)"
          else
            echo "severity=ok" >> $GITHUB_OUTPUT
            echo "::notice::Disk usage OK: $PCT% (size=$SIZE used=$USED avail=$AVAIL)"
          fi

      - name: Record incident if >= threshold
        if: steps.check.outputs.severity != 'ok'
        run: |
          PCT="${{ steps.check.outputs.pct }}"
          SIZE="${{ steps.check.outputs.size }}"
          USED="${{ steps.check.outputs.used }}"
          AVAIL="${{ steps.check.outputs.avail }}"
          SEVERITY="${{ steps.check.outputs.severity }}"

          # Note: incidents_log table requires INSERT path through Laravel app.
          # GitHub Step Summary serves as primary alert; Telegram bot watches
          # GitHub Actions notifications. Future: extend sql-runner whitelist
          # для INSERT into incidents_log.
          {
            echo "## 🚨 Disk usage alert — severity=$SEVERITY ($PCT%)"
            echo
            echo "- Host: ${{ env.LIDERRA_HOST }}"
            echo "- Filesystem: /"
            echo "- Size: $SIZE"
            echo "- Used: $USED"
            echo "- Available: $AVAIL"
            echo "- Threshold: ${{ env.THRESHOLD }}%"
            echo "- Time UTC: $(date -u)"
            echo
            echo "**Action required:** Investigate via pg-diagnose.yml workflow."
            echo
            echo "Likely causes (from incident 2026-05-29):"
            echo "- /var/www/liderra/app/storage/logs/laravel.log — Laravel exception accumulation"
            echo "- /var/log/postgresql/postgresql-16-main.log — pg_audit verbose logging"
            echo "- /var/log/syslog — kernel + service logs"
            echo "- /var/www/.cache/ — dev caches leaked to prod"
          } >> "$GITHUB_STEP_SUMMARY"

          # Fail the job чтобы GitHub Actions подсветило red — это серфисится
          # через GitHub notifications (email/desktop/telegram bot).
          if [[ "$SEVERITY" == "critical" ]]; then
            exit 1
          fi

      - name: Cleanup SSH key
        if: always()
        run: rm -f ~/.ssh/liderra_deploy