diff --git a/.github/workflows/pg-diagnose.yml b/.github/workflows/pg-diagnose.yml new file mode 100644 index 00000000..73c6a399 --- /dev/null +++ b/.github/workflows/pg-diagnose.yml @@ -0,0 +1,96 @@ +name: Diagnose PostgreSQL state on liderra.ru + +# Read-only diagnostic для incident "PG не принимает connections". +# Запускается вручную: gh workflow run pg-diagnose.yml --ref +# Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime +# + tail последних 200 строк postgresql-16-main.log. + +on: + workflow_dispatch: + +jobs: + diagnose: + runs-on: ubuntu-latest + timeout-minutes: 5 + + env: + LIDERRA_HOST: 111.88.246.137 + LIDERRA_USER: ubuntu + + steps: + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy + chmod 600 ~/.ssh/liderra_deploy + ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null + + - name: Run PG diagnostic on prod + run: | + ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \ + "bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log + set +e + echo "=== 1. hostname + UTC time ===" + echo "host=$(hostname); utc=$(date -u)" + echo + echo "=== 2. uptime ===" + uptime + echo + echo "=== 3. last reboot ===" + who -b + last reboot --time-format=iso | head -5 + echo + echo "=== 4. df -h / and /var ===" + df -h / /var /var/lib/postgresql 2>&1 | head -10 + echo + echo "=== 5. free -h ===" + free -h + echo + echo "=== 6. systemctl status postgresql ===" + sudo systemctl status postgresql --no-pager 2>&1 | head -30 + echo + echo "=== 7. systemctl status postgresql@16-main (cluster) ===" + sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30 + echo + echo "=== 8. nginx + php-fpm status (one-line each) ===" + sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1 + echo + echo "=== 9. ps aux | postgres (top 15) ===" + ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15 + echo + echo "=== 10. journalctl postgresql last 80 lines ===" + sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80 + echo + echo "=== 11. journalctl postgresql@16-main last 80 lines ===" + sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80 + echo + echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ===" + sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1 + echo + echo "=== 13. WAL size and count ===" + sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1 + sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l + echo + echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ===" + sudo dmesg -T 2>&1 | tail -40 + echo + echo "=== 15. liderra.ru HTTPS probe ===" + curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10 + echo + echo "=== DONE ===" + REMOTE + + - name: Print summary + if: always() + run: | + { + echo "## PG diagnostic on liderra.ru" + echo + echo '```' + cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + - name: Cleanup SSH key + if: always() + run: rm -f ~/.ssh/liderra_deploy diff --git a/docs/observer/STATUS.md b/docs/observer/STATUS.md index 910bed10..1db6da83 100644 --- a/docs/observer/STATUS.md +++ b/docs/observer/STATUS.md @@ -1,6 +1,6 @@ # Brain Status (auto-generated) -Last updated: 2026-05-29T09:15:00.940Z +Last updated: 2026-05-29T09:25:20.043Z | Контролёр | Состояние | Детали | |---|---|---| @@ -8,13 +8,13 @@ Last updated: 2026-05-29T09:15:00.940Z | C2 Cross-ref consistency | ✅ | [cross-ref-checker] OK — 0 drift in 4 files | | C3 Observer-of-observer | ✅ | [observer-of-observer] OK — last read 0 week(s) ago | | C4 Сигнальный статус | ✅ | This file (self-reference) | -| C5 Observer-coverage | ⚠️ | 617 episode(s) this month · Stop-hook + post-commit OK · 20 missed activation(s) — see /brain-retro | +| C5 Observer-coverage | ⚠️ | 618 episode(s) this month · Stop-hook + post-commit OK · 20 missed activation(s) — see /brain-retro | | C6 Chain map sync | ✅ | [chain-map-checker] OK — 16 chains in sync | ## Метрики (информационные, не алерты) -- Observer evidence: 617 episodes this month, 0 observer_error markers, 126 PII matches before filter -- Legacy v1 episodes (not in factor analysis): 478 +- Observer evidence: 618 episodes this month, 0 observer_error markers, 126 PII matches before filter +- Legacy v1 episodes (not in factor analysis): 479 - Last /brain-retro: 2 day(s) ago - Использование узлов: см. `/brain-retro` (раз в спринт). missed_activations: 20. **Неиспользованные узлы — не алерт, если профильной задачи не было** (Pravila §16.4 v1.36; capability-readiness; см. memory `feedback_brain_unused_tools_not_problem` — outside-repo memory store). @@ -26,14 +26,14 @@ Baseline дисциплины роутера (этап 2 router discipline overh |---|---|---|---| | analysis | 26 | 30.8% | 15.4% | | bugfix | 18 | 22.2% | 27.8% | -| planning | 16 | 18.8% | 18.8% | +| planning | 17 | 17.6% | 17.6% | | feature | 15 | 13.3% | 0.0% | | cleanup | 6 | 0.0% | 0.0% | | refactor | 1 | 0.0% | 0.0% | -Router step distribution: 1: 264, 2: 226, 3: 60, 5: 60 +Router step distribution: 1: 264, 2: 227, 3: 60, 5: 60 -Boundaries applied (ADR / границы): 73 of 610 эпизодов (12.0%). +Boundaries applied (ADR / границы): 73 of 611 эпизодов (11.9%). ## Активные многоэтапные проекты @@ -51,10 +51,10 @@ Boundaries applied (ADR / границы): 73 of 610 эпизодов (12.0%). | Компонент | Токены (in/out) | USD | |---|---|---| -| Classifier (Sonnet 4.6) | 2339/31433 | $0.48 | +| Classifier (Sonnet 4.6) | 2358/31963 | $0.49 | | Self-assessment (Sonnet 4.6) | 0/0 | $0.00 | | Reviewer (Opus 4.7 + fallback) | 0/0 | $0.00 | -| **Итого** | | **$0.48** | +| **Итого** | | **$0.49** | ## Аномалии классификатора @@ -67,7 +67,7 @@ Episodes since last run: 542 / threshold: 10 ## Reviewer: субагент vs fallback -0 эпизодов проверено из 617. +0 эпизодов проверено из 618. ## Reviewer findings @@ -109,7 +109,7 @@ Episodes since last run: 542 / threshold: 10 | Фраза | За всё время | За сегодня | |---|---|---| -| `recovery` | 1372 | 475 ⚠️ | +| `recovery` | 1380 | 483 ⚠️ | | `без скилов` | 265 | 87 ⚠️ | | `ремонт инфраструктуры` | 229 | 44 ⚠️ | | `срочно` | 187 | 94 ⚠️ | @@ -124,6 +124,7 @@ Episodes since last run: 542 / threshold: 10 | PID | Имя | CPU-время | Возраст | |---|---|---|---| | 3464 | MsMpEng | 2.14ч | NaNч | +| 16128 | Code | 1.03ч | NaNч | ⚠️ Проверь, не «осиротевшие» ли это процессы от завершённых Claude-сессий.