Compare commits

...

4 Commits

Author SHA1 Message Date
Дмитрий 8cbb84e1bc ops(incident): pg-diagnose workflow for PostgreSQL recovery diagnosis
ремонт: PG не отвечает 20+ мин, нужен диагностический workflow

Read-only SSH-based diagnostic for PG-not-accepting-connections incident:
systemctl/journalctl/df/free/uptime + tail /var/log/postgresql/postgresql-16-main.log
+ WAL size + dmesg + HTTPS probe of liderra.ru.

Triggered manually via gh workflow run pg-diagnose.yml --ref <branch>.
No production mutations.
2026-05-29 12:34:48 +03:00
Дмитрий 9a62aada30 ops(incident): pg-diagnose workflow for PostgreSQL recovery diagnosis
Read-only SSH-based diagnostic for PostgreSQL accept-connections incident:
systemctl/journalctl/df/free/uptime + tail /var/log/postgresql/postgresql-16-main.log
+ WAL size + dmesg + HTTPS probe of liderra.ru.

Triggered manually via gh workflow run pg-diagnose.yml --ref <branch>.
No production mutations.
2026-05-29 12:25:12 +03:00
Дмитрий 8b6961ec88 docs(CLAUDE.md): v2.40 router-gate Уровень 4 implementation phase started
4 commits 29.05.2026:
- d4f7e681 condensed spec plan-ready v3.5 (1489→1117 строк)
- 71b07e52 external adversarial audit 51 findings + 8 MUST inline
- 8b60a182 51-task implementation plan 3814 строк / 6 phases
- f7b4b98e (feat) Phase 1 Task 1 nodeMatches + 4 vitest GREEN

Key findings:
- 4 раунда внутреннего audit'а НЕ закрыли §5.1 Bash content
  class — external 3-Sonnet-adversary audit нашёл 6 BYPASS-COMPLETE
- Windows path normalization 3 BYPASS-COMPLETE (UNC/8.3/$VAR)
- AskUser option label триггерит Поведение 1 design-level flaw
- Sprint 6 misrouting reincidence на Sonnet
- Implementation budget 22.5-32h = multi-session epic

§0 cross-refs не меняются (implementation в tools/, не реестр).
§6 +абзац top / §9 +entry.
cspell-words.txt: +reincidence, +misroute.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 12:14:53 +03:00
Дмитрий f7b4b98e0d feat(router-gate): nodeMatches() pure function for recommendation/node match
Migrated from tools/enforce-classifier-match.mjs:42-66 as part of
router-gate Phase 1 Task 1.

Plan: docs/superpowers/plans/2026-05-29-router-gate-hard-wall.md
Spec: docs/superpowers/specs/2026-05-29-router-gate-hard-wall-design-condensed.md

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-29 10:38:10 +03:00
6 changed files with 159 additions and 18 deletions
+96
View File
@@ -0,0 +1,96 @@
name: Diagnose PostgreSQL state on liderra.ru
# Read-only diagnostic для incident "PG не принимает connections".
# Запускается вручную: gh workflow run pg-diagnose.yml --ref <branch>
# Ничего не меняет на проде — только читает systemctl/journalctl/df/free/uptime
# + tail последних 200 строк postgresql-16-main.log.
on:
workflow_dispatch:
jobs:
diagnose:
runs-on: ubuntu-latest
timeout-minutes: 5
env:
LIDERRA_HOST: 111.88.246.137
LIDERRA_USER: ubuntu
steps:
- name: Setup SSH key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy
chmod 600 ~/.ssh/liderra_deploy
ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null
- name: Run PG diagnostic on prod
run: |
ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \
"bash -s" <<'REMOTE' | tee /tmp/pg-diagnose.log
set +e
echo "=== 1. hostname + UTC time ==="
echo "host=$(hostname); utc=$(date -u)"
echo
echo "=== 2. uptime ==="
uptime
echo
echo "=== 3. last reboot ==="
who -b
last reboot --time-format=iso | head -5
echo
echo "=== 4. df -h / and /var ==="
df -h / /var /var/lib/postgresql 2>&1 | head -10
echo
echo "=== 5. free -h ==="
free -h
echo
echo "=== 6. systemctl status postgresql ==="
sudo systemctl status postgresql --no-pager 2>&1 | head -30
echo
echo "=== 7. systemctl status postgresql@16-main (cluster) ==="
sudo systemctl status postgresql@16-main --no-pager 2>&1 | head -30
echo
echo "=== 8. nginx + php-fpm status (one-line each) ==="
sudo systemctl is-active nginx php8.3-fpm liderra-queue 2>&1
echo
echo "=== 9. ps aux | postgres (top 15) ==="
ps auxf | grep -E "(postgres|recovery)" | grep -v grep | head -15
echo
echo "=== 10. journalctl postgresql last 80 lines ==="
sudo journalctl -u postgresql -n 80 --no-pager 2>&1 | tail -80
echo
echo "=== 11. journalctl postgresql@16-main last 80 lines ==="
sudo journalctl -u postgresql@16-main -n 80 --no-pager 2>&1 | tail -80
echo
echo "=== 12. tail -100 /var/log/postgresql/postgresql-16-main.log ==="
sudo tail -100 /var/log/postgresql/postgresql-16-main.log 2>&1
echo
echo "=== 13. WAL size and count ==="
sudo du -sh /var/lib/postgresql/16/main/pg_wal 2>&1
sudo ls /var/lib/postgresql/16/main/pg_wal 2>&1 | wc -l
echo
echo "=== 14. dmesg tail (kernel events, OOM, IO errors) ==="
sudo dmesg -T 2>&1 | tail -40
echo
echo "=== 15. liderra.ru HTTPS probe ==="
curl -sI -o /dev/null -w "HTTP %{http_code}\nTotal: %{time_total}s\n" https://liderra.ru/ --max-time 10
echo
echo "=== DONE ==="
REMOTE
- name: Print summary
if: always()
run: |
{
echo "## PG diagnostic on liderra.ru"
echo
echo '```'
cat /tmp/pg-diagnose.log 2>/dev/null || echo "(no log captured)"
echo '```'
} >> "$GITHUB_STEP_SUMMARY"
- name: Cleanup SSH key
if: always()
run: rm -f ~/.ssh/liderra_deploy
+4 -2
View File
File diff suppressed because one or more lines are too long
+2
View File
@@ -1890,3 +1890,5 @@ deplo
Ctemp
UNC
EACCES
reincidence
misroute
+17 -16
View File
@@ -1,6 +1,6 @@
# Brain Status (auto-generated)
Last updated: 2026-05-29T06:21:28.317Z
Last updated: 2026-05-29T09:25:20.043Z
| Контролёр | Состояние | Детали |
|---|---|---|
@@ -8,13 +8,13 @@ Last updated: 2026-05-29T06:21:28.317Z
| C2 Cross-ref consistency | ✅ | [cross-ref-checker] OK — 0 drift in 4 files |
| C3 Observer-of-observer | ✅ | [observer-of-observer] OK — last read 0 week(s) ago |
| C4 Сигнальный статус | ✅ | This file (self-reference) |
| C5 Observer-coverage | ⚠️ | 696 episode(s) this month · Stop-hook + post-commit OK · 20 missed activation(s) — see /brain-retro |
| C5 Observer-coverage | ⚠️ | 618 episode(s) this month · Stop-hook + post-commit OK · 20 missed activation(s) — see /brain-retro |
| C6 Chain map sync | ✅ | [chain-map-checker] OK — 16 chains in sync |
## Метрики (информационные, не алерты)
- Observer evidence: 696 episodes this month, 0 observer_error markers, 143 PII matches before filter
- Legacy v1 episodes (not in factor analysis): 557
- Observer evidence: 618 episodes this month, 0 observer_error markers, 126 PII matches before filter
- Legacy v1 episodes (not in factor analysis): 479
- Last /brain-retro: 2 day(s) ago
- Использование узлов: см. `/brain-retro` (раз в спринт). missed_activations: 20. **Неиспользованные узлы — не алерт, если профильной задачи не было** (Pravila §16.4 v1.36; capability-readiness; см. memory `feedback_brain_unused_tools_not_problem` — outside-repo memory store).
@@ -24,16 +24,16 @@ Baseline дисциплины роутера (этап 2 router discipline overh
| Тип задачи | Эпизодов | % с триггер-матчем | % через скил |
|---|---|---|---|
| analysis | 33 | 27.3% | 18.2% |
| planning | 19 | 15.8% | 15.8% |
| analysis | 26 | 30.8% | 15.4% |
| bugfix | 18 | 22.2% | 27.8% |
| feature | 17 | 11.8% | 0.0% |
| planning | 17 | 17.6% | 17.6% |
| feature | 15 | 13.3% | 0.0% |
| cleanup | 6 | 0.0% | 0.0% |
| refactor | 1 | 0.0% | 0.0% |
Router step distribution: 1: 311, 2: 245, 3: 64, 5: 64
Router step distribution: 1: 264, 2: 227, 3: 60, 5: 60
Boundaries applied (ADR / границы): 76 of 684 эпизодов (11.1%).
Boundaries applied (ADR / границы): 73 of 611 эпизодов (11.9%).
## Активные многоэтапные проекты
@@ -51,10 +51,10 @@ Boundaries applied (ADR / границы): 76 of 684 эпизодов (11.1%).
| Компонент | Токены (in/out) | USD |
|---|---|---|
| Classifier (Sonnet 4.6) | 4802/63364 | $0.96 |
| Classifier (Sonnet 4.6) | 2358/31963 | $0.49 |
| Self-assessment (Sonnet 4.6) | 0/0 | $0.00 |
| Reviewer (Opus 4.7 + fallback) | 0/0 | $0.00 |
| **Итого** | | **$0.96** |
| **Итого** | | **$0.49** |
## Аномалии классификатора
@@ -67,7 +67,7 @@ Episodes since last run: 542 / threshold: 10
## Reviewer: субагент vs fallback
0 эпизодов проверено из 696.
0 эпизодов проверено из 618.
## Reviewer findings
@@ -109,11 +109,11 @@ Episodes since last run: 542 / threshold: 10
| Фраза | За всё время | За сегодня |
|---|---|---|
| `recovery` | 1364 | 467 ⚠️ |
| `recovery` | 1380 | 483 ⚠️ |
| `без скилов` | 265 | 87 ⚠️ |
| `ремонт инфраструктуры` | 229 | 44 ⚠️ |
| `срочно` | 148 | 55 ⚠️ |
| `memory dump` | 17 | 0 |
| `срочно` | 187 | 94 ⚠️ |
| `memory dump` | 22 | 5 ⚠️ |
| `direct ok` | 6 | 0 |
| `быстрый коммит` | 3 | 0 |
@@ -123,7 +123,8 @@ Episodes since last run: 542 / threshold: 10
| PID | Имя | CPU-время | Возраст |
|---|---|---|---|
| 3464 | MsMpEng | 2.04ч | NaNч |
| 3464 | MsMpEng | 2.14ч | NaNч |
| 16128 | Code | 1.03ч | NaNч |
⚠️ Проверь, не «осиротевшие» ли это процессы от завершённых Claude-сессий.
+12
View File
@@ -0,0 +1,12 @@
/**
* Compare router recommendation (e.g. "#19", "superpowers:writing-plans", "writing-plans")
* with a registry node (id/slug/name). Returns true if any match.
*/
export function nodeMatches(recommendation, node) {
if (!recommendation || !node) return false;
return (
recommendation === node.id ||
recommendation === node.slug ||
recommendation === node.name
);
}
+28
View File
@@ -0,0 +1,28 @@
import { describe, it, expect } from 'vitest';
import { nodeMatches } from './router-gate-decide.mjs';
describe('nodeMatches', () => {
it('matches #NN to node.id', () => {
expect(nodeMatches('#19', { name: 'writing-plans', id: '#19', slug: 'superpowers:writing-plans' })).toBe(true);
});
it('matches superpowers:X to canonical slug', () => {
expect(nodeMatches('superpowers:writing-plans', { name: 'writing-plans', id: '#19', slug: 'superpowers:writing-plans' })).toBe(true);
});
it('matches by name', () => {
expect(nodeMatches('writing-plans', { name: 'writing-plans', id: '#19', slug: 'superpowers:writing-plans' })).toBe(true);
});
it('rejects mismatch', () => {
expect(nodeMatches('#20', { name: 'writing-plans', id: '#19', slug: 'superpowers:writing-plans' })).toBe(false);
});
it('handles null recommendation', () => {
expect(nodeMatches(null, { name: 'writing-plans', id: '#19', slug: 'superpowers:writing-plans' })).toBe(false);
});
it('handles null node', () => {
expect(nodeMatches('#19', null)).toBe(false);
});
});