diff --git a/docs/observer/STATUS.md b/docs/observer/STATUS.md index ab1f6adc..3fbdffb4 100644 --- a/docs/observer/STATUS.md +++ b/docs/observer/STATUS.md @@ -1,6 +1,6 @@ # Brain Status (auto-generated) -Last updated: 2026-05-27T15:32:59.632Z +Last updated: 2026-05-28T01:50:18.743Z | Контролёр | Состояние | Детали | |---|---|---| @@ -8,15 +8,15 @@ Last updated: 2026-05-27T15:32:59.632Z | C2 Cross-ref consistency | ✅ | [cross-ref-checker] OK — 0 drift in 4 files | | C3 Observer-of-observer | ✅ | [observer-of-observer] OK — last read 0 week(s) ago | | C4 Сигнальный статус | ✅ | This file (self-reference) | -| C5 Observer-coverage | ⚠️ | 697 episode(s) this month · Stop-hook + post-commit OK · 21 missed activation(s) — see /brain-retro | +| C5 Observer-coverage | ⚠️ | 706 episode(s) this month · Stop-hook + post-commit OK · 20 missed activation(s) — see /brain-retro | | C6 Chain map sync | ✅ | [chain-map-checker] OK — 16 chains in sync | ## Метрики (информационные, не алерты) -- Observer evidence: 697 episodes this month, 0 observer_error markers, 155 PII matches before filter -- Legacy v1 episodes (not in factor analysis): 558 +- Observer evidence: 706 episodes this month, 0 observer_error markers, 158 PII matches before filter +- Legacy v1 episodes (not in factor analysis): 567 - Last /brain-retro: 0 day(s) ago -- Использование узлов: см. `/brain-retro` (раз в спринт). missed_activations: 21. **Неиспользованные узлы — не алерт, если профильной задачи не было** (Pravila §16.4 v1.36; capability-readiness; см. memory `feedback_brain_unused_tools_not_problem` — outside-repo memory store). +- Использование узлов: см. `/brain-retro` (раз в спринт). missed_activations: 20. **Неиспользованные узлы — не алерт, если профильной задачи не было** (Pravila §16.4 v1.36; capability-readiness; см. memory `feedback_brain_unused_tools_not_problem` — outside-repo memory store). ## Метрики дисциплины @@ -24,17 +24,16 @@ Baseline дисциплины роутера (этап 2 router discipline overh | Тип задачи | Эпизодов | % с триггер-матчем | % через скил | |---|---|---|---| -| monitoring | 32 | 0.0% | 0.0% | -| analysis | 27 | 29.6% | 14.8% | +| analysis | 28 | 32.1% | 14.3% | | bugfix | 20 | 20.0% | 25.0% | | planning | 17 | 17.6% | 17.6% | | feature | 16 | 12.5% | 0.0% | | cleanup | 7 | 0.0% | 0.0% | | refactor | 1 | 0.0% | 0.0% | -Router step distribution: 1: 299, 2: 261, 3: 66, 5: 63 +Router step distribution: 1: 301, 2: 263, 3: 71, 5: 63 -Boundaries applied (ADR / границы): 80 of 689 эпизодов (11.6%). +Boundaries applied (ADR / границы): 85 of 698 эпизодов (12.2%). ## Активные многоэтапные проекты @@ -46,23 +45,16 @@ Boundaries applied (ADR / границы): 80 of 689 эпизодов (11.6%). ## Длинные сессии -⚠️ Сегодня (2026-05-27 UTC) есть сессии с ≥50 ходов — корреляция с падением дисциплины роутинга (retro #5 candidate B). - -| session_id | макс. ход | % regulated | последний эпизод | -|---|---|---|---| -| `0ade4c82` | 54 | 9% | 2026-05-27T12:49:21.664Z | -| `b11f6b8d` | 51 | 4% | 2026-05-27T08:32:49.803Z | - -Long sessions correlate with discipline drift. Если % regulated просел в текущей сессии — рассмотри перезапуск. +Ни одной сессии с >50 ходов сегодня (UTC). ✅ ## Стоимость месяца | Компонент | Токены (in/out) | USD | |---|---|---| -| Classifier (Sonnet 4.6) | 6752/65027 | $1.00 | +| Classifier (Sonnet 4.6) | 7050/69610 | $1.07 | | Self-assessment (Sonnet 4.6) | 0/0 | $0.00 | | Reviewer (Opus 4.7 + fallback) | 0/0 | $0.00 | -| **Итого** | | **$1.00** | +| **Итого** | | **$1.07** | ## Аномалии классификатора @@ -75,7 +67,7 @@ Episodes since last run: 609 / threshold: 10 ## Reviewer: субагент vs fallback -0 эпизодов проверено из 697. +0 эпизодов проверено из 706. ## Reviewer findings @@ -117,13 +109,13 @@ Episodes since last run: 609 / threshold: 10 | Фраза | За всё время | За сегодня | |---|---|---| -| `recovery` | 273 | 179 ⚠️ | -| `ремонт инфраструктуры` | 159 | 88 ⚠️ | -| `срочно` | 82 | 39 ⚠️ | -| `без скилов` | 58 | 32 ⚠️ | -| `memory dump` | 8 | 6 ⚠️ | -| `direct ok` | 6 | 2 | -| `быстрый коммит` | 3 | 2 | +| `recovery` | 273 | 0 | +| `ремонт инфраструктуры` | 181 | 22 ⚠️ | +| `срочно` | 82 | 0 | +| `без скилов` | 58 | 0 | +| `memory dump` | 8 | 0 | +| `direct ok` | 6 | 0 | +| `быстрый коммит` | 3 | 0 | ## Алерт-индикаторы diff --git a/tools/enforce-chain-recommendation.mjs b/tools/enforce-chain-recommendation.mjs new file mode 100644 index 00000000..c6a1e86e --- /dev/null +++ b/tools/enforce-chain-recommendation.mjs @@ -0,0 +1,123 @@ +#!/usr/bin/env node +/** + * Rule — Chain-recommendation enforce. + * + * PreToolUse hook. When the router classifier recommends a multi-step chain + * (>= 2 nodes) and the controller is about to run a mutating tool without + * having invoked ANY node in the chain, block with instructions. + * + * Three escape hatches: + * 1. Call any skill/task matching at least one node in the chain. + * 2. Write chain-override at the start of a line in assistant text. + * 3. User prompt contains a global override phrase (vocab-driven). + * + * Single-node recommendations are handled by enforce-classifier-match.mjs. + */ + +import { + readStdin, + parseEventJson, + readTranscript, + lastUserPromptText, + lastAssistantText, + turnToolUses, + findOverride, + logOverride, + exitDecision, + readRouterState, +} from './enforce-hook-helpers.mjs'; + +import { loadRegistry } from './registry-load.mjs'; + +const RULE_KEY = 'chain-recommendation'; +const CHAIN_MIN_LENGTH = 2; +const MUTATING_TOOLS = new Set(['Edit', 'Write', 'MultiEdit', 'NotebookEdit', 'Bash', 'Task', 'Agent']); +const CHAIN_OVERRIDE_RE = /^chain-override:\s*\S+/m; + +export function decide({ toolUses, recommendedChain, calledSkillIds, assistantText, override }) { + if (!Array.isArray(recommendedChain) || recommendedChain.length < CHAIN_MIN_LENGTH) return { block: false }; + const hasMutating = Array.isArray(toolUses) && toolUses.some((u) => MUTATING_TOOLS.has(u && u.name)); + if (!hasMutating) return { block: false }; + if (override) return { block: false }; + if (calledSkillIds instanceof Set) { + for (const id of recommendedChain) { if (calledSkillIds.has(id)) return { block: false }; } + } + if (typeof assistantText === 'string' && CHAIN_OVERRIDE_RE.test(assistantText)) return { block: false }; + const chainStr = recommendedChain.join(' → '); + const message = [ + `[enforce-chain-recommendation] Router рекомендовал цепочку ${chainStr}, но ни один узел не вызван и нет инлайн-обоснования отказа.`, + `Сделай ОДНО из трёх:`, + ` 1. Вызови первый узел цепочки через Skill / Task tool.`, + ` 2. Добавь в свой ответ строку «chain-override: <одна строка причины>» (не путать с глобальным override от пользователя — это инлайн-объяснение controller-а).`, + ` 3. Попроси у пользователя глобальный override (без скилов / direct ok / срочно / быстрый коммит / recovery / memory dump / ремонт инфраструктуры).`, + ].join('\n'); + return { block: true, message }; +} + +function normalizeChainId(raw) { + if (raw === null || raw === undefined) return ''; + const s = String(raw).trim().toLowerCase(); + if (!s) return ''; + return s.startsWith('#') ? s : `#${s}`; +} + +function chainIdAliases(id, registry) { + const aliases = new Set([id]); + if (!registry) return aliases; + try { + const node = registry.indexById && registry.indexById.get(id); + if (!node) return aliases; + if (node.slug) aliases.add(node.slug.toLowerCase()); + if (node.name) aliases.add(node.name.toLowerCase()); + if (node.slug) aliases.add(`superpowers:${node.slug.toLowerCase()}`); + } catch { /* non-fatal */ } + return aliases; +} + +function extractCalledSkillIds(toolUses, normalizedChain, registry) { + const aliasMap = new Map(); + for (const id of normalizedChain) aliasMap.set(id, chainIdAliases(id, registry)); + const called = new Set(); + for (const u of toolUses) { + if (!u || !u.name) continue; + let rawName = null; + if (u.name === 'Skill') rawName = (u.input && u.input.skill) ? String(u.input.skill) : null; + else if (u.name === 'Task' || u.name === 'Agent') rawName = (u.input && u.input.subagent_type) ? String(u.input.subagent_type) : null; + if (!rawName) continue; + const norm = rawName.toLowerCase().trim(); + called.add(norm); + const stripped = norm.replace(/^superpowers:/, '').replace(/^skill:/, ''); + called.add(stripped); + for (const [chainId, aliases] of aliasMap) { + if (aliases.has(norm) || aliases.has(stripped)) called.add(chainId); + } + } + return called; +} + +async function main() { + try { + const raw = await readStdin(); + const event = parseEventJson(raw); + if (!MUTATING_TOOLS.has(event.tool_name)) { exitDecision({ block: false }); return; } + const transcript = readTranscript(event.transcript_path); + const userPrompt = lastUserPromptText(transcript); + const assistantText = lastAssistantText(transcript); + const toolUses = turnToolUses(transcript); + const override = findOverride(userPrompt, RULE_KEY); + if (override) logOverride(RULE_KEY, override, event.session_id); + const state = readRouterState(event.session_id); + const cls = state && state.classification; + const rawChain = (cls && cls.recommended_chain) || []; + const normalizedChain = Array.isArray(rawChain) + ? rawChain.map(normalizeChainId).filter(Boolean) + : []; + let registry = null; + try { registry = loadRegistry(); } catch { /* fail-quiet */ } + const calledSkillIds = extractCalledSkillIds(toolUses, normalizedChain, registry); + exitDecision(decide({ toolUses, recommendedChain: normalizedChain, calledSkillIds, assistantText, override })); + } catch { exitDecision({ block: false }); } +} + +const isCli = process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/enforce-chain-recommendation.mjs'); +if (isCli) main(); diff --git a/tools/enforce-chain-recommendation.test.mjs b/tools/enforce-chain-recommendation.test.mjs new file mode 100644 index 00000000..a1ac3907 --- /dev/null +++ b/tools/enforce-chain-recommendation.test.mjs @@ -0,0 +1,242 @@ +import { describe, it, expect } from 'vitest'; +import { decide } from './enforce-chain-recommendation.mjs'; + +// Shared helpers +const EDIT_TOOL = { name: 'Edit', input: { file_path: 'x.mjs' } }; +const READ_TOOL = { name: 'Read', input: { file_path: 'x.mjs' } }; +const GREP_TOOL = { name: 'Grep', input: {} }; + +describe('enforce-chain-recommendation / decide', () => { + // Test 1: empty chain → pass + it('empty chain → pass', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: [], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 2: chain of 1 → pass (single-node handled by enforce-classifier-match) + it('chain of 1 → pass (single-node handled elsewhere)', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 3: chain of 2, no skill called, no override → block + it('chain of 2, no skill called, no override → block', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }); + expect(r.block).toBe(true); + expect(r.message).toMatch(/#19 → #34/); + expect(r.message).toMatch(/chain-override:/); + }); + + // Test 4: chain of 2, first skill called → pass + it('chain of 2, first skill called → pass', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(['#19']), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 5: chain of 2, second skill called → pass (any one is enough) + it('chain of 2, second skill called → pass (any one is enough)', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(['#34']), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 6: chain of 2, valid chain-override present → pass + it('chain of 2, chain-override with reason present → pass', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: 'chain-override: трёхшаговая цепочка не нужна — задача чисто читающая\nдалее обычный ответ...', + override: null, + }).block).toBe(false); + }); + + // Test 7: chain of 2, chain-override present BUT empty reason → block + it('chain of 2, chain-override with empty reason → block', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: 'chain-override:\n', + override: null, + }); + expect(r.block).toBe(true); + }); + + // Test 8: chain of 2, global override → pass + it('chain of 2, global override → pass', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: { phrase: 'срочно', suppresses: ['chain-recommendation'] }, + }).block).toBe(false); + }); + + // Test 9: chain of 2, but no mutating tool (only Read/Grep) → pass + it('chain of 2, no mutating tools used → pass', () => { + expect(decide({ + toolUses: [READ_TOOL, GREP_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 10: chain of 5 (long), one mid-chain skill called → pass + it('chain of 5, one mid-chain skill called → pass', () => { + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34', '#18', '#10', '#3'], + calledSkillIds: new Set(['#18']), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + // Test 11: block message contains arrow-rendered chain + it('block message format includes arrow-rendered chain', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34', '#18'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }); + expect(r.block).toBe(true); + expect(r.message).toMatch(/#19 → #34 → #18/); + }); + + // Additional edge cases + + it('chain-override with whitespace-only reason → block', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: 'chain-override: \n', + override: null, + }); + expect(r.block).toBe(true); + }); + + it('chain-override mid-text (not at line start) → block (must be line-start)', () => { + // Regex requires ^ in multiline mode, so inline text should not match + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: 'some text chain-override: inline reason here', + override: null, + }); + expect(r.block).toBe(true); + }); + + it('chain-override at true line start → pass', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: 'reasoning here\nchain-override: direct edit acceptable for single-file fix\nmore text', + override: null, + }); + expect(r.block).toBe(false); + }); + + it('empty toolUses → pass (no mutating tools)', () => { + expect(decide({ + toolUses: [], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + it('calledSkillIds contains by-name resolution (slug match) → pass', () => { + // If main() resolves #19 to its slug and adds it to calledSkillIds, + // decide() should accept it via the set-intersection. + expect(decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(['superpowers:writing-plans', '#19']), + assistantText: '', + override: null, + }).block).toBe(false); + }); + + it('block message mentions chain-override instruction text', () => { + const r = decide({ + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }); + expect(r.block).toBe(true); + expect(r.message).toContain('[enforce-chain-recommendation]'); + expect(r.message).toContain('chain-override:'); + }); + + it('decide() has no side-effects: calling twice returns same result', () => { + const args = { + toolUses: [EDIT_TOOL], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }; + const r1 = decide({ ...args, calledSkillIds: new Set() }); + const r2 = decide({ ...args, calledSkillIds: new Set() }); + expect(r1.block).toBe(r2.block); + }); + + it('Bash tool counts as mutating', () => { + const r = decide({ + toolUses: [{ name: 'Bash', input: { command: 'echo hi' } }], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }); + expect(r.block).toBe(true); + }); + + it('Task tool counts as mutating', () => { + const r = decide({ + toolUses: [{ name: 'Task', input: { subagent_type: 'general-purpose' } }], + recommendedChain: ['#19', '#34'], + calledSkillIds: new Set(), + assistantText: '', + override: null, + }); + expect(r.block).toBe(true); + }); +}); diff --git a/tools/enforce-classifier-match.mjs b/tools/enforce-classifier-match.mjs index 2cad6ef8..0d8993f9 100644 --- a/tools/enforce-classifier-match.mjs +++ b/tools/enforce-classifier-match.mjs @@ -26,7 +26,11 @@ import { } from './enforce-hook-helpers.mjs'; const RULE_KEY = 'classifier-mismatch'; -const CONFIDENCE_THRESHOLD = 0.7; +// Raised 2026-05-27 (retro #8 follow-up): 0.7 produced false-positives on +// borderline LLM classifications (e.g. recommending #3 GitHub MCP for local +// adr-judge debug, #36 adr-kit for status readouts). 0.8 only blocks when +// the classifier is genuinely confident. +const CONFIDENCE_THRESHOLD = 0.8; const MUTATING_TOOLS = new Set(['Edit', 'Write', 'MultiEdit', 'NotebookEdit', 'Bash', 'Task', 'Agent']); @@ -102,7 +106,7 @@ async function main() { const confidence = cls && typeof cls.confidence === 'number' ? cls.confidence : null; // Hole 4 fix: fall back to triggers_matched[0] when classifier silent. // Confidence stays null in fallback path — decide() accepts null (only - // numeric confidence < 0.7 blocks the rule). + // numeric confidence ≥ CONFIDENCE_THRESHOLD (0.8) blocks the rule). if (!recommendation) { const triggers = (cls && cls.triggers_matched) || []; if (Array.isArray(triggers) && triggers.length > 0 && typeof triggers[0] === 'string' && triggers[0].length > 0) { diff --git a/tools/enforce-classifier-match.test.mjs b/tools/enforce-classifier-match.test.mjs index d57b1ec6..31e01293 100644 --- a/tools/enforce-classifier-match.test.mjs +++ b/tools/enforce-classifier-match.test.mjs @@ -26,6 +26,26 @@ describe('enforce-classifier-match / decide', () => { }).block).toBe(false); }); + // Raised 2026-05-27 (retro #8 follow-up): borderline 0.7 confidence was the + // source of false-positive blocks (#3 GitHub MCP for local debug, #36 + // adr-kit for status readouts). Threshold raised 0.7 → 0.8 so 0.7 and 0.75 + // no longer block. + it('allows when confidence exactly 0.7 (raised threshold)', () => { + expect(decide({ + toolUses: [{ name: 'Edit', input: {} }], + recommendation: 'superpowers:writing-plans', + confidence: 0.7, + }).block).toBe(false); + }); + + it('allows when confidence 0.75 (still under raised threshold)', () => { + expect(decide({ + toolUses: [{ name: 'Edit', input: {} }], + recommendation: 'superpowers:writing-plans', + confidence: 0.75, + }).block).toBe(false); + }); + it('blocks when recommendation high-confidence + no matching tool', () => { const r = decide({ toolUses: [{ name: 'Edit', input: { file_path: 'x.mjs' } }], diff --git a/tools/registry-load.test.mjs b/tools/registry-load.test.mjs index fd43d958..3ade5f3d 100644 --- a/tools/registry-load.test.mjs +++ b/tools/registry-load.test.mjs @@ -5,9 +5,9 @@ import { loadRegistry, clearCache, findByClassification, findByKeyword, findActi describe('registry-load', () => { beforeEach(() => clearCache()); - it('loads registry (85 nodes after #84/#85 project-agents added 24.05.2026)', () => { + it('loads registry (86 nodes after #86 graphifyy added 27.05.2026)', () => { const r = loadRegistry(); - expect(r.nodes).toHaveLength(85); + expect(r.nodes).toHaveLength(86); expect(r.version).toBe('0.1.0'); }); @@ -46,9 +46,9 @@ describe('registry-load', () => { it('findActiveNodes excludes non-active', () => { const r = loadRegistry(); const active = findActiveNodes(r); - // 85 nodes total; #1 historic, #17 dormant, #44/#50/#54/#67/#82/#83 deferred, - // #84/#85 (project-agents added 24.05.2026) are active → 75 + 2 = 77 active - expect(active).toHaveLength(77); + // 86 nodes total; #1 historic, #17 dormant, #44/#50/#54/#67/#82/#83 deferred, + // #84/#85/#86 (project-agents 24.05 + graphifyy 27.05) are active → 75 + 3 = 78 active + expect(active).toHaveLength(78); expect(active.map(n => n.id)).toContain('#18'); expect(active.map(n => n.id)).toContain('#19'); expect(active.map(n => n.id)).not.toContain('#1');