feat(classifier-match): lower threshold 0.8→0.6 + inline router-skip override
Two changes: 1. CONFIDENCE_THRESHOLD 0.8 → 0.6 — catches borderline recommendations that previously slipped through. Driver: brain-retro #10 shows 0% single-node-skill follow-through, suggesting hook needs to fire more. 2. Inline escape hatch — 'router-skip: <reason 50+ chars>' in assistant text. Per-tool scope (does not affect other tools in same turn). Replaces the documented 'override: <reason>' hint which was a self-bypass loophole — high-friction 50+ char justification discourages reflexive use. Per Level 2 of plan docs/superpowers/plans/2026-05-28-router-discipline-level-1-2.md. Legacy tests flipped (2 tests): - 'allows when confidence exactly 0.7 (raised threshold)' → 'BLOCKS when confidence exactly 0.7 (above new threshold 0.6)' - 'allows when confidence 0.75 (still under raised threshold)' → 'BLOCKS when confidence 0.75 (above new threshold 0.6)' These tests previously asserted block:false at 0.7/0.75 under the old 0.8 threshold; with 0.6 threshold they now correctly assert block:true. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# Brain Status (auto-generated)
|
||||
|
||||
Last updated: 2026-05-28T14:34:46.541Z
|
||||
Last updated: 2026-05-28T14:41:36.098Z
|
||||
|
||||
| Контролёр | Состояние | Детали |
|
||||
|---|---|---|
|
||||
@@ -109,7 +109,7 @@ Episodes since last run: 542 / threshold: 10
|
||||
|
||||
| Фраза | За всё время | За сегодня |
|
||||
|---|---|---|
|
||||
| `recovery` | 832 | 559 ⚠️ |
|
||||
| `recovery` | 845 | 572 ⚠️ |
|
||||
| `ремонт инфраструктуры` | 185 | 26 ⚠️ |
|
||||
| `без скилов` | 171 | 113 ⚠️ |
|
||||
| `срочно` | 93 | 11 ⚠️ |
|
||||
@@ -123,7 +123,7 @@ Episodes since last run: 542 / threshold: 10
|
||||
|
||||
| PID | Имя | CPU-время | Возраст |
|
||||
|---|---|---|---|
|
||||
| 9756 | Code | 1.15ч | NaNч |
|
||||
| 9756 | Code | 1.17ч | 0.0ч |
|
||||
|
||||
⚠️ Проверь, не «осиротевшие» ли это процессы от завершённых Claude-сессий.
|
||||
|
||||
|
||||
@@ -1,15 +1,18 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Rule #8 — Classifier-mismatch enforce.
|
||||
*
|
||||
* Stop hook. Reads classifier output from router-state. If classifier recommended
|
||||
* a node with confidence >= threshold AND the turn DIDN'T invoke a matching
|
||||
* a node with confidence >= 0.6 AND the turn DIDN'T invoke a matching
|
||||
* skill/task — block.
|
||||
*
|
||||
* Override: "без скилов" / "direct ok" / explicit "override: <reason>" line in
|
||||
* assistant text.
|
||||
* Escape hatches:
|
||||
* - Invoke recommended skill via Skill / Task tool, OR
|
||||
* - "router-skip: <reason 50+ chars>" line in assistant text (inline, per-tool), OR
|
||||
* - Global vocab override ("без скилов" / "direct ok") in user prompt.
|
||||
*
|
||||
* Spec: docs/superpowers/specs/2026-05-25-enforce-hard-rules-design.md
|
||||
* docs/superpowers/plans/2026-05-28-router-discipline-level-1-2.md
|
||||
*/
|
||||
|
||||
import {
|
||||
@@ -26,11 +29,11 @@ import {
|
||||
} from './enforce-hook-helpers.mjs';
|
||||
|
||||
const RULE_KEY = 'classifier-mismatch';
|
||||
// Raised 2026-05-27 (retro #8 follow-up): 0.7 produced false-positives on
|
||||
// borderline LLM classifications (e.g. recommending #3 GitHub MCP for local
|
||||
// adr-judge debug, #36 adr-kit for status readouts). 0.8 only blocks when
|
||||
// the classifier is genuinely confident.
|
||||
const CONFIDENCE_THRESHOLD = 0.8;
|
||||
// Lowered 2026-05-28 (Task 4, brain-retro #10): 0.8 was too high — 0%
|
||||
// single-node-skill follow-through. 0.6 catches more borderline cases.
|
||||
// Inline router-skip escape hatch (50+ chars) mitigates friction.
|
||||
const CONFIDENCE_THRESHOLD = 0.6;
|
||||
const ROUTER_SKIP_RE = /^router-skip:\s*(.{50,})$/m;
|
||||
|
||||
const MUTATING_TOOLS = new Set(['Edit', 'Write', 'MultiEdit', 'NotebookEdit', 'Bash', 'Task', 'Agent']);
|
||||
|
||||
@@ -76,8 +79,10 @@ export function decide({ toolUses, recommendation, confidence, assistantText, ov
|
||||
const matched = toolUses.some((u) => nodeMatches(recommendation, u));
|
||||
if (matched) return { block: false };
|
||||
|
||||
// NOTE: prior \ self-bypass removed (retro #5 hole 1) - assistant
|
||||
// cannot grant itself an override. User must use a vocabulary phrase.
|
||||
// Inline override: "router-skip: <50+ chars justification>" in assistant text.
|
||||
if (typeof assistantText === 'string' && ROUTER_SKIP_RE.test(assistantText)) {
|
||||
return { block: false };
|
||||
}
|
||||
|
||||
return {
|
||||
block: true,
|
||||
@@ -85,7 +90,7 @@ export function decide({ toolUses, recommendation, confidence, assistantText, ov
|
||||
`[enforce-classifier-match] Classifier recommended "${recommendation}" (confidence=${confidence ?? 'n/a'}) but turn did not invoke that skill/node.`,
|
||||
`Either:`,
|
||||
` - Invoke ${recommendation} via Skill / Task tool, OR`,
|
||||
` - Add an explicit "override: <reason>" line in your response, OR`,
|
||||
` - Add an explicit "router-skip: <reason 50+ chars>" line in your response, OR`,
|
||||
` - Include "без скилов" / "direct ok" in the next user prompt.`,
|
||||
].join('\n'),
|
||||
};
|
||||
@@ -106,7 +111,7 @@ async function main() {
|
||||
const confidence = cls && typeof cls.confidence === 'number' ? cls.confidence : null;
|
||||
// Hole 4 fix: fall back to triggers_matched[0] when classifier silent.
|
||||
// Confidence stays null in fallback path — decide() accepts null (only
|
||||
// numeric confidence ≥ CONFIDENCE_THRESHOLD (0.8) blocks the rule).
|
||||
// numeric confidence ≥ CONFIDENCE_THRESHOLD (0.6) blocks the rule).
|
||||
if (!recommendation) {
|
||||
const triggers = (cls && cls.triggers_matched) || [];
|
||||
if (Array.isArray(triggers) && triggers.length > 0 && typeof triggers[0] === 'string' && triggers[0].length > 0) {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
// Task 4: threshold 0.8→0.6 + inline router-skip override
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { decide } from './enforce-classifier-match.mjs';
|
||||
|
||||
@@ -26,24 +27,22 @@ describe('enforce-classifier-match / decide', () => {
|
||||
}).block).toBe(false);
|
||||
});
|
||||
|
||||
// Raised 2026-05-27 (retro #8 follow-up): borderline 0.7 confidence was the
|
||||
// source of false-positive blocks (#3 GitHub MCP for local debug, #36
|
||||
// adr-kit for status readouts). Threshold raised 0.7 → 0.8 so 0.7 and 0.75
|
||||
// no longer block.
|
||||
it('allows when confidence exactly 0.7 (raised threshold)', () => {
|
||||
// Task 4 (2026-05-28): threshold lowered 0.8 → 0.6 (brain-retro #10: 0% follow-through).
|
||||
// Flipped from the old 0.8-threshold contract: 0.7 and 0.75 NOW BLOCK (above 0.6).
|
||||
it('BLOCKS when confidence exactly 0.7 (above new threshold 0.6)', () => {
|
||||
expect(decide({
|
||||
toolUses: [{ name: 'Edit', input: {} }],
|
||||
recommendation: 'superpowers:writing-plans',
|
||||
confidence: 0.7,
|
||||
}).block).toBe(false);
|
||||
}).block).toBe(true);
|
||||
});
|
||||
|
||||
it('allows when confidence 0.75 (still under raised threshold)', () => {
|
||||
it('BLOCKS when confidence 0.75 (above new threshold 0.6)', () => {
|
||||
expect(decide({
|
||||
toolUses: [{ name: 'Edit', input: {} }],
|
||||
recommendation: 'superpowers:writing-plans',
|
||||
confidence: 0.75,
|
||||
}).block).toBe(false);
|
||||
}).block).toBe(true);
|
||||
});
|
||||
|
||||
it('blocks when recommendation high-confidence + no matching tool', () => {
|
||||
@@ -189,3 +188,81 @@ describe('enforce-classifier-match / decide', () => {
|
||||
expect(r.block).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('inline router-skip override (Task 4)', () => {
|
||||
const recommendation = '#19';
|
||||
const editTool = { name: 'Edit', input: { file_path: 'x.txt' } };
|
||||
|
||||
it('does NOT block when assistant text contains "router-skip: <50+ chars>"', () => {
|
||||
const assistantText = 'router-skip: deliberately choosing direct because router recommendation #19 is irrelevant for this trivial typo fix in docs';
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.85,
|
||||
assistantText,
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(false);
|
||||
});
|
||||
|
||||
it('DOES block when "router-skip:" justification < 50 chars', () => {
|
||||
const assistantText = 'router-skip: too short';
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.85,
|
||||
assistantText,
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(true);
|
||||
});
|
||||
|
||||
it('DOES block when no "router-skip:" present at all', () => {
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.85,
|
||||
assistantText: 'just normal text, no skip',
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('lowered confidence threshold (Task 4: 0.8 → 0.6)', () => {
|
||||
const recommendation = '#19';
|
||||
const editTool = { name: 'Edit', input: { file_path: 'x.txt' } };
|
||||
|
||||
it('blocks at confidence 0.65 (above new threshold 0.6)', () => {
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.65,
|
||||
assistantText: '',
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(true);
|
||||
});
|
||||
|
||||
it('does NOT block at confidence 0.55 (below new threshold 0.6)', () => {
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.55,
|
||||
assistantText: '',
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(false);
|
||||
});
|
||||
|
||||
it('still blocks at confidence 0.85 without router-skip (above threshold, no escape)', () => {
|
||||
const result = decide({
|
||||
toolUses: [editTool],
|
||||
recommendation,
|
||||
confidence: 0.85,
|
||||
assistantText: '',
|
||||
override: null,
|
||||
});
|
||||
expect(result.block).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user