fix(enforce): hole 1 — remove self-override via assistant text

Brain-retro #5 candidate C, hole 1: enforce-classifier-match.mjs allowed
the agent to bypass the rule by writing 'override: <reason>' in its own
response (self-override = no enforcement). The user-vocabulary override
phrases in enforce-override-vocab.json remain the only legitimate path.

Added regression test asserting block on assistantText override when user
prompt has no override phrase.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Дмитрий
2026-05-26 11:07:03 +03:00
parent 51966328c5
commit ce02d1adad
2 changed files with 17 additions and 5 deletions
+3 -3
View File
@@ -1,4 +1,4 @@
#!/usr/bin/env node
#!/usr/bin/env node
/**
* Rule #8 — Classifier-mismatch enforce.
*
@@ -63,8 +63,8 @@ export function decide({ toolUses, recommendation, confidence, assistantText, ov
const matched = toolUses.some((u) => nodeMatches(recommendation, u));
if (matched) return { block: false };
// Allow explicit override: lines like "override: <reason>" in assistant text.
if (assistantText && /\boverride:\s+\S/i.test(assistantText)) return { block: false };
// NOTE: prior \ self-bypass removed (retro #5 hole 1) - assistant
// cannot grant itself an override. User must use a vocabulary phrase.
return {
block: true,
+14 -2
View File
@@ -72,14 +72,26 @@ describe('enforce-classifier-match / decide', () => {
expect(r.block).toBe(false);
});
it('allows when explicit "override:" in assistant text', () => {
it('blocks (not allows) when only "override:" in assistant text — self-override removed (hole 1)', () => {
const r = decide({
toolUses: [{ name: 'Edit', input: {} }],
recommendation: 'foo:bar',
confidence: 0.9,
assistantText: 'override: simpler direct edit, foo:bar overkill here\n',
override: null,
});
expect(r.block).toBe(false);
expect(r.block).toBe(true);
});
it('blocks when assistant text has "override: reason" but user prompt has no override phrase (hole 1)', () => {
const r = decide({
toolUses: [{ name: 'Edit', input: {} }],
recommendation: 'superpowers:writing-plans',
confidence: 0.9,
assistantText: 'override: just doing it quick',
override: null,
});
expect(r.block).toBe(true);
});
it('allows when override phrase present', () => {