fix(enforce): hole 1 — remove self-override via assistant text
Brain-retro #5 candidate C, hole 1: enforce-classifier-match.mjs allowed the agent to bypass the rule by writing 'override: <reason>' in its own response (self-override = no enforcement). The user-vocabulary override phrases in enforce-override-vocab.json remain the only legitimate path. Added regression test asserting block on assistantText override when user prompt has no override phrase. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Rule #8 — Classifier-mismatch enforce.
|
||||
*
|
||||
@@ -63,8 +63,8 @@ export function decide({ toolUses, recommendation, confidence, assistantText, ov
|
||||
const matched = toolUses.some((u) => nodeMatches(recommendation, u));
|
||||
if (matched) return { block: false };
|
||||
|
||||
// Allow explicit override: lines like "override: <reason>" in assistant text.
|
||||
if (assistantText && /\boverride:\s+\S/i.test(assistantText)) return { block: false };
|
||||
// NOTE: prior \ self-bypass removed (retro #5 hole 1) - assistant
|
||||
// cannot grant itself an override. User must use a vocabulary phrase.
|
||||
|
||||
return {
|
||||
block: true,
|
||||
|
||||
@@ -72,14 +72,26 @@ describe('enforce-classifier-match / decide', () => {
|
||||
expect(r.block).toBe(false);
|
||||
});
|
||||
|
||||
it('allows when explicit "override:" in assistant text', () => {
|
||||
it('blocks (not allows) when only "override:" in assistant text — self-override removed (hole 1)', () => {
|
||||
const r = decide({
|
||||
toolUses: [{ name: 'Edit', input: {} }],
|
||||
recommendation: 'foo:bar',
|
||||
confidence: 0.9,
|
||||
assistantText: 'override: simpler direct edit, foo:bar overkill here\n',
|
||||
override: null,
|
||||
});
|
||||
expect(r.block).toBe(false);
|
||||
expect(r.block).toBe(true);
|
||||
});
|
||||
|
||||
it('blocks when assistant text has "override: reason" but user prompt has no override phrase (hole 1)', () => {
|
||||
const r = decide({
|
||||
toolUses: [{ name: 'Edit', input: {} }],
|
||||
recommendation: 'superpowers:writing-plans',
|
||||
confidence: 0.9,
|
||||
assistantText: 'override: just doing it quick',
|
||||
override: null,
|
||||
});
|
||||
expect(r.block).toBe(true);
|
||||
});
|
||||
|
||||
it('allows when override phrase present', () => {
|
||||
|
||||
Reference in New Issue
Block a user