feat(brain): CREATE reviewer fallback handler + verify subagent (phase 3 task 18)
Phase 3 Task 18 (G16 closure). Spec §4.6 — direct Opus API fallback for the
brain-retro reviewer when the Claude Code subagent
.claude/agents/reviewer-agent.md crashes / times out.
- tools/brain-retro-opus-reviewer.mjs (NEW — G16: file did not exist):
+ buildReviewPrompt(episode) — adaptive prompt:
v4 → full (alternatives_considered + self_assessment + chain_gaps cues)
v3 → omits alternatives_considered
v2 → omits both alternatives + self_assessment
+ parseReview(text) — strips ```json fence, requires the 7 review
fields (node_quality / chain_quality / gap_assessment /
agent_self_assessment_accuracy / error_root_cause / outcome_reviewed /
reasoning) + alternative_better (nullable). Passes through
reviewer_error escalations from the subagent verbatim.
+ reviewViaDirectApi(episode, options) — async wrapper around
callAnthropicAPI with REVIEWER_MODEL. Returns parsed review or null.
- tools/brain-retro-opus-reviewer.test.mjs (NEW): 9 tests (4 prompt +
5 parse: complete / fence / malformed / missing field / reviewer_error
escalation).
- Reviewer subagent verified: .claude/agents/reviewer-agent.md exists
with frontmatter spec §4.6 (tools: Read/Grep/Glob/Skill; model: opus;
8-dim review contract). No edits to the agent file (this Task 18
step 1 is a verify, not a rewrite — agent already conforms).
This commit is contained in:
@@ -0,0 +1,127 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* brain-retro reviewer — direct Opus API fallback handler (Phase 3 Task 18).
|
||||
*
|
||||
* Spec §4.6: the primary reviewer is a Claude Code subagent
|
||||
* (`.claude/agents/reviewer-agent.md`) spawned via Task() from /brain-retro.
|
||||
* THIS module is the FALLBACK handler invoked by the controller when the
|
||||
* subagent crashes / times out: direct Opus API call with the same adaptive
|
||||
* review prompt (but no cross-episode reading, no skill invocations).
|
||||
*
|
||||
* Pure layer: buildReviewPrompt + parseReview (this file's tests). Network
|
||||
* layer: reviewViaDirectApi (zero-cost wrapper around router-classifier's
|
||||
* callAnthropicAPI; the controller decides when to call it).
|
||||
*
|
||||
* G16 — file did not exist before Phase 3 Task 18; created here.
|
||||
*/
|
||||
|
||||
import { REVIEWER_MODEL } from './router-config.mjs';
|
||||
|
||||
const REQUIRED_REVIEW_FIELDS = [
|
||||
'node_quality',
|
||||
'chain_quality',
|
||||
'gap_assessment',
|
||||
'agent_self_assessment_accuracy',
|
||||
'error_root_cause',
|
||||
'outcome_reviewed',
|
||||
'reasoning',
|
||||
];
|
||||
|
||||
/**
|
||||
* Build the adaptive review prompt for a given episode. Pure.
|
||||
*
|
||||
* Adaptive prompt template (spec §4.6):
|
||||
* - v4 → full prompt including alternatives_considered, self_assessment,
|
||||
* chain_gaps cues.
|
||||
* - v3 → omits alternatives_considered.
|
||||
* - v2 → omits both alternatives_considered and self_assessment.
|
||||
* - v1 → skipped upstream (caller filters them out).
|
||||
*/
|
||||
export function buildReviewPrompt(episode) {
|
||||
const v = Number(episode?.schema_version) || 0;
|
||||
const cues = [];
|
||||
|
||||
cues.push('node_quality: correct | wrong_node | overkill | underkill | disputable');
|
||||
cues.push('chain_quality: correct | missing_step | extra_step | wrong_order | n/a');
|
||||
cues.push('gap_assessment: acceptable | mistake_should_complete | mistake_should_not_start | n/a');
|
||||
cues.push('agent_self_assessment_accuracy: accurate | over_confident | under_confident | no_self_assessment');
|
||||
cues.push('error_root_cause: wrong_skill | wrong_tool | wrong_chain_order | external_failure | n/a');
|
||||
cues.push('alternative_better: <node_id> | null');
|
||||
cues.push('outcome_reviewed: success | soft_success | rework | blocked');
|
||||
cues.push('reasoning: 1-3 sentences');
|
||||
|
||||
const adaptiveNotes = [];
|
||||
if (v >= 3) {
|
||||
adaptiveNotes.push('Episode is v3+: primary_rationale carries triggers/candidates/boundaries.');
|
||||
}
|
||||
if (v >= 4) {
|
||||
adaptiveNotes.push('Episode is v4: classifier_output.alternatives_considered tells you what the classifier weighed.');
|
||||
adaptiveNotes.push('self_assessment (if present and not pending) is the agent\'s post-hoc judgement — compare honesty.');
|
||||
adaptiveNotes.push('execution_trace.chain_gaps shows whether the recommended chain ran in full.');
|
||||
}
|
||||
|
||||
return [
|
||||
'You are the independent reviewer of routing decisions for the Лидерра brain-governance experiment.',
|
||||
'Return ONLY a JSON object with the 8 fields below. No prose, no code fences.',
|
||||
'',
|
||||
'Fields:',
|
||||
...cues.map((c) => ' - ' + c),
|
||||
'',
|
||||
adaptiveNotes.length ? 'Notes for this schema version:' : '',
|
||||
...adaptiveNotes.map((n) => ' - ' + n),
|
||||
'',
|
||||
'Episode (JSON):',
|
||||
JSON.stringify(episode, null, 2),
|
||||
'',
|
||||
'Output JSON only.',
|
||||
].filter(Boolean).join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the Opus reviewer response. Pure. Returns null on malformed JSON or
|
||||
* when a required 8-dim field is missing. Passes through `reviewer_error`
|
||||
* escalations from the subagent.
|
||||
*/
|
||||
export function parseReview(text) {
|
||||
if (!text) return null;
|
||||
const stripped = String(text).trim()
|
||||
.replace(/^```(?:json)?\s*\n?/, '')
|
||||
.replace(/\n?```$/, '')
|
||||
.trim();
|
||||
let parsed;
|
||||
try { parsed = JSON.parse(stripped); }
|
||||
catch { return null; }
|
||||
if (!parsed || typeof parsed !== 'object') return null;
|
||||
|
||||
// Reviewer-agent escalation: pass through verbatim.
|
||||
if (typeof parsed.reviewer_error === 'string') return parsed;
|
||||
|
||||
for (const f of REQUIRED_REVIEW_FIELDS) {
|
||||
if (parsed[f] === undefined) return null;
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Direct Opus API call. Wraps callAnthropicAPI from router-classifier with
|
||||
* the reviewer model. Caller (controller inside /brain-retro) is responsible
|
||||
* for decision (subagent first, this on failure).
|
||||
*
|
||||
* Returns the parsed review object or null on transport / parse failure.
|
||||
*/
|
||||
export async function reviewViaDirectApi(episode, options = {}) {
|
||||
const { callAnthropicAPI } = await import('./router-classifier.mjs');
|
||||
const apiKey = options.apiKey ?? process.env.ROUTER_LLM_KEY;
|
||||
if (!apiKey) return null;
|
||||
const prompt = buildReviewPrompt(episode);
|
||||
try {
|
||||
const text = await callAnthropicAPI(prompt, {
|
||||
apiKey,
|
||||
baseUrl: options.baseUrl ?? process.env.ROUTER_LLM_BASE_URL ?? undefined,
|
||||
model: options.model ?? REVIEWER_MODEL,
|
||||
});
|
||||
return parseReview(text);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
// tools/brain-retro-opus-reviewer.test.mjs — TDD for Phase 3 Task 18 (G16, spec §4.6)
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { buildReviewPrompt, parseReview } from './brain-retro-opus-reviewer.mjs';
|
||||
|
||||
describe('buildReviewPrompt — adaptive v2/v3/v4 (spec §4.6)', () => {
|
||||
it('v4 includes alternatives_considered + self_assessment + chain_gaps cues', () => {
|
||||
const ep = {
|
||||
schema_version: 4,
|
||||
schema_minor: 2,
|
||||
task_id: 't',
|
||||
primary_rationale: { task_classification: 'feature', node_chosen: 'direct' },
|
||||
classifier_output: { recommended_node: '#19', alternatives_considered: [{ node: 'x', match_score: 0.5 }] },
|
||||
self_assessment: { summary: 'ok', confidence_in_choice: 0.8 },
|
||||
execution_trace: { chain_gaps: [] },
|
||||
};
|
||||
const p = buildReviewPrompt(ep);
|
||||
expect(p).toContain('alternatives_considered');
|
||||
expect(p).toContain('self_assessment');
|
||||
expect(p).toContain('chain_gaps');
|
||||
});
|
||||
|
||||
it('v3 omits alternatives_considered cue', () => {
|
||||
expect(buildReviewPrompt({ schema_version: 3 })).not.toContain('alternatives_considered');
|
||||
});
|
||||
|
||||
it('v2 omits alternatives + self_assessment cues', () => {
|
||||
const p = buildReviewPrompt({ schema_version: 2 });
|
||||
expect(p).not.toContain('alternatives_considered');
|
||||
expect(p).not.toContain('self_assessment');
|
||||
});
|
||||
|
||||
it('includes the episode JSON verbatim for the reviewer to read', () => {
|
||||
const ep = { schema_version: 4, task_id: 'task-xyz-1' };
|
||||
expect(buildReviewPrompt(ep)).toContain('task-xyz-1');
|
||||
});
|
||||
});
|
||||
|
||||
describe('parseReview — 8-dim review schema (spec §4.6)', () => {
|
||||
it('parses a complete 8-dim review JSON', () => {
|
||||
const r = parseReview('{"node_quality":"correct","chain_quality":"n/a","gap_assessment":"n/a","agent_self_assessment_accuracy":"accurate","error_root_cause":"n/a","alternative_better":null,"outcome_reviewed":"success","reasoning":"x"}');
|
||||
expect(r.node_quality).toBe('correct');
|
||||
expect(r.outcome_reviewed).toBe('success');
|
||||
expect(r.alternative_better).toBeNull();
|
||||
expect(r.reasoning).toBe('x');
|
||||
});
|
||||
|
||||
it('strips ```json fence', () => {
|
||||
const r = parseReview('```json\n{"node_quality":"wrong_node","chain_quality":"missing_step","gap_assessment":"acceptable","agent_self_assessment_accuracy":"over_confident","error_root_cause":"wrong_skill","alternative_better":"#19","outcome_reviewed":"rework","reasoning":"y"}\n```');
|
||||
expect(r.node_quality).toBe('wrong_node');
|
||||
expect(r.alternative_better).toBe('#19');
|
||||
});
|
||||
|
||||
it('returns null on malformed JSON', () => {
|
||||
expect(parseReview('not json')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when required field missing', () => {
|
||||
expect(parseReview('{"node_quality":"correct"}')).toBeNull();
|
||||
});
|
||||
|
||||
it('returns reviewer_error passthrough when reviewer escalates', () => {
|
||||
const r = parseReview('{"reviewer_error":"malformed episode"}');
|
||||
expect(r?.reviewer_error).toBe('malformed episode');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user