diff --git a/tools/brain-retro-opus-reviewer.mjs b/tools/brain-retro-opus-reviewer.mjs new file mode 100644 index 00000000..5ad8a778 --- /dev/null +++ b/tools/brain-retro-opus-reviewer.mjs @@ -0,0 +1,127 @@ +#!/usr/bin/env node +/** + * brain-retro reviewer — direct Opus API fallback handler (Phase 3 Task 18). + * + * Spec §4.6: the primary reviewer is a Claude Code subagent + * (`.claude/agents/reviewer-agent.md`) spawned via Task() from /brain-retro. + * THIS module is the FALLBACK handler invoked by the controller when the + * subagent crashes / times out: direct Opus API call with the same adaptive + * review prompt (but no cross-episode reading, no skill invocations). + * + * Pure layer: buildReviewPrompt + parseReview (this file's tests). Network + * layer: reviewViaDirectApi (zero-cost wrapper around router-classifier's + * callAnthropicAPI; the controller decides when to call it). + * + * G16 — file did not exist before Phase 3 Task 18; created here. + */ + +import { REVIEWER_MODEL } from './router-config.mjs'; + +const REQUIRED_REVIEW_FIELDS = [ + 'node_quality', + 'chain_quality', + 'gap_assessment', + 'agent_self_assessment_accuracy', + 'error_root_cause', + 'outcome_reviewed', + 'reasoning', +]; + +/** + * Build the adaptive review prompt for a given episode. Pure. + * + * Adaptive prompt template (spec §4.6): + * - v4 → full prompt including alternatives_considered, self_assessment, + * chain_gaps cues. + * - v3 → omits alternatives_considered. + * - v2 → omits both alternatives_considered and self_assessment. + * - v1 → skipped upstream (caller filters them out). + */ +export function buildReviewPrompt(episode) { + const v = Number(episode?.schema_version) || 0; + const cues = []; + + cues.push('node_quality: correct | wrong_node | overkill | underkill | disputable'); + cues.push('chain_quality: correct | missing_step | extra_step | wrong_order | n/a'); + cues.push('gap_assessment: acceptable | mistake_should_complete | mistake_should_not_start | n/a'); + cues.push('agent_self_assessment_accuracy: accurate | over_confident | under_confident | no_self_assessment'); + cues.push('error_root_cause: wrong_skill | wrong_tool | wrong_chain_order | external_failure | n/a'); + cues.push('alternative_better: | null'); + cues.push('outcome_reviewed: success | soft_success | rework | blocked'); + cues.push('reasoning: 1-3 sentences'); + + const adaptiveNotes = []; + if (v >= 3) { + adaptiveNotes.push('Episode is v3+: primary_rationale carries triggers/candidates/boundaries.'); + } + if (v >= 4) { + adaptiveNotes.push('Episode is v4: classifier_output.alternatives_considered tells you what the classifier weighed.'); + adaptiveNotes.push('self_assessment (if present and not pending) is the agent\'s post-hoc judgement — compare honesty.'); + adaptiveNotes.push('execution_trace.chain_gaps shows whether the recommended chain ran in full.'); + } + + return [ + 'You are the independent reviewer of routing decisions for the Лидерра brain-governance experiment.', + 'Return ONLY a JSON object with the 8 fields below. No prose, no code fences.', + '', + 'Fields:', + ...cues.map((c) => ' - ' + c), + '', + adaptiveNotes.length ? 'Notes for this schema version:' : '', + ...adaptiveNotes.map((n) => ' - ' + n), + '', + 'Episode (JSON):', + JSON.stringify(episode, null, 2), + '', + 'Output JSON only.', + ].filter(Boolean).join('\n'); +} + +/** + * Parse the Opus reviewer response. Pure. Returns null on malformed JSON or + * when a required 8-dim field is missing. Passes through `reviewer_error` + * escalations from the subagent. + */ +export function parseReview(text) { + if (!text) return null; + const stripped = String(text).trim() + .replace(/^```(?:json)?\s*\n?/, '') + .replace(/\n?```$/, '') + .trim(); + let parsed; + try { parsed = JSON.parse(stripped); } + catch { return null; } + if (!parsed || typeof parsed !== 'object') return null; + + // Reviewer-agent escalation: pass through verbatim. + if (typeof parsed.reviewer_error === 'string') return parsed; + + for (const f of REQUIRED_REVIEW_FIELDS) { + if (parsed[f] === undefined) return null; + } + return parsed; +} + +/** + * Direct Opus API call. Wraps callAnthropicAPI from router-classifier with + * the reviewer model. Caller (controller inside /brain-retro) is responsible + * for decision (subagent first, this on failure). + * + * Returns the parsed review object or null on transport / parse failure. + */ +export async function reviewViaDirectApi(episode, options = {}) { + const { callAnthropicAPI } = await import('./router-classifier.mjs'); + const apiKey = options.apiKey ?? process.env.ROUTER_LLM_KEY; + if (!apiKey) return null; + const prompt = buildReviewPrompt(episode); + try { + const text = await callAnthropicAPI(prompt, { + apiKey, + baseUrl: options.baseUrl ?? process.env.ROUTER_LLM_BASE_URL ?? undefined, + model: options.model ?? REVIEWER_MODEL, + }); + return parseReview(text); + } catch { + return null; + } +} diff --git a/tools/brain-retro-opus-reviewer.test.mjs b/tools/brain-retro-opus-reviewer.test.mjs new file mode 100644 index 00000000..bd927fe4 --- /dev/null +++ b/tools/brain-retro-opus-reviewer.test.mjs @@ -0,0 +1,65 @@ +// tools/brain-retro-opus-reviewer.test.mjs — TDD for Phase 3 Task 18 (G16, spec §4.6) +import { describe, it, expect } from 'vitest'; +import { buildReviewPrompt, parseReview } from './brain-retro-opus-reviewer.mjs'; + +describe('buildReviewPrompt — adaptive v2/v3/v4 (spec §4.6)', () => { + it('v4 includes alternatives_considered + self_assessment + chain_gaps cues', () => { + const ep = { + schema_version: 4, + schema_minor: 2, + task_id: 't', + primary_rationale: { task_classification: 'feature', node_chosen: 'direct' }, + classifier_output: { recommended_node: '#19', alternatives_considered: [{ node: 'x', match_score: 0.5 }] }, + self_assessment: { summary: 'ok', confidence_in_choice: 0.8 }, + execution_trace: { chain_gaps: [] }, + }; + const p = buildReviewPrompt(ep); + expect(p).toContain('alternatives_considered'); + expect(p).toContain('self_assessment'); + expect(p).toContain('chain_gaps'); + }); + + it('v3 omits alternatives_considered cue', () => { + expect(buildReviewPrompt({ schema_version: 3 })).not.toContain('alternatives_considered'); + }); + + it('v2 omits alternatives + self_assessment cues', () => { + const p = buildReviewPrompt({ schema_version: 2 }); + expect(p).not.toContain('alternatives_considered'); + expect(p).not.toContain('self_assessment'); + }); + + it('includes the episode JSON verbatim for the reviewer to read', () => { + const ep = { schema_version: 4, task_id: 'task-xyz-1' }; + expect(buildReviewPrompt(ep)).toContain('task-xyz-1'); + }); +}); + +describe('parseReview — 8-dim review schema (spec §4.6)', () => { + it('parses a complete 8-dim review JSON', () => { + const r = parseReview('{"node_quality":"correct","chain_quality":"n/a","gap_assessment":"n/a","agent_self_assessment_accuracy":"accurate","error_root_cause":"n/a","alternative_better":null,"outcome_reviewed":"success","reasoning":"x"}'); + expect(r.node_quality).toBe('correct'); + expect(r.outcome_reviewed).toBe('success'); + expect(r.alternative_better).toBeNull(); + expect(r.reasoning).toBe('x'); + }); + + it('strips ```json fence', () => { + const r = parseReview('```json\n{"node_quality":"wrong_node","chain_quality":"missing_step","gap_assessment":"acceptable","agent_self_assessment_accuracy":"over_confident","error_root_cause":"wrong_skill","alternative_better":"#19","outcome_reviewed":"rework","reasoning":"y"}\n```'); + expect(r.node_quality).toBe('wrong_node'); + expect(r.alternative_better).toBe('#19'); + }); + + it('returns null on malformed JSON', () => { + expect(parseReview('not json')).toBeNull(); + }); + + it('returns null when required field missing', () => { + expect(parseReview('{"node_quality":"correct"}')).toBeNull(); + }); + + it('returns reviewer_error passthrough when reviewer escalates', () => { + const r = parseReview('{"reviewer_error":"malformed episode"}'); + expect(r?.reviewer_error).toBe('malformed episode'); + }); +});