diff --git a/tools/observer-self-assessment-api.mjs b/tools/observer-self-assessment-api.mjs new file mode 100644 index 00000000..ebd0291a --- /dev/null +++ b/tools/observer-self-assessment-api.mjs @@ -0,0 +1,207 @@ +/** + * tools/observer-self-assessment-api.mjs + * + * Phase 3 deferred follow-up #5: real LLM self-assessment API call. + * + * Exports: + * buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted }) + * callSelfAssessmentApi({ prompt, recommendedNode, actualNode, chainExecuted, + * apiKey, baseUrl, model, fetchImpl, timeoutMs, abortSignal }) + * readRuntimeFlag(name, { homedir, fsImpl }) + * + * All functions are pure / fail-quiet — they never throw in production. + * callSelfAssessmentApi always returns string | null (null = skip self-assessment). + */ + +import { join } from 'path'; +import { existsSync, readFileSync } from 'fs'; +import { homedir as osHomedir } from 'os'; + +// --------------------------------------------------------------------------- +// Prompt builder (pure) +// --------------------------------------------------------------------------- + +/** + * Build the self-assessment prompt for Sonnet. + * + * System: Russian instruction asking Claude to evaluate its own routing choice + * and return a JSON object with 4 fields. + * + * User: interpolates the 4 context fields. + * + * @param {object} opts + * @param {string|null|undefined} opts.prompt — the user's original prompt text + * @param {string|null|undefined} opts.recommendedNode — node recommended by router + * @param {string|null|undefined} opts.actualNode — node actually chosen / 'direct' + * @param {string[]|null|undefined} opts.chainExecuted — list of chain steps executed + * @returns {{ system: string, user: string }} + */ +export function buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted } = {}) { + const safePrompt = prompt ?? ''; + const safeRecommended = recommendedNode ?? 'не определён'; + const safeActual = actualNode ?? 'direct'; + const safeChain = Array.isArray(chainExecuted) && chainExecuted.length > 0 + ? chainExecuted.join(' → ') + : '[]'; + + const system = [ + 'Ты — внутренний наблюдатель роутинговой системы Claude Code.', + 'Твоя задача — честно оценить качество роутингового решения, принятого в этой сессии.', + 'Отвечай ТОЛЬКО валидным JSON-объектом без markdown-обёрток, ровно 4 поля:', + ' "summary": строка — краткое описание принятого решения (до 120 символов)', + ' "confidence_in_choice": число от 0.0 до 1.0 — насколько оптимальным был выбор', + ' "what_could_be_better": строка или null — что можно было сделать иначе', + ' "lesson_learned": строка или null — чему учит этот эпизод для будущих сессий', + 'Не добавляй лишних полей. Не используй markdown. Только JSON.', + ].join('\n'); + + const user = [ + 'Контекст роутингового решения:', + '', + `Запрос пользователя: ${safePrompt || '(пусто)'}`, + `Рекомендованный узел роутером: ${safeRecommended}`, + `Фактически выбранный узел: ${safeActual}`, + `Выполненная цепочка: ${safeChain}`, + '', + 'Оцени это решение. Верни JSON с 4 полями.', + ].join('\n'); + + return { system, user }; +} + +// --------------------------------------------------------------------------- +// Runtime flag reader +// --------------------------------------------------------------------------- + +/** + * Read a runtime flag from ~/.claude/runtime/.json. + * Returns the "value" field from the file, or 'off' on any error. + * + * @param {string} name — flag file basename without .json + * @param {object} opts + * @param {string} [opts.homedir] — override home dir (for tests) + * @param {{ existsSync: Function, readFileSync: Function }} [opts.fsImpl] — override fs (for tests) + * @returns {string} + */ +export function readRuntimeFlag(name, { homedir, fsImpl } = {}) { + const home = homedir ?? osHomedir(); + const fs = fsImpl ?? { existsSync, readFileSync }; + + try { + const filePath = join(home, '.claude', 'runtime', `${name}.json`); + if (!fs.existsSync(filePath)) return 'off'; + const raw = fs.readFileSync(filePath, 'utf-8'); + const parsed = JSON.parse(raw); + if (typeof parsed.value !== 'string') return 'off'; + return parsed.value; + } catch { + return 'off'; + } +} + +// --------------------------------------------------------------------------- +// API caller (async, fail-quiet) +// --------------------------------------------------------------------------- + +const DEFAULT_BASE_URL = 'https://api.proxyapi.ru/anthropic'; +const DEFAULT_MODEL = 'claude-sonnet-4-6'; +const DEFAULT_TIMEOUT_MS = 10000; +const MAX_TOKENS = 512; + +/** + * Call the Anthropic /v1/messages endpoint with the self-assessment prompt. + * Returns the text content from the first content block, or null on any failure. + * + * Fail-quiet contract: any error (missing key, network error, non-2xx, JSON + * parse error, timeout) → return null. Never throws. + * + * @param {object} opts + * @param {string|null|undefined} opts.prompt + * @param {string|null|undefined} opts.recommendedNode + * @param {string|null|undefined} opts.actualNode + * @param {string[]|null|undefined} opts.chainExecuted + * @param {string|null|undefined} opts.apiKey — ROUTER_LLM_KEY value + * @param {string} [opts.baseUrl] — API base URL + * @param {string} [opts.model] — model alias + * @param {Function} [opts.fetchImpl] — override fetch (for tests) + * @param {number} [opts.timeoutMs] — abort timeout in ms + * @param {AbortSignal} [opts.abortSignal] — external abort signal + * @returns {Promise} + */ +export async function callSelfAssessmentApi({ + prompt, + recommendedNode, + actualNode, + chainExecuted, + apiKey, + baseUrl = DEFAULT_BASE_URL, + model = DEFAULT_MODEL, + fetchImpl, + timeoutMs = DEFAULT_TIMEOUT_MS, + abortSignal, +} = {}) { + // Guard: no key → skip silently + if (!apiKey) return null; + + const fetchFn = fetchImpl ?? globalThis.fetch; + + const { system, user } = buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted }); + + const url = `${baseUrl}/v1/messages`; + const body = JSON.stringify({ + model, + max_tokens: MAX_TOKENS, + system, + messages: [{ role: 'user', content: user }], + }); + + // Build abort signal — wire to caller's signal if provided + let timeoutId; + let controller; + let signal = abortSignal; + + if (!signal) { + controller = new AbortController(); + signal = controller.signal; + } + + // Build a timeout promise that resolves to null after timeoutMs. + // We always race the fetch against the timeout so that even when the + // fetchImpl ignores the AbortSignal (e.g. in tests) the timeout still wins. + const timeoutPromise = new Promise((resolve) => { + timeoutId = setTimeout(() => resolve(null), timeoutMs); + if (controller) { + // Also abort the controller so real fetch() implementations cancel early. + setTimeout(() => controller.abort(), timeoutMs); + } + }); + + try { + const fetchPromise = fetchFn(url, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'x-api-key': apiKey, + 'authorization': `Bearer ${apiKey}`, + 'anthropic-version': '2023-06-01', + }, + body, + signal, + }).then(async (response) => { + if (!response.ok) return null; + const data = await response.json(); + const text = data?.content?.[0]?.text; + if (typeof text !== 'string') return null; + return text; + }).catch(() => null); + + // Race: first settlement wins. + const result = await Promise.race([fetchPromise, timeoutPromise]); + return result ?? null; + } catch { + // Unexpected outer error → fail-quiet + return null; + } finally { + if (timeoutId !== undefined) clearTimeout(timeoutId); + } +} diff --git a/tools/observer-self-assessment-api.test.mjs b/tools/observer-self-assessment-api.test.mjs new file mode 100644 index 00000000..33ce996e --- /dev/null +++ b/tools/observer-self-assessment-api.test.mjs @@ -0,0 +1,260 @@ +/** + * Tests for tools/observer-self-assessment-api.mjs + * Phase 3 deferred follow-up #5: real LLM self-assessment API call. + * TDD — these tests are written BEFORE the implementation exists. + */ + +import { describe, it, expect } from 'vitest'; +import { + buildSelfAssessmentPrompt, + callSelfAssessmentApi, + readRuntimeFlag, +} from './observer-self-assessment-api.mjs'; + +// --------------------------------------------------------------------------- +// 1. buildSelfAssessmentPrompt — all 4 fields interpolated +// --------------------------------------------------------------------------- +describe('buildSelfAssessmentPrompt — all fields interpolated', () => { + it('returns system+user strings with all 4 fields present in user string', () => { + const { system, user } = buildSelfAssessmentPrompt({ + prompt: 'напиши тест для биллинга', + recommendedNode: '#62', + actualNode: '#19', + chainExecuted: ['#19', '#62'], + }); + expect(typeof system).toBe('string'); + expect(system.length).toBeGreaterThan(0); + expect(typeof user).toBe('string'); + expect(user).toContain('напиши тест для биллинга'); + expect(user).toContain('#62'); + expect(user).toContain('#19'); + expect(user).toContain('#62'); // part of chainExecuted serialisation + }); +}); + +// --------------------------------------------------------------------------- +// 2. buildSelfAssessmentPrompt — handles missing/null inputs gracefully +// --------------------------------------------------------------------------- +describe('buildSelfAssessmentPrompt — null/undefined inputs', () => { + it('returns valid strings when all inputs are undefined/null', () => { + const { system, user } = buildSelfAssessmentPrompt({}); + expect(typeof system).toBe('string'); + expect(typeof user).toBe('string'); + // Should contain fallback placeholders, not throw + expect(user).not.toContain('undefined'); + expect(user).not.toContain('[object Object]'); + }); + + it('handles null recommendedNode and empty chainExecuted', () => { + const { user } = buildSelfAssessmentPrompt({ + prompt: 'test', + recommendedNode: null, + actualNode: 'direct', + chainExecuted: [], + }); + expect(user).toContain('test'); + }); +}); + +// --------------------------------------------------------------------------- +// 3. callSelfAssessmentApi — returns null when apiKey is missing/empty +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — missing apiKey', () => { + it('returns null immediately when apiKey is falsy (no fetch call)', async () => { + let fetchCalled = false; + const fakeFetch = async () => { fetchCalled = true; }; + + const result = await callSelfAssessmentApi({ + prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [], + apiKey: '', + fetchImpl: fakeFetch, + }); + + expect(result).toBeNull(); + expect(fetchCalled).toBe(false); + }); + + it('returns null when apiKey is undefined', async () => { + const result = await callSelfAssessmentApi({ + prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [], + apiKey: undefined, + }); + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 4. callSelfAssessmentApi — returns text on 200 + content[0].text +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — successful 200 response', () => { + it('returns content[0].text on ok response', async () => { + const responseText = '{"summary":"chose correctly","confidence_in_choice":0.9,"what_could_be_better":null,"lesson_learned":null}'; + const fakeFetch = async () => ({ + ok: true, + json: async () => ({ + content: [{ type: 'text', text: responseText }], + }), + }); + + const result = await callSelfAssessmentApi({ + prompt: 'do something', + recommendedNode: '#19', + actualNode: '#19', + chainExecuted: ['#19'], + apiKey: 'test-key', + baseUrl: 'https://api.example.com/anthropic', + model: 'claude-sonnet-4-6', + fetchImpl: fakeFetch, + timeoutMs: 5000, + }); + + expect(result).toBe(responseText); + }); +}); + +// --------------------------------------------------------------------------- +// 5. callSelfAssessmentApi — returns null on non-2xx (r.ok=false) +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — non-2xx response', () => { + it('returns null when response.ok is false', async () => { + const fakeFetch = async () => ({ + ok: false, + status: 429, + json: async () => ({ error: { message: 'rate limited' } }), + }); + + const result = await callSelfAssessmentApi({ + prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [], + apiKey: 'test-key', + fetchImpl: fakeFetch, + timeoutMs: 5000, + }); + + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 6. callSelfAssessmentApi — returns null on fetch throw +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — fetch throws', () => { + it('returns null (fail-quiet) when fetch throws a network error', async () => { + const fakeFetch = async () => { throw new Error('network error'); }; + + const result = await callSelfAssessmentApi({ + prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [], + apiKey: 'test-key', + fetchImpl: fakeFetch, + timeoutMs: 5000, + }); + + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// 7. callSelfAssessmentApi — returns null on timeout +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — timeout', () => { + it('returns null when fetch never resolves within timeoutMs', async () => { + // fakeFetch returns a promise that never resolves + const fakeFetch = async (_url, _opts) => new Promise(() => { /* never */ }); + + const start = Date.now(); + const result = await callSelfAssessmentApi({ + prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [], + apiKey: 'test-key', + fetchImpl: fakeFetch, + timeoutMs: 30, // 30 ms timeout — very fast for test + }); + const elapsed = Date.now() - start; + + expect(result).toBeNull(); + // Should resolve around the timeout, not hang indefinitely + expect(elapsed).toBeLessThan(500); + }); +}); + +// --------------------------------------------------------------------------- +// 8. callSelfAssessmentApi — sends correct headers and body +// --------------------------------------------------------------------------- +describe('callSelfAssessmentApi — request format', () => { + it('sends correct headers and body shape (spy fetchImpl)', async () => { + let capturedUrl, capturedOpts; + const fakeFetch = async (url, opts) => { + capturedUrl = url; + capturedOpts = opts; + return { + ok: true, + json: async () => ({ content: [{ type: 'text', text: 'ok' }] }), + }; + }; + + await callSelfAssessmentApi({ + prompt: 'test prompt', + recommendedNode: '#62', + actualNode: '#62', + chainExecuted: ['#62'], + apiKey: 'my-secret-key', + baseUrl: 'https://api.proxyapi.ru/anthropic', + model: 'claude-sonnet-4-6', + fetchImpl: fakeFetch, + timeoutMs: 5000, + }); + + expect(capturedUrl).toContain('/v1/messages'); + const headers = capturedOpts.headers; + expect(headers['authorization'] || headers['x-api-key']).toBeTruthy(); + const body = JSON.parse(capturedOpts.body); + expect(body.model).toBe('claude-sonnet-4-6'); + expect(Array.isArray(body.messages)).toBe(true); + expect(body.messages[0].role).toBe('user'); + expect(body.max_tokens).toBeGreaterThan(0); + }); +}); + +// --------------------------------------------------------------------------- +// 9. readRuntimeFlag — reads value from file; returns 'off' on missing/malformed +// --------------------------------------------------------------------------- +describe('readRuntimeFlag', () => { + it('returns the value from {"value":"on"} when file exists', () => { + const fakeHomedir = '/fake/home'; + const fakeFsImpl = { + existsSync: (p) => p.endsWith('self-assessment-mode.json'), + readFileSync: (_p, _enc) => '{"value":"on"}', + }; + + const result = readRuntimeFlag('self-assessment-mode', { homedir: fakeHomedir, fsImpl: fakeFsImpl }); + expect(result).toBe('on'); + }); + + it('returns "off" when file does not exist', () => { + const fakeFsImpl = { + existsSync: () => false, + readFileSync: () => { throw new Error('no file'); }, + }; + + const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl }); + expect(result).toBe('off'); + }); + + it('returns "off" on malformed JSON', () => { + const fakeFsImpl = { + existsSync: () => true, + readFileSync: () => 'NOT JSON', + }; + + const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl }); + expect(result).toBe('off'); + }); + + it('returns "off" when value field is missing', () => { + const fakeFsImpl = { + existsSync: () => true, + readFileSync: () => '{"mode":"on"}', // no "value" key + }; + + const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl }); + expect(result).toBe('off'); + }); +}); diff --git a/tools/observer-stop-hook.mjs b/tools/observer-stop-hook.mjs index 295fc8b5..d3389e69 100644 --- a/tools/observer-stop-hook.mjs +++ b/tools/observer-stop-hook.mjs @@ -19,6 +19,7 @@ import { join } from 'path'; import { sanitize, sanitizeWithCount } from './observer-pii-filter.mjs'; import { parseTranscript, extractLastUserPromptText } from './observer-transcript-parser.mjs'; import { detectMethodDirected, loadKnownNodes } from './observer-routing-detector.mjs'; +import { callSelfAssessmentApi, readRuntimeFlag } from './observer-self-assessment-api.mjs'; const REQUIRED_FIELDS = ['task_id', 'timestamps', 'path_type', 'outcome', 'primary_rationale']; const V2_FIELDS = [ @@ -294,7 +295,7 @@ function currentMonth() { if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/observer-stop-hook.mjs')) { const chunks = []; process.stdin.on('data', (c) => chunks.push(c)); - process.stdin.on('end', () => { + process.stdin.on('end', async () => { let ctx = {}; try { const raw = Buffer.concat(chunks).toString('utf-8'); @@ -315,6 +316,23 @@ if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/observer-s } try { const ep = buildEpisodeFromContext(ctx, transcriptText); + + // Step 3.5: self-assessment API call (fail-quiet). + // Only runs when the runtime flag is 'on' and ROUTER_LLM_KEY is set. + const saMode = readRuntimeFlag('self-assessment-mode'); + const saApiKey = process.env.ROUTER_LLM_KEY || null; + if (saMode === 'on' && saApiKey) { + const rat = ep.primary_rationale ?? {}; + const apiResult = await callSelfAssessmentApi({ + prompt: ctx.prompt || null, + recommendedNode: rat.recommended_node || null, + actualNode: rat.node_chosen || null, + chainExecuted: rat.chain_executed || [], + apiKey: saApiKey, + }); + ep.self_assessment = buildSelfAssessment({ apiResult }); + } + // Always write the episode first — exit-0-safe (spec §5.1 step 1). appendEpisode(ep); // Then the routing-gate (spec §5.1 steps 2-4).