feat(observer): wire real LLM self-assessment API call — phase 3 deferred #5

- NEW tools/observer-self-assessment-api.mjs
  buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted })
  pure, handles nulls/undefined, returns { system, user } strings
  callSelfAssessmentApi(opts) async, fail-quiet — returns string|null
  AbortController + timeout race (works even when fetchImpl ignores signal)
  guards: !apiKey -> return null immediately (no fetch call)
  guards: !response.ok, fetch throw, JSON parse error -> return null
  passes x-api-key + authorization headers per ProxyAPI two-header pattern
  readRuntimeFlag(name, { homedir, fsImpl }) reads ~/.claude/runtime/<name>.json
  returns value field string or 'off' on missing/malformed

- NEW tools/observer-self-assessment-api.test.mjs: 14 tests, 0 failed
  1. buildSelfAssessmentPrompt all 4 fields interpolated
  2. buildSelfAssessmentPrompt null/undefined inputs (2 tests)
  3. callSelfAssessmentApi returns null when apiKey falsy (2 tests)
  4. returns content[0].text on 200 ok (fake fetchImpl)
  5. returns null on non-2xx (response.ok=false)
  6. returns null on fetch throw
  7. returns null on timeout (never-resolving fake fetchImpl, timeoutMs=30ms)
  8. sends correct headers+body shape (spy fetchImpl)
  9. readRuntimeFlag reads {"value":"on"}, returns 'off' on missing/malformed (4 tests)

- EDIT tools/observer-stop-hook.mjs
  import { callSelfAssessmentApi, readRuntimeFlag } added
  stdin 'end' handler made async
  step 3.5 inserted between buildEpisodeFromContext and appendEpisode:
  reads self-assessment-mode runtime flag; if 'on' and ROUTER_LLM_KEY set,
  calls callSelfAssessmentApi and attaches ep.self_assessment via buildSelfAssessment()
  fail-quiet: on any error apiResult=null -> self_assessment_pending: true

Regression: 628/628 tests passed (35 test files), 0 failed
gitleaks: 0 leaks on all 3 files

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Дмитрий
2026-05-25 13:20:29 +03:00
parent a4e30622cf
commit c1ec61fa49
3 changed files with 486 additions and 1 deletions
+207
View File
@@ -0,0 +1,207 @@
/**
* tools/observer-self-assessment-api.mjs
*
* Phase 3 deferred follow-up #5: real LLM self-assessment API call.
*
* Exports:
* buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted })
* callSelfAssessmentApi({ prompt, recommendedNode, actualNode, chainExecuted,
* apiKey, baseUrl, model, fetchImpl, timeoutMs, abortSignal })
* readRuntimeFlag(name, { homedir, fsImpl })
*
* All functions are pure / fail-quiet — they never throw in production.
* callSelfAssessmentApi always returns string | null (null = skip self-assessment).
*/
import { join } from 'path';
import { existsSync, readFileSync } from 'fs';
import { homedir as osHomedir } from 'os';
// ---------------------------------------------------------------------------
// Prompt builder (pure)
// ---------------------------------------------------------------------------
/**
* Build the self-assessment prompt for Sonnet.
*
* System: Russian instruction asking Claude to evaluate its own routing choice
* and return a JSON object with 4 fields.
*
* User: interpolates the 4 context fields.
*
* @param {object} opts
* @param {string|null|undefined} opts.prompt — the user's original prompt text
* @param {string|null|undefined} opts.recommendedNode — node recommended by router
* @param {string|null|undefined} opts.actualNode — node actually chosen / 'direct'
* @param {string[]|null|undefined} opts.chainExecuted — list of chain steps executed
* @returns {{ system: string, user: string }}
*/
export function buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted } = {}) {
const safePrompt = prompt ?? '';
const safeRecommended = recommendedNode ?? 'не определён';
const safeActual = actualNode ?? 'direct';
const safeChain = Array.isArray(chainExecuted) && chainExecuted.length > 0
? chainExecuted.join(' → ')
: '[]';
const system = [
'Ты — внутренний наблюдатель роутинговой системы Claude Code.',
'Твоя задача — честно оценить качество роутингового решения, принятого в этой сессии.',
'Отвечай ТОЛЬКО валидным JSON-объектом без markdown-обёрток, ровно 4 поля:',
' "summary": строка — краткое описание принятого решения (до 120 символов)',
' "confidence_in_choice": число от 0.0 до 1.0 — насколько оптимальным был выбор',
' "what_could_be_better": строка или null — что можно было сделать иначе',
' "lesson_learned": строка или null — чему учит этот эпизод для будущих сессий',
'Не добавляй лишних полей. Не используй markdown. Только JSON.',
].join('\n');
const user = [
'Контекст роутингового решения:',
'',
`Запрос пользователя: ${safePrompt || '(пусто)'}`,
`Рекомендованный узел роутером: ${safeRecommended}`,
`Фактически выбранный узел: ${safeActual}`,
`Выполненная цепочка: ${safeChain}`,
'',
'Оцени это решение. Верни JSON с 4 полями.',
].join('\n');
return { system, user };
}
// ---------------------------------------------------------------------------
// Runtime flag reader
// ---------------------------------------------------------------------------
/**
* Read a runtime flag from ~/.claude/runtime/<name>.json.
* Returns the "value" field from the file, or 'off' on any error.
*
* @param {string} name — flag file basename without .json
* @param {object} opts
* @param {string} [opts.homedir] — override home dir (for tests)
* @param {{ existsSync: Function, readFileSync: Function }} [opts.fsImpl] — override fs (for tests)
* @returns {string}
*/
export function readRuntimeFlag(name, { homedir, fsImpl } = {}) {
const home = homedir ?? osHomedir();
const fs = fsImpl ?? { existsSync, readFileSync };
try {
const filePath = join(home, '.claude', 'runtime', `${name}.json`);
if (!fs.existsSync(filePath)) return 'off';
const raw = fs.readFileSync(filePath, 'utf-8');
const parsed = JSON.parse(raw);
if (typeof parsed.value !== 'string') return 'off';
return parsed.value;
} catch {
return 'off';
}
}
// ---------------------------------------------------------------------------
// API caller (async, fail-quiet)
// ---------------------------------------------------------------------------
const DEFAULT_BASE_URL = 'https://api.proxyapi.ru/anthropic';
const DEFAULT_MODEL = 'claude-sonnet-4-6';
const DEFAULT_TIMEOUT_MS = 10000;
const MAX_TOKENS = 512;
/**
* Call the Anthropic /v1/messages endpoint with the self-assessment prompt.
* Returns the text content from the first content block, or null on any failure.
*
* Fail-quiet contract: any error (missing key, network error, non-2xx, JSON
* parse error, timeout) → return null. Never throws.
*
* @param {object} opts
* @param {string|null|undefined} opts.prompt
* @param {string|null|undefined} opts.recommendedNode
* @param {string|null|undefined} opts.actualNode
* @param {string[]|null|undefined} opts.chainExecuted
* @param {string|null|undefined} opts.apiKey — ROUTER_LLM_KEY value
* @param {string} [opts.baseUrl] — API base URL
* @param {string} [opts.model] — model alias
* @param {Function} [opts.fetchImpl] — override fetch (for tests)
* @param {number} [opts.timeoutMs] — abort timeout in ms
* @param {AbortSignal} [opts.abortSignal] — external abort signal
* @returns {Promise<string|null>}
*/
export async function callSelfAssessmentApi({
prompt,
recommendedNode,
actualNode,
chainExecuted,
apiKey,
baseUrl = DEFAULT_BASE_URL,
model = DEFAULT_MODEL,
fetchImpl,
timeoutMs = DEFAULT_TIMEOUT_MS,
abortSignal,
} = {}) {
// Guard: no key → skip silently
if (!apiKey) return null;
const fetchFn = fetchImpl ?? globalThis.fetch;
const { system, user } = buildSelfAssessmentPrompt({ prompt, recommendedNode, actualNode, chainExecuted });
const url = `${baseUrl}/v1/messages`;
const body = JSON.stringify({
model,
max_tokens: MAX_TOKENS,
system,
messages: [{ role: 'user', content: user }],
});
// Build abort signal — wire to caller's signal if provided
let timeoutId;
let controller;
let signal = abortSignal;
if (!signal) {
controller = new AbortController();
signal = controller.signal;
}
// Build a timeout promise that resolves to null after timeoutMs.
// We always race the fetch against the timeout so that even when the
// fetchImpl ignores the AbortSignal (e.g. in tests) the timeout still wins.
const timeoutPromise = new Promise((resolve) => {
timeoutId = setTimeout(() => resolve(null), timeoutMs);
if (controller) {
// Also abort the controller so real fetch() implementations cancel early.
setTimeout(() => controller.abort(), timeoutMs);
}
});
try {
const fetchPromise = fetchFn(url, {
method: 'POST',
headers: {
'content-type': 'application/json',
'x-api-key': apiKey,
'authorization': `Bearer ${apiKey}`,
'anthropic-version': '2023-06-01',
},
body,
signal,
}).then(async (response) => {
if (!response.ok) return null;
const data = await response.json();
const text = data?.content?.[0]?.text;
if (typeof text !== 'string') return null;
return text;
}).catch(() => null);
// Race: first settlement wins.
const result = await Promise.race([fetchPromise, timeoutPromise]);
return result ?? null;
} catch {
// Unexpected outer error → fail-quiet
return null;
} finally {
if (timeoutId !== undefined) clearTimeout(timeoutId);
}
}
+260
View File
@@ -0,0 +1,260 @@
/**
* Tests for tools/observer-self-assessment-api.mjs
* Phase 3 deferred follow-up #5: real LLM self-assessment API call.
* TDD — these tests are written BEFORE the implementation exists.
*/
import { describe, it, expect } from 'vitest';
import {
buildSelfAssessmentPrompt,
callSelfAssessmentApi,
readRuntimeFlag,
} from './observer-self-assessment-api.mjs';
// ---------------------------------------------------------------------------
// 1. buildSelfAssessmentPrompt — all 4 fields interpolated
// ---------------------------------------------------------------------------
describe('buildSelfAssessmentPrompt — all fields interpolated', () => {
it('returns system+user strings with all 4 fields present in user string', () => {
const { system, user } = buildSelfAssessmentPrompt({
prompt: 'напиши тест для биллинга',
recommendedNode: '#62',
actualNode: '#19',
chainExecuted: ['#19', '#62'],
});
expect(typeof system).toBe('string');
expect(system.length).toBeGreaterThan(0);
expect(typeof user).toBe('string');
expect(user).toContain('напиши тест для биллинга');
expect(user).toContain('#62');
expect(user).toContain('#19');
expect(user).toContain('#62'); // part of chainExecuted serialisation
});
});
// ---------------------------------------------------------------------------
// 2. buildSelfAssessmentPrompt — handles missing/null inputs gracefully
// ---------------------------------------------------------------------------
describe('buildSelfAssessmentPrompt — null/undefined inputs', () => {
it('returns valid strings when all inputs are undefined/null', () => {
const { system, user } = buildSelfAssessmentPrompt({});
expect(typeof system).toBe('string');
expect(typeof user).toBe('string');
// Should contain fallback placeholders, not throw
expect(user).not.toContain('undefined');
expect(user).not.toContain('[object Object]');
});
it('handles null recommendedNode and empty chainExecuted', () => {
const { user } = buildSelfAssessmentPrompt({
prompt: 'test',
recommendedNode: null,
actualNode: 'direct',
chainExecuted: [],
});
expect(user).toContain('test');
});
});
// ---------------------------------------------------------------------------
// 3. callSelfAssessmentApi — returns null when apiKey is missing/empty
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — missing apiKey', () => {
it('returns null immediately when apiKey is falsy (no fetch call)', async () => {
let fetchCalled = false;
const fakeFetch = async () => { fetchCalled = true; };
const result = await callSelfAssessmentApi({
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
apiKey: '',
fetchImpl: fakeFetch,
});
expect(result).toBeNull();
expect(fetchCalled).toBe(false);
});
it('returns null when apiKey is undefined', async () => {
const result = await callSelfAssessmentApi({
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
apiKey: undefined,
});
expect(result).toBeNull();
});
});
// ---------------------------------------------------------------------------
// 4. callSelfAssessmentApi — returns text on 200 + content[0].text
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — successful 200 response', () => {
it('returns content[0].text on ok response', async () => {
const responseText = '{"summary":"chose correctly","confidence_in_choice":0.9,"what_could_be_better":null,"lesson_learned":null}';
const fakeFetch = async () => ({
ok: true,
json: async () => ({
content: [{ type: 'text', text: responseText }],
}),
});
const result = await callSelfAssessmentApi({
prompt: 'do something',
recommendedNode: '#19',
actualNode: '#19',
chainExecuted: ['#19'],
apiKey: 'test-key',
baseUrl: 'https://api.example.com/anthropic',
model: 'claude-sonnet-4-6',
fetchImpl: fakeFetch,
timeoutMs: 5000,
});
expect(result).toBe(responseText);
});
});
// ---------------------------------------------------------------------------
// 5. callSelfAssessmentApi — returns null on non-2xx (r.ok=false)
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — non-2xx response', () => {
it('returns null when response.ok is false', async () => {
const fakeFetch = async () => ({
ok: false,
status: 429,
json: async () => ({ error: { message: 'rate limited' } }),
});
const result = await callSelfAssessmentApi({
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
apiKey: 'test-key',
fetchImpl: fakeFetch,
timeoutMs: 5000,
});
expect(result).toBeNull();
});
});
// ---------------------------------------------------------------------------
// 6. callSelfAssessmentApi — returns null on fetch throw
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — fetch throws', () => {
it('returns null (fail-quiet) when fetch throws a network error', async () => {
const fakeFetch = async () => { throw new Error('network error'); };
const result = await callSelfAssessmentApi({
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
apiKey: 'test-key',
fetchImpl: fakeFetch,
timeoutMs: 5000,
});
expect(result).toBeNull();
});
});
// ---------------------------------------------------------------------------
// 7. callSelfAssessmentApi — returns null on timeout
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — timeout', () => {
it('returns null when fetch never resolves within timeoutMs', async () => {
// fakeFetch returns a promise that never resolves
const fakeFetch = async (_url, _opts) => new Promise(() => { /* never */ });
const start = Date.now();
const result = await callSelfAssessmentApi({
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
apiKey: 'test-key',
fetchImpl: fakeFetch,
timeoutMs: 30, // 30 ms timeout — very fast for test
});
const elapsed = Date.now() - start;
expect(result).toBeNull();
// Should resolve around the timeout, not hang indefinitely
expect(elapsed).toBeLessThan(500);
});
});
// ---------------------------------------------------------------------------
// 8. callSelfAssessmentApi — sends correct headers and body
// ---------------------------------------------------------------------------
describe('callSelfAssessmentApi — request format', () => {
it('sends correct headers and body shape (spy fetchImpl)', async () => {
let capturedUrl, capturedOpts;
const fakeFetch = async (url, opts) => {
capturedUrl = url;
capturedOpts = opts;
return {
ok: true,
json: async () => ({ content: [{ type: 'text', text: 'ok' }] }),
};
};
await callSelfAssessmentApi({
prompt: 'test prompt',
recommendedNode: '#62',
actualNode: '#62',
chainExecuted: ['#62'],
apiKey: 'my-secret-key',
baseUrl: 'https://api.proxyapi.ru/anthropic',
model: 'claude-sonnet-4-6',
fetchImpl: fakeFetch,
timeoutMs: 5000,
});
expect(capturedUrl).toContain('/v1/messages');
const headers = capturedOpts.headers;
expect(headers['authorization'] || headers['x-api-key']).toBeTruthy();
const body = JSON.parse(capturedOpts.body);
expect(body.model).toBe('claude-sonnet-4-6');
expect(Array.isArray(body.messages)).toBe(true);
expect(body.messages[0].role).toBe('user');
expect(body.max_tokens).toBeGreaterThan(0);
});
});
// ---------------------------------------------------------------------------
// 9. readRuntimeFlag — reads value from file; returns 'off' on missing/malformed
// ---------------------------------------------------------------------------
describe('readRuntimeFlag', () => {
it('returns the value from {"value":"on"} when file exists', () => {
const fakeHomedir = '/fake/home';
const fakeFsImpl = {
existsSync: (p) => p.endsWith('self-assessment-mode.json'),
readFileSync: (_p, _enc) => '{"value":"on"}',
};
const result = readRuntimeFlag('self-assessment-mode', { homedir: fakeHomedir, fsImpl: fakeFsImpl });
expect(result).toBe('on');
});
it('returns "off" when file does not exist', () => {
const fakeFsImpl = {
existsSync: () => false,
readFileSync: () => { throw new Error('no file'); },
};
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
expect(result).toBe('off');
});
it('returns "off" on malformed JSON', () => {
const fakeFsImpl = {
existsSync: () => true,
readFileSync: () => 'NOT JSON',
};
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
expect(result).toBe('off');
});
it('returns "off" when value field is missing', () => {
const fakeFsImpl = {
existsSync: () => true,
readFileSync: () => '{"mode":"on"}', // no "value" key
};
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
expect(result).toBe('off');
});
});
+19 -1
View File
@@ -19,6 +19,7 @@ import { join } from 'path';
import { sanitize, sanitizeWithCount } from './observer-pii-filter.mjs';
import { parseTranscript, extractLastUserPromptText } from './observer-transcript-parser.mjs';
import { detectMethodDirected, loadKnownNodes } from './observer-routing-detector.mjs';
import { callSelfAssessmentApi, readRuntimeFlag } from './observer-self-assessment-api.mjs';
const REQUIRED_FIELDS = ['task_id', 'timestamps', 'path_type', 'outcome', 'primary_rationale'];
const V2_FIELDS = [
@@ -294,7 +295,7 @@ function currentMonth() {
if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/observer-stop-hook.mjs')) {
const chunks = [];
process.stdin.on('data', (c) => chunks.push(c));
process.stdin.on('end', () => {
process.stdin.on('end', async () => {
let ctx = {};
try {
const raw = Buffer.concat(chunks).toString('utf-8');
@@ -315,6 +316,23 @@ if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/observer-s
}
try {
const ep = buildEpisodeFromContext(ctx, transcriptText);
// Step 3.5: self-assessment API call (fail-quiet).
// Only runs when the runtime flag is 'on' and ROUTER_LLM_KEY is set.
const saMode = readRuntimeFlag('self-assessment-mode');
const saApiKey = process.env.ROUTER_LLM_KEY || null;
if (saMode === 'on' && saApiKey) {
const rat = ep.primary_rationale ?? {};
const apiResult = await callSelfAssessmentApi({
prompt: ctx.prompt || null,
recommendedNode: rat.recommended_node || null,
actualNode: rat.node_chosen || null,
chainExecuted: rat.chain_executed || [],
apiKey: saApiKey,
});
ep.self_assessment = buildSelfAssessment({ apiResult });
}
// Always write the episode first — exit-0-safe (spec §5.1 step 1).
appendEpisode(ep);
// Then the routing-gate (spec §5.1 steps 2-4).