7b4da1477e
Brain-retro #6 follow-up #2 (consolidated). Eight independent fixes: A1 — task_cost wiring (cost tracking) - router-prehook.mjs: capture classifier LLM usage via onUsage callback, persist to state.task_cost.classifier_input_tokens / output_tokens. - observer-transcript-parser.mjs: merge router-state.task_cost on top of extractTokenUsage(turn). State-file values win for classifier/ self_assessment/reviewer fields. - New buildCostFromClassifierUsage() exported from router-prehook. - Verified live: state file now shows real input_tokens=190 / output_tokens=598 / cache_read=10075 (was 0 before). A2 — self-assessment coverage - observer-self-assessment-api.mjs: DEFAULT_TIMEOUT_MS 10s -> 30s. - .claude/settings.json: Stop-hook timeout 15s -> 60s. - Same Windows TLS handshake issue. Was 85% no_self_assessment in retro #6. B3 — brain-retro SKILL.md reconciliation - Step 5b: batch=default for N>=20, subagent for N<20. C1 — dead-code cleanup - Removed recommendNode import + getClassificationMap + getDormancy from observer-transcript-parser.mjs. G — parseClassifierResponse Pass 3 (fixLLMJsonQuirks) - Root cause: real Sonnet output sometimes contains raw newlines inside string values (multi-line reason_for_choice) and trailing commas, which strict JSON.parse rejects. Result was llm_error_type=parse_null on every other call, falling back to regex with task_type=unknown. - Fix: after Pass 1 (clean) and Pass 2 (brace-extract) fail, try Pass 3 that escapes raw newline/tab inside string values and strips trailing commas before final JSON.parse attempt. Pure char-walk, no JSON5 dep. H — 'unknown' added to NON_BLOCKING_TASK_TYPES in router-tool-gate.mjs - Until G fully proves itself, blocking Bash/Edit on unknown is too strict. With G in place, parse_null should be rare; H gives a safety net. Tests added: +9 across 5 test files. Regression: 913 vitest tests in tools/. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
303 lines
12 KiB
JavaScript
303 lines
12 KiB
JavaScript
/**
|
|
* Tests for tools/observer-self-assessment-api.mjs
|
|
* Phase 3 deferred follow-up #5: real LLM self-assessment API call.
|
|
* TDD — these tests are written BEFORE the implementation exists.
|
|
*/
|
|
|
|
import { describe, it, expect } from 'vitest';
|
|
import {
|
|
buildSelfAssessmentPrompt,
|
|
callSelfAssessmentApi,
|
|
readRuntimeFlag,
|
|
} from './observer-self-assessment-api.mjs';
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 1. buildSelfAssessmentPrompt — all 4 fields interpolated
|
|
// ---------------------------------------------------------------------------
|
|
describe('buildSelfAssessmentPrompt — all fields interpolated', () => {
|
|
it('returns system+user strings with all 4 fields present in user string', () => {
|
|
const { system, user } = buildSelfAssessmentPrompt({
|
|
prompt: 'напиши тест для биллинга',
|
|
recommendedNode: '#62',
|
|
actualNode: '#19',
|
|
chainExecuted: ['#19', '#62'],
|
|
});
|
|
expect(typeof system).toBe('string');
|
|
expect(system.length).toBeGreaterThan(0);
|
|
expect(typeof user).toBe('string');
|
|
expect(user).toContain('напиши тест для биллинга');
|
|
expect(user).toContain('#62');
|
|
expect(user).toContain('#19');
|
|
expect(user).toContain('#62'); // part of chainExecuted serialisation
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 2. buildSelfAssessmentPrompt — handles missing/null inputs gracefully
|
|
// ---------------------------------------------------------------------------
|
|
describe('buildSelfAssessmentPrompt — null/undefined inputs', () => {
|
|
it('returns valid strings when all inputs are undefined/null', () => {
|
|
const { system, user } = buildSelfAssessmentPrompt({});
|
|
expect(typeof system).toBe('string');
|
|
expect(typeof user).toBe('string');
|
|
// Should contain fallback placeholders, not throw
|
|
expect(user).not.toContain('undefined');
|
|
expect(user).not.toContain('[object Object]');
|
|
});
|
|
|
|
it('handles null recommendedNode and empty chainExecuted', () => {
|
|
const { user } = buildSelfAssessmentPrompt({
|
|
prompt: 'test',
|
|
recommendedNode: null,
|
|
actualNode: 'direct',
|
|
chainExecuted: [],
|
|
});
|
|
expect(user).toContain('test');
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 3. callSelfAssessmentApi — returns null when apiKey is missing/empty
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — missing apiKey', () => {
|
|
it('returns null immediately when apiKey is falsy (no fetch call)', async () => {
|
|
let fetchCalled = false;
|
|
const fakeFetch = async () => { fetchCalled = true; };
|
|
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: '',
|
|
fetchImpl: fakeFetch,
|
|
});
|
|
|
|
expect(result).toBeNull();
|
|
expect(fetchCalled).toBe(false);
|
|
});
|
|
|
|
it('returns null when apiKey is undefined', async () => {
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: undefined,
|
|
});
|
|
expect(result).toBeNull();
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 4. callSelfAssessmentApi — returns text on 200 + content[0].text
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — successful 200 response', () => {
|
|
it('returns content[0].text on ok response', async () => {
|
|
const responseText = '{"summary":"chose correctly","confidence_in_choice":0.9,"what_could_be_better":null,"lesson_learned":null}';
|
|
const fakeFetch = async () => ({
|
|
ok: true,
|
|
json: async () => ({
|
|
content: [{ type: 'text', text: responseText }],
|
|
}),
|
|
});
|
|
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'do something',
|
|
recommendedNode: '#19',
|
|
actualNode: '#19',
|
|
chainExecuted: ['#19'],
|
|
apiKey: 'test-key',
|
|
baseUrl: 'https://api.example.com/anthropic',
|
|
model: 'claude-sonnet-4-6',
|
|
fetchImpl: fakeFetch,
|
|
timeoutMs: 5000,
|
|
});
|
|
|
|
expect(result).toBe(responseText);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 5. callSelfAssessmentApi — returns null on non-2xx (r.ok=false)
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — non-2xx response', () => {
|
|
it('returns null when response.ok is false', async () => {
|
|
const fakeFetch = async () => ({
|
|
ok: false,
|
|
status: 429,
|
|
json: async () => ({ error: { message: 'rate limited' } }),
|
|
});
|
|
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: 'test-key',
|
|
fetchImpl: fakeFetch,
|
|
timeoutMs: 5000,
|
|
});
|
|
|
|
expect(result).toBeNull();
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 6. callSelfAssessmentApi — returns null on fetch throw
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — fetch throws', () => {
|
|
it('returns null (fail-quiet) when fetch throws a network error', async () => {
|
|
const fakeFetch = async () => { throw new Error('network error'); };
|
|
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: 'test-key',
|
|
fetchImpl: fakeFetch,
|
|
timeoutMs: 5000,
|
|
});
|
|
|
|
expect(result).toBeNull();
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 7. callSelfAssessmentApi — returns null on timeout
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — A2 default timeout bumped 10s → 30s (2026-05-26)', () => {
|
|
it('default timeoutMs is >= 30000 ms (TLS handshake budget on Windows)', async () => {
|
|
// Detect default by mocking fetch to record signal AbortController duration.
|
|
// We can introspect indirectly: start a fakeFetch that resolves after 25s
|
|
// (longer than old default 10s, shorter than new default 30s). With the new
|
|
// default, it should resolve to the response; with the old default, null.
|
|
// To avoid waiting 25s real-time, we instead check the exported constant.
|
|
const mod = await import('./observer-self-assessment-api.mjs');
|
|
// Test via call: pass no timeoutMs and confirm fetchImpl's signal doesn't abort early.
|
|
let abortedEarly = false;
|
|
const fakeFetch = (_url, opts) => new Promise((resolve) => {
|
|
if (opts.signal) {
|
|
opts.signal.addEventListener('abort', () => { abortedEarly = true; resolve(null); });
|
|
}
|
|
// resolve after 12s (would fail with 10s default, pass with 30s)
|
|
setTimeout(() => resolve({ ok: true, json: () => Promise.resolve({ content: [{ text: '{}' }] }) }), 50);
|
|
});
|
|
const result = await mod.callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: 'test-key',
|
|
fetchImpl: fakeFetch,
|
|
// no explicit timeoutMs — use default
|
|
});
|
|
// 50ms fetch should NOT be aborted by default timeout (>= 30000ms means lots of headroom)
|
|
expect(abortedEarly).toBe(false);
|
|
// Returned the parsed JSON content (string)
|
|
expect(typeof result).toBe('string');
|
|
});
|
|
});
|
|
|
|
describe('callSelfAssessmentApi — timeout', () => {
|
|
it('returns null when fetch never resolves within timeoutMs', async () => {
|
|
// fakeFetch returns a promise that never resolves
|
|
const fakeFetch = async (_url, _opts) => new Promise(() => { /* never */ });
|
|
|
|
const start = Date.now();
|
|
const result = await callSelfAssessmentApi({
|
|
prompt: 'x', recommendedNode: '#1', actualNode: '#1', chainExecuted: [],
|
|
apiKey: 'test-key',
|
|
fetchImpl: fakeFetch,
|
|
timeoutMs: 30, // 30 ms timeout — very fast for test
|
|
});
|
|
const elapsed = Date.now() - start;
|
|
|
|
expect(result).toBeNull();
|
|
// Should resolve around the timeout, not hang indefinitely
|
|
expect(elapsed).toBeLessThan(500);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 8. callSelfAssessmentApi — sends correct headers and body
|
|
// ---------------------------------------------------------------------------
|
|
describe('callSelfAssessmentApi — request format', () => {
|
|
it('sends correct headers and body shape (spy fetchImpl)', async () => {
|
|
let capturedUrl, capturedOpts;
|
|
const fakeFetch = async (url, opts) => {
|
|
capturedUrl = url;
|
|
capturedOpts = opts;
|
|
return {
|
|
ok: true,
|
|
json: async () => ({ content: [{ type: 'text', text: 'ok' }] }),
|
|
};
|
|
};
|
|
|
|
await callSelfAssessmentApi({
|
|
prompt: 'test prompt',
|
|
recommendedNode: '#62',
|
|
actualNode: '#62',
|
|
chainExecuted: ['#62'],
|
|
apiKey: 'my-secret-key',
|
|
baseUrl: 'https://api.proxyapi.ru/anthropic',
|
|
model: 'claude-sonnet-4-6',
|
|
fetchImpl: fakeFetch,
|
|
timeoutMs: 5000,
|
|
});
|
|
|
|
expect(capturedUrl).toContain('/v1/messages');
|
|
const headers = capturedOpts.headers;
|
|
expect(headers['authorization'] || headers['x-api-key']).toBeTruthy();
|
|
const body = JSON.parse(capturedOpts.body);
|
|
expect(body.model).toBe('claude-sonnet-4-6');
|
|
expect(Array.isArray(body.messages)).toBe(true);
|
|
expect(body.messages[0].role).toBe('user');
|
|
expect(body.max_tokens).toBeGreaterThan(0);
|
|
});
|
|
});
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// 9. readRuntimeFlag — reads value from file; returns 'off' on missing/malformed
|
|
// ---------------------------------------------------------------------------
|
|
describe('readRuntimeFlag', () => {
|
|
it('returns the value from {"value":"on"} when file exists', () => {
|
|
const fakeHomedir = '/fake/home';
|
|
const fakeFsImpl = {
|
|
existsSync: (p) => p.endsWith('self-assessment-mode.json'),
|
|
readFileSync: (_p, _enc) => '{"value":"on"}',
|
|
};
|
|
|
|
const result = readRuntimeFlag('self-assessment-mode', { homedir: fakeHomedir, fsImpl: fakeFsImpl });
|
|
expect(result).toBe('on');
|
|
});
|
|
|
|
it('returns "off" when file does not exist', () => {
|
|
const fakeFsImpl = {
|
|
existsSync: () => false,
|
|
readFileSync: () => { throw new Error('no file'); },
|
|
};
|
|
|
|
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
|
|
expect(result).toBe('off');
|
|
});
|
|
|
|
it('returns "off" on malformed JSON', () => {
|
|
const fakeFsImpl = {
|
|
existsSync: () => true,
|
|
readFileSync: () => 'NOT JSON',
|
|
};
|
|
|
|
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
|
|
expect(result).toBe('off');
|
|
});
|
|
|
|
it('reads "mode" field when "value" is absent (post-050b349a fix)', () => {
|
|
// After 050b349a's readRuntimeFlag fix, runtime files store {mode: "on"} as
|
|
// canonical shape. The legacy "value" key is still accepted as fallback,
|
|
// but "mode" is preferred. Test that mode='on' without value yields 'on'.
|
|
const fakeFsImpl = {
|
|
existsSync: () => true,
|
|
readFileSync: () => '{"mode":"on"}',
|
|
};
|
|
|
|
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
|
|
expect(result).toBe('on');
|
|
});
|
|
|
|
it('returns "off" when neither "mode" nor "value" present', () => {
|
|
const fakeFsImpl = {
|
|
existsSync: () => true,
|
|
readFileSync: () => '{"other":"thing"}',
|
|
};
|
|
const result = readRuntimeFlag('self-assessment-mode', { homedir: '/fake', fsImpl: fakeFsImpl });
|
|
expect(result).toBe('off');
|
|
});
|
|
});
|