diff --git a/tools/brain-retro-analyzer.mjs b/tools/brain-retro-analyzer.mjs index 31de2e86..b4c8feb9 100644 --- a/tools/brain-retro-analyzer.mjs +++ b/tools/brain-retro-analyzer.mjs @@ -187,6 +187,18 @@ function iterationsBucket(iterations) { return '11+'; } +// Pass 2 — classifier latency bucket. <500ms = fast (cache hit territory), +// 500-2000 = medium (cold call), 2000-10000 = slow (network jitter / overflow), +// >10000 = very_slow (retries fired). Null on non-LLM paths. +function latencyBucket(latency) { + const n = Number(latency); + if (!Number.isFinite(n) || n < 0) return 'null'; + if (n < 500) return 'fast'; + if (n < 2000) return 'medium'; + if (n < 10000) return 'slow'; + return 'very_slow'; +} + const FACTOR_FNS = { decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown', economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'), @@ -207,6 +219,9 @@ const FACTOR_FNS = { error_count: (e) => errorBucket(e.events), hard_floor_invoked: (e) => String(((e.primary_rationale || {}).hard_floor || {}).invoked ?? false), iterations_bucket: (e) => iterationsBucket((e.task_cost || {}).iterations), + // Pass 2 — classifier-metric axes (project-brain-factor-analysis-4passes): + latency_bucket: (e) => latencyBucket((e.classifier_output || {}).latency_ms), + error_type: (e) => (e.classifier_output || {}).llm_error || 'null', }; /** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */ diff --git a/tools/brain-retro-analyzer.test.mjs b/tools/brain-retro-analyzer.test.mjs index 9a5f3c97..e81371b5 100644 --- a/tools/brain-retro-analyzer.test.mjs +++ b/tools/brain-retro-analyzer.test.mjs @@ -516,3 +516,32 @@ describe('buildFactorMatrix — Pass 1 cheap axes (project-brain-factor-analysis } }); }); + +describe('buildFactorMatrix — Pass 2 classifier-metric axes', () => { + it('latency_bucket axis: fast / medium / slow / very_slow / null', () => { + const m = buildFactorMatrix([ + { ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 250 } }, + { ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 1500 } }, + { ...ep(), _inferredOutcome: 'rework', classifier_output: { latency_ms: 5000 } }, + { ...ep(), _inferredOutcome: 'blocked', classifier_output: { latency_ms: 15000 } }, + { ...ep(), _inferredOutcome: 'unknown', classifier_output: null }, + ]); + expect(m.latency_bucket.fast.success).toBe(1); + expect(m.latency_bucket.medium.success).toBe(1); + expect(m.latency_bucket.slow.rework).toBe(1); + expect(m.latency_bucket.very_slow.blocked).toBe(1); + expect(m.latency_bucket.null.unknown).toBe(1); + }); + + it('error_type axis: reads classifier_output.llm_error verbatim with null default', () => { + const m = buildFactorMatrix([ + { ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'timeout' } }, + { ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'econnreset' } }, + { ...ep(), _inferredOutcome: 'success', classifier_output: { llm_error: null } }, + { ...ep(), _inferredOutcome: 'success', classifier_output: null }, + ]); + expect(m.error_type.timeout.rework).toBe(1); + expect(m.error_type.econnreset.rework).toBe(1); + expect(m.error_type.null.success).toBe(2); + }); +}); diff --git a/tools/observer-state-enricher.mjs b/tools/observer-state-enricher.mjs index bf10914f..7117f842 100644 --- a/tools/observer-state-enricher.mjs +++ b/tools/observer-state-enricher.mjs @@ -65,6 +65,14 @@ export function extractClassifierOutput(state) { // keep episode JSONL line size bounded. reasoning: pickReasoning(cls), confidence: typeof cls.confidence === 'number' ? cls.confidence : null, + // Pass 2 metrics (project-brain-factor-analysis-4passes): network latency, + // internal retry count, categorized transport error, and the classifier's + // own top-3 alternative nodes with rejection rationale. null on regex / + // prefilter / cache paths where the LLM was never (or was already) called. + latency_ms: typeof cls.latency_ms === 'number' ? cls.latency_ms : null, + retry_count_internal: typeof cls.retry_count_internal === 'number' ? cls.retry_count_internal : null, + llm_error: cls.llm_error_type ?? null, + alternatives_considered: pickAlternatives(cls), }; } @@ -73,3 +81,10 @@ function pickReasoning(cls) { if (typeof v !== 'string') return null; return v.slice(0, 600); } + +function pickAlternatives(cls) { + const v = cls.alternatives_considered; + if (!Array.isArray(v)) return null; + // Cap at top-3 to bound episode JSONL line size; Sonnet sometimes returns 5+. + return v.slice(0, 3); +} diff --git a/tools/observer-state-enricher.test.mjs b/tools/observer-state-enricher.test.mjs index f85b209a..2c376c34 100644 --- a/tools/observer-state-enricher.test.mjs +++ b/tools/observer-state-enricher.test.mjs @@ -96,3 +96,67 @@ describe('extractRouterFields', () => { }); }); }); + +describe('extractClassifierOutput — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => { + it('surfaces latency_ms / retry_count_internal / llm_error / alternatives_considered when present', async () => { + const { extractClassifierOutput } = await import('./observer-state-enricher.mjs'); + const state = { + classification: { + task_type: 'feature', + source: 'llm', + latency_ms: 742, + retry_count_internal: 0, + llm_error_type: null, + alternatives_considered: [ + { node: '#19', score: 0.8, reason: 'close match' }, + { node: '#62', score: 0.4, reason: 'mismatch domain' }, + ], + }, + }; + const out = extractClassifierOutput(state); + expect(out.latency_ms).toBe(742); + expect(out.retry_count_internal).toBe(0); + expect(out.llm_error).toBeNull(); + expect(Array.isArray(out.alternatives_considered)).toBe(true); + expect(out.alternatives_considered).toHaveLength(2); + }); + + it('truncates alternatives_considered to top-3 to bound JSONL line size', async () => { + const { extractClassifierOutput } = await import('./observer-state-enricher.mjs'); + const out = extractClassifierOutput({ + classification: { + task_type: 'feature', + source: 'llm', + alternatives_considered: [ + { node: '#1' }, { node: '#2' }, { node: '#3' }, { node: '#4' }, { node: '#5' }, + ], + }, + }); + expect(out.alternatives_considered).toHaveLength(3); + expect(out.alternatives_considered[0].node).toBe('#1'); + }); + + it('returns null fields on regex / prefilter / cache paths (no LLM hit)', async () => { + const { extractClassifierOutput } = await import('./observer-state-enricher.mjs'); + const out = extractClassifierOutput({ + classification: { task_type: 'conversation', source: 'prefilter' }, + }); + expect(out.latency_ms).toBeNull(); + expect(out.retry_count_internal).toBeNull(); + expect(out.llm_error).toBeNull(); + expect(out.alternatives_considered).toBeNull(); + }); + + it('captures llm_error category on degraded LLM path', async () => { + const { extractClassifierOutput } = await import('./observer-state-enricher.mjs'); + const out = extractClassifierOutput({ + classification: { + task_type: 'feature', source: 'regex', + llm_error_type: 'timeout', latency_ms: 30000, retry_count_internal: 4, + }, + }); + expect(out.llm_error).toBe('timeout'); + expect(out.latency_ms).toBe(30000); + expect(out.retry_count_internal).toBe(4); + }); +}); diff --git a/tools/router-classifier.mjs b/tools/router-classifier.mjs index 3e3244e9..83f933b3 100644 --- a/tools/router-classifier.mjs +++ b/tools/router-classifier.mjs @@ -407,6 +407,7 @@ export async function callAnthropicAPI(promptOrMessages, { perAttemptTimeoutMs = 30_000, sleepImpl = (ms) => new Promise((res) => setTimeout(res, ms)), onUsage, + onMetrics, }) { const url = `${String(baseUrl).replace(/\/+$/, '')}/v1/messages`; let body; @@ -432,40 +433,65 @@ export async function callAnthropicAPI(promptOrMessages, { 'content-type': 'application/json', }; + // Pass 2 metric capture (project-brain-factor-analysis-4passes). + const started = Date.now(); + let attempt = 0; + const emitMetrics = () => { + if (!onMetrics) return; + try { onMetrics({ latency_ms: Date.now() - started, retry_count_internal: attempt }); } catch { /* swallow */ } + }; + let lastError; - for (let attempt = 0; attempt <= maxRetries; attempt++) { - const ctrl = new AbortController(); - const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs); - try { - const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal }); - if (r.ok) { - const data = await r.json(); - if (onUsage && data.usage) { - try { onUsage(data.usage); } catch { /* swallow callback errors */ } + try { + for (attempt = 0; attempt <= maxRetries; attempt++) { + const ctrl = new AbortController(); + const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs); + try { + const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal }); + if (r.ok) { + const data = await r.json(); + if (onUsage && data.usage) { + try { onUsage(data.usage); } catch { /* swallow callback errors */ } + } + return data.content?.[0]?.text || ''; } - return data.content?.[0]?.text || ''; + // Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help). + if (r.status >= 500 || r.status === 429) { + lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`); + } else { + const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`); + fatal.fatal = true; + throw fatal; + } + } catch (err) { + // Re-throw fatal errors (4xx) instead of retrying them. + if (err && err.fatal) { clearTimeout(timer); throw err; } + // Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible. + lastError = err; + } finally { + clearTimeout(timer); } - // Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help). - if (r.status >= 500 || r.status === 429) { - lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`); - } else { - const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`); - fatal.fatal = true; - throw fatal; + if (attempt < maxRetries) { + await sleepImpl(retryBaseDelayMs * 2 ** attempt); } - } catch (err) { - // Re-throw fatal errors (4xx) instead of retrying them. - if (err && err.fatal) { clearTimeout(timer); throw err; } - // Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible. - lastError = err; - } finally { - clearTimeout(timer); - } - if (attempt < maxRetries) { - await sleepImpl(retryBaseDelayMs * 2 ** attempt); } + throw lastError; + } finally { + emitMetrics(); } - throw lastError; +} + +// Pass 2 — categorize the LLM transport failure for the factor-analysis +// error_type axis. Looks at err.fatal + message keywords (no err.code on +// undici fetch failures — message is the only reliable signal). +export function classifyLLMError(err) { + if (!err) return 'other'; + const msg = String(err.message || err); + if (err.fatal && /\b4\d\d\b/.test(msg)) return 'http_4xx'; + if (/\b5\d\d\b/.test(msg) || /429\b/.test(msg)) return 'http_5xx'; + if (/ECONNRESET|ECONNREFUSED|ENOTFOUND|EAI_AGAIN|socket hang up/i.test(msg)) return 'econnreset'; + if (err.name === 'AbortError' || /\btimeout\b/i.test(msg)) return 'timeout'; + return 'other'; } function hashPrompt(s) { @@ -507,7 +533,9 @@ export async function classify(prompt, registry, options = {}) { } // Layer 2 — Sonnet 4.6 with prompt caching (ephemeral 5m TTL on system block). - const llmCall = options.llmCall || (async () => { + // llmCall receives { onMetrics } so callAnthropicAPI can report latency / retries + // (Pass 2 factor-analysis extension); tests pass synthetic metrics directly. + const llmCall = options.llmCall || (async ({ onMetrics } = {}) => { const apiKey = process.env.ROUTER_LLM_KEY; if (!apiKey) return null; const structured = buildClassifierPromptStructured(prompt, registry, { @@ -518,26 +546,48 @@ export async function classify(prompt, registry, options = {}) { baseUrl: process.env.ROUTER_LLM_BASE_URL || undefined, model: options.model || CLASSIFIER_MODEL, onUsage: options.onUsage, + onMetrics, }); return parseClassifierResponse(text); }); + let metrics = null; + const captureMetrics = (m) => { metrics = m; }; let llmResult; try { - llmResult = await llmCall(); + llmResult = await llmCall({ onMetrics: captureMetrics }); } catch (err) { // Layer 3 — regex fallback on LLM transport error. const r = classifyByRegex(prompt, registry); - return { ...r, llmError: err.message, degraded: true }; + return { + ...r, + llmError: err.message, + llm_error_type: classifyLLMError(err), + latency_ms: metrics?.latency_ms ?? null, + retry_count_internal: metrics?.retry_count_internal ?? null, + degraded: true, + }; } if (!llmResult) { - // Layer 3 — regex fallback on no key / unparseable. + // Layer 3 — regex fallback on no key (metrics null) / unparseable response + // (metrics set, classify as parse_null so the analyzer error_type axis + // distinguishes "API never called" from "API returned garbage"). const r = classifyByRegex(prompt, registry); - return r; + return { + ...r, + llm_error_type: metrics ? 'parse_null' : 'no_key', + latency_ms: metrics?.latency_ms ?? null, + retry_count_internal: metrics?.retry_count_internal ?? null, + }; } - const finalResult = { ...llmResult, source: 'llm' }; + const finalResult = { + ...llmResult, + source: 'llm', + latency_ms: metrics?.latency_ms ?? null, + retry_count_internal: metrics?.retry_count_internal ?? null, + }; if (cache) cache.set(key, finalResult); return finalResult; } diff --git a/tools/router-classifier.test.mjs b/tools/router-classifier.test.mjs index 623563d4..0d2c29e3 100644 --- a/tools/router-classifier.test.mjs +++ b/tools/router-classifier.test.mjs @@ -341,3 +341,106 @@ describe('classify — isolation from Claude Code auth', () => { } }); }); + +describe('callAnthropicAPI — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => { + it('emits onMetrics({latency_ms, retry_count_internal}) on success', async () => { + const fetchImpl = async () => ({ ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) }); + let captured = null; + await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } }); + expect(captured).not.toBeNull(); + expect(typeof captured.latency_ms).toBe('number'); + expect(captured.latency_ms).toBeGreaterThanOrEqual(0); + expect(captured.retry_count_internal).toBe(0); + }); + + it('emits onMetrics with retry_count_internal>0 after 5xx retries', async () => { + let calls = 0; + const fetchImpl = async () => { + calls += 1; + if (calls < 3) return { ok: false, status: 503, text: async () => 'unavailable' }; + return { ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) }; + }; + let captured = null; + const sleepImpl = () => Promise.resolve(); // skip backoff in tests + await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, sleepImpl, onMetrics: (m) => { captured = m; } }); + expect(captured.retry_count_internal).toBe(2); + }); + + it('emits onMetrics even on fatal 4xx (so latency / retry count reach the classifier state)', async () => { + const fetchImpl = async () => ({ ok: false, status: 401, text: async () => 'invalid key' }); + let captured = null; + await expect(callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } })).rejects.toThrow(/401/); + expect(captured).not.toBeNull(); + expect(typeof captured.latency_ms).toBe('number'); + expect(captured.retry_count_internal).toBe(0); + }); +}); + +describe('classify — Pass 2 metrics surface to result', () => { + const fakeRegistry = { nodes: [{ id: '#19', status: 'active', triggers: [] }], chains: {} }; + + it('attaches latency_ms / retry_count_internal on LLM success', async () => { + const llmCall = async ({ onMetrics } = {}) => { + if (onMetrics) onMetrics({ latency_ms: 432, retry_count_internal: 1 }); + return { task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null, alternatives_considered: [] }; + }; + const r = await classify('новая фича: добавь endpoint X', fakeRegistry, { llmCall }); + expect(r.source).toBe('llm'); + expect(r.latency_ms).toBe(432); + expect(r.retry_count_internal).toBe(1); + }); + + it('passes through alternatives_considered from Sonnet (truncated to top-3 by enricher, not by classify)', async () => { + const llmCall = async () => ({ + task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null, + alternatives_considered: [{ node: '#19', score: 0.8 }, { node: '#62', score: 0.4 }], + }); + const r = await classify('новая фича X', fakeRegistry, { llmCall }); + expect(r.alternatives_considered).toBeDefined(); + expect(r.alternatives_considered).toHaveLength(2); + }); + + it('sets llm_error_type=econnreset / latency / retry_count on transport error', async () => { + const llmCall = async ({ onMetrics } = {}) => { + if (onMetrics) onMetrics({ latency_ms: 1234, retry_count_internal: 4 }); + const e = new Error('fetch failed: ECONNRESET'); throw e; + }; + const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall }); + expect(r.source).toBe('regex'); + expect(r.llm_error_type).toBe('econnreset'); + expect(r.latency_ms).toBe(1234); + expect(r.retry_count_internal).toBe(4); + }); + + it('sets llm_error_type=timeout on AbortError or per-attempt timeout', async () => { + const llmCall = async () => { + const e = new Error('per-attempt timeout 30000ms'); throw e; + }; + const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall }); + expect(r.llm_error_type).toBe('timeout'); + }); + + it('sets llm_error_type=http_4xx on fatal upstream 4xx', async () => { + const llmCall = async () => { const e = new Error('Router LLM 401: invalid key'); e.fatal = true; throw e; }; + const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall }); + expect(r.llm_error_type).toBe('http_4xx'); + }); + + it('sets llm_error_type=http_5xx on exhausted retries', async () => { + const llmCall = async () => { const e = new Error('Router LLM 503: bad gateway'); throw e; }; + const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall }); + expect(r.llm_error_type).toBe('http_5xx'); + }); + + it('sets llm_error_type=parse_null when llmCall returns null (LLM produced unparseable response)', async () => { + // Mocked llmCall returns null without throwing — simulates upstream parse failure + // after a successful HTTP exchange. onMetrics still fires from the mocked path. + const llmCall = async ({ onMetrics } = {}) => { + if (onMetrics) onMetrics({ latency_ms: 800, retry_count_internal: 0 }); + return null; + }; + const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall }); + expect(r.llm_error_type).toBe('parse_null'); + expect(r.latency_ms).toBe(800); + }); +});