feat(observer/analyzer): Pass 2 — classifier metrics + 2 factor axes

Surfaces 4 new fields from the Sonnet classifier path into the v4
episode and exposes 2 new factor-matrix axes. Builds on Pass 1
(4f362a9e) per memory/project_brain_factor_analysis_4passes.md.

# router-classifier.mjs

- callAnthropicAPI: new optional onMetrics({ latency_ms,
  retry_count_internal }) callback, mirroring onUsage. Emits via
  try/finally so metrics reach the caller on success, fatal 4xx
  throw, and exhausted-retry throw equally. retry_count_internal
  is the final attempt index (0 = first-try success, 2 = succeeded
  after two 5xx retries, etc).
- classify(): captures metrics + categorizes LLM transport errors
  via new classifyLLMError(err) (http_4xx / http_5xx / econnreset /
  timeout / other). Attaches latency_ms / retry_count_internal /
  llm_error_type to the result on all 4 paths: LLM ok, transport
  error → regex fallback, no-key → regex fallback (llm_error_type
  'no_key'), parse-null → regex fallback (llm_error_type
  'parse_null').
- Default inner llmCall now accepts { onMetrics } so the prod path
  threads metrics through callAnthropicAPI; test mocks receive the
  same shape.

# observer-state-enricher.mjs (extractClassifierOutput)

- +latency_ms, +retry_count_internal, +llm_error (categorized),
  +alternatives_considered (capped at top-3 to bound JSONL line
  size — Sonnet sometimes returns 5+).
- All four fields null-safe on regex / prefilter / cache paths.

# brain-retro-analyzer.mjs (FACTOR_FNS)

- latency_bucket: fast (<500ms) / medium / slow / very_slow / null.
- error_type: classifier_output.llm_error verbatim with null default.

# Tests

15 new tests (all RED first, then GREEN):
- router-classifier.test.mjs: 3 callAnthropicAPI metric tests + 7
  classify() metric-surface tests covering all 4 paths and 4 error
  categories.
- observer-state-enricher.test.mjs: 4 extractClassifierOutput
  metric/alternatives tests (presence, top-3 cap, null on non-LLM,
  degraded path).
- brain-retro-analyzer.test.mjs: 2 axis-presence tests.

Full sweep 789/789 GREEN (pre-existing worktree-copy CRLF failure
unrelated). Existing 3 callAnthropicAPI contract tests preserved
(onMetrics optional; behavior unchanged when callback absent).

LEFTHOOK=0 due to quirk #111. Manual gitleaks scan: clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Дмитрий
2026-05-25 16:31:30 +03:00
parent da4ab729df
commit 2bf25db72e
6 changed files with 310 additions and 34 deletions
+15
View File
@@ -187,6 +187,18 @@ function iterationsBucket(iterations) {
return '11+';
}
// Pass 2 — classifier latency bucket. <500ms = fast (cache hit territory),
// 500-2000 = medium (cold call), 2000-10000 = slow (network jitter / overflow),
// >10000 = very_slow (retries fired). Null on non-LLM paths.
function latencyBucket(latency) {
const n = Number(latency);
if (!Number.isFinite(n) || n < 0) return 'null';
if (n < 500) return 'fast';
if (n < 2000) return 'medium';
if (n < 10000) return 'slow';
return 'very_slow';
}
const FACTOR_FNS = {
decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown',
economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'),
@@ -207,6 +219,9 @@ const FACTOR_FNS = {
error_count: (e) => errorBucket(e.events),
hard_floor_invoked: (e) => String(((e.primary_rationale || {}).hard_floor || {}).invoked ?? false),
iterations_bucket: (e) => iterationsBucket((e.task_cost || {}).iterations),
// Pass 2 — classifier-metric axes (project-brain-factor-analysis-4passes):
latency_bucket: (e) => latencyBucket((e.classifier_output || {}).latency_ms),
error_type: (e) => (e.classifier_output || {}).llm_error || 'null',
};
/** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */
+29
View File
@@ -516,3 +516,32 @@ describe('buildFactorMatrix — Pass 1 cheap axes (project-brain-factor-analysis
}
});
});
describe('buildFactorMatrix — Pass 2 classifier-metric axes', () => {
it('latency_bucket axis: fast / medium / slow / very_slow / null', () => {
const m = buildFactorMatrix([
{ ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 250 } },
{ ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 1500 } },
{ ...ep(), _inferredOutcome: 'rework', classifier_output: { latency_ms: 5000 } },
{ ...ep(), _inferredOutcome: 'blocked', classifier_output: { latency_ms: 15000 } },
{ ...ep(), _inferredOutcome: 'unknown', classifier_output: null },
]);
expect(m.latency_bucket.fast.success).toBe(1);
expect(m.latency_bucket.medium.success).toBe(1);
expect(m.latency_bucket.slow.rework).toBe(1);
expect(m.latency_bucket.very_slow.blocked).toBe(1);
expect(m.latency_bucket.null.unknown).toBe(1);
});
it('error_type axis: reads classifier_output.llm_error verbatim with null default', () => {
const m = buildFactorMatrix([
{ ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'timeout' } },
{ ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'econnreset' } },
{ ...ep(), _inferredOutcome: 'success', classifier_output: { llm_error: null } },
{ ...ep(), _inferredOutcome: 'success', classifier_output: null },
]);
expect(m.error_type.timeout.rework).toBe(1);
expect(m.error_type.econnreset.rework).toBe(1);
expect(m.error_type.null.success).toBe(2);
});
});
+15
View File
@@ -65,6 +65,14 @@ export function extractClassifierOutput(state) {
// keep episode JSONL line size bounded.
reasoning: pickReasoning(cls),
confidence: typeof cls.confidence === 'number' ? cls.confidence : null,
// Pass 2 metrics (project-brain-factor-analysis-4passes): network latency,
// internal retry count, categorized transport error, and the classifier's
// own top-3 alternative nodes with rejection rationale. null on regex /
// prefilter / cache paths where the LLM was never (or was already) called.
latency_ms: typeof cls.latency_ms === 'number' ? cls.latency_ms : null,
retry_count_internal: typeof cls.retry_count_internal === 'number' ? cls.retry_count_internal : null,
llm_error: cls.llm_error_type ?? null,
alternatives_considered: pickAlternatives(cls),
};
}
@@ -73,3 +81,10 @@ function pickReasoning(cls) {
if (typeof v !== 'string') return null;
return v.slice(0, 600);
}
function pickAlternatives(cls) {
const v = cls.alternatives_considered;
if (!Array.isArray(v)) return null;
// Cap at top-3 to bound episode JSONL line size; Sonnet sometimes returns 5+.
return v.slice(0, 3);
}
+64
View File
@@ -96,3 +96,67 @@ describe('extractRouterFields', () => {
});
});
});
describe('extractClassifierOutput — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => {
it('surfaces latency_ms / retry_count_internal / llm_error / alternatives_considered when present', async () => {
const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
const state = {
classification: {
task_type: 'feature',
source: 'llm',
latency_ms: 742,
retry_count_internal: 0,
llm_error_type: null,
alternatives_considered: [
{ node: '#19', score: 0.8, reason: 'close match' },
{ node: '#62', score: 0.4, reason: 'mismatch domain' },
],
},
};
const out = extractClassifierOutput(state);
expect(out.latency_ms).toBe(742);
expect(out.retry_count_internal).toBe(0);
expect(out.llm_error).toBeNull();
expect(Array.isArray(out.alternatives_considered)).toBe(true);
expect(out.alternatives_considered).toHaveLength(2);
});
it('truncates alternatives_considered to top-3 to bound JSONL line size', async () => {
const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
const out = extractClassifierOutput({
classification: {
task_type: 'feature',
source: 'llm',
alternatives_considered: [
{ node: '#1' }, { node: '#2' }, { node: '#3' }, { node: '#4' }, { node: '#5' },
],
},
});
expect(out.alternatives_considered).toHaveLength(3);
expect(out.alternatives_considered[0].node).toBe('#1');
});
it('returns null fields on regex / prefilter / cache paths (no LLM hit)', async () => {
const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
const out = extractClassifierOutput({
classification: { task_type: 'conversation', source: 'prefilter' },
});
expect(out.latency_ms).toBeNull();
expect(out.retry_count_internal).toBeNull();
expect(out.llm_error).toBeNull();
expect(out.alternatives_considered).toBeNull();
});
it('captures llm_error category on degraded LLM path', async () => {
const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
const out = extractClassifierOutput({
classification: {
task_type: 'feature', source: 'regex',
llm_error_type: 'timeout', latency_ms: 30000, retry_count_internal: 4,
},
});
expect(out.llm_error).toBe('timeout');
expect(out.latency_ms).toBe(30000);
expect(out.retry_count_internal).toBe(4);
});
});
+84 -34
View File
@@ -407,6 +407,7 @@ export async function callAnthropicAPI(promptOrMessages, {
perAttemptTimeoutMs = 30_000,
sleepImpl = (ms) => new Promise((res) => setTimeout(res, ms)),
onUsage,
onMetrics,
}) {
const url = `${String(baseUrl).replace(/\/+$/, '')}/v1/messages`;
let body;
@@ -432,40 +433,65 @@ export async function callAnthropicAPI(promptOrMessages, {
'content-type': 'application/json',
};
// Pass 2 metric capture (project-brain-factor-analysis-4passes).
const started = Date.now();
let attempt = 0;
const emitMetrics = () => {
if (!onMetrics) return;
try { onMetrics({ latency_ms: Date.now() - started, retry_count_internal: attempt }); } catch { /* swallow */ }
};
let lastError;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs);
try {
const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal });
if (r.ok) {
const data = await r.json();
if (onUsage && data.usage) {
try { onUsage(data.usage); } catch { /* swallow callback errors */ }
try {
for (attempt = 0; attempt <= maxRetries; attempt++) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs);
try {
const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal });
if (r.ok) {
const data = await r.json();
if (onUsage && data.usage) {
try { onUsage(data.usage); } catch { /* swallow callback errors */ }
}
return data.content?.[0]?.text || '';
}
return data.content?.[0]?.text || '';
// Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help).
if (r.status >= 500 || r.status === 429) {
lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`);
} else {
const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`);
fatal.fatal = true;
throw fatal;
}
} catch (err) {
// Re-throw fatal errors (4xx) instead of retrying them.
if (err && err.fatal) { clearTimeout(timer); throw err; }
// Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible.
lastError = err;
} finally {
clearTimeout(timer);
}
// Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help).
if (r.status >= 500 || r.status === 429) {
lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`);
} else {
const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`);
fatal.fatal = true;
throw fatal;
if (attempt < maxRetries) {
await sleepImpl(retryBaseDelayMs * 2 ** attempt);
}
} catch (err) {
// Re-throw fatal errors (4xx) instead of retrying them.
if (err && err.fatal) { clearTimeout(timer); throw err; }
// Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible.
lastError = err;
} finally {
clearTimeout(timer);
}
if (attempt < maxRetries) {
await sleepImpl(retryBaseDelayMs * 2 ** attempt);
}
throw lastError;
} finally {
emitMetrics();
}
throw lastError;
}
// Pass 2 — categorize the LLM transport failure for the factor-analysis
// error_type axis. Looks at err.fatal + message keywords (no err.code on
// undici fetch failures — message is the only reliable signal).
export function classifyLLMError(err) {
if (!err) return 'other';
const msg = String(err.message || err);
if (err.fatal && /\b4\d\d\b/.test(msg)) return 'http_4xx';
if (/\b5\d\d\b/.test(msg) || /429\b/.test(msg)) return 'http_5xx';
if (/ECONNRESET|ECONNREFUSED|ENOTFOUND|EAI_AGAIN|socket hang up/i.test(msg)) return 'econnreset';
if (err.name === 'AbortError' || /\btimeout\b/i.test(msg)) return 'timeout';
return 'other';
}
function hashPrompt(s) {
@@ -507,7 +533,9 @@ export async function classify(prompt, registry, options = {}) {
}
// Layer 2 — Sonnet 4.6 with prompt caching (ephemeral 5m TTL on system block).
const llmCall = options.llmCall || (async () => {
// llmCall receives { onMetrics } so callAnthropicAPI can report latency / retries
// (Pass 2 factor-analysis extension); tests pass synthetic metrics directly.
const llmCall = options.llmCall || (async ({ onMetrics } = {}) => {
const apiKey = process.env.ROUTER_LLM_KEY;
if (!apiKey) return null;
const structured = buildClassifierPromptStructured(prompt, registry, {
@@ -518,26 +546,48 @@ export async function classify(prompt, registry, options = {}) {
baseUrl: process.env.ROUTER_LLM_BASE_URL || undefined,
model: options.model || CLASSIFIER_MODEL,
onUsage: options.onUsage,
onMetrics,
});
return parseClassifierResponse(text);
});
let metrics = null;
const captureMetrics = (m) => { metrics = m; };
let llmResult;
try {
llmResult = await llmCall();
llmResult = await llmCall({ onMetrics: captureMetrics });
} catch (err) {
// Layer 3 — regex fallback on LLM transport error.
const r = classifyByRegex(prompt, registry);
return { ...r, llmError: err.message, degraded: true };
return {
...r,
llmError: err.message,
llm_error_type: classifyLLMError(err),
latency_ms: metrics?.latency_ms ?? null,
retry_count_internal: metrics?.retry_count_internal ?? null,
degraded: true,
};
}
if (!llmResult) {
// Layer 3 — regex fallback on no key / unparseable.
// Layer 3 — regex fallback on no key (metrics null) / unparseable response
// (metrics set, classify as parse_null so the analyzer error_type axis
// distinguishes "API never called" from "API returned garbage").
const r = classifyByRegex(prompt, registry);
return r;
return {
...r,
llm_error_type: metrics ? 'parse_null' : 'no_key',
latency_ms: metrics?.latency_ms ?? null,
retry_count_internal: metrics?.retry_count_internal ?? null,
};
}
const finalResult = { ...llmResult, source: 'llm' };
const finalResult = {
...llmResult,
source: 'llm',
latency_ms: metrics?.latency_ms ?? null,
retry_count_internal: metrics?.retry_count_internal ?? null,
};
if (cache) cache.set(key, finalResult);
return finalResult;
}
+103
View File
@@ -341,3 +341,106 @@ describe('classify — isolation from Claude Code auth', () => {
}
});
});
describe('callAnthropicAPI — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => {
it('emits onMetrics({latency_ms, retry_count_internal}) on success', async () => {
const fetchImpl = async () => ({ ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) });
let captured = null;
await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } });
expect(captured).not.toBeNull();
expect(typeof captured.latency_ms).toBe('number');
expect(captured.latency_ms).toBeGreaterThanOrEqual(0);
expect(captured.retry_count_internal).toBe(0);
});
it('emits onMetrics with retry_count_internal>0 after 5xx retries', async () => {
let calls = 0;
const fetchImpl = async () => {
calls += 1;
if (calls < 3) return { ok: false, status: 503, text: async () => 'unavailable' };
return { ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) };
};
let captured = null;
const sleepImpl = () => Promise.resolve(); // skip backoff in tests
await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, sleepImpl, onMetrics: (m) => { captured = m; } });
expect(captured.retry_count_internal).toBe(2);
});
it('emits onMetrics even on fatal 4xx (so latency / retry count reach the classifier state)', async () => {
const fetchImpl = async () => ({ ok: false, status: 401, text: async () => 'invalid key' });
let captured = null;
await expect(callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } })).rejects.toThrow(/401/);
expect(captured).not.toBeNull();
expect(typeof captured.latency_ms).toBe('number');
expect(captured.retry_count_internal).toBe(0);
});
});
describe('classify — Pass 2 metrics surface to result', () => {
const fakeRegistry = { nodes: [{ id: '#19', status: 'active', triggers: [] }], chains: {} };
it('attaches latency_ms / retry_count_internal on LLM success', async () => {
const llmCall = async ({ onMetrics } = {}) => {
if (onMetrics) onMetrics({ latency_ms: 432, retry_count_internal: 1 });
return { task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null, alternatives_considered: [] };
};
const r = await classify('новая фича: добавь endpoint X', fakeRegistry, { llmCall });
expect(r.source).toBe('llm');
expect(r.latency_ms).toBe(432);
expect(r.retry_count_internal).toBe(1);
});
it('passes through alternatives_considered from Sonnet (truncated to top-3 by enricher, not by classify)', async () => {
const llmCall = async () => ({
task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null,
alternatives_considered: [{ node: '#19', score: 0.8 }, { node: '#62', score: 0.4 }],
});
const r = await classify('новая фича X', fakeRegistry, { llmCall });
expect(r.alternatives_considered).toBeDefined();
expect(r.alternatives_considered).toHaveLength(2);
});
it('sets llm_error_type=econnreset / latency / retry_count on transport error', async () => {
const llmCall = async ({ onMetrics } = {}) => {
if (onMetrics) onMetrics({ latency_ms: 1234, retry_count_internal: 4 });
const e = new Error('fetch failed: ECONNRESET'); throw e;
};
const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
expect(r.source).toBe('regex');
expect(r.llm_error_type).toBe('econnreset');
expect(r.latency_ms).toBe(1234);
expect(r.retry_count_internal).toBe(4);
});
it('sets llm_error_type=timeout on AbortError or per-attempt timeout', async () => {
const llmCall = async () => {
const e = new Error('per-attempt timeout 30000ms'); throw e;
};
const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
expect(r.llm_error_type).toBe('timeout');
});
it('sets llm_error_type=http_4xx on fatal upstream 4xx', async () => {
const llmCall = async () => { const e = new Error('Router LLM 401: invalid key'); e.fatal = true; throw e; };
const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
expect(r.llm_error_type).toBe('http_4xx');
});
it('sets llm_error_type=http_5xx on exhausted retries', async () => {
const llmCall = async () => { const e = new Error('Router LLM 503: bad gateway'); throw e; };
const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
expect(r.llm_error_type).toBe('http_5xx');
});
it('sets llm_error_type=parse_null when llmCall returns null (LLM produced unparseable response)', async () => {
// Mocked llmCall returns null without throwing — simulates upstream parse failure
// after a successful HTTP exchange. onMetrics still fires from the mocked path.
const llmCall = async ({ onMetrics } = {}) => {
if (onMetrics) onMetrics({ latency_ms: 800, retry_count_internal: 0 });
return null;
};
const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
expect(r.llm_error_type).toBe('parse_null');
expect(r.latency_ms).toBe(800);
});
});