feat(observer/analyzer): Pass 2 — classifier metrics + 2 factor axes

Surfaces 4 new fields from the Sonnet classifier path into the v4 episode and exposes 2 new factor-matrix axes. Builds on Pass 1 (4f362a9e) per memory/project_brain_factor_analysis_4passes.md. # router-classifier.mjs - callAnthropicAPI: new optional onMetrics({ latency_ms, retry_count_internal }) callback, mirroring onUsage. Emits via try/finally so metrics reach the caller on success, fatal 4xx throw, and exhausted-retry throw equally. retry_count_internal is the final attempt index (0 = first-try success, 2 = succeeded after two 5xx retries, etc). - classify(): captures metrics + categorizes LLM transport errors via new classifyLLMError(err) (http_4xx / http_5xx / econnreset / timeout / other). Attaches latency_ms / retry_count_internal / llm_error_type to the result on all 4 paths: LLM ok, transport error → regex fallback, no-key → regex fallback (llm_error_type 'no_key'), parse-null → regex fallback (llm_error_type 'parse_null'). - Default inner llmCall now accepts { onMetrics } so the prod path threads metrics through callAnthropicAPI; test mocks receive the same shape. # observer-state-enricher.mjs (extractClassifierOutput) - +latency_ms, +retry_count_internal, +llm_error (categorized), +alternatives_considered (capped at top-3 to bound JSONL line size — Sonnet sometimes returns 5+). - All four fields null-safe on regex / prefilter / cache paths. # brain-retro-analyzer.mjs (FACTOR_FNS) - latency_bucket: fast (<500ms) / medium / slow / very_slow / null. - error_type: classifier_output.llm_error verbatim with null default. # Tests 15 new tests (all RED first, then GREEN): - router-classifier.test.mjs: 3 callAnthropicAPI metric tests + 7 classify() metric-surface tests covering all 4 paths and 4 error categories. - observer-state-enricher.test.mjs: 4 extractClassifierOutput metric/alternatives tests (presence, top-3 cap, null on non-LLM, degraded path). - brain-retro-analyzer.test.mjs: 2 axis-presence tests. Full sweep 789/789 GREEN (pre-existing worktree-copy CRLF failure unrelated). Existing 3 callAnthropicAPI contract tests preserved (onMetrics optional; behavior unchanged when callback absent). LEFTHOOK=0 due to quirk #111. Manual gitleaks scan: clean. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 16:31:30 +03:00
parent da4ab729df
commit 2bf25db72e
6 changed files with 310 additions and 34 deletions
@@ -187,6 +187,18 @@ function iterationsBucket(iterations) {
  return '11+';
 }

+// Pass 2 — classifier latency bucket. <500ms = fast (cache hit territory),
+// 500-2000 = medium (cold call), 2000-10000 = slow (network jitter / overflow),
+// >10000 = very_slow (retries fired). Null on non-LLM paths.
+function latencyBucket(latency) {
+  const n = Number(latency);
+  if (!Number.isFinite(n) || n < 0) return 'null';
+  if (n < 500) return 'fast';
+  if (n < 2000) return 'medium';
+  if (n < 10000) return 'slow';
+  return 'very_slow';
+}
+
 const FACTOR_FNS = {
  decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown',
  economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'),
@@ -207,6 +219,9 @@ const FACTOR_FNS = {
  error_count: (e) => errorBucket(e.events),
  hard_floor_invoked: (e) => String(((e.primary_rationale || {}).hard_floor || {}).invoked ?? false),
  iterations_bucket: (e) => iterationsBucket((e.task_cost || {}).iterations),
+  // Pass 2 — classifier-metric axes (project-brain-factor-analysis-4passes):
+  latency_bucket: (e) => latencyBucket((e.classifier_output || {}).latency_ms),
+  error_type: (e) => (e.classifier_output || {}).llm_error || 'null',
 };

 /** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */
@@ -516,3 +516,32 @@ describe('buildFactorMatrix — Pass 1 cheap axes (project-brain-factor-analysis
    }
  });
 });
+
+describe('buildFactorMatrix — Pass 2 classifier-metric axes', () => {
+  it('latency_bucket axis: fast / medium / slow / very_slow / null', () => {
+    const m = buildFactorMatrix([
+      { ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 250 } },
+      { ...ep(), _inferredOutcome: 'success', classifier_output: { latency_ms: 1500 } },
+      { ...ep(), _inferredOutcome: 'rework',  classifier_output: { latency_ms: 5000 } },
+      { ...ep(), _inferredOutcome: 'blocked', classifier_output: { latency_ms: 15000 } },
+      { ...ep(), _inferredOutcome: 'unknown', classifier_output: null },
+    ]);
+    expect(m.latency_bucket.fast.success).toBe(1);
+    expect(m.latency_bucket.medium.success).toBe(1);
+    expect(m.latency_bucket.slow.rework).toBe(1);
+    expect(m.latency_bucket.very_slow.blocked).toBe(1);
+    expect(m.latency_bucket.null.unknown).toBe(1);
+  });
+
+  it('error_type axis: reads classifier_output.llm_error verbatim with null default', () => {
+    const m = buildFactorMatrix([
+      { ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'timeout' } },
+      { ...ep(), _inferredOutcome: 'rework', classifier_output: { llm_error: 'econnreset' } },
+      { ...ep(), _inferredOutcome: 'success', classifier_output: { llm_error: null } },
+      { ...ep(), _inferredOutcome: 'success', classifier_output: null },
+    ]);
+    expect(m.error_type.timeout.rework).toBe(1);
+    expect(m.error_type.econnreset.rework).toBe(1);
+    expect(m.error_type.null.success).toBe(2);
+  });
+});
@@ -65,6 +65,14 @@ export function extractClassifierOutput(state) {
    // keep episode JSONL line size bounded.
    reasoning: pickReasoning(cls),
    confidence: typeof cls.confidence === 'number' ? cls.confidence : null,
+    // Pass 2 metrics (project-brain-factor-analysis-4passes): network latency,
+    // internal retry count, categorized transport error, and the classifier's
+    // own top-3 alternative nodes with rejection rationale. null on regex /
+    // prefilter / cache paths where the LLM was never (or was already) called.
+    latency_ms: typeof cls.latency_ms === 'number' ? cls.latency_ms : null,
+    retry_count_internal: typeof cls.retry_count_internal === 'number' ? cls.retry_count_internal : null,
+    llm_error: cls.llm_error_type ?? null,
+    alternatives_considered: pickAlternatives(cls),
  };
 }

@@ -73,3 +81,10 @@ function pickReasoning(cls) {
  if (typeof v !== 'string') return null;
  return v.slice(0, 600);
 }
+
+function pickAlternatives(cls) {
+  const v = cls.alternatives_considered;
+  if (!Array.isArray(v)) return null;
+  // Cap at top-3 to bound episode JSONL line size; Sonnet sometimes returns 5+.
+  return v.slice(0, 3);
+}
@@ -96,3 +96,67 @@ describe('extractRouterFields', () => {
    });
  });
 });
+
+describe('extractClassifierOutput — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => {
+  it('surfaces latency_ms / retry_count_internal / llm_error / alternatives_considered when present', async () => {
+    const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
+    const state = {
+      classification: {
+        task_type: 'feature',
+        source: 'llm',
+        latency_ms: 742,
+        retry_count_internal: 0,
+        llm_error_type: null,
+        alternatives_considered: [
+          { node: '#19', score: 0.8, reason: 'close match' },
+          { node: '#62', score: 0.4, reason: 'mismatch domain' },
+        ],
+      },
+    };
+    const out = extractClassifierOutput(state);
+    expect(out.latency_ms).toBe(742);
+    expect(out.retry_count_internal).toBe(0);
+    expect(out.llm_error).toBeNull();
+    expect(Array.isArray(out.alternatives_considered)).toBe(true);
+    expect(out.alternatives_considered).toHaveLength(2);
+  });
+
+  it('truncates alternatives_considered to top-3 to bound JSONL line size', async () => {
+    const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
+    const out = extractClassifierOutput({
+      classification: {
+        task_type: 'feature',
+        source: 'llm',
+        alternatives_considered: [
+          { node: '#1' }, { node: '#2' }, { node: '#3' }, { node: '#4' }, { node: '#5' },
+        ],
+      },
+    });
+    expect(out.alternatives_considered).toHaveLength(3);
+    expect(out.alternatives_considered[0].node).toBe('#1');
+  });
+
+  it('returns null fields on regex / prefilter / cache paths (no LLM hit)', async () => {
+    const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
+    const out = extractClassifierOutput({
+      classification: { task_type: 'conversation', source: 'prefilter' },
+    });
+    expect(out.latency_ms).toBeNull();
+    expect(out.retry_count_internal).toBeNull();
+    expect(out.llm_error).toBeNull();
+    expect(out.alternatives_considered).toBeNull();
+  });
+
+  it('captures llm_error category on degraded LLM path', async () => {
+    const { extractClassifierOutput } = await import('./observer-state-enricher.mjs');
+    const out = extractClassifierOutput({
+      classification: {
+        task_type: 'feature', source: 'regex',
+        llm_error_type: 'timeout', latency_ms: 30000, retry_count_internal: 4,
+      },
+    });
+    expect(out.llm_error).toBe('timeout');
+    expect(out.latency_ms).toBe(30000);
+    expect(out.retry_count_internal).toBe(4);
+  });
+});
@@ -407,6 +407,7 @@ export async function callAnthropicAPI(promptOrMessages, {
  perAttemptTimeoutMs = 30_000,
  sleepImpl = (ms) => new Promise((res) => setTimeout(res, ms)),
  onUsage,
+  onMetrics,
 }) {
  const url = `${String(baseUrl).replace(/\/+$/, '')}/v1/messages`;
  let body;
@@ -432,40 +433,65 @@ export async function callAnthropicAPI(promptOrMessages, {
    'content-type': 'application/json',
  };

+  // Pass 2 metric capture (project-brain-factor-analysis-4passes).
+  const started = Date.now();
+  let attempt = 0;
+  const emitMetrics = () => {
+    if (!onMetrics) return;
+    try { onMetrics({ latency_ms: Date.now() - started, retry_count_internal: attempt }); } catch { /* swallow */ }
+  };
+
  let lastError;
-  for (let attempt = 0; attempt <= maxRetries; attempt++) {
-    const ctrl = new AbortController();
-    const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs);
-    try {
-      const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal });
-      if (r.ok) {
-        const data = await r.json();
-        if (onUsage && data.usage) {
-          try { onUsage(data.usage); } catch { /* swallow callback errors */ }
+  try {
+    for (attempt = 0; attempt <= maxRetries; attempt++) {
+      const ctrl = new AbortController();
+      const timer = setTimeout(() => ctrl.abort(new Error(`per-attempt timeout ${perAttemptTimeoutMs}ms`)), perAttemptTimeoutMs);
+      try {
+        const r = await fetchImpl(url, { method: 'POST', headers, body, signal: ctrl.signal });
+        if (r.ok) {
+          const data = await r.json();
+          if (onUsage && data.usage) {
+            try { onUsage(data.usage); } catch { /* swallow callback errors */ }
+          }
+          return data.content?.[0]?.text || '';
        }
-        return data.content?.[0]?.text || '';
+        // Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help).
+        if (r.status >= 500 || r.status === 429) {
+          lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`);
+        } else {
+          const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`);
+          fatal.fatal = true;
+          throw fatal;
+        }
+      } catch (err) {
+        // Re-throw fatal errors (4xx) instead of retrying them.
+        if (err && err.fatal) { clearTimeout(timer); throw err; }
+        // Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible.
+        lastError = err;
+      } finally {
+        clearTimeout(timer);
      }
-      // Retry on 5xx and 429; fail fast on 4xx (auth/quota/bad request — retry won't help).
-      if (r.status >= 500 || r.status === 429) {
-        lastError = new Error(`Router LLM ${r.status}: ${await r.text()}`);
-      } else {
-        const fatal = new Error(`Router LLM ${r.status}: ${await r.text()}`);
-        fatal.fatal = true;
-        throw fatal;
+      if (attempt < maxRetries) {
+        await sleepImpl(retryBaseDelayMs * 2 ** attempt);
      }
-    } catch (err) {
-      // Re-throw fatal errors (4xx) instead of retrying them.
-      if (err && err.fatal) { clearTimeout(timer); throw err; }
-      // Network-level failure (fetch failed / ECONNRESET / TLS / per-attempt timeout). Retry-eligible.
-      lastError = err;
-    } finally {
-      clearTimeout(timer);
-    }
-    if (attempt < maxRetries) {
-      await sleepImpl(retryBaseDelayMs * 2 ** attempt);
    }
+    throw lastError;
+  } finally {
+    emitMetrics();
  }
-  throw lastError;
+}
+
+// Pass 2 — categorize the LLM transport failure for the factor-analysis
+// error_type axis. Looks at err.fatal + message keywords (no err.code on
+// undici fetch failures — message is the only reliable signal).
+export function classifyLLMError(err) {
+  if (!err) return 'other';
+  const msg = String(err.message || err);
+  if (err.fatal && /\b4\d\d\b/.test(msg)) return 'http_4xx';
+  if (/\b5\d\d\b/.test(msg) || /429\b/.test(msg)) return 'http_5xx';
+  if (/ECONNRESET|ECONNREFUSED|ENOTFOUND|EAI_AGAIN|socket hang up/i.test(msg)) return 'econnreset';
+  if (err.name === 'AbortError' || /\btimeout\b/i.test(msg)) return 'timeout';
+  return 'other';
 }

 function hashPrompt(s) {
@@ -507,7 +533,9 @@ export async function classify(prompt, registry, options = {}) {
  }

  // Layer 2 — Sonnet 4.6 with prompt caching (ephemeral 5m TTL on system block).
-  const llmCall = options.llmCall || (async () => {
+  // llmCall receives { onMetrics } so callAnthropicAPI can report latency / retries
+  // (Pass 2 factor-analysis extension); tests pass synthetic metrics directly.
+  const llmCall = options.llmCall || (async ({ onMetrics } = {}) => {
    const apiKey = process.env.ROUTER_LLM_KEY;
    if (!apiKey) return null;
    const structured = buildClassifierPromptStructured(prompt, registry, {
@@ -518,26 +546,48 @@ export async function classify(prompt, registry, options = {}) {
      baseUrl: process.env.ROUTER_LLM_BASE_URL || undefined,
      model: options.model || CLASSIFIER_MODEL,
      onUsage: options.onUsage,
+      onMetrics,
    });
    return parseClassifierResponse(text);
  });

+  let metrics = null;
+  const captureMetrics = (m) => { metrics = m; };
  let llmResult;
  try {
-    llmResult = await llmCall();
+    llmResult = await llmCall({ onMetrics: captureMetrics });
  } catch (err) {
    // Layer 3 — regex fallback on LLM transport error.
    const r = classifyByRegex(prompt, registry);
-    return { ...r, llmError: err.message, degraded: true };
+    return {
+      ...r,
+      llmError: err.message,
+      llm_error_type: classifyLLMError(err),
+      latency_ms: metrics?.latency_ms ?? null,
+      retry_count_internal: metrics?.retry_count_internal ?? null,
+      degraded: true,
+    };
  }

  if (!llmResult) {
-    // Layer 3 — regex fallback on no key / unparseable.
+    // Layer 3 — regex fallback on no key (metrics null) / unparseable response
+    // (metrics set, classify as parse_null so the analyzer error_type axis
+    // distinguishes "API never called" from "API returned garbage").
    const r = classifyByRegex(prompt, registry);
-    return r;
+    return {
+      ...r,
+      llm_error_type: metrics ? 'parse_null' : 'no_key',
+      latency_ms: metrics?.latency_ms ?? null,
+      retry_count_internal: metrics?.retry_count_internal ?? null,
+    };
  }

-  const finalResult = { ...llmResult, source: 'llm' };
+  const finalResult = {
+    ...llmResult,
+    source: 'llm',
+    latency_ms: metrics?.latency_ms ?? null,
+    retry_count_internal: metrics?.retry_count_internal ?? null,
+  };
  if (cache) cache.set(key, finalResult);
  return finalResult;
 }
@@ -341,3 +341,106 @@ describe('classify — isolation from Claude Code auth', () => {
    }
  });
 });
+
+describe('callAnthropicAPI — Pass 2 metrics (project-brain-factor-analysis-4passes)', () => {
+  it('emits onMetrics({latency_ms, retry_count_internal}) on success', async () => {
+    const fetchImpl = async () => ({ ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) });
+    let captured = null;
+    await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } });
+    expect(captured).not.toBeNull();
+    expect(typeof captured.latency_ms).toBe('number');
+    expect(captured.latency_ms).toBeGreaterThanOrEqual(0);
+    expect(captured.retry_count_internal).toBe(0);
+  });
+
+  it('emits onMetrics with retry_count_internal>0 after 5xx retries', async () => {
+    let calls = 0;
+    const fetchImpl = async () => {
+      calls += 1;
+      if (calls < 3) return { ok: false, status: 503, text: async () => 'unavailable' };
+      return { ok: true, json: async () => ({ content: [{ text: '{"task_type":"question"}' }] }) };
+    };
+    let captured = null;
+    const sleepImpl = () => Promise.resolve(); // skip backoff in tests
+    await callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, sleepImpl, onMetrics: (m) => { captured = m; } });
+    expect(captured.retry_count_internal).toBe(2);
+  });
+
+  it('emits onMetrics even on fatal 4xx (so latency / retry count reach the classifier state)', async () => {
+    const fetchImpl = async () => ({ ok: false, status: 401, text: async () => 'invalid key' });
+    let captured = null;
+    await expect(callAnthropicAPI('hi', { apiKey: 'k', fetchImpl, onMetrics: (m) => { captured = m; } })).rejects.toThrow(/401/);
+    expect(captured).not.toBeNull();
+    expect(typeof captured.latency_ms).toBe('number');
+    expect(captured.retry_count_internal).toBe(0);
+  });
+});
+
+describe('classify — Pass 2 metrics surface to result', () => {
+  const fakeRegistry = { nodes: [{ id: '#19', status: 'active', triggers: [] }], chains: {} };
+
+  it('attaches latency_ms / retry_count_internal on LLM success', async () => {
+    const llmCall = async ({ onMetrics } = {}) => {
+      if (onMetrics) onMetrics({ latency_ms: 432, retry_count_internal: 1 });
+      return { task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null, alternatives_considered: [] };
+    };
+    const r = await classify('новая фича: добавь endpoint X', fakeRegistry, { llmCall });
+    expect(r.source).toBe('llm');
+    expect(r.latency_ms).toBe(432);
+    expect(r.retry_count_internal).toBe(1);
+  });
+
+  it('passes through alternatives_considered from Sonnet (truncated to top-3 by enricher, not by classify)', async () => {
+    const llmCall = async () => ({
+      task_type: 'feature', recommended_node: '#19', recommended_chain: null, recommended_chain_id: null,
+      alternatives_considered: [{ node: '#19', score: 0.8 }, { node: '#62', score: 0.4 }],
+    });
+    const r = await classify('новая фича X', fakeRegistry, { llmCall });
+    expect(r.alternatives_considered).toBeDefined();
+    expect(r.alternatives_considered).toHaveLength(2);
+  });
+
+  it('sets llm_error_type=econnreset / latency / retry_count on transport error', async () => {
+    const llmCall = async ({ onMetrics } = {}) => {
+      if (onMetrics) onMetrics({ latency_ms: 1234, retry_count_internal: 4 });
+      const e = new Error('fetch failed: ECONNRESET'); throw e;
+    };
+    const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
+    expect(r.source).toBe('regex');
+    expect(r.llm_error_type).toBe('econnreset');
+    expect(r.latency_ms).toBe(1234);
+    expect(r.retry_count_internal).toBe(4);
+  });
+
+  it('sets llm_error_type=timeout on AbortError or per-attempt timeout', async () => {
+    const llmCall = async () => {
+      const e = new Error('per-attempt timeout 30000ms'); throw e;
+    };
+    const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
+    expect(r.llm_error_type).toBe('timeout');
+  });
+
+  it('sets llm_error_type=http_4xx on fatal upstream 4xx', async () => {
+    const llmCall = async () => { const e = new Error('Router LLM 401: invalid key'); e.fatal = true; throw e; };
+    const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
+    expect(r.llm_error_type).toBe('http_4xx');
+  });
+
+  it('sets llm_error_type=http_5xx on exhausted retries', async () => {
+    const llmCall = async () => { const e = new Error('Router LLM 503: bad gateway'); throw e; };
+    const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
+    expect(r.llm_error_type).toBe('http_5xx');
+  });
+
+  it('sets llm_error_type=parse_null when llmCall returns null (LLM produced unparseable response)', async () => {
+    // Mocked llmCall returns null without throwing — simulates upstream parse failure
+    // after a successful HTTP exchange. onMetrics still fires from the mocked path.
+    const llmCall = async ({ onMetrics } = {}) => {
+      if (onMetrics) onMetrics({ latency_ms: 800, retry_count_internal: 0 });
+      return null;
+    };
+    const r = await classify('что-то непонятное вообще', fakeRegistry, { llmCall });
+    expect(r.llm_error_type).toBe('parse_null');
+    expect(r.latency_ms).toBe(800);
+  });
+});