397777089e
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
811 lines
31 KiB
JavaScript
811 lines
31 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Brain-retro analyzer (brain governance, observer factor-analysis spec §6).
|
||
* Pure, deterministic Layer-4 aggregation over observer episodes for the
|
||
* /brain-retro skill. Read-only — never writes JSONL. No LLM.
|
||
*
|
||
* Security Guidance #40: pure parsing — no exec/execSync.
|
||
*/
|
||
import { Buffer } from 'buffer';
|
||
import { readFileSync, existsSync } from 'fs';
|
||
import { join as pathJoin } from 'path';
|
||
import { homedir } from 'os';
|
||
import { detectMissedActivations } from './missed-activations.mjs';
|
||
import {
|
||
disciplinePercentByClassification,
|
||
routerStepReached,
|
||
boundariesAppliedRate,
|
||
} from './discipline-metrics.mjs';
|
||
import { loadRegistry } from './registry-load.mjs';
|
||
import { buildClassificationMap, buildDormancyMap } from './registry-to-classification-map.mjs';
|
||
import {
|
||
buildIndex as buildEmbeddingIndex,
|
||
findNearestNeighbors,
|
||
majorityOutcome,
|
||
} from './observer-embedding-index.mjs';
|
||
|
||
export const CHAIN_OUTCOME_BUCKETS = [
|
||
'blocked',
|
||
'passed-with-skill',
|
||
'passed-inline-override',
|
||
'passed-global-override',
|
||
'passed-short-chain',
|
||
'passed-no-mutating',
|
||
];
|
||
|
||
export function analyzeChainHookEffectiveness({ ledgerPath, periodStart, periodEnd } = {}) {
|
||
const lp = ledgerPath || pathJoin(homedir(), '.claude', 'runtime', 'hook-outcomes.jsonl');
|
||
const buckets = Object.fromEntries(CHAIN_OUTCOME_BUCKETS.map((b) => [b, 0]));
|
||
let total = 0;
|
||
let raw;
|
||
try { raw = readFileSync(lp, 'utf-8'); } catch { return { total: 0, buckets }; }
|
||
const startMs = periodStart ? Date.parse(periodStart) : -Infinity;
|
||
const endMs = periodEnd ? Date.parse(periodEnd) : Infinity;
|
||
for (const line of raw.split('\n')) {
|
||
if (!line.trim()) continue;
|
||
let entry;
|
||
try { entry = JSON.parse(line); } catch { continue; }
|
||
if (!entry || entry.rule !== 'chain-recommendation') continue;
|
||
const ts = Date.parse(entry.ts || '');
|
||
if (Number.isNaN(ts) || ts < startMs || ts > endMs) continue;
|
||
const bucket = CHAIN_OUTCOME_BUCKETS.includes(entry.outcome) ? entry.outcome : null;
|
||
if (!bucket) continue;
|
||
buckets[bucket] += 1;
|
||
total += 1;
|
||
}
|
||
return { total, buckets };
|
||
}
|
||
|
||
export function buildChainHookEffectiveness({ total, buckets }) {
|
||
if (!total) return '_(нет данных за период — хук не срабатывал или ledger пуст)_\n';
|
||
const lines = [
|
||
'### Cut 11: Chain-hook effectiveness',
|
||
'',
|
||
'| Outcome | Count | % |',
|
||
'|---|---:|---:|',
|
||
];
|
||
for (const b of CHAIN_OUTCOME_BUCKETS) {
|
||
const c = (buckets && buckets[b]) || 0;
|
||
const pct = total ? Math.round((c / total) * 100) : 0;
|
||
lines.push(`| ${b} | ${c} | ${pct}% |`);
|
||
}
|
||
lines.push(`| **TOTAL** | **${total}** | **100%** |`);
|
||
return lines.join('\n') + '\n';
|
||
}
|
||
|
||
const SIZE_SMALL = 20;
|
||
const SIZE_LARGE = 60;
|
||
|
||
/**
|
||
* Deduplicate the routing-gate double-write: a turn that was blocked then
|
||
* re-stopped yields two episodes with the same task_id + started_at. Keep the
|
||
* last (most complete). observer_error markers are all kept.
|
||
*/
|
||
export function dedupeEpisodes(episodes) {
|
||
const errors = episodes.filter((e) => e && e.observer_error);
|
||
const normal = episodes.filter((e) => e && !e.observer_error);
|
||
const byKey = new Map();
|
||
for (const e of normal) {
|
||
byKey.set(`${e.task_id}|${(e.timestamps || {}).started_at}`, e);
|
||
}
|
||
return [...byKey.values(), ...errors];
|
||
}
|
||
|
||
/** Infer the true outcome of an episode from its events + the next episode's prompt. */
|
||
export function inferOutcome(episode, nextEpisode) {
|
||
const events = episode && Array.isArray(episode.events) ? episode.events : [];
|
||
if (events.some((e) => e.kind === 'interrupt')) {
|
||
return 'partial';
|
||
}
|
||
// A turn is `blocked` only when it ENDED on an unrecovered tool failure —
|
||
// emitted by the parser as a single `unrecovered_error` event when the
|
||
// LAST tool_result of the turn was is_error=true. Raw error/retry counts
|
||
// do NOT imply blocked: a TDD red→green cycle or a grep that returns
|
||
// nothing both surface as `error` events but are intentional and
|
||
// recovered — counting them as blocked over-reports failures (A-1 fix).
|
||
if (events.some((e) => e.kind === 'unrecovered_error')) {
|
||
return 'blocked';
|
||
}
|
||
// 'failure' (work wrong AND never corrected) is a judgment, not
|
||
// deterministically recoverable from a transcript — deferred to the phase-2
|
||
// agent-judge. Until then a wrong-then-corrected turn surfaces as 'rework'.
|
||
if (!nextEpisode) return 'unknown';
|
||
if (nextEpisode.prompt_signal === 'correction') return 'rework';
|
||
if (nextEpisode.prompt_signal === 'approval' || nextEpisode.prompt_signal === 'new_task') return 'success';
|
||
// Task 16: neutral next-prompt = silent success. Если operator продолжил
|
||
// следующей instruction без correction-маркеров — это «no objection».
|
||
// Slightly weaker signal than explicit approval — labelled `soft_success`.
|
||
if (nextEpisode.prompt_signal === 'neutral') return 'soft_success';
|
||
return 'unknown';
|
||
}
|
||
|
||
function bySessionSorted(episodes) {
|
||
const map = new Map();
|
||
for (const e of episodes) {
|
||
if (e.observer_error) continue;
|
||
const sid = e.task_id || 'unknown';
|
||
if (!map.has(sid)) map.set(sid, []);
|
||
map.get(sid).push(e);
|
||
}
|
||
for (const eps of map.values()) {
|
||
eps.sort((a, b) =>
|
||
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
|
||
);
|
||
}
|
||
return map;
|
||
}
|
||
|
||
/** Group episodes into tasks: a new task starts after a success or on a new_task prompt. */
|
||
export function groupEpisodesToTasks(episodes) {
|
||
const tasks = [];
|
||
for (const [sid, eps] of bySessionSorted(episodes)) {
|
||
let current = null;
|
||
eps.forEach((episode, i) => {
|
||
const prev = eps[i - 1];
|
||
const prevOutcome = prev ? inferOutcome(prev, episode) : null;
|
||
const isNewTask = i === 0 || prevOutcome === 'success' || episode.prompt_signal === 'new_task';
|
||
if (isNewTask) {
|
||
current = { task_ref: `${sid}#${tasks.length + 1}`, episodes: [] };
|
||
tasks.push(current);
|
||
}
|
||
current.episodes.push(episode);
|
||
});
|
||
}
|
||
return tasks;
|
||
}
|
||
|
||
// Hot/normative files — touched by almost every turn (memory store, CLAUDE.md,
|
||
// STATUS.md, episodes JSONL). Sharing one of these is not evidence of a causal
|
||
// chain; it just means both turns brushed the same hot file. Excluded from the
|
||
// shared-file signal (A-3 fix).
|
||
const HOT_FILE_PATTERNS = [
|
||
/(?:^|[\\/])CLAUDE\.md$/i,
|
||
/(?:^|[\\/])MEMORY\.md$/i,
|
||
/(?:^|[\\/])STATUS\.md$/i,
|
||
/[\\/]episodes-\d{4}-\d{2}\.jsonl$/i,
|
||
/[\\/]memory[\\/][^\\/]+\.md$/i,
|
||
];
|
||
|
||
export function isHotFile(path) {
|
||
const s = String(path || '');
|
||
return HOT_FILE_PATTERNS.some((re) => re.test(s));
|
||
}
|
||
|
||
/** Causal-chain candidates: an errored episode → a later episode sharing a file. */
|
||
export function findCausalChains(episodes) {
|
||
const sorted = episodes
|
||
.filter((e) => !e.observer_error)
|
||
.slice()
|
||
.sort((a, b) =>
|
||
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
|
||
);
|
||
const chains = [];
|
||
for (let i = 0; i < sorted.length - 1; i++) {
|
||
const a = sorted[i];
|
||
const hasError = Array.isArray(a.events) && a.events.some((e) => e.kind === 'error');
|
||
if (!hasError) continue;
|
||
const filesA = new Set(
|
||
(((a.task_size || {}).files) || []).filter((f) => !isHotFile(f))
|
||
);
|
||
if (filesA.size === 0) continue;
|
||
for (let j = i + 1; j < sorted.length; j++) {
|
||
const b = sorted[j];
|
||
const shared = (((b.task_size || {}).files) || []).filter((f) => !isHotFile(f) && filesA.has(f));
|
||
if (shared.length > 0) {
|
||
chains.push({
|
||
from: `${a.task_id}|${(a.timestamps || {}).started_at}`,
|
||
to: `${b.task_id}|${(b.timestamps || {}).started_at}`,
|
||
sharedFiles: shared,
|
||
});
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
return chains;
|
||
}
|
||
|
||
function sizeBucket(toolCalls) {
|
||
const n = Number(toolCalls) || 0;
|
||
return n < SIZE_SMALL ? 'small' : n <= SIZE_LARGE ? 'medium' : 'large';
|
||
}
|
||
|
||
const SESSION_TURN_EARLY = 10;
|
||
const SESSION_TURN_LATE = 40;
|
||
|
||
function sessionTurnBucket(turn) {
|
||
const n = Number(turn);
|
||
if (!Number.isFinite(n)) return 'null';
|
||
return n < SESSION_TURN_EARLY ? 'early' : n <= SESSION_TURN_LATE ? 'mid' : 'late';
|
||
}
|
||
|
||
// Pass 1 cheap-axis helpers (project-brain-factor-analysis-4passes).
|
||
function countEventKind(events, kind) {
|
||
if (!Array.isArray(events)) return 0;
|
||
let c = 0;
|
||
for (const ev of events) if (ev && ev.kind === kind) c++;
|
||
return c;
|
||
}
|
||
|
||
function retryBucket(events) {
|
||
const n = countEventKind(events, 'retry');
|
||
return n === 0 ? '0' : n <= 2 ? '1-2' : '3+';
|
||
}
|
||
|
||
function errorBucket(events) {
|
||
const n = countEventKind(events, 'error');
|
||
return n === 0 ? '0' : n === 1 ? '1' : '2+';
|
||
}
|
||
|
||
function iterationsBucket(iterations) {
|
||
const n = Number(iterations);
|
||
if (!Number.isFinite(n) || n <= 0) return '0';
|
||
if (n <= 3) return '1-3';
|
||
if (n <= 10) return '4-10';
|
||
return '11+';
|
||
}
|
||
|
||
// Pass 2 — classifier latency bucket. <500ms = fast (cache hit territory),
|
||
// 500-2000 = medium (cold call), 2000-10000 = slow (network jitter / overflow),
|
||
// >10000 = very_slow (retries fired). Null on non-LLM paths.
|
||
function latencyBucket(latency) {
|
||
const n = Number(latency);
|
||
if (!Number.isFinite(n) || n < 0) return 'null';
|
||
if (n < 500) return 'fast';
|
||
if (n < 2000) return 'medium';
|
||
if (n < 10000) return 'slow';
|
||
return 'very_slow';
|
||
}
|
||
|
||
// Pass 3 helpers (project-brain-factor-analysis-4passes).
|
||
function promptLengthBucket(n) {
|
||
const v = Number(n);
|
||
if (!Number.isFinite(v) || v <= 0) return 'null';
|
||
if (v < 100) return 'short';
|
||
if (v < 1000) return 'medium';
|
||
if (v < 2500) return 'long';
|
||
return 'huge';
|
||
}
|
||
|
||
function timeOfDayBucket(iso) {
|
||
// Reject null / undefined / empty BEFORE Date construction: `new Date(null)`
|
||
// is the epoch (1970-01-01), not NaN — would falsely bucket missing
|
||
// timestamps as 'night'.
|
||
if (iso == null || iso === '') return 'null';
|
||
const d = new Date(iso);
|
||
if (Number.isNaN(d.getTime())) return 'null';
|
||
const h = d.getUTCHours();
|
||
if (h < 6) return 'night';
|
||
if (h < 12) return 'morning';
|
||
if (h < 18) return 'afternoon';
|
||
return 'evening';
|
||
}
|
||
|
||
const WEEKDAY_NAMES = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
|
||
function dayOfWeekLabel(iso) {
|
||
if (iso == null || iso === '') return 'null';
|
||
const d = new Date(iso);
|
||
if (Number.isNaN(d.getTime())) return 'null';
|
||
return WEEKDAY_NAMES[d.getUTCDay()];
|
||
}
|
||
|
||
function interPromptGapBucket(min) {
|
||
const v = Number(min);
|
||
if (!Number.isFinite(v) || v < 0) return 'null';
|
||
if (v < 1) return '<1m';
|
||
if (v < 10) return '1-10m';
|
||
if (v < 60) return '10-60m';
|
||
return '60m+';
|
||
}
|
||
|
||
function fileTypeMain(dist) {
|
||
if (!dist || typeof dist !== 'object') return 'none';
|
||
const entries = Object.entries(dist).filter(([, n]) => Number(n) > 0);
|
||
if (entries.length === 0) return 'none';
|
||
let maxN = 0;
|
||
for (const [, n] of entries) if (n > maxN) maxN = n;
|
||
const winners = entries.filter(([, n]) => n === maxN);
|
||
if (winners.length > 1) return 'mixed';
|
||
return winners[0][0];
|
||
}
|
||
|
||
function eventToolCount(events, toolName) {
|
||
if (!Array.isArray(events)) return 0;
|
||
for (const ev of events) {
|
||
if (ev && ev.kind === 'tool_summary' && ev.counts) {
|
||
return Number(ev.counts[toolName]) || 0;
|
||
}
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
function countBucket012(n) {
|
||
const v = Number(n) || 0;
|
||
return v === 0 ? '0' : v === 1 ? '1' : '2+';
|
||
}
|
||
|
||
function judgeCallsBucket(n) {
|
||
const v = Number(n) || 0;
|
||
if (v === 0) return '0';
|
||
if (v < 10) return '1-9';
|
||
return '10+';
|
||
}
|
||
|
||
const FACTOR_FNS = {
|
||
decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown',
|
||
economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'),
|
||
model: (e) => (e.environment || {}).model || 'null',
|
||
post_compaction: (e) => String((e.environment || {}).post_compaction ?? false),
|
||
session_segment_turn: (e) => sessionTurnBucket((e.environment || {}).session_turn),
|
||
parallel_session: (e) => String((e.environment || {}).parallel_session ?? false),
|
||
task_size: (e) => sizeBucket((e.task_size || {}).tool_calls),
|
||
node_chosen: (e) => (e.primary_rationale || {}).node_chosen || 'direct',
|
||
task_classification: (e) => (e.primary_rationale || {}).task_classification || 'other',
|
||
recommended_node_for_direct: (e) => (e.primary_rationale || {}).recommended_node || 'none',
|
||
// Pass 1 — 8 cheap axes (data already in v4 episode, just expose):
|
||
prompt_signal: (e) => e.prompt_signal || 'null',
|
||
classifier_source: (e) => (e.classifier_output || {}).source || 'null',
|
||
degraded_mode: (e) => String(e.degraded_mode ?? false),
|
||
path_type: (e) => e.path_type || 'null',
|
||
retry_count: (e) => retryBucket(e.events),
|
||
error_count: (e) => errorBucket(e.events),
|
||
hard_floor_invoked: (e) => String(((e.primary_rationale || {}).hard_floor || {}).invoked ?? false),
|
||
iterations_bucket: (e) => iterationsBucket((e.task_cost || {}).iterations),
|
||
// Pass 2 — classifier-metric axes (project-brain-factor-analysis-4passes):
|
||
latency_bucket: (e) => latencyBucket((e.classifier_output || {}).latency_ms),
|
||
error_type: (e) => (e.classifier_output || {}).llm_error || 'null',
|
||
// Pass 3 — dynamics axes (project-brain-factor-analysis-4passes):
|
||
prompt_length_bucket: (e) => promptLengthBucket((e.task_meta || {}).prompt_length_chars),
|
||
time_of_day_bucket: (e) => timeOfDayBucket((e.timestamps || {}).started_at),
|
||
day_of_week: (e) => dayOfWeekLabel((e.timestamps || {}).started_at),
|
||
inter_prompt_gap_bucket: (e) => interPromptGapBucket(e._interPromptGapMin),
|
||
mcp_server_used: (e) => (((e.task_meta || {}).mcp_servers_used || []).length > 0 ? 'any' : 'none'),
|
||
file_type_main: (e) => fileTypeMain((e.task_meta || {}).file_type_distribution),
|
||
skill_invocations_bucket: (e) => countBucket012(eventToolCount(e.events, 'Skill')),
|
||
subagent_spawns_bucket: (e) => countBucket012(
|
||
eventToolCount(e.events, 'Agent') + eventToolCount(e.events, 'Task'),
|
||
),
|
||
// Pass 4 — semantic NN axis (project-brain-factor-analysis-4passes).
|
||
// Reads the pre-computed family label stamped on the episode by analyze()
|
||
// (cross-episode pass via observer-embedding-index). Episodes without an
|
||
// embedding or with no resolved neighbours bucket as 'no_neighbors'.
|
||
similar_past_outcome_majority: (e) => e._similarPastOutcomeMajority || 'no_neighbors',
|
||
// Pass 5 — router-gate v4 signal axes (brain-data-catalog раздел F → факторы).
|
||
rationalization_flag_count: (e) => countBucket012((e.v4_signals || {}).rationalization_flag_count),
|
||
judge_verdict: (e) => (e.v4_signals || {}).judge_verdict || 'null',
|
||
safe_baseline_action: (e) => (e.v4_signals || {}).safe_baseline_action || 'null',
|
||
judge_calls_bucket: (e) => judgeCallsBucket((e.v4_signals || {}).judge_calls),
|
||
};
|
||
|
||
// Pass 4 — decode prompt_embedding_base64 to Float32Array. Mirrors
|
||
// observer-embedding-index safeDecode but kept private here to avoid
|
||
// circular surface; analyzer only needs the target-embedding decode path.
|
||
function decodeTargetEmbedding(b64) {
|
||
if (!b64 || typeof b64 !== 'string') return null;
|
||
try {
|
||
const buf = Buffer.from(b64, 'base64');
|
||
if (buf.byteLength === 0 || buf.byteLength % 4 !== 0) return null;
|
||
const v = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
|
||
for (let i = 0; i < v.length; i++) if (!Number.isFinite(v[i])) return null;
|
||
return v;
|
||
} catch {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */
|
||
export function buildFactorMatrix(episodesWithOutcome) {
|
||
const matrix = {};
|
||
for (const [fname, fn] of Object.entries(FACTOR_FNS)) {
|
||
matrix[fname] = {};
|
||
for (const e of episodesWithOutcome) {
|
||
const val = fn(e);
|
||
const outcome = e._inferredOutcome || 'unknown';
|
||
matrix[fname][val] = matrix[fname][val] || {};
|
||
matrix[fname][val][outcome] = (matrix[fname][val][outcome] || 0) + 1;
|
||
}
|
||
}
|
||
// chain_ref is multi-value: a multi-chain episode counts once per chain;
|
||
// null/absent → key "null". Handled outside FACTOR_FNS (single-value loop).
|
||
matrix.chain_ref = {};
|
||
for (const e of episodesWithOutcome) {
|
||
const cr = (e.primary_rationale || {}).chain_ref;
|
||
const outcome = e._inferredOutcome || 'unknown';
|
||
const keys = Array.isArray(cr) && cr.length ? cr : ['null'];
|
||
for (const k of keys) {
|
||
matrix.chain_ref[k] = matrix.chain_ref[k] || {};
|
||
matrix.chain_ref[k][outcome] = (matrix.chain_ref[k][outcome] || 0) + 1;
|
||
}
|
||
}
|
||
return matrix;
|
||
}
|
||
|
||
|
||
// ────────────────────────────────────────────────────────────────
|
||
// New cut helpers — normalize recommended id to '#N' form for canon
|
||
// comparison regardless of whether the source stored 19 or '#19'.
|
||
// ────────────────────────────────────────────────────────────────
|
||
function normalizeNodeId(id) {
|
||
if (id == null) return null;
|
||
const s = String(id).trim();
|
||
return s.startsWith('#') ? s : `#${s}`;
|
||
}
|
||
|
||
function hasRecommendation(ep) {
|
||
const pr = ep.primary_rationale || {};
|
||
const co = ep.classifier_output || {};
|
||
const recNode = pr.recommended_node || co.recommended_node;
|
||
const recChain = pr.recommended_chain || co.recommended_chain;
|
||
return !!(recNode || (Array.isArray(recChain) && recChain.length > 0));
|
||
}
|
||
|
||
function getRecommendedNode(ep) {
|
||
const pr = ep.primary_rationale || {};
|
||
const co = ep.classifier_output || {};
|
||
return pr.recommended_node || co.recommended_node || null;
|
||
}
|
||
|
||
function getRecommendedChain(ep) {
|
||
const pr = ep.primary_rationale || {};
|
||
const co = ep.classifier_output || {};
|
||
const chain = pr.recommended_chain || co.recommended_chain;
|
||
return Array.isArray(chain) ? chain : [];
|
||
}
|
||
|
||
|
||
/**
|
||
* Cut 8 — Class × canon coverage.
|
||
* Returns one row per task_classification appearing in the episodes, sorted by count desc.
|
||
* classificationMap shape: { [classification]: string[] } — canonical node IDs (e.g. '#34').
|
||
*/
|
||
export function buildClassCanonCoverage(episodes, classificationMap) {
|
||
const map = classificationMap || {};
|
||
const byClass = new Map();
|
||
for (const ep of episodes) {
|
||
const classification = (ep.primary_rationale || {}).task_classification || 'other';
|
||
if (!byClass.has(classification)) {
|
||
byClass.set(classification, {
|
||
classification,
|
||
count: 0,
|
||
canonicalNodes: map[classification] ? [...map[classification]] : [],
|
||
routerRecommended: 0,
|
||
claudeTook: 0,
|
||
recWithinCanon: 0,
|
||
rework: 0,
|
||
});
|
||
}
|
||
const row = byClass.get(classification);
|
||
row.count += 1;
|
||
|
||
const recNode = getRecommendedNode(ep);
|
||
const recChain = getRecommendedChain(ep);
|
||
const hasRec = !!(recNode || recChain.length > 0);
|
||
if (hasRec) {
|
||
row.routerRecommended += 1;
|
||
// Check if any recommended id falls within canonical set
|
||
const canonSet = new Set(row.canonicalNodes.map(normalizeNodeId));
|
||
const allRecIds = [];
|
||
if (recNode) allRecIds.push(normalizeNodeId(recNode));
|
||
for (const id of recChain) allRecIds.push(normalizeNodeId(id));
|
||
if (allRecIds.some((id) => id && canonSet.has(id))) {
|
||
row.recWithinCanon += 1;
|
||
}
|
||
}
|
||
|
||
const nodeChosen = (ep.primary_rationale || {}).node_chosen;
|
||
if (nodeChosen && nodeChosen !== 'direct') {
|
||
row.claudeTook += 1;
|
||
}
|
||
if (ep.outcome_reviewed === 'rework') {
|
||
row.rework += 1;
|
||
}
|
||
}
|
||
return [...byClass.values()].sort((a, b) => b.count - a.count);
|
||
}
|
||
|
||
/**
|
||
* Cut 9 — Router vs Opus three-section breakdown.
|
||
* Returns { sectionA, sectionB, sectionC } — each an array of structured items.
|
||
* Episodes lacking `review` are excluded from all sections.
|
||
*/
|
||
export function buildRouterVsOpus(episodes) {
|
||
const sectionA = [];
|
||
const sectionB = [];
|
||
const sectionC = [];
|
||
|
||
for (const ep of episodes) {
|
||
const rev = ep.review;
|
||
if (!rev || typeof rev !== 'object' || rev.reviewer_error) continue;
|
||
|
||
const pr = ep.primary_rationale || {};
|
||
const hasRec = hasRecommendation(ep);
|
||
const recNode = getRecommendedNode(ep);
|
||
const recChain = getRecommendedChain(ep);
|
||
const routerRecommendation = recChain.length > 0 ? recChain : recNode;
|
||
const time = (ep.timestamps || {}).started_at || null;
|
||
const taskId = String(ep.task_id || '').slice(0, 8);
|
||
const classification = pr.task_classification || 'other';
|
||
const nodeChosen = pr.node_chosen || 'direct';
|
||
const outcomeReviewed = ep.outcome_reviewed || 'unknown';
|
||
|
||
if (hasRec) {
|
||
const isCorrectNoAlt = rev.node_quality === 'correct' && !rev.alternative_better;
|
||
if (isCorrectNoAlt) {
|
||
// Section C: router gave + Opus agreed it was fine (correct, no better alternative)
|
||
sectionC.push({ time, taskId, classification, routerRecommendation, outcomeReviewed });
|
||
} else {
|
||
// Section A: router gave + some disagreement or uncertainty (wrong_node / disputable / has alternative)
|
||
sectionA.push({
|
||
time,
|
||
taskId,
|
||
classification,
|
||
routerRecommendation,
|
||
claudeChose: nodeChosen,
|
||
opusNodeQuality: rev.node_quality || 'n/a',
|
||
opusChainQuality: rev.chain_quality || 'n/a',
|
||
outcomeReviewed,
|
||
opusAlternative: rev.alternative_better || null,
|
||
opusRootCause: rev.error_root_cause || 'n/a',
|
||
});
|
||
}
|
||
} else if (!hasRec && rev.alternative_better) {
|
||
// Section B: router silent, Opus identified a better node
|
||
sectionB.push({
|
||
time,
|
||
taskId,
|
||
classification,
|
||
opusSuggests: rev.alternative_better,
|
||
outcomeReviewed,
|
||
opusReasoning: String(rev.reasoning || '').slice(0, 200),
|
||
});
|
||
}
|
||
}
|
||
|
||
return { sectionA, sectionB, sectionC };
|
||
}
|
||
|
||
/**
|
||
* Cut 10 — Chain-ignore breakdown.
|
||
* Distinguishes chain recommendations from node-only recommendations and reports
|
||
* ignore rates + rework rates, bucketed by chain length.
|
||
*/
|
||
export function buildChainIgnoreBreakdown(episodes) {
|
||
const result = {
|
||
totalChainRecommendations: 0,
|
||
ignoredChainCount: 0,
|
||
ignoredChainRework: 0,
|
||
totalNodeOnlyRecommendations: 0,
|
||
ignoredNodeOnlyCount: 0,
|
||
ignoredNodeOnlyRework: 0,
|
||
breakdownByChainLength: {
|
||
'1': { count: 0, ignored: 0, rework: 0 },
|
||
'2': { count: 0, ignored: 0, rework: 0 },
|
||
'3+': { count: 0, ignored: 0, rework: 0 },
|
||
},
|
||
};
|
||
|
||
for (const ep of episodes) {
|
||
const pr = ep.primary_rationale || {};
|
||
const recNode = getRecommendedNode(ep);
|
||
const recChain = getRecommendedChain(ep);
|
||
const hasChain = recChain.length > 0;
|
||
const hasNodeOnly = !hasChain && !!recNode;
|
||
const nodeChosen = pr.node_chosen || 'direct';
|
||
const isIgnored = nodeChosen === 'direct';
|
||
const isRework = ep.outcome_reviewed === 'rework';
|
||
|
||
if (hasChain) {
|
||
result.totalChainRecommendations += 1;
|
||
const lenBucket = recChain.length === 1 ? '1' : recChain.length === 2 ? '2' : '3+';
|
||
result.breakdownByChainLength[lenBucket].count += 1;
|
||
if (isIgnored) {
|
||
result.ignoredChainCount += 1;
|
||
result.breakdownByChainLength[lenBucket].ignored += 1;
|
||
if (isRework) {
|
||
result.ignoredChainRework += 1;
|
||
result.breakdownByChainLength[lenBucket].rework += 1;
|
||
}
|
||
}
|
||
} else if (hasNodeOnly) {
|
||
result.totalNodeOnlyRecommendations += 1;
|
||
if (isIgnored) {
|
||
result.ignoredNodeOnlyCount += 1;
|
||
if (isRework) result.ignoredNodeOnlyRework += 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Stream H Task 8 — Table 16: per-rule router-gate hook effectiveness.
|
||
*
|
||
* Aggregates episode.hook_fired records by `rule` name, counting total fires
|
||
* and how many ended with `outcome === 'block'`. Episodes without `hook_fired`
|
||
* are ignored.
|
||
*
|
||
* @returns {{rules: Record<string, {fires: number, blocks: number}>}}
|
||
*/
|
||
export function buildRouterGateHookEffectiveness(episodes) {
|
||
const rules = {};
|
||
if (!Array.isArray(episodes)) return { rules };
|
||
for (const ep of episodes) {
|
||
const hf = ep && ep.hook_fired;
|
||
if (!hf || typeof hf !== 'object' || typeof hf.rule !== 'string') continue;
|
||
const slot = rules[hf.rule] || { fires: 0, blocks: 0 };
|
||
slot.fires += 1;
|
||
if (hf.outcome === 'block') slot.blocks += 1;
|
||
rules[hf.rule] = slot;
|
||
}
|
||
return { rules };
|
||
}
|
||
|
||
/**
|
||
* Stream H Task 8 — Table 17: self-fabrication signal detection.
|
||
*
|
||
* An episode is classified as a fabrication when `controller_claim` is a
|
||
* non-empty string but `tool_uses` is missing or empty (controller said it
|
||
* acted but no recorded tool_use proves it). Episodes with `controller_claim`
|
||
* AND at least one tool_use are classified as legit.
|
||
*
|
||
* Episodes without `controller_claim` are not counted (nothing was claimed).
|
||
*
|
||
* @returns {{fabrications: Array, legit: Array}}
|
||
*/
|
||
export function buildSelfFabricationSignals(episodes) {
|
||
const fabrications = [];
|
||
const legit = [];
|
||
if (!Array.isArray(episodes)) return { fabrications, legit };
|
||
for (const ep of episodes) {
|
||
if (!ep || typeof ep.controller_claim !== 'string' || !ep.controller_claim) continue;
|
||
const uses = Array.isArray(ep.tool_uses) ? ep.tool_uses : [];
|
||
if (uses.length === 0) fabrications.push(ep);
|
||
else legit.push(ep);
|
||
}
|
||
return { fabrications, legit };
|
||
}
|
||
|
||
/** Full deterministic aggregation: dedup → infer outcomes → group → chains → matrix → missed activations. */
|
||
export function analyze(episodes, options = {}) {
|
||
const deduped = dedupeEpisodes(episodes);
|
||
const allNormal = deduped.filter((e) => !e.observer_error);
|
||
// v1 episodes lack environment / prompt_signal / decision_provenance — they
|
||
// pollute the factor matrix and break outcome inference. Analyze v2 only.
|
||
const normal = allNormal.filter((e) => e.schema_version >= 2);
|
||
const v1SkippedCount = allNormal.length - normal.length;
|
||
for (const eps of bySessionSorted(normal).values()) {
|
||
eps.forEach((episode, i) => {
|
||
episode._inferredOutcome = inferOutcome(episode, eps[i + 1]);
|
||
// Pass 3 — inter-prompt gap (project-brain-factor-analysis-4passes).
|
||
// Cross-episode signal: minutes between this episode's start and the
|
||
// previous (same-session) episode's end. First episode of a session
|
||
// has no prev → stays undefined → bucket 'null'.
|
||
if (i > 0) {
|
||
const prevEnded = (eps[i - 1].timestamps || {}).ended_at;
|
||
const curStarted = (episode.timestamps || {}).started_at;
|
||
const ms = new Date(curStarted) - new Date(prevEnded);
|
||
if (Number.isFinite(ms) && ms >= 0) episode._interPromptGapMin = ms / 60000;
|
||
}
|
||
});
|
||
}
|
||
|
||
// Pass 4 — semantic NN lookup (project-brain-factor-analysis-4passes).
|
||
// Build a single global index from episodes with resolved outcomes +
|
||
// embeddings, then for EACH episode (resolved or not) find its top-3
|
||
// nearest neighbours and stamp the majority family on _similarPastOutcomeMajority.
|
||
// O(N²) is fine: typical session has ~50-500 episodes, k=3, embedding=384-dim.
|
||
// Future: switch to HNSW / faiss when episode count crosses ~10k.
|
||
const embeddingIndex = buildEmbeddingIndex(normal);
|
||
for (const episode of normal) {
|
||
const target = decodeTargetEmbedding(episode.prompt_embedding_base64);
|
||
if (!target) {
|
||
episode._similarPastOutcomeMajority = 'no_neighbors';
|
||
continue;
|
||
}
|
||
// task_id is the SESSION id (shared across turns), not a turn id —
|
||
// exclude self by (task_id|started_at), the same dedupe key buildIndex uses.
|
||
const excludeKey = `${episode.task_id || ''}|${(episode.timestamps || {}).started_at || ''}`;
|
||
const neighbours = findNearestNeighbors(target, embeddingIndex, 3, { excludeKey });
|
||
episode._similarPastOutcomeMajority = majorityOutcome(neighbours);
|
||
}
|
||
const classificationMap = options.classificationMap || {};
|
||
const dormancy = options.dormancy || {};
|
||
const disciplineByClassification = disciplinePercentByClassification(normal, classificationMap);
|
||
const routerStep = routerStepReached(normal);
|
||
const boundariesRate = boundariesAppliedRate(normal);
|
||
|
||
// Phase 3 Task 20 — v4 aggregation: inheritance count + reviewer outcome
|
||
// distribution + cost totals. Reads schema_version >=4 fields gracefully.
|
||
let inheritanceCount = 0;
|
||
const reviewQuality = { correct: 0, wrong_node: 0, overkill: 0, underkill: 0, disputable: 0 };
|
||
const reviewerCoverage = { reviewed: 0, pending: 0, errored: 0 };
|
||
let degradedCount = 0;
|
||
const costTotals = {
|
||
classifier_input_tokens: 0,
|
||
classifier_output_tokens: 0,
|
||
self_assessment_input_tokens: 0,
|
||
self_assessment_output_tokens: 0,
|
||
reviewer_input_tokens: 0,
|
||
reviewer_output_tokens: 0,
|
||
};
|
||
for (const e of normal) {
|
||
if (e?.inheritance?.inherited_from_task_id) inheritanceCount += 1;
|
||
if (e?.degraded_mode === true) degradedCount += 1;
|
||
const r = e?.review;
|
||
if (r && typeof r === 'object') {
|
||
if (r.reviewer_error) reviewerCoverage.errored += 1;
|
||
else if (typeof r.node_quality === 'string') {
|
||
reviewerCoverage.reviewed += 1;
|
||
if (reviewQuality[r.node_quality] !== undefined) reviewQuality[r.node_quality] += 1;
|
||
}
|
||
} else if (e?.schema_version >= 4) {
|
||
reviewerCoverage.pending += 1;
|
||
}
|
||
const tc = e?.task_cost;
|
||
if (tc && typeof tc === 'object') {
|
||
for (const k of Object.keys(costTotals)) {
|
||
const v = tc[k];
|
||
if (typeof v === 'number' && Number.isFinite(v)) costTotals[k] += v;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Cuts 8/9/10 — use classificationMap derived from nodes.yaml (registry-to-classification-map.mjs).
|
||
// Archive-fallback REMOVED 2026-05-28 — was stale source of #37/deploy noise.
|
||
const canonMapForCuts = classificationMap || {};
|
||
|
||
return {
|
||
episodeCount: normal.length,
|
||
v1SkippedCount,
|
||
observerErrorCount: deduped.length - allNormal.length,
|
||
tasks: groupEpisodesToTasks(normal),
|
||
causalChains: findCausalChains(normal),
|
||
factorMatrix: buildFactorMatrix(normal),
|
||
missedActivations: detectMissedActivations(normal, classificationMap, dormancy),
|
||
disciplineByClassification,
|
||
routerStep,
|
||
boundariesRate,
|
||
inheritanceCount,
|
||
reviewQuality,
|
||
reviewerCoverage,
|
||
degradedCount,
|
||
costTotals,
|
||
classCanonCoverage: buildClassCanonCoverage(normal, canonMapForCuts),
|
||
routerVsOpus: buildRouterVsOpus(normal),
|
||
chainIgnoreBreakdown: buildChainIgnoreBreakdown(normal),
|
||
chainHookEffectiveness: analyzeChainHookEffectiveness({
|
||
ledgerPath: options && options.hookOutcomesLedgerPath,
|
||
periodStart: options && options.periodStart,
|
||
periodEnd: options && options.periodEnd,
|
||
}),
|
||
routerGateHookEffectiveness: buildRouterGateHookEffectiveness(normal),
|
||
selfFabricationSignals: buildSelfFabricationSignals(normal),
|
||
};
|
||
}
|
||
|
||
function loadEpisodes(files) {
|
||
const eps = [];
|
||
for (const f of files) {
|
||
if (!existsSync(f)) continue;
|
||
for (const line of readFileSync(f, 'utf-8').split('\n')) {
|
||
const t = line.trim();
|
||
if (!t) continue;
|
||
try {
|
||
eps.push(JSON.parse(t));
|
||
} catch {
|
||
// skip broken line
|
||
}
|
||
}
|
||
}
|
||
return eps;
|
||
}
|
||
|
||
if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/brain-retro-analyzer.mjs')) {
|
||
const registry = loadRegistry({ useCache: false });
|
||
const classificationMap = buildClassificationMap(registry);
|
||
const dormancy = buildDormancyMap(registry);
|
||
const result = analyze(loadEpisodes(process.argv.slice(2)), { classificationMap, dormancy });
|
||
console.log(JSON.stringify(result, null, 2));
|
||
process.exit(0);
|
||
}
|