Files
portal/tools/brain-retro-analyzer.mjs
T
Дмитрий 58784b182d feat(observer/analyzer): Pass 4 — embedding-NN axis (similar_past_outcome_majority)
Closes the 4-pass factor-analysis expansion plan in
memory/project_brain_factor_analysis_4passes.md. Adds semantic-search
context to the brain-retro analyzer: for each episode, look up its
top-3 prompt-embedding neighbours among historical (resolved-outcome)
episodes and report the majority outcome family. Lets the matrix
answer "do prompts that look like THIS one usually succeed or rework?"

# New module: tools/observer-embedding-index.mjs (pure, fs-free)

- mapOutcomeToFamily(outcome): success / soft_success → 'success',
  rework → 'retry', blocked / partial → 'failure', else null.
- cosineSimilarity(a, b): generic formula (defends against non-
  normalised vectors); 0 on null / empty / mismatched lengths.
- buildIndex(episodes): keeps only episodes with both a base64
  embedding AND a resolved outcome family. Decodes base64 safely
  (rejects garbage where byteLength % 4 ≠ 0 — Node's
  Buffer.from('garbage', 'base64') silently strips invalid chars).
- findNearestNeighbors(target, index, k, opts): top-k by descending
  cosine. Supports `excludeKey` (composite task_id|started_at) and
  legacy `excludeTaskId`.
- majorityOutcome(neighbours): 'mixed' on top-rank tie, 'no_neighbors'
  on empty input.
- episodeKey(ep): the same task_id|started_at shape that
  dedupeEpisodes uses — needed because task_id is the SESSION id,
  shared across turns. task_id alone cannot identify a single turn.

# brain-retro-analyzer.mjs

- New FACTOR_FNS axis similar_past_outcome_majority reading the
  pre-computed episode._similarPastOutcomeMajority field.
- analyze() builds a single global embedding index from normal
  (post-inferOutcome), then for every episode decodes its own embedding,
  looks up top-3 neighbours excluding self by composite key, and
  stamps the majority family on the episode (O(N^2), fine up to ~10k
  episodes; HNSW migration deferred per memory plan).
- Local decodeTargetEmbedding mirrors the embedding-index safeDecode.

# Tests

20 new tests (RED -> GREEN):
- observer-embedding-index.test.mjs (new file, 18 tests):
  cosineSimilarity (5), mapOutcomeToFamily (4), buildIndex (4),
  findNearestNeighbors (4 incl. self-exclusion), majorityOutcome (3).
- brain-retro-analyzer.test.mjs (2 integration tests):
  similar_past_outcome_majority lands on factor matrix; no_neighbors
  bucket when no episode has embeddings.

Targeted sweep: 632/632 PASS on the 2 directly-affected suites.
Broader tools/ sweep: 7968/7969 PASS. Pre-existing 1 test failure in
observer-self-assessment-api.test.mjs:258 (contract change from prior
session's readRuntimeFlag fix in 050b349a; out of scope for this commit).
95 pre-existing test-file load failures in worktree copies + ruflo /
subagent-prompt-prefix — unrelated.

Factor matrix grew 11 -> 19 -> 21 -> 29 -> 30 axes across Pass 1+2+3+4.

LEFTHOOK=0 due to quirk #111. Manual gitleaks scan: clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 17:07:23 +03:00

488 lines
19 KiB
JavaScript

#!/usr/bin/env node
/**
* Brain-retro analyzer (brain governance, observer factor-analysis spec §6).
* Pure, deterministic Layer-4 aggregation over observer episodes for the
* /brain-retro skill. Read-only — never writes JSONL. No LLM.
*
* Security Guidance #40: pure parsing — no exec/execSync.
*/
import { Buffer } from 'buffer';
import { readFileSync, existsSync } from 'fs';
import { detectMissedActivations } from './missed-activations.mjs';
import {
disciplinePercentByClassification,
routerStepReached,
boundariesAppliedRate,
} from './discipline-metrics.mjs';
import { loadRegistry } from './registry-load.mjs';
import { buildClassificationMap, buildDormancyMap } from './registry-to-classification-map.mjs';
import {
buildIndex as buildEmbeddingIndex,
findNearestNeighbors,
majorityOutcome,
} from './observer-embedding-index.mjs';
const SIZE_SMALL = 20;
const SIZE_LARGE = 60;
/**
* Deduplicate the routing-gate double-write: a turn that was blocked then
* re-stopped yields two episodes with the same task_id + started_at. Keep the
* last (most complete). observer_error markers are all kept.
*/
export function dedupeEpisodes(episodes) {
const errors = episodes.filter((e) => e && e.observer_error);
const normal = episodes.filter((e) => e && !e.observer_error);
const byKey = new Map();
for (const e of normal) {
byKey.set(`${e.task_id}|${(e.timestamps || {}).started_at}`, e);
}
return [...byKey.values(), ...errors];
}
/** Infer the true outcome of an episode from its events + the next episode's prompt. */
export function inferOutcome(episode, nextEpisode) {
const events = episode && Array.isArray(episode.events) ? episode.events : [];
if (events.some((e) => e.kind === 'interrupt')) {
return 'partial';
}
// A turn is `blocked` only when it ENDED on an unrecovered tool failure —
// emitted by the parser as a single `unrecovered_error` event when the
// LAST tool_result of the turn was is_error=true. Raw error/retry counts
// do NOT imply blocked: a TDD red→green cycle or a grep that returns
// nothing both surface as `error` events but are intentional and
// recovered — counting them as blocked over-reports failures (A-1 fix).
if (events.some((e) => e.kind === 'unrecovered_error')) {
return 'blocked';
}
// 'failure' (work wrong AND never corrected) is a judgment, not
// deterministically recoverable from a transcript — deferred to the phase-2
// agent-judge. Until then a wrong-then-corrected turn surfaces as 'rework'.
if (!nextEpisode) return 'unknown';
if (nextEpisode.prompt_signal === 'correction') return 'rework';
if (nextEpisode.prompt_signal === 'approval' || nextEpisode.prompt_signal === 'new_task') return 'success';
// Task 16: neutral next-prompt = silent success. Если operator продолжил
// следующей instruction без correction-маркеров — это «no objection».
// Slightly weaker signal than explicit approval — labelled `soft_success`.
if (nextEpisode.prompt_signal === 'neutral') return 'soft_success';
return 'unknown';
}
function bySessionSorted(episodes) {
const map = new Map();
for (const e of episodes) {
if (e.observer_error) continue;
const sid = e.task_id || 'unknown';
if (!map.has(sid)) map.set(sid, []);
map.get(sid).push(e);
}
for (const eps of map.values()) {
eps.sort((a, b) =>
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
);
}
return map;
}
/** Group episodes into tasks: a new task starts after a success or on a new_task prompt. */
export function groupEpisodesToTasks(episodes) {
const tasks = [];
for (const [sid, eps] of bySessionSorted(episodes)) {
let current = null;
eps.forEach((episode, i) => {
const prev = eps[i - 1];
const prevOutcome = prev ? inferOutcome(prev, episode) : null;
const isNewTask = i === 0 || prevOutcome === 'success' || episode.prompt_signal === 'new_task';
if (isNewTask) {
current = { task_ref: `${sid}#${tasks.length + 1}`, episodes: [] };
tasks.push(current);
}
current.episodes.push(episode);
});
}
return tasks;
}
// Hot/normative files — touched by almost every turn (memory store, CLAUDE.md,
// STATUS.md, episodes JSONL). Sharing one of these is not evidence of a causal
// chain; it just means both turns brushed the same hot file. Excluded from the
// shared-file signal (A-3 fix).
const HOT_FILE_PATTERNS = [
/(?:^|[\\/])CLAUDE\.md$/i,
/(?:^|[\\/])MEMORY\.md$/i,
/(?:^|[\\/])STATUS\.md$/i,
/[\\/]episodes-\d{4}-\d{2}\.jsonl$/i,
/[\\/]memory[\\/][^\\/]+\.md$/i,
];
export function isHotFile(path) {
const s = String(path || '');
return HOT_FILE_PATTERNS.some((re) => re.test(s));
}
/** Causal-chain candidates: an errored episode → a later episode sharing a file. */
export function findCausalChains(episodes) {
const sorted = episodes
.filter((e) => !e.observer_error)
.slice()
.sort((a, b) =>
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
);
const chains = [];
for (let i = 0; i < sorted.length - 1; i++) {
const a = sorted[i];
const hasError = Array.isArray(a.events) && a.events.some((e) => e.kind === 'error');
if (!hasError) continue;
const filesA = new Set(
(((a.task_size || {}).files) || []).filter((f) => !isHotFile(f))
);
if (filesA.size === 0) continue;
for (let j = i + 1; j < sorted.length; j++) {
const b = sorted[j];
const shared = (((b.task_size || {}).files) || []).filter((f) => !isHotFile(f) && filesA.has(f));
if (shared.length > 0) {
chains.push({
from: `${a.task_id}|${(a.timestamps || {}).started_at}`,
to: `${b.task_id}|${(b.timestamps || {}).started_at}`,
sharedFiles: shared,
});
break;
}
}
}
return chains;
}
function sizeBucket(toolCalls) {
const n = Number(toolCalls) || 0;
return n < SIZE_SMALL ? 'small' : n <= SIZE_LARGE ? 'medium' : 'large';
}
const SESSION_TURN_EARLY = 10;
const SESSION_TURN_LATE = 40;
function sessionTurnBucket(turn) {
const n = Number(turn);
if (!Number.isFinite(n)) return 'null';
return n < SESSION_TURN_EARLY ? 'early' : n <= SESSION_TURN_LATE ? 'mid' : 'late';
}
// Pass 1 cheap-axis helpers (project-brain-factor-analysis-4passes).
function countEventKind(events, kind) {
if (!Array.isArray(events)) return 0;
let c = 0;
for (const ev of events) if (ev && ev.kind === kind) c++;
return c;
}
function retryBucket(events) {
const n = countEventKind(events, 'retry');
return n === 0 ? '0' : n <= 2 ? '1-2' : '3+';
}
function errorBucket(events) {
const n = countEventKind(events, 'error');
return n === 0 ? '0' : n === 1 ? '1' : '2+';
}
function iterationsBucket(iterations) {
const n = Number(iterations);
if (!Number.isFinite(n) || n <= 0) return '0';
if (n <= 3) return '1-3';
if (n <= 10) return '4-10';
return '11+';
}
// Pass 2 — classifier latency bucket. <500ms = fast (cache hit territory),
// 500-2000 = medium (cold call), 2000-10000 = slow (network jitter / overflow),
// >10000 = very_slow (retries fired). Null on non-LLM paths.
function latencyBucket(latency) {
const n = Number(latency);
if (!Number.isFinite(n) || n < 0) return 'null';
if (n < 500) return 'fast';
if (n < 2000) return 'medium';
if (n < 10000) return 'slow';
return 'very_slow';
}
// Pass 3 helpers (project-brain-factor-analysis-4passes).
function promptLengthBucket(n) {
const v = Number(n);
if (!Number.isFinite(v) || v <= 0) return 'null';
if (v < 100) return 'short';
if (v < 1000) return 'medium';
if (v < 2500) return 'long';
return 'huge';
}
function timeOfDayBucket(iso) {
// Reject null / undefined / empty BEFORE Date construction: `new Date(null)`
// is the epoch (1970-01-01), not NaN — would falsely bucket missing
// timestamps as 'night'.
if (iso == null || iso === '') return 'null';
const d = new Date(iso);
if (Number.isNaN(d.getTime())) return 'null';
const h = d.getUTCHours();
if (h < 6) return 'night';
if (h < 12) return 'morning';
if (h < 18) return 'afternoon';
return 'evening';
}
const WEEKDAY_NAMES = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'];
function dayOfWeekLabel(iso) {
if (iso == null || iso === '') return 'null';
const d = new Date(iso);
if (Number.isNaN(d.getTime())) return 'null';
return WEEKDAY_NAMES[d.getUTCDay()];
}
function interPromptGapBucket(min) {
const v = Number(min);
if (!Number.isFinite(v) || v < 0) return 'null';
if (v < 1) return '<1m';
if (v < 10) return '1-10m';
if (v < 60) return '10-60m';
return '60m+';
}
function fileTypeMain(dist) {
if (!dist || typeof dist !== 'object') return 'none';
const entries = Object.entries(dist).filter(([, n]) => Number(n) > 0);
if (entries.length === 0) return 'none';
let maxN = 0;
for (const [, n] of entries) if (n > maxN) maxN = n;
const winners = entries.filter(([, n]) => n === maxN);
if (winners.length > 1) return 'mixed';
return winners[0][0];
}
function eventToolCount(events, toolName) {
if (!Array.isArray(events)) return 0;
for (const ev of events) {
if (ev && ev.kind === 'tool_summary' && ev.counts) {
return Number(ev.counts[toolName]) || 0;
}
}
return 0;
}
function countBucket012(n) {
const v = Number(n) || 0;
return v === 0 ? '0' : v === 1 ? '1' : '2+';
}
const FACTOR_FNS = {
decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown',
economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'),
model: (e) => (e.environment || {}).model || 'null',
post_compaction: (e) => String((e.environment || {}).post_compaction ?? false),
session_segment_turn: (e) => sessionTurnBucket((e.environment || {}).session_turn),
parallel_session: (e) => String((e.environment || {}).parallel_session ?? false),
task_size: (e) => sizeBucket((e.task_size || {}).tool_calls),
node_chosen: (e) => (e.primary_rationale || {}).node_chosen || 'direct',
task_classification: (e) => (e.primary_rationale || {}).task_classification || 'other',
recommended_node_for_direct: (e) => (e.primary_rationale || {}).recommended_node || 'none',
// Pass 1 — 8 cheap axes (data already in v4 episode, just expose):
prompt_signal: (e) => e.prompt_signal || 'null',
classifier_source: (e) => (e.classifier_output || {}).source || 'null',
degraded_mode: (e) => String(e.degraded_mode ?? false),
path_type: (e) => e.path_type || 'null',
retry_count: (e) => retryBucket(e.events),
error_count: (e) => errorBucket(e.events),
hard_floor_invoked: (e) => String(((e.primary_rationale || {}).hard_floor || {}).invoked ?? false),
iterations_bucket: (e) => iterationsBucket((e.task_cost || {}).iterations),
// Pass 2 — classifier-metric axes (project-brain-factor-analysis-4passes):
latency_bucket: (e) => latencyBucket((e.classifier_output || {}).latency_ms),
error_type: (e) => (e.classifier_output || {}).llm_error || 'null',
// Pass 3 — dynamics axes (project-brain-factor-analysis-4passes):
prompt_length_bucket: (e) => promptLengthBucket((e.task_meta || {}).prompt_length_chars),
time_of_day_bucket: (e) => timeOfDayBucket((e.timestamps || {}).started_at),
day_of_week: (e) => dayOfWeekLabel((e.timestamps || {}).started_at),
inter_prompt_gap_bucket: (e) => interPromptGapBucket(e._interPromptGapMin),
mcp_server_used: (e) => (((e.task_meta || {}).mcp_servers_used || []).length > 0 ? 'any' : 'none'),
file_type_main: (e) => fileTypeMain((e.task_meta || {}).file_type_distribution),
skill_invocations_bucket: (e) => countBucket012(eventToolCount(e.events, 'Skill')),
subagent_spawns_bucket: (e) => countBucket012(
eventToolCount(e.events, 'Agent') + eventToolCount(e.events, 'Task'),
),
// Pass 4 — semantic NN axis (project-brain-factor-analysis-4passes).
// Reads the pre-computed family label stamped on the episode by analyze()
// (cross-episode pass via observer-embedding-index). Episodes without an
// embedding or with no resolved neighbours bucket as 'no_neighbors'.
similar_past_outcome_majority: (e) => e._similarPastOutcomeMajority || 'no_neighbors',
};
// Pass 4 — decode prompt_embedding_base64 to Float32Array. Mirrors
// observer-embedding-index safeDecode but kept private here to avoid
// circular surface; analyzer only needs the target-embedding decode path.
function decodeTargetEmbedding(b64) {
if (!b64 || typeof b64 !== 'string') return null;
try {
const buf = Buffer.from(b64, 'base64');
if (buf.byteLength === 0 || buf.byteLength % 4 !== 0) return null;
const v = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
for (let i = 0; i < v.length; i++) if (!Number.isFinite(v[i])) return null;
return v;
} catch {
return null;
}
}
/** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */
export function buildFactorMatrix(episodesWithOutcome) {
const matrix = {};
for (const [fname, fn] of Object.entries(FACTOR_FNS)) {
matrix[fname] = {};
for (const e of episodesWithOutcome) {
const val = fn(e);
const outcome = e._inferredOutcome || 'unknown';
matrix[fname][val] = matrix[fname][val] || {};
matrix[fname][val][outcome] = (matrix[fname][val][outcome] || 0) + 1;
}
}
// chain_ref is multi-value: a multi-chain episode counts once per chain;
// null/absent → key "null". Handled outside FACTOR_FNS (single-value loop).
matrix.chain_ref = {};
for (const e of episodesWithOutcome) {
const cr = (e.primary_rationale || {}).chain_ref;
const outcome = e._inferredOutcome || 'unknown';
const keys = Array.isArray(cr) && cr.length ? cr : ['null'];
for (const k of keys) {
matrix.chain_ref[k] = matrix.chain_ref[k] || {};
matrix.chain_ref[k][outcome] = (matrix.chain_ref[k][outcome] || 0) + 1;
}
}
return matrix;
}
/** Full deterministic aggregation: dedup → infer outcomes → group → chains → matrix → missed activations. */
export function analyze(episodes, options = {}) {
const deduped = dedupeEpisodes(episodes);
const allNormal = deduped.filter((e) => !e.observer_error);
// v1 episodes lack environment / prompt_signal / decision_provenance — they
// pollute the factor matrix and break outcome inference. Analyze v2 only.
const normal = allNormal.filter((e) => e.schema_version >= 2);
const v1SkippedCount = allNormal.length - normal.length;
for (const eps of bySessionSorted(normal).values()) {
eps.forEach((episode, i) => {
episode._inferredOutcome = inferOutcome(episode, eps[i + 1]);
// Pass 3 — inter-prompt gap (project-brain-factor-analysis-4passes).
// Cross-episode signal: minutes between this episode's start and the
// previous (same-session) episode's end. First episode of a session
// has no prev → stays undefined → bucket 'null'.
if (i > 0) {
const prevEnded = (eps[i - 1].timestamps || {}).ended_at;
const curStarted = (episode.timestamps || {}).started_at;
const ms = new Date(curStarted) - new Date(prevEnded);
if (Number.isFinite(ms) && ms >= 0) episode._interPromptGapMin = ms / 60000;
}
});
}
// Pass 4 — semantic NN lookup (project-brain-factor-analysis-4passes).
// Build a single global index from episodes with resolved outcomes +
// embeddings, then for EACH episode (resolved or not) find its top-3
// nearest neighbours and stamp the majority family on _similarPastOutcomeMajority.
// O(N²) is fine: typical session has ~50-500 episodes, k=3, embedding=384-dim.
// Future: switch to HNSW / faiss when episode count crosses ~10k.
const embeddingIndex = buildEmbeddingIndex(normal);
for (const episode of normal) {
const target = decodeTargetEmbedding(episode.prompt_embedding_base64);
if (!target) {
episode._similarPastOutcomeMajority = 'no_neighbors';
continue;
}
// task_id is the SESSION id (shared across turns), not a turn id —
// exclude self by (task_id|started_at), the same dedupe key buildIndex uses.
const excludeKey = `${episode.task_id || ''}|${(episode.timestamps || {}).started_at || ''}`;
const neighbours = findNearestNeighbors(target, embeddingIndex, 3, { excludeKey });
episode._similarPastOutcomeMajority = majorityOutcome(neighbours);
}
const classificationMap = options.classificationMap || {};
const dormancy = options.dormancy || {};
const disciplineByClassification = disciplinePercentByClassification(normal, classificationMap);
const routerStep = routerStepReached(normal);
const boundariesRate = boundariesAppliedRate(normal);
// Phase 3 Task 20 — v4 aggregation: inheritance count + reviewer outcome
// distribution + cost totals. Reads schema_version >=4 fields gracefully.
let inheritanceCount = 0;
const reviewQuality = { correct: 0, wrong_node: 0, overkill: 0, underkill: 0, disputable: 0 };
const reviewerCoverage = { reviewed: 0, pending: 0, errored: 0 };
let degradedCount = 0;
const costTotals = {
classifier_input_tokens: 0,
classifier_output_tokens: 0,
self_assessment_input_tokens: 0,
self_assessment_output_tokens: 0,
reviewer_input_tokens: 0,
reviewer_output_tokens: 0,
};
for (const e of normal) {
if (e?.inheritance?.inherited_from_task_id) inheritanceCount += 1;
if (e?.degraded_mode === true) degradedCount += 1;
const r = e?.review;
if (r && typeof r === 'object') {
if (r.reviewer_error) reviewerCoverage.errored += 1;
else if (typeof r.node_quality === 'string') {
reviewerCoverage.reviewed += 1;
if (reviewQuality[r.node_quality] !== undefined) reviewQuality[r.node_quality] += 1;
}
} else if (e?.schema_version >= 4) {
reviewerCoverage.pending += 1;
}
const tc = e?.task_cost;
if (tc && typeof tc === 'object') {
for (const k of Object.keys(costTotals)) {
const v = tc[k];
if (typeof v === 'number' && Number.isFinite(v)) costTotals[k] += v;
}
}
}
return {
episodeCount: normal.length,
v1SkippedCount,
observerErrorCount: deduped.length - allNormal.length,
tasks: groupEpisodesToTasks(normal),
causalChains: findCausalChains(normal),
factorMatrix: buildFactorMatrix(normal),
missedActivations: detectMissedActivations(normal, classificationMap, dormancy),
disciplineByClassification,
routerStep,
boundariesRate,
inheritanceCount,
reviewQuality,
reviewerCoverage,
degradedCount,
costTotals,
};
}
function loadEpisodes(files) {
const eps = [];
for (const f of files) {
if (!existsSync(f)) continue;
for (const line of readFileSync(f, 'utf-8').split('\n')) {
const t = line.trim();
if (!t) continue;
try {
eps.push(JSON.parse(t));
} catch {
// skip broken line
}
}
}
return eps;
}
if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/brain-retro-analyzer.mjs')) {
const registry = loadRegistry({ useCache: false });
const classificationMap = buildClassificationMap(registry);
const dormancy = buildDormancyMap(registry);
const result = analyze(loadEpisodes(process.argv.slice(2)), { classificationMap, dormancy });
console.log(JSON.stringify(result, null, 2));
process.exit(0);
}