7ac18d1103
Stage 2 Task 4 -- analyze() расширен: disciplineByClassification, routerStep, boundariesRate. CLI (tools/brain-retro-analyzer.mjs source-of-truth) теперь читает classificationMap и dormancy из docs/registry/nodes.yaml через registry-to-classification-map.mjs (вместо observer-classification-map.json и .node-dormancy.json). Sanity-check na 124 эпизодах: missed_before=17 -> missed_after=17 (delta=0). disciplineKeys: bugfix, feature, refactor, planning, cleanup, monitoring, analysis. step dist: all step=1 (suspicious=true -- expected baseline). boundaries rate: 0.105. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
261 lines
10 KiB
JavaScript
261 lines
10 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Brain-retro analyzer (brain governance, observer factor-analysis spec §6).
|
|
* Pure, deterministic Layer-4 aggregation over observer episodes for the
|
|
* /brain-retro skill. Read-only — never writes JSONL. No LLM.
|
|
*
|
|
* Security Guidance #40: pure parsing — no exec/execSync.
|
|
*/
|
|
import { readFileSync, existsSync } from 'fs';
|
|
import { detectMissedActivations } from './missed-activations.mjs';
|
|
import {
|
|
disciplinePercentByClassification,
|
|
routerStepReached,
|
|
boundariesAppliedRate,
|
|
} from './discipline-metrics.mjs';
|
|
import { loadRegistry } from './registry-load.mjs';
|
|
import { buildClassificationMap, buildDormancyMap } from './registry-to-classification-map.mjs';
|
|
|
|
const SIZE_SMALL = 20;
|
|
const SIZE_LARGE = 60;
|
|
|
|
/**
|
|
* Deduplicate the routing-gate double-write: a turn that was blocked then
|
|
* re-stopped yields two episodes with the same task_id + started_at. Keep the
|
|
* last (most complete). observer_error markers are all kept.
|
|
*/
|
|
export function dedupeEpisodes(episodes) {
|
|
const errors = episodes.filter((e) => e && e.observer_error);
|
|
const normal = episodes.filter((e) => e && !e.observer_error);
|
|
const byKey = new Map();
|
|
for (const e of normal) {
|
|
byKey.set(`${e.task_id}|${(e.timestamps || {}).started_at}`, e);
|
|
}
|
|
return [...byKey.values(), ...errors];
|
|
}
|
|
|
|
/** Infer the true outcome of an episode from its events + the next episode's prompt. */
|
|
export function inferOutcome(episode, nextEpisode) {
|
|
const events = episode && Array.isArray(episode.events) ? episode.events : [];
|
|
if (events.some((e) => e.kind === 'interrupt')) {
|
|
return 'partial';
|
|
}
|
|
// A turn is `blocked` only when it ENDED on an unrecovered tool failure —
|
|
// emitted by the parser as a single `unrecovered_error` event when the
|
|
// LAST tool_result of the turn was is_error=true. Raw error/retry counts
|
|
// do NOT imply blocked: a TDD red→green cycle or a grep that returns
|
|
// nothing both surface as `error` events but are intentional and
|
|
// recovered — counting them as blocked over-reports failures (A-1 fix).
|
|
if (events.some((e) => e.kind === 'unrecovered_error')) {
|
|
return 'blocked';
|
|
}
|
|
// 'failure' (work wrong AND never corrected) is a judgment, not
|
|
// deterministically recoverable from a transcript — deferred to the phase-2
|
|
// agent-judge. Until then a wrong-then-corrected turn surfaces as 'rework'.
|
|
if (!nextEpisode) return 'unknown';
|
|
if (nextEpisode.prompt_signal === 'correction') return 'rework';
|
|
if (nextEpisode.prompt_signal === 'approval' || nextEpisode.prompt_signal === 'new_task') return 'success';
|
|
// Task 16: neutral next-prompt = silent success. Если operator продолжил
|
|
// следующей instruction без correction-маркеров — это «no objection».
|
|
// Slightly weaker signal than explicit approval — labelled `soft_success`.
|
|
if (nextEpisode.prompt_signal === 'neutral') return 'soft_success';
|
|
return 'unknown';
|
|
}
|
|
|
|
function bySessionSorted(episodes) {
|
|
const map = new Map();
|
|
for (const e of episodes) {
|
|
if (e.observer_error) continue;
|
|
const sid = e.task_id || 'unknown';
|
|
if (!map.has(sid)) map.set(sid, []);
|
|
map.get(sid).push(e);
|
|
}
|
|
for (const eps of map.values()) {
|
|
eps.sort((a, b) =>
|
|
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
|
|
);
|
|
}
|
|
return map;
|
|
}
|
|
|
|
/** Group episodes into tasks: a new task starts after a success or on a new_task prompt. */
|
|
export function groupEpisodesToTasks(episodes) {
|
|
const tasks = [];
|
|
for (const [sid, eps] of bySessionSorted(episodes)) {
|
|
let current = null;
|
|
eps.forEach((episode, i) => {
|
|
const prev = eps[i - 1];
|
|
const prevOutcome = prev ? inferOutcome(prev, episode) : null;
|
|
const isNewTask = i === 0 || prevOutcome === 'success' || episode.prompt_signal === 'new_task';
|
|
if (isNewTask) {
|
|
current = { task_ref: `${sid}#${tasks.length + 1}`, episodes: [] };
|
|
tasks.push(current);
|
|
}
|
|
current.episodes.push(episode);
|
|
});
|
|
}
|
|
return tasks;
|
|
}
|
|
|
|
// Hot/normative files — touched by almost every turn (memory store, CLAUDE.md,
|
|
// STATUS.md, episodes JSONL). Sharing one of these is not evidence of a causal
|
|
// chain; it just means both turns brushed the same hot file. Excluded from the
|
|
// shared-file signal (A-3 fix).
|
|
const HOT_FILE_PATTERNS = [
|
|
/(?:^|[\\/])CLAUDE\.md$/i,
|
|
/(?:^|[\\/])MEMORY\.md$/i,
|
|
/(?:^|[\\/])STATUS\.md$/i,
|
|
/[\\/]episodes-\d{4}-\d{2}\.jsonl$/i,
|
|
/[\\/]memory[\\/][^\\/]+\.md$/i,
|
|
];
|
|
|
|
export function isHotFile(path) {
|
|
const s = String(path || '');
|
|
return HOT_FILE_PATTERNS.some((re) => re.test(s));
|
|
}
|
|
|
|
/** Causal-chain candidates: an errored episode → a later episode sharing a file. */
|
|
export function findCausalChains(episodes) {
|
|
const sorted = episodes
|
|
.filter((e) => !e.observer_error)
|
|
.slice()
|
|
.sort((a, b) =>
|
|
String((a.timestamps || {}).started_at).localeCompare(String((b.timestamps || {}).started_at))
|
|
);
|
|
const chains = [];
|
|
for (let i = 0; i < sorted.length - 1; i++) {
|
|
const a = sorted[i];
|
|
const hasError = Array.isArray(a.events) && a.events.some((e) => e.kind === 'error');
|
|
if (!hasError) continue;
|
|
const filesA = new Set(
|
|
(((a.task_size || {}).files) || []).filter((f) => !isHotFile(f))
|
|
);
|
|
if (filesA.size === 0) continue;
|
|
for (let j = i + 1; j < sorted.length; j++) {
|
|
const b = sorted[j];
|
|
const shared = (((b.task_size || {}).files) || []).filter((f) => !isHotFile(f) && filesA.has(f));
|
|
if (shared.length > 0) {
|
|
chains.push({
|
|
from: `${a.task_id}|${(a.timestamps || {}).started_at}`,
|
|
to: `${b.task_id}|${(b.timestamps || {}).started_at}`,
|
|
sharedFiles: shared,
|
|
});
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return chains;
|
|
}
|
|
|
|
function sizeBucket(toolCalls) {
|
|
const n = Number(toolCalls) || 0;
|
|
return n < SIZE_SMALL ? 'small' : n <= SIZE_LARGE ? 'medium' : 'large';
|
|
}
|
|
|
|
const SESSION_TURN_EARLY = 10;
|
|
const SESSION_TURN_LATE = 40;
|
|
|
|
function sessionTurnBucket(turn) {
|
|
const n = Number(turn);
|
|
if (!Number.isFinite(n)) return 'null';
|
|
return n < SESSION_TURN_EARLY ? 'early' : n <= SESSION_TURN_LATE ? 'mid' : 'late';
|
|
}
|
|
|
|
const FACTOR_FNS = {
|
|
decision_provenance: (e) => (e.decision_provenance || {}).kind || 'unknown',
|
|
economy_level: (e) => String((e.environment || {}).economy_level ?? 'null'),
|
|
model: (e) => (e.environment || {}).model || 'null',
|
|
post_compaction: (e) => String((e.environment || {}).post_compaction ?? false),
|
|
session_segment_turn: (e) => sessionTurnBucket((e.environment || {}).session_turn),
|
|
parallel_session: (e) => String((e.environment || {}).parallel_session ?? false),
|
|
task_size: (e) => sizeBucket((e.task_size || {}).tool_calls),
|
|
node_chosen: (e) => (e.primary_rationale || {}).node_chosen || 'direct',
|
|
task_classification: (e) => (e.primary_rationale || {}).task_classification || 'other',
|
|
recommended_node_for_direct: (e) => (e.primary_rationale || {}).recommended_node || 'none',
|
|
};
|
|
|
|
/** Factor matrix: rows = factor values, columns = outcome distribution (spec §6). */
|
|
export function buildFactorMatrix(episodesWithOutcome) {
|
|
const matrix = {};
|
|
for (const [fname, fn] of Object.entries(FACTOR_FNS)) {
|
|
matrix[fname] = {};
|
|
for (const e of episodesWithOutcome) {
|
|
const val = fn(e);
|
|
const outcome = e._inferredOutcome || 'unknown';
|
|
matrix[fname][val] = matrix[fname][val] || {};
|
|
matrix[fname][val][outcome] = (matrix[fname][val][outcome] || 0) + 1;
|
|
}
|
|
}
|
|
// chain_ref is multi-value: a multi-chain episode counts once per chain;
|
|
// null/absent → key "null". Handled outside FACTOR_FNS (single-value loop).
|
|
matrix.chain_ref = {};
|
|
for (const e of episodesWithOutcome) {
|
|
const cr = (e.primary_rationale || {}).chain_ref;
|
|
const outcome = e._inferredOutcome || 'unknown';
|
|
const keys = Array.isArray(cr) && cr.length ? cr : ['null'];
|
|
for (const k of keys) {
|
|
matrix.chain_ref[k] = matrix.chain_ref[k] || {};
|
|
matrix.chain_ref[k][outcome] = (matrix.chain_ref[k][outcome] || 0) + 1;
|
|
}
|
|
}
|
|
return matrix;
|
|
}
|
|
|
|
/** Full deterministic aggregation: dedup → infer outcomes → group → chains → matrix → missed activations. */
|
|
export function analyze(episodes, options = {}) {
|
|
const deduped = dedupeEpisodes(episodes);
|
|
const allNormal = deduped.filter((e) => !e.observer_error);
|
|
// v1 episodes lack environment / prompt_signal / decision_provenance — they
|
|
// pollute the factor matrix and break outcome inference. Analyze v2 only.
|
|
const normal = allNormal.filter((e) => e.schema_version >= 2);
|
|
const v1SkippedCount = allNormal.length - normal.length;
|
|
for (const eps of bySessionSorted(normal).values()) {
|
|
eps.forEach((episode, i) => {
|
|
episode._inferredOutcome = inferOutcome(episode, eps[i + 1]);
|
|
});
|
|
}
|
|
const classificationMap = options.classificationMap || {};
|
|
const dormancy = options.dormancy || {};
|
|
const disciplineByClassification = disciplinePercentByClassification(normal, classificationMap);
|
|
const routerStep = routerStepReached(normal);
|
|
const boundariesRate = boundariesAppliedRate(normal);
|
|
return {
|
|
episodeCount: normal.length,
|
|
v1SkippedCount,
|
|
observerErrorCount: deduped.length - allNormal.length,
|
|
tasks: groupEpisodesToTasks(normal),
|
|
causalChains: findCausalChains(normal),
|
|
factorMatrix: buildFactorMatrix(normal),
|
|
missedActivations: detectMissedActivations(normal, classificationMap, dormancy),
|
|
disciplineByClassification,
|
|
routerStep,
|
|
boundariesRate,
|
|
};
|
|
}
|
|
|
|
function loadEpisodes(files) {
|
|
const eps = [];
|
|
for (const f of files) {
|
|
if (!existsSync(f)) continue;
|
|
for (const line of readFileSync(f, 'utf-8').split('\n')) {
|
|
const t = line.trim();
|
|
if (!t) continue;
|
|
try {
|
|
eps.push(JSON.parse(t));
|
|
} catch {
|
|
// skip broken line
|
|
}
|
|
}
|
|
}
|
|
return eps;
|
|
}
|
|
|
|
if (process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/brain-retro-analyzer.mjs')) {
|
|
const registry = loadRegistry({ useCache: false });
|
|
const classificationMap = buildClassificationMap(registry);
|
|
const dormancy = buildDormancyMap(registry);
|
|
const result = analyze(loadEpisodes(process.argv.slice(2)), { classificationMap, dormancy });
|
|
console.log(JSON.stringify(result, null, 2));
|
|
process.exit(0);
|
|
}
|