/** * Observer episode embedding index (Pass 4 of project-brain-factor-analysis-4passes). * * Pure module: given a list of episodes carrying `prompt_embedding_base64` and a * resolved `_inferredOutcome`, build an in-memory index, find top-k cosine * neighbours for a target embedding, and report the majority outcome family * (success / retry / failure / mixed / no_neighbors). * * Embeddings produced by router-embedding.mjs are mean-pooled AND L2-normalized, * so cosine similarity collapses to a plain dot product. We still defend the * generic formula (denominator) here for robustness against legacy / hand-crafted * test vectors. * * Security Guidance #40: pure parsing — no exec/execSync. */ import { Buffer } from 'buffer'; import { decodeBase64 } from './router-embedding.mjs'; const OUTCOME_TO_FAMILY = { success: 'success', soft_success: 'success', rework: 'retry', blocked: 'failure', partial: 'failure', }; export function mapOutcomeToFamily(outcome) { if (!outcome || typeof outcome !== 'string') return null; return OUTCOME_TO_FAMILY[outcome] || null; } export function cosineSimilarity(a, b) { if (!a || !b) return 0; if (a.length === 0 || b.length === 0) return 0; if (a.length !== b.length) return 0; let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dot += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } if (normA === 0 || normB === 0) return 0; return dot / (Math.sqrt(normA) * Math.sqrt(normB)); } function safeDecode(b64) { if (!b64 || typeof b64 !== 'string') return null; try { // Node's Buffer.from('garbage', 'base64') silently strips invalid chars and // truncates — won't throw on 'not-base64!!!' style input. Guard explicitly // by checking the byte length is a positive multiple of 4 (Float32 width). const buf = Buffer.from(b64, 'base64'); if (buf.byteLength === 0 || buf.byteLength % 4 !== 0) return null; const v = decodeBase64(b64); if (!(v instanceof Float32Array) || v.length === 0) return null; for (let i = 0; i < v.length; i++) if (!Number.isFinite(v[i])) return null; return v; } catch { return null; } } /** * Episode dedupe key — identifies a single turn uniquely. In real episodes * `task_id` is the SESSION id (shared across turns), so task_id alone is not * a turn identifier. Pairing with started_at gives the same key shape that * dedupeEpisodes uses in the analyzer. */ export function episodeKey(ep) { if (!ep) return ''; return `${ep.task_id || ''}|${(ep.timestamps || {}).started_at || ''}`; } /** * Build an index from episodes carrying a base64 embedding AND a resolved * outcome family. Episodes lacking either are silently skipped — they * cannot teach the neighbour lookup anything. */ export function buildIndex(episodes) { const idx = []; for (const ep of episodes || []) { const family = mapOutcomeToFamily(ep && ep._inferredOutcome); if (!family) continue; const emb = safeDecode(ep && ep.prompt_embedding_base64); if (!emb) continue; idx.push({ task_id: ep.task_id || null, started_at: (ep.timestamps || {}).started_at || null, key: episodeKey(ep), family, embedding: emb, }); } return idx; } /** * Return the top-k index entries by cosine similarity to `target`, in * descending order. Self-exclusion is by composite key (task_id|started_at) * since task_id alone is the session id (shared across turns). Legacy * `excludeTaskId` option kept for callers that still pass task-unique ids; * `excludeKey` overrides it. Empty / null inputs → []. */ export function findNearestNeighbors(target, index, k, options = {}) { if (!target || !(target instanceof Float32Array) || target.length === 0) return []; if (!Array.isArray(index) || index.length === 0) return []; const excludeKey = options.excludeKey || null; const excludeTaskId = options.excludeTaskId || null; const scored = []; for (const entry of index) { if (excludeKey && entry.key === excludeKey) continue; if (excludeTaskId && entry.task_id === excludeTaskId && !excludeKey) continue; scored.push({ ...entry, similarity: cosineSimilarity(target, entry.embedding) }); } scored.sort((a, b) => b.similarity - a.similarity); return scored.slice(0, k); } /** * Return the dominant family across `neighbors`, or 'mixed' on a tie at the * top, or 'no_neighbors' on empty input. The 4 known families are * success / retry / failure (plus the synthetic mixed / no_neighbors). */ export function majorityOutcome(neighbors) { if (!Array.isArray(neighbors) || neighbors.length === 0) return 'no_neighbors'; const counts = {}; for (const n of neighbors) { const f = n && n.family; if (!f) continue; counts[f] = (counts[f] || 0) + 1; } const entries = Object.entries(counts); if (entries.length === 0) return 'no_neighbors'; let maxN = 0; for (const [, n] of entries) if (n > maxN) maxN = n; const winners = entries.filter(([, n]) => n === maxN); if (winners.length > 1) return 'mixed'; return winners[0][0]; }