portal/tools/observer-embedding-index.mjs

/**
 * Observer episode embedding index (Pass 4 of project-brain-factor-analysis-4passes).
 *
 * Pure module: given a list of episodes carrying `prompt_embedding_base64` and a
 * resolved `_inferredOutcome`, build an in-memory index, find top-k cosine
 * neighbours for a target embedding, and report the majority outcome family
 * (success / retry / failure / mixed / no_neighbors).
 *
 * Embeddings produced by router-embedding.mjs are mean-pooled AND L2-normalized,
 * so cosine similarity collapses to a plain dot product. We still defend the
 * generic formula (denominator) here for robustness against legacy / hand-crafted
 * test vectors.
 *
 * Security Guidance #40: pure parsing — no exec/execSync.
 */
import { Buffer } from 'buffer';
import { decodeBase64 } from './router-embedding.mjs';

const OUTCOME_TO_FAMILY = {
  success: 'success',
  soft_success: 'success',
  rework: 'retry',
  blocked: 'failure',
  partial: 'failure',
};

export function mapOutcomeToFamily(outcome) {
  if (!outcome || typeof outcome !== 'string') return null;
  return OUTCOME_TO_FAMILY[outcome] || null;
}

export function cosineSimilarity(a, b) {
  if (!a || !b) return 0;
  if (a.length === 0 || b.length === 0) return 0;
  if (a.length !== b.length) return 0;
  let dot = 0;
  let normA = 0;
  let normB = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i] * b[i];
    normA += a[i] * a[i];
    normB += b[i] * b[i];
  }
  if (normA === 0 || normB === 0) return 0;
  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}

function safeDecode(b64) {
  if (!b64 || typeof b64 !== 'string') return null;
  try {
    // Node's Buffer.from('garbage', 'base64') silently strips invalid chars and
    // truncates — won't throw on 'not-base64!!!' style input. Guard explicitly
    // by checking the byte length is a positive multiple of 4 (Float32 width).
    const buf = Buffer.from(b64, 'base64');
    if (buf.byteLength === 0 || buf.byteLength % 4 !== 0) return null;
    const v = decodeBase64(b64);
    if (!(v instanceof Float32Array) || v.length === 0) return null;
    for (let i = 0; i < v.length; i++) if (!Number.isFinite(v[i])) return null;
    return v;
  } catch {
    return null;
  }
}

/**
 * Episode dedupe key — identifies a single turn uniquely. In real episodes
 * `task_id` is the SESSION id (shared across turns), so task_id alone is not
 * a turn identifier. Pairing with started_at gives the same key shape that
 * dedupeEpisodes uses in the analyzer.
 */
export function episodeKey(ep) {
  if (!ep) return '';
  return `${ep.task_id || ''}|${(ep.timestamps || {}).started_at || ''}`;
}

/**
 * Build an index from episodes carrying a base64 embedding AND a resolved
 * outcome family. Episodes lacking either are silently skipped — they
 * cannot teach the neighbour lookup anything.
 */
export function buildIndex(episodes) {
  const idx = [];
  for (const ep of episodes || []) {
    const family = mapOutcomeToFamily(ep && ep._inferredOutcome);
    if (!family) continue;
    const emb = safeDecode(ep && ep.prompt_embedding_base64);
    if (!emb) continue;
    idx.push({
      task_id: ep.task_id || null,
      started_at: (ep.timestamps || {}).started_at || null,
      key: episodeKey(ep),
      family,
      embedding: emb,
    });
  }
  return idx;
}

/**
 * Return the top-k index entries by cosine similarity to `target`, in
 * descending order. Self-exclusion is by composite key (task_id|started_at)
 * since task_id alone is the session id (shared across turns). Legacy
 * `excludeTaskId` option kept for callers that still pass task-unique ids;
 * `excludeKey` overrides it. Empty / null inputs → [].
 */
export function findNearestNeighbors(target, index, k, options = {}) {
  if (!target || !(target instanceof Float32Array) || target.length === 0) return [];
  if (!Array.isArray(index) || index.length === 0) return [];
  const excludeKey = options.excludeKey || null;
  const excludeTaskId = options.excludeTaskId || null;
  const scored = [];
  for (const entry of index) {
    if (excludeKey && entry.key === excludeKey) continue;
    if (excludeTaskId && entry.task_id === excludeTaskId && !excludeKey) continue;
    scored.push({ ...entry, similarity: cosineSimilarity(target, entry.embedding) });
  }
  scored.sort((a, b) => b.similarity - a.similarity);
  return scored.slice(0, k);
}

/**
 * Return the dominant family across `neighbors`, or 'mixed' on a tie at the
 * top, or 'no_neighbors' on empty input. The 4 known families are
 * success / retry / failure (plus the synthetic mixed / no_neighbors).
 */
export function majorityOutcome(neighbors) {
  if (!Array.isArray(neighbors) || neighbors.length === 0) return 'no_neighbors';
  const counts = {};
  for (const n of neighbors) {
    const f = n && n.family;
    if (!f) continue;
    counts[f] = (counts[f] || 0) + 1;
  }
  const entries = Object.entries(counts);
  if (entries.length === 0) return 'no_neighbors';
  let maxN = 0;
  for (const [, n] of entries) if (n > maxN) maxN = n;
  const winners = entries.filter(([, n]) => n === maxN);
  if (winners.length > 1) return 'mixed';
  return winners[0][0];
}