portal/tools/router-embedding.mjs

#!/usr/bin/env node
/**
 * Router embedding layer (Phase 2 Task 12, spec §4.3).
 *
 * Computes 384-dim sentence embeddings via Xenova/all-MiniLM-L6-v2 for
 * NON-trivial classified episodes. Trivial task types (conversation / micro /
 * manual_override) are skipped — semantic search on "да" or "спасибо" is
 * wasted compute.
 *
 * Storage: base64-encoded Float32Array (~2050 chars per 384-dim vector).
 * Stored on the episode as `prompt_embedding_base64` (Phase 3 parser writes).
 *
 * Fallback: model load or inference failure → embed() returns null. Caller
 * marks `environment.embedding_unavailable = true` on the episode (parser).
 *
 * Lazy load: @xenova/transformers is heavy (native ONNX runtime, ~50 MB). The
 * pipeline is created on the first embed() call and cached; the dedicated
 * `tools/router-embedding-warmup.mjs` hook fires this on SessionStart so the
 * first real prompt doesn't pay the cold-start cost.
 */

import { Buffer } from 'buffer';

const EMBED_EXEMPT_TASK_TYPES = new Set(['conversation', 'micro', 'manual_override']);

const EMBEDDING_MODEL = 'Xenova/all-MiniLM-L6-v2';

export function shouldEmbed(taskType) {
  if (!taskType || typeof taskType !== 'string') return false;
  return !EMBED_EXEMPT_TASK_TYPES.has(taskType);
}

export function encodeBase64(arr) {
  return Buffer.from(arr.buffer, arr.byteOffset, arr.byteLength).toString('base64');
}

export function decodeBase64(b64) {
  const buf = Buffer.from(b64, 'base64');
  // Float32Array view over the buffer's underlying ArrayBuffer slice.
  return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
}

let _pipelinePromise = null;

async function getPipeline() {
  if (_pipelinePromise) return _pipelinePromise;
  _pipelinePromise = (async () => {
    const mod = await import('@xenova/transformers');
    return mod.pipeline('feature-extraction', EMBEDDING_MODEL);
  })();
  // Reset promise on error so a transient failure doesn't poison subsequent calls.
  _pipelinePromise.catch(() => { _pipelinePromise = null; });
  return _pipelinePromise;
}

/**
 * Compute embedding for a prompt. Returns Float32Array(384) on success, null
 * on any failure (model load error, runtime exception). Caller must handle null.
 */
export async function embed(prompt) {
  try {
    const pipe = await getPipeline();
    const out = await pipe(prompt, { pooling: 'mean', normalize: true });
    return new Float32Array(out.data);
  } catch {
    return null;
  }
}