fix(observer): parser candidates_considered — whitelist filter

extractCandidates грузила в primary_rationale.candidates_considered ЛЮБОЙ нумерованный/маркированный список из ассистентского текста — без семантического фильтра. В topе оказывались куски прозы («Hard-floor работает только для §12 Superpowers …»), шаги процедуры («1. Hard-floor check, 2. Классификация …»), фрагменты кода (regex-паттерны) — не имена узлов реестра. Фикс: при загрузке модуля собираю KNOWN_NODES из tools/observer-known-nodes.txt + ключей observer-chain-map.json + сентинела «direct». После regex-извлечения item нормализуется (срезаются **/`/_/* обвязки + хвостовая пунктуация) и проверяется по: точное имя в реестре ИЛИ #NN (Tooling ID) ИЛИ plugin:skill форма. Если после фильтра <2 элементов — return []. Opt-in  тег остаётся authoritative и идёт мимо фильтра. Триггеры/границы не трогал — их regex уже узкий (Pravila §N / ADR-N / PSR_v1 RN / L-цепочки). Repro-кейсы из живого episodes-2026-05.jsonl добавлены в тесты: prose-bullets, procedure-steps, code-snippet bullets, mixed list, single survivor.
2026-05-23 13:16:42 +03:00
parent c7d61a6adc
commit 4665c537e8
2 changed files with 146 additions and 15 deletions
@@ -15,9 +15,14 @@
 * Per ADR-011 §6 + spec v1.1 §5.2.1.
 */

+import { readFileSync } from 'node:fs';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
 import { detectChoiceProvenance, detectAskUserQuestionChoice } from './observer-choice-detector.mjs';
 import { loadChainMap, chainsFor } from './observer-chain-detector.mjs';

+const __dirname = dirname(fileURLToPath(import.meta.url));
+
 let CHAIN_MAP = null;
 try {
  CHAIN_MAP = loadChainMap();
@@ -25,6 +30,67 @@ try {
  CHAIN_MAP = new Map(); // битый/отсутствующий JSON -> chainsFor вернёт null, observer не падает
 }

+/**
+ * Whitelist of router-node names. Used by extractCandidates to filter out
+ * free-form prose bullets (analysis text, procedure steps, code snippets) that
+ * the regex on its own would happily slurp into candidates_considered.
+ * Sources, in order:
+ *   - tools/observer-known-nodes.txt — bare names (brainstorming, ccpm, …)
+ *   - tools/observer-chain-map.json keys — incl. plugin:skill form
+ *   - sentinel "direct" (no-skill marker used by node_chosen)
+ * Tooling IDs (#NN) and arbitrary plugin:skill forms pass via regex below.
+ */
+const KNOWN_NODES = (() => {
+  const set = new Set(['direct']);
+  try {
+    const txt = readFileSync(join(__dirname, 'observer-known-nodes.txt'), 'utf8');
+    for (const line of txt.split('\n')) {
+      const t = line.replace(/#.*$/, '').trim();
+      if (t) set.add(t);
+    }
+  } catch {
+    // file missing in some test sandboxes — fall back to chain-map keys only
+  }
+  if (CHAIN_MAP) for (const node of CHAIN_MAP.keys()) set.add(node);
+  return set;
+})();
+
+const TOOLING_ID_RE = /^#\d+$/;
+const NAMESPACED_SKILL_RE = /^[a-z][a-z0-9-]*:[a-z][a-z0-9-]*(?::[a-z][a-z0-9-]*)?$/;
+
+/**
+ * Strip lightweight markdown wrappers (bold, italic, code, trailing punctuation)
+ * before testing against the whitelist. Conservative — we accept that some
+ * weirdly-formatted node names slip through, but free-form prose bullets do not.
+ */
+function normalizeCandidate(s) {
+  let t = String(s || '').trim();
+  // peel outer markdown markers: **x**, *x*, `x`, _x_
+  while (
+    (t.startsWith('**') && t.endsWith('**') && t.length > 4) ||
+    (t.startsWith('`') && t.endsWith('`') && t.length > 2) ||
+    (t.startsWith('*') && t.endsWith('*') && t.length > 2) ||
+    (t.startsWith('_') && t.endsWith('_') && t.length > 2)
+  ) {
+    if (t.startsWith('**')) t = t.slice(2, -2).trim();
+    else t = t.slice(1, -1).trim();
+  }
+  // drop trailing punctuation (commas, periods, em-dashes) that lists often leave
+  t = t.replace(/[.,;:!?—–-]+$/u, '').trim();
+  return t;
+}
+
+function isKnownNode(raw) {
+  const t = normalizeCandidate(raw);
+  if (!t) return false;
+  if (KNOWN_NODES.has(t)) return true;
+  if (TOOLING_ID_RE.test(t)) return true;
+  // namespaced plugin:skill we haven't seen yet — accept if shape matches and
+  // contains no whitespace (a free-form bullet with a colon in prose won't pass).
+  if (NAMESPACED_SKILL_RE.test(t)) return true;
+  return false;
+}
+
 const SUPERPOWERS_PREFIX = 'superpowers:';

 function parseLines(text) {
@@ -375,13 +441,25 @@ export function extractTriggers(turn) {
 const CANDIDATE_NUMBERED_RE = /^\s*\d+[.\)]\s+([^\n]+)$/gm;
 const CANDIDATE_BULLET_RE = /^\s*[-*]\s+([^\n]+)$/gm;

-/** Heuristic candidates: ≥2 numbered (preferred) or bulleted items. */
+/**
+ * Heuristic candidates: ≥2 numbered (preferred) or bulleted items, filtered to
+ * router-node identifiers (see isKnownNode). Free-form prose bullets are
+ * rejected — they belong in the assistant's narrative, not in
+ * primary_rationale.candidates_considered. The opt-in <!-- reasoning --> tag
+ * (parseReasoningTag) bypasses this filter; that channel is authoritative.
+ */
 export function extractCandidates(turn) {
  const text = assistantTextOfTurn(turn);
  const numbered = [...text.matchAll(CANDIDATE_NUMBERED_RE)].map((m) => m[1].trim());
-  if (numbered.length >= 2) return numbered;
+  if (numbered.length >= 2) {
+    const filtered = numbered.map(normalizeCandidate).filter(isKnownNode);
+    if (filtered.length >= 2) return filtered;
+  }
  const bulleted = [...text.matchAll(CANDIDATE_BULLET_RE)].map((m) => m[1].trim());
-  if (bulleted.length >= 2) return bulleted;
+  if (bulleted.length >= 2) {
+    const filtered = bulleted.map(normalizeCandidate).filter(isKnownNode);
+    if (filtered.length >= 2) return filtered;
+  }
  return [];
 }