fix(observer): parser candidates_considered — whitelist filter

extractCandidates грузила в primary_rationale.candidates_considered ЛЮБОЙ
нумерованный/маркированный список из ассистентского текста — без
семантического фильтра. В topе оказывались куски прозы («Hard-floor работает
только для §12 Superpowers …»), шаги процедуры («1. Hard-floor check, 2.
Классификация …»), фрагменты кода (regex-паттерны) — не имена узлов реестра.

Фикс: при загрузке модуля собираю KNOWN_NODES из tools/observer-known-nodes.txt
+ ключей observer-chain-map.json + сентинела «direct». После regex-извлечения
item нормализуется (срезаются **/`/_/* обвязки + хвостовая пунктуация) и
проверяется по: точное имя в реестре ИЛИ #NN (Tooling ID) ИЛИ plugin:skill
форма. Если после фильтра <2 элементов — return []. Opt-in <!-- reasoning -->
тег остаётся authoritative и идёт мимо фильтра.

Триггеры/границы не трогал — их regex уже узкий (Pravila §N / ADR-N / PSR_v1
RN / L-цепочки).

Repro-кейсы из живого episodes-2026-05.jsonl добавлены в тесты: prose-bullets,
procedure-steps, code-snippet bullets, mixed list, single survivor.
This commit is contained in:
Дмитрий
2026-05-23 13:16:42 +03:00
parent c7d61a6adc
commit 4665c537e8
2 changed files with 146 additions and 15 deletions
+81 -3
View File
@@ -15,9 +15,14 @@
* Per ADR-011 §6 + spec v1.1 §5.2.1.
*/
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import { dirname, join } from 'node:path';
import { detectChoiceProvenance, detectAskUserQuestionChoice } from './observer-choice-detector.mjs';
import { loadChainMap, chainsFor } from './observer-chain-detector.mjs';
const __dirname = dirname(fileURLToPath(import.meta.url));
let CHAIN_MAP = null;
try {
CHAIN_MAP = loadChainMap();
@@ -25,6 +30,67 @@ try {
CHAIN_MAP = new Map(); // битый/отсутствующий JSON -> chainsFor вернёт null, observer не падает
}
/**
* Whitelist of router-node names. Used by extractCandidates to filter out
* free-form prose bullets (analysis text, procedure steps, code snippets) that
* the regex on its own would happily slurp into candidates_considered.
* Sources, in order:
* - tools/observer-known-nodes.txt — bare names (brainstorming, ccpm, …)
* - tools/observer-chain-map.json keys — incl. plugin:skill form
* - sentinel "direct" (no-skill marker used by node_chosen)
* Tooling IDs (#NN) and arbitrary plugin:skill forms pass via regex below.
*/
const KNOWN_NODES = (() => {
const set = new Set(['direct']);
try {
const txt = readFileSync(join(__dirname, 'observer-known-nodes.txt'), 'utf8');
for (const line of txt.split('\n')) {
const t = line.replace(/#.*$/, '').trim();
if (t) set.add(t);
}
} catch {
// file missing in some test sandboxes — fall back to chain-map keys only
}
if (CHAIN_MAP) for (const node of CHAIN_MAP.keys()) set.add(node);
return set;
})();
const TOOLING_ID_RE = /^#\d+$/;
const NAMESPACED_SKILL_RE = /^[a-z][a-z0-9-]*:[a-z][a-z0-9-]*(?::[a-z][a-z0-9-]*)?$/;
/**
* Strip lightweight markdown wrappers (bold, italic, code, trailing punctuation)
* before testing against the whitelist. Conservative — we accept that some
* weirdly-formatted node names slip through, but free-form prose bullets do not.
*/
function normalizeCandidate(s) {
let t = String(s || '').trim();
// peel outer markdown markers: **x**, *x*, `x`, _x_
while (
(t.startsWith('**') && t.endsWith('**') && t.length > 4) ||
(t.startsWith('`') && t.endsWith('`') && t.length > 2) ||
(t.startsWith('*') && t.endsWith('*') && t.length > 2) ||
(t.startsWith('_') && t.endsWith('_') && t.length > 2)
) {
if (t.startsWith('**')) t = t.slice(2, -2).trim();
else t = t.slice(1, -1).trim();
}
// drop trailing punctuation (commas, periods, em-dashes) that lists often leave
t = t.replace(/[.,;:!?—–-]+$/u, '').trim();
return t;
}
function isKnownNode(raw) {
const t = normalizeCandidate(raw);
if (!t) return false;
if (KNOWN_NODES.has(t)) return true;
if (TOOLING_ID_RE.test(t)) return true;
// namespaced plugin:skill we haven't seen yet — accept if shape matches and
// contains no whitespace (a free-form bullet with a colon in prose won't pass).
if (NAMESPACED_SKILL_RE.test(t)) return true;
return false;
}
const SUPERPOWERS_PREFIX = 'superpowers:';
function parseLines(text) {
@@ -375,13 +441,25 @@ export function extractTriggers(turn) {
const CANDIDATE_NUMBERED_RE = /^\s*\d+[.\)]\s+([^\n]+)$/gm;
const CANDIDATE_BULLET_RE = /^\s*[-*]\s+([^\n]+)$/gm;
/** Heuristic candidates: ≥2 numbered (preferred) or bulleted items. */
/**
* Heuristic candidates: ≥2 numbered (preferred) or bulleted items, filtered to
* router-node identifiers (see isKnownNode). Free-form prose bullets are
* rejected — they belong in the assistant's narrative, not in
* primary_rationale.candidates_considered. The opt-in <!-- reasoning --> tag
* (parseReasoningTag) bypasses this filter; that channel is authoritative.
*/
export function extractCandidates(turn) {
const text = assistantTextOfTurn(turn);
const numbered = [...text.matchAll(CANDIDATE_NUMBERED_RE)].map((m) => m[1].trim());
if (numbered.length >= 2) return numbered;
if (numbered.length >= 2) {
const filtered = numbered.map(normalizeCandidate).filter(isKnownNode);
if (filtered.length >= 2) return filtered;
}
const bulleted = [...text.matchAll(CANDIDATE_BULLET_RE)].map((m) => m[1].trim());
if (bulleted.length >= 2) return bulleted;
if (bulleted.length >= 2) {
const filtered = bulleted.map(normalizeCandidate).filter(isKnownNode);
if (filtered.length >= 2) return filtered;
}
return [];
}