Files
portal/tools/router-accuracy-runner.mjs
T
Дмитрий 808461295a feat(router): Sonnet classifier + памятка + regex-fallback module (phase 2 task 10)
Phase 2 Task 10 of LLM-first router overhaul. Spec §4.2 — Layer 2 Sonnet 4.6
classifier with 4-pattern памятка enrichment, JSON output per spec, fallback
chain Sonnet → regex → degraded. Phase 1 regex Layer 1 extracted to its own
module so it can be called only as a fallback.

- tools/router-classifier-regex-fallback.mjs (NEW): self-contained regex
  fallback. Extracts TASK_TYPE_KEYWORDS, HARD_KEYWORD_STEMS, detectTaskType,
  keywordMatches, detectRecommendedNode, computeConfidence, classifyByRegex
  verbatim from the prior classifier. Self-contained (own MICRO_KEYWORDS,
  detectMicro, lower) — no circular imports.
- tools/router-classifier.mjs (REWRITE):
  + import { CLASSIFIER_MODEL } from router-config.mjs
  + re-export { classifyByRegex } from regex-fallback (back-compat surface)
  + buildClassifierPrompt(prompt, registry, { enrichment=true }) — spec §4.2
    format with 4-pattern памятка (brainstorming / discovery-interview /
    writing-plans / systematic-debugging) togglable via enrichment flag.
  + parseClassifierResponse(text) — strict task_type required, ```json fence
    aware, accepts null recommended_chain_id.
  + classify() rewritten: prefilter → cache → Sonnet (CLASSIFIER_MODEL) →
    regex fallback (transport error OR no key/unparseable).
  + callAnthropicAPI default model = CLASSIFIER_MODEL; max_tokens 300 → 1500
    (full classifier output with alternatives & памятка needs the budget).
  - removed: shouldEscalate, TASK_TYPE_KEYWORDS, detectTaskType,
    keywordMatches, detectRecommendedNode, HARD_KEYWORD_STEMS, computeConfidence
    (all live in regex-fallback now).
  Kept legacy: buildLLMPrompt / parseLLMResponse (back-compat surface).
- tools/router-accuracy-runner.mjs: import classifyByRegex from regex-fallback
  module (G11 from plan). Runner functionality unchanged.
- tools/router-classifier.test.mjs: +8 tests for buildClassifierPrompt (4) and
  parseClassifierResponse (4); removed obsolete shouldEscalate block (3);
  rewrote classify integration block (4 tests) to reflect new flow
  (prefilter-first, LLM-always-on-fallthrough, regex on error).

Tests: tools/router-classifier.test.mjs 44/44 PASS. Full tools/ suite:
557 tests passed, 0 failed (4 pre-existing empty test files report
"no test suite found" — unrelated: ruflo-recall-hook, subagent-prompt-prefix,
plus 2 others — not touched in this commit).
accuracy-runner smoke: type=85%/node=55%/micro=100% on the 20-prompt set,
unchanged from pre-Task-10 baseline (regex path semantics preserved).
2026-05-25 14:28:25 +03:00

56 lines
2.3 KiB
JavaScript

#!/usr/bin/env node
/**
* Accuracy runner — прогоняет 20 промптов через classifier (без LLM, regex only)
* и выдаёт отчёт «правильно/неправильно» по каждому пункту.
*
* Использовать перед регистрацией router-prehook в settings.json.
*/
import { readFileSync } from 'fs';
import { classifyByRegex } from './router-classifier-regex-fallback.mjs';
import { loadRegistry } from './registry-load.mjs';
function main() {
const promptsFile = process.argv[2] || 'tools/router-test-prompts.json';
const data = JSON.parse(readFileSync(promptsFile, 'utf-8'));
const registry = loadRegistry({ useCache: false });
let correctType = 0, correctNode = 0, correctMicro = 0, total = data.prompts.length;
const failures = [];
for (const p of data.prompts) {
const r = classifyByRegex(p.text, registry);
const typeOk = r.taskType === p.expectedType;
const nodeOk = r.recommendedNode === p.expectedNode;
const microOk = r.micro === p.expectedMicro;
if (typeOk) correctType++;
if (nodeOk) correctNode++;
if (microOk) correctMicro++;
if (!typeOk || !nodeOk || !microOk) {
failures.push({
prompt: p.text,
expected: { type: p.expectedType, node: p.expectedNode, micro: p.expectedMicro },
actual: { type: r.taskType, node: r.recommendedNode, micro: r.micro },
deltas: { type: !typeOk, node: !nodeOk, micro: !microOk },
});
}
}
console.log('=== Accuracy Report ===');
console.log(`Type accuracy: ${correctType}/${total} = ${(correctType / total * 100).toFixed(1)}%`);
console.log(`Node accuracy: ${correctNode}/${total} = ${(correctNode / total * 100).toFixed(1)}%`);
console.log(`Micro accuracy: ${correctMicro}/${total} = ${(correctMicro / total * 100).toFixed(1)}%`);
console.log('');
console.log(`Failures (${failures.length}):`);
for (const f of failures) {
console.log(` «${f.prompt}»`);
console.log(` expected: type=${f.expected.type}, node=${f.expected.node}, micro=${f.expected.micro}`);
console.log(` actual: type=${f.actual.type}, node=${f.actual.node}, micro=${f.actual.micro}`);
}
const passOverall = (correctType + correctNode + correctMicro) / (total * 3);
process.exit(passOverall >= 0.75 ? 0 : 1);
}
main();