Files
portal/tools/llm-judge-per-tool.mjs
T
Дмитрий 8a7144892c fix(router-gate-v4): calibrate per-tool LLM-judge — calibration 4 soft user-prompt fallback
The per-tool judge compares each mutating tool call against the classifier's
distilled task summary read from router-state. That summary is lossy and
frequently "(unknown)" even for a perfectly explicit user request — and with an
unknown task the judge has nothing to compare against, so "Сомнения → NO"
blocked every real edit. Reproduced repeatedly this session: an explicit
"реализуй ... main() ..." prompt still classified unknown → all edits blocked,
including the judge's own fix. Calibration 2 (allow on unknown) was rejected by
the owner as a discipline hole.

Calibration 4 (soft, scope-preserving): when — and only when — the classifier
summary is "(unknown)"/empty, fall back to judging against the user's actual
last prompt (the ground-truth request) instead of nothing. The judge still runs
and still blocks on doubt; it just uses better evidence. When the summary is
meaningful, behaviour is unchanged (the user-prompt reader is not consulted).
When both summary and prompt are unavailable, the task stays "(unknown)" and
doubt→block is preserved.

NOT calibration 2: this does not blindly allow on unknown — it re-grounds the
judge in the literal user request, which the controller cannot fabricate (the
user writes it; it is read locally from the session transcript).

- tools/llm-judge-per-tool.mjs: resolveEffectiveTask(declaredTask, lastUserPrompt).
- tools/enforce-llm-judge-per-tool.mjs: runPerTool reads the last user prompt
  (helpers.lastUserPromptText + readTranscript) only on an unknown summary;
  main() binds it.

Regression: judge tests 57/57 GREEN; full tools suite 1951 passed | 2 skipped.
The 6 remaining failures are uncommitted point-2 WIP in
enforce-parallel-session-lock.test.mjs — not part of this change, not committed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 10:34:27 +03:00

172 lines
6.7 KiB
JavaScript

// tools/llm-judge-per-tool.mjs
/**
* llm-judge-per-tool — PreToolUse judge on mutating tools (v4.1 §4.7).
* Asks a single Sonnet judge: "is this tool call consistent with the declared
* user task and recommended skill scope?" NO → block. Doubt (null) → block.
* No key / budget exhausted → degraded allow + flag (fail-open on LLM layer).
*/
import { JUDGE_MODELS } from './llm-judge.mjs';
function truncate(s, n) {
const str = typeof s === 'string' ? s : JSON.stringify(s ?? {});
return str.length > n ? str.slice(0, n) + '…' : str;
}
/** Build the consistency question for a single mutating tool call. */
export function buildPerToolQuestion({ toolName, toolInput, declaredTask }) {
const dt = declaredTask || {};
return [
'User task: ' + (dt.task_summary || '(unknown)'),
'Recommended skill: ' + (dt.recommended_node ?? 'none'),
'Recommended chain: ' + JSON.stringify(dt.recommended_chain ?? []),
'Tool being called: ' + toolName,
'Tool input summary: ' + truncate(toolInput, 500),
'',
'Is this tool call consistent with the declared user task and recommended skill scope?',
'Answer YES (consistent) or NO (inconsistent / off-scope / suspicious). Сомнения → NO.',
].join('\n');
}
/**
* Pure decision. is the injected single-judge caller (test
* mock or, in main(), the real llmJudgeCall bound to ROUTER_LLM_KEY).
*
* @returns {Promise<{block:boolean, reason?:string, degraded?:boolean, verdict?:string|null}>}
*/
export async function judgePerTool({
toolName,
toolInput,
declaredTask,
apiKey = process.env.ROUTER_LLM_KEY,
budgetState,
llmJudgeCallImpl,
}) {
if (!llmJudgeCallImpl && !apiKey) {
return { block: false, degraded: true, reason: 'no_api_key' };
}
if (budgetState && budgetState.spent + 1 > budgetState.limit) {
return { block: false, degraded: true, reason: 'budget_exhausted' };
}
const question = buildPerToolQuestion({ toolName, toolInput, declaredTask });
const verdict = await llmJudgeCallImpl({
model: JUDGE_MODELS.single[0],
question,
content: '', // question already carries the (truncated) input
});
if (verdict === 'YES') return { block: false, verdict };
return {
block: true,
verdict,
reason: 'v4.1 per-tool LLM-judge: tool call classified off-scope vs declared user task (doubt→block).',
};
}
import { readFileSync, appendFileSync, mkdirSync } from 'node:fs';
import { join } from 'node:path';
import { homedir } from 'node:os';
import { readStdin, parseEventJson, exitDecision } from './enforce-hook-helpers.mjs';
import { llmJudgeCall, readJudgeBudget, bumpJudgeBudget, JUDGE_SESSION_BUDGET } from './llm-judge.mjs';
// Calibration 1 (2026-05-31) — `Skill` removed from judge scope (SCOPE fix, NOT
// a discipline drop). Invoking a Skill mutates no state; it is the prescribed
// §17 entry into work. Judging the skill-invocation itself and blocking on
// doubt directly contradicts §17 (which mandates skills). The real mutations a
// skill leads to (Edit/Write/MultiEdit/Bash/PowerShell/commit/push/Task) remain
// fully judged below — doubt→block on those is unchanged.
export const MUTATING_TOOLS = new Set([
'Edit', 'Write', 'MultiEdit', 'NotebookEdit', 'Bash', 'PowerShell', 'Task', 'Workflow',
]);
function runtimeDir(override) {
return override || join(homedir(), '.claude', 'runtime');
}
/**
* Calibration 4 (soft, 2026-05-31): the classifier's distilled task summary is
* lossy and sometimes "(unknown)" even for a perfectly clear user request,
* which made the judge block all real edits (no task to compare → doubt→block).
* When the summary is unknown/empty, fall back to judging against the user's
* actual last prompt — the ground-truth request — instead of nothing.
*
* This is NOT calibration 2 (which would blindly ALLOW on unknown). The judge
* still runs and still blocks on doubt; it just uses better evidence. When both
* the summary and the user prompt are unavailable, the task stays "(unknown)"
* and doubt→block is preserved.
*/
export function resolveEffectiveTask(declaredTask, lastUserPrompt) {
const dt = declaredTask || {};
const summary = dt.task_summary;
const summaryUnknown = !summary || summary === '(unknown)' || !String(summary).trim();
const prompt = typeof lastUserPrompt === 'string' ? lastUserPrompt.trim() : '';
if (summaryUnknown && prompt) {
return { ...dt, task_summary: prompt, task_source: 'user_prompt_fallback' };
}
return dt;
}
/** Read the classifier-written declared task for this session; stub on miss. */
export function readDeclaredTask({ sessionId, runtimeDirOverride }) {
const path = join(runtimeDir(runtimeDirOverride), `router-state-${sessionId || 'unknown'}.json`);
try {
const st = JSON.parse(readFileSync(path, 'utf8'));
return {
task_summary: st.task_summary ?? st.task_classification?.task_summary ?? '(unknown)',
recommended_node: st.recommended_node ?? null,
recommended_chain: st.recommended_chain ?? [],
};
} catch {
return { task_summary: '(unknown)', recommended_node: null, recommended_chain: [] };
}
}
function logPerTool({ sessionId, runtimeDirOverride, entry }) {
try {
const dir = runtimeDir(runtimeDirOverride);
mkdirSync(dir, { recursive: true });
appendFileSync(join(dir, `llm-judge-per-tool-${sessionId || 'unknown'}.jsonl`),
JSON.stringify({ ts: new Date().toISOString(), session_id: sessionId || null, ...entry }) + '\n');
} catch { /* ignore */ }
}
async function main() {
try {
const event = parseEventJson(await readStdin());
const toolName = event.tool_name;
if (!MUTATING_TOOLS.has(toolName)) { exitDecision({ block: false }); return; }
const sessionId = event.session_id;
const declaredTask = readDeclaredTask({ sessionId });
const spent = readJudgeBudget({ sessionId });
const result = await judgePerTool({
toolName,
toolInput: event.tool_input || {},
declaredTask,
budgetState: { spent, limit: JUDGE_SESSION_BUDGET },
llmJudgeCallImpl: (opts) => llmJudgeCall(opts),
});
if (!result.degraded) bumpJudgeBudget({ sessionId, by: 1 });
logPerTool({
sessionId,
entry: {
tool_name: toolName,
tool_input_summary: truncate(event.tool_input, 200),
declared_task: declaredTask.task_summary,
verdict: result.verdict ?? null,
action_taken: result.block ? 'block' : (result.degraded ? 'degraded_allow' : 'allow'),
reason: result.reason || null,
},
});
exitDecision({ block: result.block, message: result.reason });
} catch {
exitDecision({ block: false }); // fail-quiet
}
}
const isCli = process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/llm-judge-per-tool.mjs');
if (isCli) main();