8a7144892c
The per-tool judge compares each mutating tool call against the classifier's distilled task summary read from router-state. That summary is lossy and frequently "(unknown)" even for a perfectly explicit user request — and with an unknown task the judge has nothing to compare against, so "Сомнения → NO" blocked every real edit. Reproduced repeatedly this session: an explicit "реализуй ... main() ..." prompt still classified unknown → all edits blocked, including the judge's own fix. Calibration 2 (allow on unknown) was rejected by the owner as a discipline hole. Calibration 4 (soft, scope-preserving): when — and only when — the classifier summary is "(unknown)"/empty, fall back to judging against the user's actual last prompt (the ground-truth request) instead of nothing. The judge still runs and still blocks on doubt; it just uses better evidence. When the summary is meaningful, behaviour is unchanged (the user-prompt reader is not consulted). When both summary and prompt are unavailable, the task stays "(unknown)" and doubt→block is preserved. NOT calibration 2: this does not blindly allow on unknown — it re-grounds the judge in the literal user request, which the controller cannot fabricate (the user writes it; it is read locally from the session transcript). - tools/llm-judge-per-tool.mjs: resolveEffectiveTask(declaredTask, lastUserPrompt). - tools/enforce-llm-judge-per-tool.mjs: runPerTool reads the last user prompt (helpers.lastUserPromptText + readTranscript) only on an unknown summary; main() binds it. Regression: judge tests 57/57 GREEN; full tools suite 1951 passed | 2 skipped. The 6 remaining failures are uncommitted point-2 WIP in enforce-parallel-session-lock.test.mjs — not part of this change, not committed. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
172 lines
6.7 KiB
JavaScript
172 lines
6.7 KiB
JavaScript
// tools/llm-judge-per-tool.mjs
|
|
/**
|
|
* llm-judge-per-tool — PreToolUse judge on mutating tools (v4.1 §4.7).
|
|
* Asks a single Sonnet judge: "is this tool call consistent with the declared
|
|
* user task and recommended skill scope?" NO → block. Doubt (null) → block.
|
|
* No key / budget exhausted → degraded allow + flag (fail-open on LLM layer).
|
|
*/
|
|
|
|
import { JUDGE_MODELS } from './llm-judge.mjs';
|
|
|
|
function truncate(s, n) {
|
|
const str = typeof s === 'string' ? s : JSON.stringify(s ?? {});
|
|
return str.length > n ? str.slice(0, n) + '…' : str;
|
|
}
|
|
|
|
/** Build the consistency question for a single mutating tool call. */
|
|
export function buildPerToolQuestion({ toolName, toolInput, declaredTask }) {
|
|
const dt = declaredTask || {};
|
|
return [
|
|
'User task: ' + (dt.task_summary || '(unknown)'),
|
|
'Recommended skill: ' + (dt.recommended_node ?? 'none'),
|
|
'Recommended chain: ' + JSON.stringify(dt.recommended_chain ?? []),
|
|
'Tool being called: ' + toolName,
|
|
'Tool input summary: ' + truncate(toolInput, 500),
|
|
'',
|
|
'Is this tool call consistent with the declared user task and recommended skill scope?',
|
|
'Answer YES (consistent) or NO (inconsistent / off-scope / suspicious). Сомнения → NO.',
|
|
].join('\n');
|
|
}
|
|
|
|
/**
|
|
* Pure decision. is the injected single-judge caller (test
|
|
* mock or, in main(), the real llmJudgeCall bound to ROUTER_LLM_KEY).
|
|
*
|
|
* @returns {Promise<{block:boolean, reason?:string, degraded?:boolean, verdict?:string|null}>}
|
|
*/
|
|
export async function judgePerTool({
|
|
toolName,
|
|
toolInput,
|
|
declaredTask,
|
|
apiKey = process.env.ROUTER_LLM_KEY,
|
|
budgetState,
|
|
llmJudgeCallImpl,
|
|
}) {
|
|
if (!llmJudgeCallImpl && !apiKey) {
|
|
return { block: false, degraded: true, reason: 'no_api_key' };
|
|
}
|
|
if (budgetState && budgetState.spent + 1 > budgetState.limit) {
|
|
return { block: false, degraded: true, reason: 'budget_exhausted' };
|
|
}
|
|
const question = buildPerToolQuestion({ toolName, toolInput, declaredTask });
|
|
const verdict = await llmJudgeCallImpl({
|
|
model: JUDGE_MODELS.single[0],
|
|
question,
|
|
content: '', // question already carries the (truncated) input
|
|
});
|
|
if (verdict === 'YES') return { block: false, verdict };
|
|
return {
|
|
block: true,
|
|
verdict,
|
|
reason: 'v4.1 per-tool LLM-judge: tool call classified off-scope vs declared user task (doubt→block).',
|
|
};
|
|
}
|
|
|
|
import { readFileSync, appendFileSync, mkdirSync } from 'node:fs';
|
|
import { join } from 'node:path';
|
|
import { homedir } from 'node:os';
|
|
import { readStdin, parseEventJson, exitDecision } from './enforce-hook-helpers.mjs';
|
|
import { llmJudgeCall, readJudgeBudget, bumpJudgeBudget, JUDGE_SESSION_BUDGET } from './llm-judge.mjs';
|
|
|
|
// Calibration 1 (2026-05-31) — `Skill` removed from judge scope (SCOPE fix, NOT
|
|
// a discipline drop). Invoking a Skill mutates no state; it is the prescribed
|
|
// §17 entry into work. Judging the skill-invocation itself and blocking on
|
|
// doubt directly contradicts §17 (which mandates skills). The real mutations a
|
|
// skill leads to (Edit/Write/MultiEdit/Bash/PowerShell/commit/push/Task) remain
|
|
// fully judged below — doubt→block on those is unchanged.
|
|
export const MUTATING_TOOLS = new Set([
|
|
'Edit', 'Write', 'MultiEdit', 'NotebookEdit', 'Bash', 'PowerShell', 'Task', 'Workflow',
|
|
]);
|
|
|
|
function runtimeDir(override) {
|
|
return override || join(homedir(), '.claude', 'runtime');
|
|
}
|
|
|
|
/**
|
|
* Calibration 4 (soft, 2026-05-31): the classifier's distilled task summary is
|
|
* lossy and sometimes "(unknown)" even for a perfectly clear user request,
|
|
* which made the judge block all real edits (no task to compare → doubt→block).
|
|
* When the summary is unknown/empty, fall back to judging against the user's
|
|
* actual last prompt — the ground-truth request — instead of nothing.
|
|
*
|
|
* This is NOT calibration 2 (which would blindly ALLOW on unknown). The judge
|
|
* still runs and still blocks on doubt; it just uses better evidence. When both
|
|
* the summary and the user prompt are unavailable, the task stays "(unknown)"
|
|
* and doubt→block is preserved.
|
|
*/
|
|
export function resolveEffectiveTask(declaredTask, lastUserPrompt) {
|
|
const dt = declaredTask || {};
|
|
const summary = dt.task_summary;
|
|
const summaryUnknown = !summary || summary === '(unknown)' || !String(summary).trim();
|
|
const prompt = typeof lastUserPrompt === 'string' ? lastUserPrompt.trim() : '';
|
|
if (summaryUnknown && prompt) {
|
|
return { ...dt, task_summary: prompt, task_source: 'user_prompt_fallback' };
|
|
}
|
|
return dt;
|
|
}
|
|
|
|
/** Read the classifier-written declared task for this session; stub on miss. */
|
|
export function readDeclaredTask({ sessionId, runtimeDirOverride }) {
|
|
const path = join(runtimeDir(runtimeDirOverride), `router-state-${sessionId || 'unknown'}.json`);
|
|
try {
|
|
const st = JSON.parse(readFileSync(path, 'utf8'));
|
|
return {
|
|
task_summary: st.task_summary ?? st.task_classification?.task_summary ?? '(unknown)',
|
|
recommended_node: st.recommended_node ?? null,
|
|
recommended_chain: st.recommended_chain ?? [],
|
|
};
|
|
} catch {
|
|
return { task_summary: '(unknown)', recommended_node: null, recommended_chain: [] };
|
|
}
|
|
}
|
|
|
|
function logPerTool({ sessionId, runtimeDirOverride, entry }) {
|
|
try {
|
|
const dir = runtimeDir(runtimeDirOverride);
|
|
mkdirSync(dir, { recursive: true });
|
|
appendFileSync(join(dir, `llm-judge-per-tool-${sessionId || 'unknown'}.jsonl`),
|
|
JSON.stringify({ ts: new Date().toISOString(), session_id: sessionId || null, ...entry }) + '\n');
|
|
} catch { /* ignore */ }
|
|
}
|
|
|
|
async function main() {
|
|
try {
|
|
const event = parseEventJson(await readStdin());
|
|
const toolName = event.tool_name;
|
|
if (!MUTATING_TOOLS.has(toolName)) { exitDecision({ block: false }); return; }
|
|
|
|
const sessionId = event.session_id;
|
|
const declaredTask = readDeclaredTask({ sessionId });
|
|
const spent = readJudgeBudget({ sessionId });
|
|
|
|
const result = await judgePerTool({
|
|
toolName,
|
|
toolInput: event.tool_input || {},
|
|
declaredTask,
|
|
budgetState: { spent, limit: JUDGE_SESSION_BUDGET },
|
|
llmJudgeCallImpl: (opts) => llmJudgeCall(opts),
|
|
});
|
|
|
|
if (!result.degraded) bumpJudgeBudget({ sessionId, by: 1 });
|
|
|
|
logPerTool({
|
|
sessionId,
|
|
entry: {
|
|
tool_name: toolName,
|
|
tool_input_summary: truncate(event.tool_input, 200),
|
|
declared_task: declaredTask.task_summary,
|
|
verdict: result.verdict ?? null,
|
|
action_taken: result.block ? 'block' : (result.degraded ? 'degraded_allow' : 'allow'),
|
|
reason: result.reason || null,
|
|
},
|
|
});
|
|
|
|
exitDecision({ block: result.block, message: result.reason });
|
|
} catch {
|
|
exitDecision({ block: false }); // fail-quiet
|
|
}
|
|
}
|
|
|
|
const isCli = process.argv[1] && process.argv[1].replace(/\\/g, '/').endsWith('/llm-judge-per-tool.mjs');
|
|
if (isCli) main();
|