Files
brain/tools/llm-judge-per-tool.test.mjs
T

130 lines
5.2 KiB
JavaScript

// tools/llm-judge-per-tool.test.mjs
import { describe, it, expect } from 'vitest';
import { buildPerToolQuestion, judgePerTool } from './llm-judge-per-tool.mjs';
const declaredTask = {
task_summary: 'write a sub-plan',
recommended_node: '#19',
recommended_chain: [],
};
describe('buildPerToolQuestion', () => {
it('includes tool name, declared task, recommended node, and truncated input', () => {
const q = buildPerToolQuestion({
toolName: 'Edit',
toolInput: { file_path: 'app/Models/User.php', new_string: 'x'.repeat(1000) },
declaredTask,
});
expect(q).toContain('Edit');
expect(q).toContain('write a sub-plan');
expect(q).toContain('#19');
expect(q).toMatch(/YES.*NO|NO.*YES/s);
expect(q.length).toBeLessThan(2000);
});
});
describe('judgePerTool', () => {
it('blocks when judge returns NO (inconsistent)', async () => {
const r = await judgePerTool({
toolName: 'Bash', toolInput: { command: 'rm -rf /' }, declaredTask,
llmJudgeCallImpl: async () => 'NO',
});
expect(r.block).toBe(true);
expect(r.reason).toMatch(/per-tool/i);
});
it('allows when judge returns YES (consistent)', async () => {
const r = await judgePerTool({
toolName: 'Write', toolInput: { file_path: 'docs/superpowers/plans/x.md' }, declaredTask,
llmJudgeCallImpl: async () => 'YES',
});
expect(r.block).toBe(false);
});
it('blocks on null verdict (doubt → NO)', async () => {
const r = await judgePerTool({
toolName: 'Edit', toolInput: {}, declaredTask,
llmJudgeCallImpl: async () => null,
});
expect(r.block).toBe(true);
});
it('degrades to allow+flag when no key / no impl', async () => {
const r = await judgePerTool({
toolName: 'Edit', toolInput: {}, declaredTask, apiKey: '',
});
expect(r.block).toBe(false);
expect(r.degraded).toBe(true);
});
it('degrades to allow when budget exhausted', async () => {
const r = await judgePerTool({
toolName: 'Edit', toolInput: {}, declaredTask,
budgetState: { spent: 200, limit: 200 },
llmJudgeCallImpl: async () => 'NO',
});
expect(r.block).toBe(false);
expect(r.degraded).toBe(true);
expect(r.reason).toBe('budget_exhausted');
});
});
import { resolveEffectiveTask } from './llm-judge-per-tool.mjs';
// Calibration 4 (soft, 2026-05-31) — when the classifier wrote "(unknown)" as
// the declared task (its summary is lossy/unreliable), fall back to judging
// against the user's actual last prompt instead of an empty task. NOT
// calibration 2: the judge still blocks on doubt — it just uses better
// evidence (the literal user request) when the classifier summary is empty.
describe('resolveEffectiveTask — calibration 4 user-prompt fallback', () => {
it('keeps the classifier summary when it is meaningful', () => {
const r = resolveEffectiveTask({ task_summary: 'implement parallel-session-lock', recommended_node: '#19' }, 'some prompt');
expect(r.task_summary).toBe('implement parallel-session-lock');
expect(r.task_source).toBeUndefined();
});
it('falls back to the user prompt when summary is "(unknown)"', () => {
const r = resolveEffectiveTask({ task_summary: '(unknown)', recommended_node: null }, 'реализуй живой main для parallel-session-lock');
expect(r.task_summary).toBe('реализуй живой main для parallel-session-lock');
expect(r.task_source).toBe('user_prompt_fallback');
});
it('falls back when summary is empty or blank', () => {
expect(resolveEffectiveTask({ task_summary: '' }, 'do X').task_summary).toBe('do X');
expect(resolveEffectiveTask({ task_summary: ' ' }, 'do X').task_summary).toBe('do X');
});
it('stays unknown when both summary and user prompt are unavailable (still blocks on doubt)', () => {
const r = resolveEffectiveTask({ task_summary: '(unknown)' }, '');
expect(r.task_summary).toBe('(unknown)');
expect(r.task_source).toBeUndefined();
});
});
import { MUTATING_TOOLS, readDeclaredTask } from './llm-judge-per-tool.mjs';
describe('per-tool helpers', () => {
it('classifies mutating tools', () => {
expect(MUTATING_TOOLS.has('Edit')).toBe(true);
expect(MUTATING_TOOLS.has('Write')).toBe(true);
expect(MUTATING_TOOLS.has('Bash')).toBe(true);
expect(MUTATING_TOOLS.has('Read')).toBe(false);
});
// Calibration 1 (2026-05-31) — SCOPE fix, discipline NOT lowered.
// Invoking a Skill changes no state; it is the prescribed §17 entry into
// work. Judging the skill-invocation itself and blocking on doubt directly
// contradicts §17 (which mandates skills). The real mutations a skill leads
// to (Edit/Write/Bash/commit/push) stay fully judged, so removing Skill from
// the judge scope does not lower discipline.
it('does NOT treat Skill as mutating (calibration 1 — prescribed §17 entry, mutates nothing)', () => {
expect(MUTATING_TOOLS.has('Skill')).toBe(false);
});
it('readDeclaredTask falls back to a stub when state missing', () => {
const dt = readDeclaredTask({ sessionId: 'no-such-session', runtimeDirOverride: '/nonexistent' });
expect(dt).toHaveProperty('task_summary');
expect(dt).toHaveProperty('recommended_node');
});
});