brain/tools/llm-judge-per-tool.test.mjs

// tools/llm-judge-per-tool.test.mjs
import { describe, it, expect } from 'vitest';
import { buildPerToolQuestion, judgePerTool } from './llm-judge-per-tool.mjs';

const declaredTask = {
  task_summary: 'write a sub-plan',
  recommended_node: '#19',
  recommended_chain: [],
};

describe('buildPerToolQuestion', () => {
  it('includes tool name, declared task, recommended node, and truncated input', () => {
    const q = buildPerToolQuestion({
      toolName: 'Edit',
      toolInput: { file_path: 'app/Models/User.php', new_string: 'x'.repeat(1000) },
      declaredTask,
    });
    expect(q).toContain('Edit');
    expect(q).toContain('write a sub-plan');
    expect(q).toContain('#19');
    expect(q).toMatch(/YES.*NO|NO.*YES/s);
    expect(q.length).toBeLessThan(2000);
  });
});

describe('judgePerTool', () => {
  it('blocks when judge returns NO (inconsistent)', async () => {
    const r = await judgePerTool({
      toolName: 'Bash', toolInput: { command: 'rm -rf /' }, declaredTask,
      llmJudgeCallImpl: async () => 'NO',
    });
    expect(r.block).toBe(true);
    expect(r.reason).toMatch(/per-tool/i);
  });

  it('allows when judge returns YES (consistent)', async () => {
    const r = await judgePerTool({
      toolName: 'Write', toolInput: { file_path: 'docs/superpowers/plans/x.md' }, declaredTask,
      llmJudgeCallImpl: async () => 'YES',
    });
    expect(r.block).toBe(false);
  });

  it('blocks on null verdict (doubt → NO)', async () => {
    const r = await judgePerTool({
      toolName: 'Edit', toolInput: {}, declaredTask,
      llmJudgeCallImpl: async () => null,
    });
    expect(r.block).toBe(true);
  });

  it('degrades to allow+flag when no key / no impl', async () => {
    const r = await judgePerTool({
      toolName: 'Edit', toolInput: {}, declaredTask, apiKey: '',
    });
    expect(r.block).toBe(false);
    expect(r.degraded).toBe(true);
  });

  it('degrades to allow when budget exhausted', async () => {
    const r = await judgePerTool({
      toolName: 'Edit', toolInput: {}, declaredTask,
      budgetState: { spent: 200, limit: 200 },
      llmJudgeCallImpl: async () => 'NO',
    });
    expect(r.block).toBe(false);
    expect(r.degraded).toBe(true);
    expect(r.reason).toBe('budget_exhausted');
  });
});

import { resolveEffectiveTask } from './llm-judge-per-tool.mjs';

// Calibration 4 (soft, 2026-05-31) — when the classifier wrote "(unknown)" as
// the declared task (its summary is lossy/unreliable), fall back to judging
// against the user's actual last prompt instead of an empty task. NOT
// calibration 2: the judge still blocks on doubt — it just uses better
// evidence (the literal user request) when the classifier summary is empty.
describe('resolveEffectiveTask — calibration 4 user-prompt fallback', () => {
  it('keeps the classifier summary when it is meaningful', () => {
    const r = resolveEffectiveTask({ task_summary: 'implement parallel-session-lock', recommended_node: '#19' }, 'some prompt');
    expect(r.task_summary).toBe('implement parallel-session-lock');
    expect(r.task_source).toBeUndefined();
  });

  it('falls back to the user prompt when summary is "(unknown)"', () => {
    const r = resolveEffectiveTask({ task_summary: '(unknown)', recommended_node: null }, 'реализуй живой main для parallel-session-lock');
    expect(r.task_summary).toBe('реализуй живой main для parallel-session-lock');
    expect(r.task_source).toBe('user_prompt_fallback');
  });

  it('falls back when summary is empty or blank', () => {
    expect(resolveEffectiveTask({ task_summary: '' }, 'do X').task_summary).toBe('do X');
    expect(resolveEffectiveTask({ task_summary: '   ' }, 'do X').task_summary).toBe('do X');
  });

  it('stays unknown when both summary and user prompt are unavailable (still blocks on doubt)', () => {
    const r = resolveEffectiveTask({ task_summary: '(unknown)' }, '');
    expect(r.task_summary).toBe('(unknown)');
    expect(r.task_source).toBeUndefined();
  });
});

import { MUTATING_TOOLS, readDeclaredTask } from './llm-judge-per-tool.mjs';

describe('per-tool helpers', () => {
  it('classifies mutating tools', () => {
    expect(MUTATING_TOOLS.has('Edit')).toBe(true);
    expect(MUTATING_TOOLS.has('Write')).toBe(true);
    expect(MUTATING_TOOLS.has('Bash')).toBe(true);
    expect(MUTATING_TOOLS.has('Read')).toBe(false);
  });

  // Calibration 1 (2026-05-31) — SCOPE fix, discipline NOT lowered.
  // Invoking a Skill changes no state; it is the prescribed §17 entry into
  // work. Judging the skill-invocation itself and blocking on doubt directly
  // contradicts §17 (which mandates skills). The real mutations a skill leads
  // to (Edit/Write/Bash/commit/push) stay fully judged, so removing Skill from
  // the judge scope does not lower discipline.
  it('does NOT treat Skill as mutating (calibration 1 — prescribed §17 entry, mutates nothing)', () => {
    expect(MUTATING_TOOLS.has('Skill')).toBe(false);
  });

  it('readDeclaredTask falls back to a stub when state missing', () => {
    const dt = readDeclaredTask({ sessionId: 'no-such-session', runtimeDirOverride: '/nonexistent' });
    expect(dt).toHaveProperty('task_summary');
    expect(dt).toHaveProperty('recommended_node');
  });
});