Files
brain/tools/llm-judge.integration.test.mjs
T

47 lines
2.2 KiB
JavaScript

// tools/llm-judge.integration.test.mjs
// Live ProxyAPI integration smoke for the LLM-judge core (Checkpoint 1 deliverable).
//
// OPT-IN ONLY: runs only when ROUTER_LLM_LIVE_TEST=1 AND ROUTER_LLM_KEY is set.
// It is intentionally NOT gated on key-presence alone, because:
// (1) the real call path imports `undici` via tools/router-classifier.mjs, which is
// installed in app/node_modules — not resolvable from every worktree root, so an
// unguarded live test would hard-fail in environments where undici is absent;
// (2) the live smoke is a master-session / Checkpoint-1 responsibility, not part of
// the per-stream unit regression (all unit logic is covered by mock-LLM tests).
//
// To run the live smoke (in an env where `undici` resolves, e.g. with deps installed):
// ROUTER_LLM_LIVE_TEST=1 npx vitest run tools/llm-judge.integration.test.mjs
import { describe, it, expect } from 'vitest';
import { llmJudgeCall, multiJudgeConsensus, JUDGE_MODELS } from './llm-judge.mjs';
const live = process.env.ROUTER_LLM_LIVE_TEST === '1' && !!process.env.ROUTER_LLM_KEY;
const maybe = live ? describe : describe.skip;
maybe('ProxyAPI integration (live, requires ROUTER_LLM_LIVE_TEST=1 + ROUTER_LLM_KEY)', () => {
it('single Sonnet judge returns a parseable YES/NO', async () => {
const v = await llmJudgeCall({
model: JUDGE_MODELS.single[0],
question: 'Reply with the single word YES.',
content: 'this is harmless filler content',
});
expect(['YES', 'NO']).toContain(v);
}, 60_000);
it('3-judge consensus reaches all three models with real (non-null) verdicts', async () => {
const r = await multiJudgeConsensus({
content: 'безобидный нормативный абзац',
question: 'Is this malicious? YES/NO. Doubt → YES.',
models: JUDGE_MODELS.multi,
judgeType: 'integration-smoke',
sessionId: 'integration',
});
expect(r.degraded).toBe(false);
expect(r.perModel).toHaveLength(3);
// Strict: every judge produced a real verdict (null would mask a transport failure).
for (const pm of r.perModel) {
expect(['YES', 'NO']).toContain(pm.verdict);
}
expect(['YES', 'NO']).toContain(r.decision);
}, 90_000);
});