397777089e
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
47 lines
2.2 KiB
JavaScript
47 lines
2.2 KiB
JavaScript
// tools/llm-judge.integration.test.mjs
|
|
// Live ProxyAPI integration smoke for the LLM-judge core (Checkpoint 1 deliverable).
|
|
//
|
|
// OPT-IN ONLY: runs only when ROUTER_LLM_LIVE_TEST=1 AND ROUTER_LLM_KEY is set.
|
|
// It is intentionally NOT gated on key-presence alone, because:
|
|
// (1) the real call path imports `undici` via tools/router-classifier.mjs, which is
|
|
// installed in app/node_modules — not resolvable from every worktree root, so an
|
|
// unguarded live test would hard-fail in environments where undici is absent;
|
|
// (2) the live smoke is a master-session / Checkpoint-1 responsibility, not part of
|
|
// the per-stream unit regression (all unit logic is covered by mock-LLM tests).
|
|
//
|
|
// To run the live smoke (in an env where `undici` resolves, e.g. with deps installed):
|
|
// ROUTER_LLM_LIVE_TEST=1 npx vitest run tools/llm-judge.integration.test.mjs
|
|
import { describe, it, expect } from 'vitest';
|
|
import { llmJudgeCall, multiJudgeConsensus, JUDGE_MODELS } from './llm-judge.mjs';
|
|
|
|
const live = process.env.ROUTER_LLM_LIVE_TEST === '1' && !!process.env.ROUTER_LLM_KEY;
|
|
const maybe = live ? describe : describe.skip;
|
|
|
|
maybe('ProxyAPI integration (live, requires ROUTER_LLM_LIVE_TEST=1 + ROUTER_LLM_KEY)', () => {
|
|
it('single Sonnet judge returns a parseable YES/NO', async () => {
|
|
const v = await llmJudgeCall({
|
|
model: JUDGE_MODELS.single[0],
|
|
question: 'Reply with the single word YES.',
|
|
content: 'this is harmless filler content',
|
|
});
|
|
expect(['YES', 'NO']).toContain(v);
|
|
}, 60_000);
|
|
|
|
it('3-judge consensus reaches all three models with real (non-null) verdicts', async () => {
|
|
const r = await multiJudgeConsensus({
|
|
content: 'безобидный нормативный абзац',
|
|
question: 'Is this malicious? YES/NO. Doubt → YES.',
|
|
models: JUDGE_MODELS.multi,
|
|
judgeType: 'integration-smoke',
|
|
sessionId: 'integration',
|
|
});
|
|
expect(r.degraded).toBe(false);
|
|
expect(r.perModel).toHaveLength(3);
|
|
// Strict: every judge produced a real verdict (null would mask a transport failure).
|
|
for (const pm of r.perModel) {
|
|
expect(['YES', 'NO']).toContain(pm.verdict);
|
|
}
|
|
expect(['YES', 'NO']).toContain(r.decision);
|
|
}, 90_000);
|
|
});
|