397777089e
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
51 lines
2.2 KiB
JavaScript
51 lines
2.2 KiB
JavaScript
// tools/judge-evaluator.test.mjs
|
|
import { describe, it, expect } from 'vitest';
|
|
import { evaluatePostfactum } from './judge-evaluator.mjs';
|
|
|
|
describe('evaluatePostfactum (D31, §9.4): прошлые GO vs что вылезло ниже', () => {
|
|
it('нет всплывших проблем → промахов нет, флага нет', () => {
|
|
const r = evaluatePostfactum({
|
|
verdicts: [{ verdict_id: 'v1', decision: 'GO', source: 'judge' }],
|
|
surfaced: [],
|
|
});
|
|
expect(r.misses).toBe(0);
|
|
expect(r.flag).toBe(false);
|
|
});
|
|
it('проблема вылезла ниже и её пропустил GO-вердикт → промах', () => {
|
|
const r = evaluatePostfactum({
|
|
verdicts: [{ verdict_id: 'v1', decision: 'GO', source: 'judge' }, { verdict_id: 'v2', decision: 'GO', source: 'judge' }],
|
|
surfaced: [{ missed_by: 'v1', where: 'gate3' }],
|
|
threshold: 0.4,
|
|
});
|
|
expect(r.misses).toBe(1);
|
|
expect(r.totalGo).toBe(2);
|
|
expect(r.missRate).toBeCloseTo(0.5);
|
|
expect(r.flag).toBe(true);
|
|
});
|
|
it('проблема указывает на NO-GO вердикт (судья поймал) → не промах того вердикта', () => {
|
|
const r = evaluatePostfactum({
|
|
verdicts: [{ verdict_id: 'v1', decision: 'NO-GO', source: 'judge' }],
|
|
surfaced: [{ missed_by: 'v1', where: 'owner' }],
|
|
});
|
|
expect(r.misses).toBe(0);
|
|
});
|
|
it('разбивка по источнику (судья vs роутер)', () => {
|
|
const r = evaluatePostfactum({
|
|
verdicts: [
|
|
{ verdict_id: 'j1', decision: 'GO', source: 'judge' },
|
|
{ verdict_id: 'r1', decision: 'GO', source: 'router' },
|
|
],
|
|
surfaced: [{ missed_by: 'j1' }, { missed_by: 'r1' }],
|
|
threshold: 0.9,
|
|
});
|
|
expect(r.bySource.judge.misses).toBe(1);
|
|
expect(r.bySource.router.misses).toBe(1);
|
|
});
|
|
it('ниже порога → флага нет', () => {
|
|
const verdicts = Array.from({ length: 10 }, (_, i) => ({ verdict_id: 'v' + i, decision: 'GO', source: 'judge' }));
|
|
const r = evaluatePostfactum({ verdicts, surfaced: [{ missed_by: 'v0' }], threshold: 0.2 });
|
|
expect(r.missRate).toBeCloseTo(0.1);
|
|
expect(r.flag).toBe(false);
|
|
});
|
|
});
|