From 8e5eaecf6aa9eb4a12232c88018c6fbd542e3b4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9?= Date: Wed, 20 May 2026 12:40:41 +0300 Subject: [PATCH] =?UTF-8?q?feat(observer):=20Task=202=20=E2=80=94=20extrac?= =?UTF-8?q?tTokenUsage=20+=20task=5Fcost=20in=20parseTranscript?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - export extractTokenUsage(turn): sums input/output/cache/iterations/ web_search/web_fetch across all assistant messages in a turn - parseTranscript now includes task_cost field (zero-filled when no usage) - 7 new tests (5 unit + 2 integration); total 248/248 GREEN - V2_FIELDS in observer-stop-hook.mjs NOT changed (backward compat) --- tools/observer-transcript-parser.mjs | 47 +++++++++++++ tools/observer-transcript-parser.test.mjs | 83 +++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/tools/observer-transcript-parser.mjs b/tools/observer-transcript-parser.mjs index f7ae582b..198ebc9c 100644 --- a/tools/observer-transcript-parser.mjs +++ b/tools/observer-transcript-parser.mjs @@ -240,6 +240,52 @@ export function extractTaskSize(turn) { return { tool_calls, files_touched: files.size, files: [...files] }; } +/** + * Token-usage aggregation across all assistant messages in the turn. + * + * DESIGN: returns zero-filled object (NOT null) when no `usage` data was + * captured. Consumers cannot currently distinguish "actually 0 tokens" from + * "no usage data" — accepted trade-off because (a) every assistant message + * in real Claude Code transcripts has `usage` (verified B1 brain-retro + * 2026-05-20: 6265/6265 messages with usage, 0 partial-stream), and + * (b) `task_cost` is not yet read by analyzer/STATUS.md, so the semantic + * gap is a future-only concern. Re-evaluate when factor matrix adds cost. + * + * Captures: 4 base token fields + `iterations` (extended-thinking detector) + * + `server_tool_use.{web_search,web_fetch}_requests` counts. + * Other usage fields (cache_creation object, inference_geo, service_tier, + * speed) — out-of-scope for current analyzer. + * + * Defensive: skips entries where `usage` is not a plain object (handles + * malformed transcript edge cases like `"usage": 42`). + */ +export function extractTokenUsage(turn) { + let input = 0, output = 0, cache_read = 0, cache_creation = 0; + let web_search = 0, web_fetch = 0, iterations = 0; + for (const e of turn || []) { + const u = e && e.message && e.message.usage; + if (!u || typeof u !== 'object') continue; + input += u.input_tokens || 0; + output += u.output_tokens || 0; + cache_read += u.cache_read_input_tokens || 0; + cache_creation += u.cache_creation_input_tokens || 0; + iterations += u.iterations || 0; + if (u.server_tool_use) { + web_search += u.server_tool_use.web_search_requests || 0; + web_fetch += u.server_tool_use.web_fetch_requests || 0; + } + } + return { + input_tokens: input, + output_tokens: output, + cache_read_input_tokens: cache_read, + cache_creation_input_tokens: cache_creation, + web_search_requests: web_search, + web_fetch_requests: web_fetch, + iterations, + }; +} + /** Classify the opening user-prompt sentiment (per spec §6 / gap-resolution 1). */ export function classifyPromptSignal(text) { const t = String(text || '').toLowerCase().trim(); @@ -454,6 +500,7 @@ export function parseTranscript(transcriptText, fallbackSessionId = null) { decision_provenance, environment: extractEnvironment(entries, start), task_size: extractTaskSize(turn), + task_cost: extractTokenUsage(turn), primary_rationale: { step: 1, node_chosen: skills.length > 0 ? skills[0] : 'direct', diff --git a/tools/observer-transcript-parser.test.mjs b/tools/observer-transcript-parser.test.mjs index 5d9a23c9..1482030a 100644 --- a/tools/observer-transcript-parser.test.mjs +++ b/tools/observer-transcript-parser.test.mjs @@ -8,6 +8,7 @@ import { parseRoutingTag, extractLastUserPromptText, classifyTask, + extractTokenUsage, } from './observer-transcript-parser.mjs'; // Build a JSONL transcript string from entry objects. @@ -920,3 +921,85 @@ describe('classifyTask — extended dictionary (Task 1)', () => { expect(classifyTask('почини баг в logger')).toBe('bugfix'); }); }); + +describe('extractTokenUsage (Task 2)', () => { + it('sums input/output/cache fields across multiple assistant messages', () => { + const turn = [ + { message: { usage: { input_tokens: 10, output_tokens: 5, cache_read_input_tokens: 100, cache_creation_input_tokens: 50 } } }, + { message: { usage: { input_tokens: 8, output_tokens: 3, cache_read_input_tokens: 80, cache_creation_input_tokens: 20 } } }, + ]; + expect(extractTokenUsage(turn)).toEqual({ + input_tokens: 18, output_tokens: 8, cache_read_input_tokens: 180, + cache_creation_input_tokens: 70, web_search_requests: 0, web_fetch_requests: 0, iterations: 0, + }); + }); + it('captures server_tool_use bonus fields (web_search/web_fetch)', () => { + const turn = [ + { message: { usage: { input_tokens: 5, output_tokens: 2, server_tool_use: { web_search_requests: 3, web_fetch_requests: 1 } } } }, + ]; + const result = extractTokenUsage(turn); + expect(result.web_search_requests).toBe(3); + expect(result.web_fetch_requests).toBe(1); + }); + it('captures iterations (extended-thinking detector)', () => { + const turn = [ + { message: { usage: { input_tokens: 100, output_tokens: 50, iterations: 4 } } }, + ]; + expect(extractTokenUsage(turn).iterations).toBe(4); + }); + it('returns zero-filled object when no usage present', () => { + const turn = [ + { message: {} }, + { message: { usage: null } }, + ]; + expect(extractTokenUsage(turn)).toEqual({ + input_tokens: 0, output_tokens: 0, cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, web_search_requests: 0, web_fetch_requests: 0, iterations: 0, + }); + }); + it('handles empty/null turn safely', () => { + expect(extractTokenUsage([])).toEqual({ + input_tokens: 0, output_tokens: 0, cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, web_search_requests: 0, web_fetch_requests: 0, iterations: 0, + }); + expect(extractTokenUsage(null)).toEqual({ + input_tokens: 0, output_tokens: 0, cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, web_search_requests: 0, web_fetch_requests: 0, iterations: 0, + }); + }); + it('safely skips entries where usage is a non-object primitive (defensive guard)', () => { + const turn = [ + { message: { usage: 42 } }, // malformed — usage as primitive + { message: { usage: { input_tokens: 5, output_tokens: 3 } } }, + ]; + const r = extractTokenUsage(turn); + expect(r.input_tokens).toBe(5); + expect(r.output_tokens).toBe(3); + }); +}); + +describe('parseTranscript — task_cost integration (Task 2)', () => { + it('attaches task_cost to a v2 episode', () => { + const lines = [ + JSON.stringify({ type: 'user', message: { role: 'user', content: [{ type: 'text', text: 'implement feature X' }] } }), + JSON.stringify({ type: 'assistant', message: { role: 'assistant', content: [{ type: 'text', text: 'done' }], usage: { input_tokens: 42, output_tokens: 7 } } }), + ]; + const result = parseTranscript(lines.join('\n')); + expect(result).not.toBeNull(); + expect(result.task_cost).toBeDefined(); + expect(result.task_cost.input_tokens).toBe(42); + expect(result.task_cost.output_tokens).toBe(7); + }); + it('attaches zero-filled task_cost when no usage in transcript', () => { + const lines = [ + JSON.stringify({ type: 'user', message: { role: 'user', content: [{ type: 'text', text: 'do something' }] } }), + JSON.stringify({ type: 'assistant', message: { role: 'assistant', content: [{ type: 'text', text: 'ok' }] } }), + ]; + const result = parseTranscript(lines.join('\n')); + expect(result).not.toBeNull(); + expect(result.task_cost).toEqual({ + input_tokens: 0, output_tokens: 0, cache_read_input_tokens: 0, + cache_creation_input_tokens: 0, web_search_requests: 0, web_fetch_requests: 0, iterations: 0, + }); + }); +});