diff --git a/tools/router-stdin-helper.mjs b/tools/router-stdin-helper.mjs new file mode 100644 index 00000000..29d6f999 --- /dev/null +++ b/tools/router-stdin-helper.mjs @@ -0,0 +1,22 @@ +#!/usr/bin/env node +/** + * UTF-8 safe stdin reader for hooks. + * Fixes Windows Node stdin quirk: default `for await (chunk of stdin)` interprets + * chunks as Buffer, and `input += chunk` calls .toString() which uses utf-8 BUT + * fails on chunk boundaries that fall inside multi-byte sequences (e.g. cyrillic + * 2-byte chars split across chunks). + * + * Uses StringDecoder to handle multi-byte chars across chunks correctly. + */ + +import { StringDecoder } from 'string_decoder'; + +export async function readStdinAsUtf8(stdin) { + const decoder = new StringDecoder('utf-8'); + let out = ''; + for await (const chunk of stdin) { + out += decoder.write(chunk); + } + out += decoder.end(); + return out; +} diff --git a/tools/router-stdin-helper.test.mjs b/tools/router-stdin-helper.test.mjs new file mode 100644 index 00000000..8caa2ba2 --- /dev/null +++ b/tools/router-stdin-helper.test.mjs @@ -0,0 +1,40 @@ +import { describe, it, expect } from 'vitest'; +import { readStdinAsUtf8 } from './router-stdin-helper.mjs'; + +async function* fromBuffers(buffers) { + for (const b of buffers) yield b; +} + +describe('readStdinAsUtf8', () => { + it('decodes UTF-8 cyrillic correctly across chunk boundaries', async () => { + const text = 'посмотри сторожа достаточно ему информации?'; + const buf = Buffer.from(text, 'utf-8'); + // Split across multi-byte boundary (UTF-8 cyrillic = 2 bytes per char) + const mid = 9; // mid-byte split for 'посмо|три...' + const result = await readStdinAsUtf8(fromBuffers([buf.subarray(0, mid), buf.subarray(mid)])); + expect(result).toBe(text); + }); + + it('handles ASCII without modification', async () => { + const text = 'hello world'; + const result = await readStdinAsUtf8(fromBuffers([Buffer.from(text)])); + expect(result).toBe(text); + }); + + it('returns empty string on empty stream', async () => { + const result = await readStdinAsUtf8(fromBuffers([])); + expect(result).toBe(''); + }); + + it('does NOT mangle byte-level concatenation (regression guard)', async () => { + // The bug: `for await (const c of stdin) input += c` interprets Buffer + // via Buffer.prototype.toString() = 'utf-8' by default in Node, BUT + // concatenation across chunks at multi-byte boundary fails. + // Our helper must use a StringDecoder to handle the boundary. + const cyrillic = 'тест'; + const buf = Buffer.from(cyrillic, 'utf-8'); + // Split exactly in the middle of 'т' (2-byte char) + const result = await readStdinAsUtf8(fromBuffers([buf.subarray(0, 1), buf.subarray(1)])); + expect(result).toBe(cyrillic); + }); +});