feat(router): UTF-8 safe stdin helper for three hooks
StringDecoder correctly assembles multi-byte chars (Cyrillic) across stdin chunk boundaries. Closes Windows Node quirk where Russian prompts were turned into mojibake before sending to Anthropic API (Layer 2 escalation). Stage 3 follow-up fix 1/3 (helper). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* UTF-8 safe stdin reader for hooks.
|
||||
* Fixes Windows Node stdin quirk: default `for await (chunk of stdin)` interprets
|
||||
* chunks as Buffer, and `input += chunk` calls .toString() which uses utf-8 BUT
|
||||
* fails on chunk boundaries that fall inside multi-byte sequences (e.g. cyrillic
|
||||
* 2-byte chars split across chunks).
|
||||
*
|
||||
* Uses StringDecoder to handle multi-byte chars across chunks correctly.
|
||||
*/
|
||||
|
||||
import { StringDecoder } from 'string_decoder';
|
||||
|
||||
export async function readStdinAsUtf8(stdin) {
|
||||
const decoder = new StringDecoder('utf-8');
|
||||
let out = '';
|
||||
for await (const chunk of stdin) {
|
||||
out += decoder.write(chunk);
|
||||
}
|
||||
out += decoder.end();
|
||||
return out;
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { readStdinAsUtf8 } from './router-stdin-helper.mjs';
|
||||
|
||||
async function* fromBuffers(buffers) {
|
||||
for (const b of buffers) yield b;
|
||||
}
|
||||
|
||||
describe('readStdinAsUtf8', () => {
|
||||
it('decodes UTF-8 cyrillic correctly across chunk boundaries', async () => {
|
||||
const text = 'посмотри сторожа достаточно ему информации?';
|
||||
const buf = Buffer.from(text, 'utf-8');
|
||||
// Split across multi-byte boundary (UTF-8 cyrillic = 2 bytes per char)
|
||||
const mid = 9; // mid-byte split for 'посмо|три...'
|
||||
const result = await readStdinAsUtf8(fromBuffers([buf.subarray(0, mid), buf.subarray(mid)]));
|
||||
expect(result).toBe(text);
|
||||
});
|
||||
|
||||
it('handles ASCII without modification', async () => {
|
||||
const text = 'hello world';
|
||||
const result = await readStdinAsUtf8(fromBuffers([Buffer.from(text)]));
|
||||
expect(result).toBe(text);
|
||||
});
|
||||
|
||||
it('returns empty string on empty stream', async () => {
|
||||
const result = await readStdinAsUtf8(fromBuffers([]));
|
||||
expect(result).toBe('');
|
||||
});
|
||||
|
||||
it('does NOT mangle byte-level concatenation (regression guard)', async () => {
|
||||
// The bug: `for await (const c of stdin) input += c` interprets Buffer
|
||||
// via Buffer.prototype.toString() = 'utf-8' by default in Node, BUT
|
||||
// concatenation across chunks at multi-byte boundary fails.
|
||||
// Our helper must use a StringDecoder to handle the boundary.
|
||||
const cyrillic = 'тест';
|
||||
const buf = Buffer.from(cyrillic, 'utf-8');
|
||||
// Split exactly in the middle of 'т' (2-byte char)
|
||||
const result = await readStdinAsUtf8(fromBuffers([buf.subarray(0, 1), buf.subarray(1)]));
|
||||
expect(result).toBe(cyrillic);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user