/** * PII filter for brain governance observer (B2). * Used by Stop-hook before JSONL write — per Pravila §16.2 + ADR-011 + spec §5.4. * * Patterns covered: * RU_PHONE — +7XXXXXXXXXX OR bare 7XXXXXXXXXX (11 digits starting with 7, * word-boundary on left). Real-leak regression (gitleaks * 2026-05-23): bare format slipped past `\+7\d{10}`. * EMAIL — any user@domain.tld * JWT — eyJ.. (must run BEFORE OPENAI/Bearer * fallbacks to avoid partial matches) * AWS_KEY — AKIA[A-Z0-9]{16} (Access Key ID prefix) * YC_STATIC — AQVN[\w-]{15,} (Yandex Cloud IAM static key) * YC_SESSION — t1. (Yandex IAM session token) * YC_OAUTH — y0_ (Yandex OAuth token) * SENTRY_TOKEN — sntrys?_<12+ alphanum> * OPENAI_TOKEN — sk-<20+ alphanum> * GENERIC_BEARER — Bearer <20+ token chars> * IPV4 — dotted-quad (over-redacts 4-segment build numbers — accepted * tradeoff; under-redaction is the worse failure) * WIN_USER_PATH — C:\Users\ → C:\Users\*** * POSIX_HOME — /home/ → /home/*** * * Security Guidance #40: pure regex — no exec/execSync. */ const RU_PHONE = /(?:\+7|\b7)\d{10}/g; const EMAIL = /[\w.+-]+@[\w-]+\.[\w.-]+/g; const JWT = /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b/g; const AWS_KEY = /\bAKIA[A-Z0-9]{16}\b/g; const YC_STATIC = /\bAQVN[A-Za-z0-9_-]{15,}\b/g; const YC_SESSION = /\bt1\.[A-Za-z0-9_-]{40,}\b/g; const YC_OAUTH = /\by0_[A-Za-z0-9_-]{40,}\b/g; const SENTRY_TOKEN = /sntrys?_[A-Za-z0-9]{12,}/g; const OPENAI_TOKEN = /sk-[A-Za-z0-9]{20,}/g; const GENERIC_BEARER = /Bearer\s+[A-Za-z0-9._-]{20,}/g; const IPV4 = /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g; const WIN_USER_PATH = /([Cc]:[\\/]Users[\\/])[^\\/\s"'<>:|?*]+/g; const POSIX_HOME = /(\/home\/)[^/\s"'<>:?*]+/g; function sanitizeString(s) { if (typeof s !== 'string') return s; return s .replace(RU_PHONE, '+7XXXXXXXXXX') .replace(EMAIL, '***@***') // Highly-specific token patterns first — they would otherwise be eaten // by GENERIC_BEARER / OPENAI_TOKEN partial matches. .replace(JWT, '[REDACTED:jwt]') .replace(AWS_KEY, '[REDACTED:aws]') .replace(YC_STATIC, '[REDACTED:yandex]') .replace(YC_SESSION, '[REDACTED:yandex]') .replace(YC_OAUTH, '[REDACTED:yandex]') .replace(SENTRY_TOKEN, '[REDACTED:sentry]') .replace(OPENAI_TOKEN, '[REDACTED:openai]') .replace(GENERIC_BEARER, '[REDACTED:bearer]') .replace(IPV4, '[REDACTED:ipv4]') .replace(WIN_USER_PATH, '$1***') .replace(POSIX_HOME, '$1***'); } const PATTERNS = { RU_PHONE, EMAIL, JWT, AWS_KEY, YC_STATIC, YC_SESSION, YC_OAUTH, SENTRY_TOKEN, OPENAI_TOKEN, GENERIC_BEARER, IPV4, WIN_USER_PATH, POSIX_HOME, }; function countString(s, counts) { if (typeof s !== 'string') return; for (const [name, re] of Object.entries(PATTERNS)) { const reFresh = new RegExp(re.source, re.flags); const matches = s.match(reFresh); counts[name] = (counts[name] || 0) + (matches ? matches.length : 0); } } function walkAndCount(input, counts) { if (typeof input === 'string') { countString(input, counts); return; } if (input === null || input === undefined) return; if (Array.isArray(input)) { input.forEach((v) => walkAndCount(v, counts)); return; } if (typeof input === 'object') { for (const v of Object.values(input)) walkAndCount(v, counts); } } /** * Sanitize input AND count matches per pattern type. * Returns { sanitized, counts: { PATTERN_NAME: N, ... } }. * counts is pre-initialised to 0 for all 13 known patterns. */ export function sanitizeWithCount(input) { const counts = {}; for (const k of Object.keys(PATTERNS)) counts[k] = 0; walkAndCount(input, counts); return { sanitized: sanitize(input), counts }; } export function sanitize(input) { if (typeof input === 'string') return sanitizeString(input); if (input === null || input === undefined) return input; if (Array.isArray(input)) return input.map(sanitize); if (typeof input === 'object') { const out = {}; for (const [k, v] of Object.entries(input)) out[k] = sanitize(v); return out; } return input; }