2476dd3c1b
PII filter previously covered only RU phone, email, Sentry, OpenAI token, and generic Bearer. Several common surface leaks were uncovered: - JWT tokens (eyJ<base64>.<base64>.<base64>) — auth/session tokens. - AWS access key IDs (AKIA<16 alphanum>) — IAM static creds. - Yandex Cloud IAM static keys (AQVN<base64>), session tokens (t1.<base64>), OAuth tokens (y0_<base64>) — primary cloud-provider for this project. - IPv4 addresses (dotted-quad) — over-redacts 4-segment build numbers as an accepted tradeoff (under-redaction is the worse failure). - Windows user-paths (C:\Users\<name>) → C:\Users\***. Otherwise the OS username `Administrator` leaks via task_size.files in every episode. - POSIX /home/<name>/ → /home/***/. Same rationale for Linux dev hosts. Pattern order: highly-specific token patterns (JWT/AWS/YC) run BEFORE OPENAI_TOKEN/GENERIC_BEARER fallbacks; otherwise partial overlaps would strip the wrong segments. Tests: 9 new (each new pattern + idempotency over the expanded redaction markers). 27/27 PII tests green. .gitleaks.toml: added the test fixture to the path allowlist — the file contains synthetic JWT/AWS/Yandex tokens (the filter is supposed to redact them), not real secrets. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
70 lines
2.9 KiB
JavaScript
70 lines
2.9 KiB
JavaScript
/**
|
|
* PII filter for brain governance observer (B2).
|
|
* Used by Stop-hook before JSONL write — per Pravila §16.2 + ADR-011 + spec §5.4.
|
|
*
|
|
* Patterns covered:
|
|
* RU_PHONE — +7XXXXXXXXXX (10 digits after +7)
|
|
* EMAIL — any user@domain.tld
|
|
* JWT — eyJ<base64>.<base64>.<base64> (must run BEFORE OPENAI/Bearer
|
|
* fallbacks to avoid partial matches)
|
|
* AWS_KEY — AKIA[A-Z0-9]{16} (Access Key ID prefix)
|
|
* YC_STATIC — AQVN[\w-]{15,} (Yandex Cloud IAM static key)
|
|
* YC_SESSION — t1.<base64> (Yandex IAM session token)
|
|
* YC_OAUTH — y0_<base64> (Yandex OAuth token)
|
|
* SENTRY_TOKEN — sntrys?_<12+ alphanum>
|
|
* OPENAI_TOKEN — sk-<20+ alphanum>
|
|
* GENERIC_BEARER — Bearer <20+ token chars>
|
|
* IPV4 — dotted-quad (over-redacts 4-segment build numbers — accepted
|
|
* tradeoff; under-redaction is the worse failure)
|
|
* WIN_USER_PATH — C:\Users\<name> → C:\Users\***
|
|
* POSIX_HOME — /home/<name> → /home/***
|
|
*
|
|
* Security Guidance #40: pure regex — no exec/execSync.
|
|
*/
|
|
|
|
const RU_PHONE = /\+7\d{10}/g;
|
|
const EMAIL = /[\w.+-]+@[\w-]+\.[\w.-]+/g;
|
|
const JWT = /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b/g;
|
|
const AWS_KEY = /\bAKIA[A-Z0-9]{16}\b/g;
|
|
const YC_STATIC = /\bAQVN[A-Za-z0-9_-]{15,}\b/g;
|
|
const YC_SESSION = /\bt1\.[A-Za-z0-9_-]{40,}\b/g;
|
|
const YC_OAUTH = /\by0_[A-Za-z0-9_-]{40,}\b/g;
|
|
const SENTRY_TOKEN = /sntrys?_[A-Za-z0-9]{12,}/g;
|
|
const OPENAI_TOKEN = /sk-[A-Za-z0-9]{20,}/g;
|
|
const GENERIC_BEARER = /Bearer\s+[A-Za-z0-9._-]{20,}/g;
|
|
const IPV4 = /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g;
|
|
const WIN_USER_PATH = /([Cc]:[\\/]Users[\\/])[^\\/\s"'<>:|?*]+/g;
|
|
const POSIX_HOME = /(\/home\/)[^/\s"'<>:?*]+/g;
|
|
|
|
function sanitizeString(s) {
|
|
if (typeof s !== 'string') return s;
|
|
return s
|
|
.replace(RU_PHONE, '+7XXXXXXXXXX')
|
|
.replace(EMAIL, '***@***')
|
|
// Highly-specific token patterns first — they would otherwise be eaten
|
|
// by GENERIC_BEARER / OPENAI_TOKEN partial matches.
|
|
.replace(JWT, '[REDACTED:jwt]')
|
|
.replace(AWS_KEY, '[REDACTED:aws]')
|
|
.replace(YC_STATIC, '[REDACTED:yandex]')
|
|
.replace(YC_SESSION, '[REDACTED:yandex]')
|
|
.replace(YC_OAUTH, '[REDACTED:yandex]')
|
|
.replace(SENTRY_TOKEN, '[REDACTED:sentry]')
|
|
.replace(OPENAI_TOKEN, '[REDACTED:openai]')
|
|
.replace(GENERIC_BEARER, '[REDACTED:bearer]')
|
|
.replace(IPV4, '[REDACTED:ipv4]')
|
|
.replace(WIN_USER_PATH, '$1***')
|
|
.replace(POSIX_HOME, '$1***');
|
|
}
|
|
|
|
export function sanitize(input) {
|
|
if (typeof input === 'string') return sanitizeString(input);
|
|
if (input === null || input === undefined) return input;
|
|
if (Array.isArray(input)) return input.map(sanitize);
|
|
if (typeof input === 'object') {
|
|
const out = {};
|
|
for (const [k, v] of Object.entries(input)) out[k] = sanitize(v);
|
|
return out;
|
|
}
|
|
return input;
|
|
}
|