Files
portal/tools/observer-pii-filter.mjs
T
Дмитрий 11822e3803 fix(observer): RU_PHONE regex catches bare 7XXXXXXXXXX (DO-PII-1)
Bug: gitleaks (rule `ru-phone-unmasked`) caught `79135191264` in 3 lines
of docs/observer/episodes-2026-05.jsonl during brain-retro #3 push
(963379c3). Stop-hook PII-filter was not masking bare-format Russian
phone numbers (without the `+` prefix).

Root cause:
  const RU_PHONE = /\+7\d{10}/g;   // requires literal '+7'

Free-text observer episodes captured phone `79135191264` in field-value
context (`call client 79135191264` / `phone 79135191264 in payload`),
slipping past the existing filter.

Fix:
  const RU_PHONE = /(?:\+7|\b7)\d{10}/g;

The `\b7` branch catches bare format with a word-boundary on the left,
avoiding false-positives inside long digit sequences (timestamps, IDs,
hashes). False-positive guard verified via test:
  'id 1796133619135191264999 not a phone' → unchanged.

TDD cycle:
  - RED: 3 new tests + 1 sanitizeWithCount test (4 fails on bare phone)
  - GREEN: regex extended, 24/24 file tests pass, 373/373 full tools
    suite GREEN (0 regressions across 18 files).

Cleanup: applied sanitize() to docs/observer/episodes-2026-05.jsonl;
11 lines touched (3 phone-leak lines + 8 with other PII patterns).
gitleaks now finds 0 leaks in the file.

Pravila §5.2 (no PII in commits) + 152-FZ (phone is regulated PD).
Closes DO-PII-1 (see memory observer-pii-leak-2026-05-23).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 12:26:24 +03:00

107 lines
4.2 KiB
JavaScript

/**
* PII filter for brain governance observer (B2).
* Used by Stop-hook before JSONL write — per Pravila §16.2 + ADR-011 + spec §5.4.
*
* Patterns covered:
* RU_PHONE — +7XXXXXXXXXX OR bare 7XXXXXXXXXX (11 digits starting with 7,
* word-boundary on left). Real-leak regression (gitleaks
* 2026-05-23): bare format slipped past `\+7\d{10}`.
* EMAIL — any user@domain.tld
* JWT — eyJ<base64>.<base64>.<base64> (must run BEFORE OPENAI/Bearer
* fallbacks to avoid partial matches)
* AWS_KEY — AKIA[A-Z0-9]{16} (Access Key ID prefix)
* YC_STATIC — AQVN[\w-]{15,} (Yandex Cloud IAM static key)
* YC_SESSION — t1.<base64> (Yandex IAM session token)
* YC_OAUTH — y0_<base64> (Yandex OAuth token)
* SENTRY_TOKEN — sntrys?_<12+ alphanum>
* OPENAI_TOKEN — sk-<20+ alphanum>
* GENERIC_BEARER — Bearer <20+ token chars>
* IPV4 — dotted-quad (over-redacts 4-segment build numbers — accepted
* tradeoff; under-redaction is the worse failure)
* WIN_USER_PATH — C:\Users\<name> → C:\Users\***
* POSIX_HOME — /home/<name> → /home/***
*
* Security Guidance #40: pure regex — no exec/execSync.
*/
const RU_PHONE = /(?:\+7|\b7)\d{10}/g;
const EMAIL = /[\w.+-]+@[\w-]+\.[\w.-]+/g;
const JWT = /\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b/g;
const AWS_KEY = /\bAKIA[A-Z0-9]{16}\b/g;
const YC_STATIC = /\bAQVN[A-Za-z0-9_-]{15,}\b/g;
const YC_SESSION = /\bt1\.[A-Za-z0-9_-]{40,}\b/g;
const YC_OAUTH = /\by0_[A-Za-z0-9_-]{40,}\b/g;
const SENTRY_TOKEN = /sntrys?_[A-Za-z0-9]{12,}/g;
const OPENAI_TOKEN = /sk-[A-Za-z0-9]{20,}/g;
const GENERIC_BEARER = /Bearer\s+[A-Za-z0-9._-]{20,}/g;
const IPV4 = /\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g;
const WIN_USER_PATH = /([Cc]:[\\/]Users[\\/])[^\\/\s"'<>:|?*]+/g;
const POSIX_HOME = /(\/home\/)[^/\s"'<>:?*]+/g;
function sanitizeString(s) {
if (typeof s !== 'string') return s;
return s
.replace(RU_PHONE, '+7XXXXXXXXXX')
.replace(EMAIL, '***@***')
// Highly-specific token patterns first — they would otherwise be eaten
// by GENERIC_BEARER / OPENAI_TOKEN partial matches.
.replace(JWT, '[REDACTED:jwt]')
.replace(AWS_KEY, '[REDACTED:aws]')
.replace(YC_STATIC, '[REDACTED:yandex]')
.replace(YC_SESSION, '[REDACTED:yandex]')
.replace(YC_OAUTH, '[REDACTED:yandex]')
.replace(SENTRY_TOKEN, '[REDACTED:sentry]')
.replace(OPENAI_TOKEN, '[REDACTED:openai]')
.replace(GENERIC_BEARER, '[REDACTED:bearer]')
.replace(IPV4, '[REDACTED:ipv4]')
.replace(WIN_USER_PATH, '$1***')
.replace(POSIX_HOME, '$1***');
}
const PATTERNS = {
RU_PHONE, EMAIL, JWT, AWS_KEY, YC_STATIC, YC_SESSION, YC_OAUTH,
SENTRY_TOKEN, OPENAI_TOKEN, GENERIC_BEARER, IPV4, WIN_USER_PATH, POSIX_HOME,
};
function countString(s, counts) {
if (typeof s !== 'string') return;
for (const [name, re] of Object.entries(PATTERNS)) {
const reFresh = new RegExp(re.source, re.flags);
const matches = s.match(reFresh);
counts[name] = (counts[name] || 0) + (matches ? matches.length : 0);
}
}
function walkAndCount(input, counts) {
if (typeof input === 'string') { countString(input, counts); return; }
if (input === null || input === undefined) return;
if (Array.isArray(input)) { input.forEach((v) => walkAndCount(v, counts)); return; }
if (typeof input === 'object') {
for (const v of Object.values(input)) walkAndCount(v, counts);
}
}
/**
* Sanitize input AND count matches per pattern type.
* Returns { sanitized, counts: { PATTERN_NAME: N, ... } }.
* counts is pre-initialised to 0 for all 13 known patterns.
*/
export function sanitizeWithCount(input) {
const counts = {};
for (const k of Object.keys(PATTERNS)) counts[k] = 0;
walkAndCount(input, counts);
return { sanitized: sanitize(input), counts };
}
export function sanitize(input) {
if (typeof input === 'string') return sanitizeString(input);
if (input === null || input === undefined) return input;
if (Array.isArray(input)) return input.map(sanitize);
if (typeof input === 'object') {
const out = {};
for (const [k, v] of Object.entries(input)) out[k] = sanitize(v);
return out;
}
return input;
}