brain/tools/mcp-tool-classifier.mjs

#!/usr/bin/env node
/**
 * MCP tool classifier (router-gate v4 Stream C, spec §5.3 + v4.1 G1/G12).
 *
 * Classifies an MCP / built-in tool call against a path-deny / URL-whitelist /
 * SQL-statement overlay. Pure — path normalization & protected-path check are
 * injected (Stream A); LLM-judge for WebSearch query is flagged for the consumer
 * (Stream D). Unknown tools -> default 'block' (fail-CLOSE).
 */

// §5.3 + v4.1 G1/G12 classification map. Glob keys use `*`. `default` is the
// fallback category for unmatched tools.
export const DEFAULT_MCP_CLASSIFICATION = Object.freeze({
  'mcp__redis__get': { category: 'read_only' },
  'mcp__redis__list': { category: 'read_only' },
  'mcp__redis__set': { category: 'hard_blacklist' },
  'mcp__redis__delete': { category: 'hard_blacklist' },
  'mcp__github__get_me': { category: 'read_only' },
  'mcp__github__list_*': { category: 'read_only' },
  'mcp__github__search_*': { category: 'read_only' },
  'mcp__github__pull_request_read': { category: 'read_only' },
  'mcp__github__issue_read': { category: 'read_only' },
  'mcp__laravel-boost__database-query': {
    category: 'conditional',
    args_key_to_scan: 'query',
    // v4.1 G12 — full-statement scan (mutating verb anywhere, not just prefix).
    query_full_statement_scan: {
      read_only_only_patterns: [
        '^\\s*(?:SELECT|EXPLAIN|SHOW|DESCRIBE|DESC|WITH\\s+\\w+\\s+AS\\s*\\(\\s*SELECT)\\b',
      ],
      blocked_anywhere_patterns: [
        '\\b(?:UPDATE|INSERT|DELETE|DROP|TRUNCATE|ALTER|CREATE|GRANT|REVOKE|COMMIT|ROLLBACK|MERGE|REPLACE|LOAD)\\b',
        ';\\s*(?:UPDATE|INSERT|DELETE|DROP|TRUNCATE|ALTER|CREATE|GRANT|REVOKE)\\b',
      ],
      comment_strip: true,
    },
  },
  'mcp__laravel-boost__*': { category: 'read_only', exception: 'database-query handled above' },
  'mcp__github__create_*': { category: 'hard_blacklist' },
  'mcp__github__update_*': { category: 'hard_blacklist' },
  'mcp__github__merge_*': { category: 'hard_blacklist' },
  'mcp__github__delete_*': { category: 'hard_blacklist' },
  'mcp__github__push_files': { category: 'hard_blacklist' },
  'mcp__github__create_or_update_file': { category: 'hard_blacklist', path_args: ['path'] },
  'mcp__github__add_*comment*': { category: 'hard_blacklist' },
  'mcp__github__add_reply*': { category: 'hard_blacklist' },
  'mcp__github__star_repository': { category: 'hard_blacklist' },
  'mcp__github__unstar_repository': { category: 'hard_blacklist' },
  'mcp__github__manage_*subscription': { category: 'hard_blacklist' },
  'mcp__github__mark_*read': { category: 'hard_blacklist' },
  'mcp__github__dismiss_*': { category: 'hard_blacklist' },
  'mcp__github__discussion_comment_write': { category: 'hard_blacklist' },
  'mcp__github__sub_issue_write': { category: 'hard_blacklist' },
  'mcp__github__actions_run_trigger': { category: 'hard_blacklist' },
  'mcp__playwright__browser_snapshot': { category: 'read_only' },
  'mcp__playwright__browser_take_screenshot': { category: 'read_only' },
  'mcp__playwright__browser_network_requests': { category: 'read_only' },
  'mcp__playwright__browser_console_messages': { category: 'read_only' },
  'mcp__playwright__browser_navigate': {
    category: 'conditional',
    args_key_to_scan: 'url',
    // Host token MUST be followed by a port/path/query/fragment delimiter or end —
    // otherwise a subdomain-suffix spoof (liderra.ru.evil.com / localhost.evil.com)
    // slips past. (The v4.0 design §5.3 regex omitted this boundary; corrected here,
    // spec to be synced in Stream H.)
    url_whitelist_patterns: ['^https?://(?:localhost|127\\.0\\.0\\.1|liderra\\.ru)(?:[:/?#]|$)'],
    url_blocked_patterns: ['^https?://(?!(?:localhost|127\\.0\\.0\\.1|liderra\\.ru)(?:[:/?#]|$))'],
  },
  'mcp__playwright__browser_click': { category: 'hard_blacklist' },
  'mcp__playwright__browser_fill_form': { category: 'hard_blacklist' },
  'mcp__playwright__browser_type': { category: 'hard_blacklist' },
  'mcp__playwright__browser_press_key': { category: 'hard_blacklist' },
  'mcp__playwright__browser_drag': { category: 'hard_blacklist' },
  'mcp__playwright__browser_drop': { category: 'hard_blacklist' },
  'mcp__playwright__browser_evaluate': { category: 'hard_blacklist' },
  'mcp__playwright__browser_file_upload': { category: 'hard_blacklist' },
  'mcp__playwright__browser_handle_dialog': { category: 'hard_blacklist' },
  'mcp__playwright__browser_hover': { category: 'hard_blacklist' },
  'mcp__playwright__browser_resize': { category: 'hard_blacklist' },
  'mcp__playwright__browser_run_code_unsafe': { category: 'hard_blacklist' },
  'mcp__playwright__browser_select_option': { category: 'hard_blacklist' },
  'mcp__plugin_brand-voice_*__authenticate': { category: 'hard_blacklist' },
  'mcp__plugin_brand-voice_*__complete_authentication': { category: 'hard_blacklist' },
  'mcp__plugin_*_*__authenticate': { category: 'hard_blacklist' },
  'mcp__plugin_*_*__complete_authentication': { category: 'hard_blacklist' },
  'mcp__openapi__deals-store': { category: 'hard_blacklist' },
  'mcp__openapi__deals-update': { category: 'hard_blacklist' },
  'mcp__openapi__deals-bulk-*': { category: 'hard_blacklist' },
  'mcp__openapi__deals-export': { category: 'hard_blacklist' },
  'mcp__plugin_context7_context7__*': { category: 'read_only' },
  'mcp__universal-icons__*': { category: 'read_only' },
  // Off-phase research-tooling (Perplexity Pack #87/#88/#89): read_only posture per
  // ADR-019 (owner decision 2026-06-14). Web research reads external sources and does
  // not mutate project state; egress arg scan (enforce-mcp-classification) still runs.
  'mcp__perplexity__*': { category: 'read_only' },
  'mcp__exa__*': { category: 'read_only' },
  'mcp__firecrawl__*': { category: 'read_only' },
  // v4.1 G1 — WebSearch / WebFetch.
  'WebSearch': {
    category: 'conditional',
    args_key_to_scan: 'query',
    llm_judge_required: true,
    rationale: 'search query observable in engine logs; potential exfil channel',
  },
  'WebFetch': {
    category: 'conditional',
    args_key_to_scan: 'url',
    url_whitelist_patterns: [
      '^https?://docs\\.anthropic\\.com/',
      '^https?://github\\.com/(?:liderra|anthropics|deck|deck-platform)/',
      '^https?://liderra\\.ru/',
      '^https?://(?:www\\.)?npmjs\\.com/package/',
      '^https?://stackoverflow\\.com/questions/',
    ],
    url_blocked_patterns: [
      '^data:',
      '^javascript:',
      '^https?://(?!docs\\.anthropic\\.com|github\\.com|liderra\\.ru|npmjs\\.com|stackoverflow\\.com)',
    ],
    fetched_content_scan: true,
  },
  'default': 'block',
});

/**
 * Convert a glob key (`*` wildcards) to an anchored regex. Escapes regex specials,
 * expands `*` to `.*`. No backtracking risk (single-pass, no nested quantifiers).
 */
function globKeyToRegex(key) {
  const escaped = key.replace(/[.+^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*');
  return new RegExp('^' + escaped + '$');
}

/**
 * Resolve the classification entry for a tool name. Exact key wins; otherwise the
 * most specific glob key (longest literal length = fewest wildcards / longest
 * static prefix) wins. The literal "default" key is never matched as a tool.
 * @returns {object|null} the entry, or null if nothing matches.
 */
export function matchClassificationKey(toolName, classification = DEFAULT_MCP_CLASSIFICATION) {
  if (typeof toolName !== 'string' || !classification) return null;
  if (toolName === 'default') return null;
  // 1. Exact match (excluding 'default').
  if (Object.prototype.hasOwnProperty.call(classification, toolName)) {
    const entry = classification[toolName];
    if (entry && typeof entry === 'object') return entry;
  }
  // 2. Glob match — collect all, pick most specific (longest literal length).
  let best = null;
  let bestScore = -1;
  for (const key of Object.keys(classification)) {
    if (key === 'default' || key === toolName) continue;
    if (!key.includes('*')) continue;
    if (!globKeyToRegex(key).test(toolName)) continue;
    const score = key.replace(/\*/g, '').length; // literal char count = specificity
    if (score > bestScore) {
      bestScore = score;
      best = classification[key];
    }
  }
  return best && typeof best === 'object' ? best : null;
}

function defaultNormalize(target) {
  if (typeof target !== 'string') return '';
  return target.replace(/\\/g, '/').toLowerCase();
}

function stripSqlComments(sql) {
  // Remove /* ... */ and -- ... line comments (lazy bounded — no backtracking).
  return String(sql)
    .replace(/\/\*[\s\S]*?\*\//g, ' ')
    .replace(/--[^\n]*/g, ' ');
}

function testAny(patterns, text) {
  return (patterns || []).some((p) => new RegExp(p, 'i').test(text));
}

/**
 * Classify an MCP / built-in tool call into an actionable decision.
 *
 * @param {string} toolName
 * @param {object} toolInput
 * @param {{classification?: object, normalize?: Function, isProtectedPath?: Function}} [deps]
 * @returns {{decision: 'allow'|'block'|'ask', category?: string, reason?: string,
 *            needsLlmJudge?: boolean, needsContentScan?: boolean, scanArg?: string}}
 */
export function classifyMcpTool(toolName, toolInput = {}, deps = {}) {
  const classification = deps.classification || DEFAULT_MCP_CLASSIFICATION;
  const normalize = typeof deps.normalize === 'function' ? deps.normalize : defaultNormalize;
  const isProtectedPath = typeof deps.isProtectedPath === 'function' ? deps.isProtectedPath : () => false;

  const entry = matchClassificationKey(toolName, classification);
  if (!entry) {
    return { decision: 'block', category: 'default', reason: `MCP tool ${toolName} not in gate-config classification. Add to mcp_tool_classification.` };
  }

  const category = entry.category;

  if (category === 'read_only') return { decision: 'allow', category };

  if (category === 'hard_blacklist') {
    return { decision: 'block', category, reason: `MCP tool ${toolName} classified hard-blacklist.` };
  }

  if (category === 'conditional') {
    // 1. path_args — normalize + protected check.
    if (Array.isArray(entry.path_args)) {
      for (const key of entry.path_args) {
        const raw = toolInput && toolInput[key];
        if (typeof raw === 'string' && isProtectedPath(normalize(raw))) {
          return { decision: 'block', category, reason: `MCP tool ${toolName} targets protected path "${raw}".` };
        }
      }
    }
    const scanKey = entry.args_key_to_scan;
    const argVal = scanKey && toolInput ? toolInput[scanKey] : undefined;
    // 2. SQL full-statement scan (G12).
    if (entry.query_full_statement_scan && typeof argVal === 'string') {
      const cfg = entry.query_full_statement_scan;
      const sql = cfg.comment_strip ? stripSqlComments(argVal) : argVal;
      if (testAny(cfg.blocked_anywhere_patterns, sql)) {
        return { decision: 'block', category, reason: `database-query contains a mutating verb (full-statement scan).` };
      }
      if (testAny(cfg.read_only_only_patterns, sql)) {
        return { decision: 'allow', category };
      }
      return { decision: 'ask', category, reason: `database-query did not match read-only nor blocked patterns — needs approval.`, scanArg: argVal };
    }
    // 2b. SQL prefix scan (legacy v4.0 style).
    if (entry.query_prefix_scan && typeof argVal === 'string') {
      const cfg = entry.query_prefix_scan;
      if (testAny(cfg.blocked_patterns, argVal)) return { decision: 'block', category };
      if (testAny(cfg.read_only_patterns, argVal)) return { decision: 'allow', category };
      return { decision: 'ask', category, scanArg: argVal };
    }
    // 3. URL whitelist / blocklist (WebFetch / browser_navigate).
    if (typeof argVal === 'string' && (entry.url_whitelist_patterns || entry.url_blocked_patterns)) {
      if (testAny(entry.url_blocked_patterns, argVal)) {
        return { decision: 'block', category, reason: `MCP tool ${toolName} URL "${argVal}" is blocked.` };
      }
      if (testAny(entry.url_whitelist_patterns, argVal)) {
        return { decision: 'allow', category, needsContentScan: !!entry.fetched_content_scan };
      }
      return { decision: 'block', category, reason: `MCP tool ${toolName} URL "${argVal}" not in whitelist.` };
    }
    // 4. LLM-judge required (WebSearch) — flag for the consumer (Stream D).
    if (entry.llm_judge_required) {
      return { decision: 'ask', category, needsLlmJudge: true, scanArg: typeof argVal === 'string' ? argVal : undefined };
    }
    // Conditional with no resolvable signal -> ask.
    return { decision: 'ask', category, reason: `MCP tool ${toolName} conditional — needs approval.` };
  }

  // Unknown category string -> fail-CLOSE.
  return { decision: 'block', category: category || 'unknown', reason: `MCP tool ${toolName} unknown category.` };
}