diff --git a/app/app/Console/Commands/IncidentsWatchFailures.php b/app/app/Console/Commands/IncidentsWatchFailures.php index 8abf29b3..8c797087 100644 --- a/app/app/Console/Commands/IncidentsWatchFailures.php +++ b/app/app/Console/Commands/IncidentsWatchFailures.php @@ -4,46 +4,69 @@ declare(strict_types=1); namespace App\Console\Commands; +use App\Mail\IncidentDetectedMail; use Illuminate\Console\Command; use Illuminate\Support\Carbon; use Illuminate\Support\Facades\DB; +use Illuminate\Support\Facades\Mail; /** - * Сканирует failed_webhook_jobs за скользящее окно и автоматически создаёт - * incidents_log, когда кластер падений превышает заданный порог. + * Сканирует failed_webhook_jobs и failed_jobs за скользящее окно. * - * Запускается каждые 10 минут через Schedule (routes/console.php). - * Дедупликация: если открытый инцидент с такой же сигнатурой создан менее - * --dedup-window минут назад, новая запись не создаётся. + * failed_webhook_jobs: одно правило — spike ≥ threshold (200). + * failed_jobs: три правила: + * - spike: кол-во за окно одного job-класса ≥ threshold-spike (10) → high + * - daily-total: за 24ч одного job-класса ≥ threshold-daily (50) → medium + * - persistent: один exception повторяется > persistent-hours часов → medium * - * NB: SaaS-admin scope — обе таблицы (`failed_webhook_jobs` + `incidents_log`) - * имеют RLS-политики на `app.current_tenant_id`, который в cron-контексте - * не задан. Идём через `pgsql_supplier` (BYPASSRLS, роль `crm_supplier_worker`), - * как и остальные системные cron-команды (Reset, RetryFailed и т.п.). + * Дедуп: если открытый инцидент с той же сигнатурой создан < dedup-window мин — + * пропускаем. Письмо на kdv1@bk.ru только для severity=high. */ class IncidentsWatchFailures extends Command { private const DB_CONNECTION = 'pgsql_supplier'; protected $signature = 'incidents:watch-failures - {--window=10 : Окно сканирования в минутах} - {--threshold=200 : Порог числа падений за окно} - {--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}'; + {--window=10 : Окно сканирования в минутах} + {--threshold=200 : Порог спайка для failed_webhook_jobs} + {--threshold-spike=10 : Порог спайка для failed_jobs (за окно)} + {--threshold-daily=50 : Порог суммы за 24ч для failed_jobs} + {--persistent-hours=3 : Порог возраста persistent-exception для failed_jobs} + {--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}'; - protected $description = 'Сканирует failed_webhook_jobs за окно и создаёт incidents_log на превышение порога'; + protected $description = 'Сканирует failed_webhook_jobs и failed_jobs, создаёт incidents_log на превышение порогов'; public function handle(): int { $windowMinutes = (int) $this->option('window'); $threshold = (int) $this->option('threshold'); + $thresholdSpike = (int) $this->option('threshold-spike'); + $thresholdDaily = (int) $this->option('threshold-daily'); + $persistentHours = (int) $this->option('persistent-hours'); $dedupMinutes = (int) $this->option('dedup-window'); $since = Carbon::now()->subMinutes($windowMinutes); + $since24h = Carbon::now()->subHours(24); $dedupAt = Carbon::now()->subMinutes($dedupMinutes); $now = Carbon::now(); - // Группируем упавшие (ещё не resolved) джобы за окно по сигнатуре - $groups = DB::connection(self::DB_CONNECTION) + // --- Проверяем наличие SaaS-администратора (FK NOT NULL) --- + $adminId = DB::connection(self::DB_CONNECTION) + ->table('saas_admin_users') + ->where('is_active', true) + ->whereNull('deleted_at') + ->value('id'); + + if ($adminId === null) { + $this->warn('No active saas_admin_users found — skipping incident creation (warn-only).'); + + return self::SUCCESS; + } + + $created = 0; + + // ===== БЛОК 1: failed_webhook_jobs (исходная логика) ===== + $webhookGroups = DB::connection(self::DB_CONNECTION) ->table('failed_webhook_jobs') ->selectRaw('LEFT(exception, 180) AS sig, COUNT(*) AS cnt') ->whereNull('resolved_at') @@ -52,65 +75,156 @@ class IncidentsWatchFailures extends Command ->havingRaw('COUNT(*) >= ?', [$threshold]) ->get(); - if ($groups->isEmpty()) { - $this->info('No failure spikes detected.'); - - return self::SUCCESS; - } - - // Получаем ID первого доступного SaaS-администратора (для NOT NULL FK) - $adminId = DB::connection(self::DB_CONNECTION) - ->table('saas_admin_users') - ->where('is_active', true) - ->whereNull('deleted_at') - ->value('id'); - - if ($adminId === null) { - $this->error('No active saas_admin_users found — cannot create incidents_log rows.'); - - return self::FAILURE; - } - - $created = 0; - - foreach ($groups as $group) { + foreach ($webhookGroups as $group) { $sig = $group->sig; $count = (int) $group->cnt; + $dedupKey = substr($sig, 0, 80); - // Дедупликация: есть ли уже открытый инцидент с такой сигнатурой? - $alreadyOpen = DB::connection(self::DB_CONNECTION) - ->table('incidents_log') - ->where('summary', 'like', '%'.addcslashes(substr($sig, 0, 80), '%_\\').'%') - ->whereNull('resolved_at') - ->where('detected_at', '>=', $dedupAt) - ->exists(); - - if ($alreadyOpen) { - $this->line("Skipping (dedup): {$sig}"); + if ($this->isDup($dedupKey, $dedupAt)) { + $this->line("Skipping webhook (dedup): {$dedupKey}"); continue; } - DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([ - 'type' => 'other', - 'severity' => 'high', - 'summary' => "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. " - ."Сигнатура: {$sig}", - 'root_cause' => null, - 'started_at' => $since, - 'detected_at' => $now, - 'resolved_at' => null, - 'created_by_admin_id' => $adminId, - 'created_at' => $now, - 'updated_at' => $now, - ]); + $summary = "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. Сигнатура: {$sig}"; + $this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey); $created++; - $this->info("Incident created: [{$count} failures] {$sig}"); + $this->info("Webhook incident [high]: {$count} failures"); + } + + // ===== БЛОК 2: failed_jobs — spike ===== + $spikes = DB::connection(self::DB_CONNECTION) + ->table('failed_jobs') + ->selectRaw( + "payload::json->>'displayName' AS job_class, ". + 'LEFT(exception, 80) AS exc_sig, '. + 'COUNT(*) AS cnt' + ) + ->where('failed_at', '>=', $since) + ->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)") + ->havingRaw('COUNT(*) >= ?', [$thresholdSpike]) + ->get(); + + foreach ($spikes as $row) { + $jobClass = (string) $row->job_class; + $excSig = (string) $row->exc_sig; + $cnt = (int) $row->cnt; + $dedupKey = "spike:{$jobClass}:{$excSig}"; + + if ($this->isDup($dedupKey, $dedupAt)) { + $this->line("Skipping spike (dedup): {$dedupKey}"); + + continue; + } + + $summary = "Автоматически: spike {$cnt} failures job={$jobClass} за {$windowMinutes} мин. Exc: {$excSig}"; + $this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey); + $created++; + $this->info("Job spike [high]: {$jobClass} — {$cnt}"); + } + + // ===== БЛОК 3: failed_jobs — daily-total ===== + $daily = DB::connection(self::DB_CONNECTION) + ->table('failed_jobs') + ->selectRaw( + "payload::json->>'displayName' AS job_class, ". + 'COUNT(*) AS cnt' + ) + ->where('failed_at', '>=', $since24h) + ->groupByRaw("payload::json->>'displayName'") + ->havingRaw('COUNT(*) >= ?', [$thresholdDaily]) + ->get(); + + foreach ($daily as $row) { + $jobClass = (string) $row->job_class; + $cnt = (int) $row->cnt; + $dedupKey = "daily:{$jobClass}"; + + if ($this->isDup($dedupKey, $dedupAt)) { + $this->line("Skipping daily (dedup): {$dedupKey}"); + + continue; + } + + $summary = "Автоматически: daily-total {$cnt} failures job={$jobClass} за 24ч"; + $this->createIncident($adminId, 'other', 'medium', $summary, $since24h, $now, $dedupKey); + $created++; + $this->info("Job daily [medium]: {$jobClass} — {$cnt}"); + } + + // ===== БЛОК 4: failed_jobs — persistent ===== + $persistentSince = Carbon::now()->subHours($persistentHours); + + $persistent = DB::connection(self::DB_CONNECTION) + ->table('failed_jobs') + ->selectRaw( + "payload::json->>'displayName' AS job_class, ". + 'LEFT(exception, 80) AS exc_sig, '. + 'MIN(failed_at) AS oldest_at, '. + 'COUNT(*) AS cnt' + ) + ->where('failed_at', '<=', $persistentSince) + ->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)") + ->get(); + + foreach ($persistent as $row) { + $jobClass = (string) $row->job_class; + $excSig = (string) $row->exc_sig; + $dedupKey = "persistent:{$jobClass}:{$excSig}"; + + if ($this->isDup($dedupKey, $dedupAt)) { + $this->line("Skipping persistent (dedup): {$dedupKey}"); + + continue; + } + + $summary = "Автоматически: persistent exception job={$jobClass} повторяется >{$persistentHours}ч. Exc: {$excSig}"; + $this->createIncident($adminId, 'other', 'medium', $summary, Carbon::parse($row->oldest_at), $now, $dedupKey); + $created++; + $this->info("Job persistent [medium]: {$jobClass}"); } $this->info("Done. Created {$created} incident(s)."); return self::SUCCESS; } + + private function isDup(string $dedupKey, Carbon $dedupAt): bool + { + // Сигнатура сохраняется в root_cause для надёжного дедупа + return DB::connection(self::DB_CONNECTION) + ->table('incidents_log') + ->where('root_cause', $dedupKey) + ->whereNull('resolved_at') + ->where('detected_at', '>=', $dedupAt) + ->exists(); + } + + private function createIncident( + int $adminId, + string $type, + string $severity, + string $summary, + Carbon $startedAt, + Carbon $now, + string $dedupKey = '', + ): void { + DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([ + 'type' => $type, + 'severity' => $severity, + 'summary' => $summary, + 'root_cause' => $dedupKey !== '' ? $dedupKey : null, + 'started_at' => $startedAt, + 'detected_at' => $now, + 'resolved_at' => null, + 'created_by_admin_id' => $adminId, + 'created_at' => $now, + 'updated_at' => $now, + ]); + + if ($severity === 'high') { + Mail::to('kdv1@bk.ru')->send(new IncidentDetectedMail($summary, $severity)); + } + } } diff --git a/app/app/Mail/IncidentDetectedMail.php b/app/app/Mail/IncidentDetectedMail.php new file mode 100644 index 00000000..d8897e2b --- /dev/null +++ b/app/app/Mail/IncidentDetectedMail.php @@ -0,0 +1,44 @@ +summary, 0, 100); + + return new Envelope( + subject: "[Лидерра HIGH] Incident: {$subjectSnippet}", + ); + } + + public function content(): Content + { + return new Content( + text: 'emails.incident_detected_text', + with: [ + 'summary' => $this->summary, + 'severity' => $this->severity, + 'now' => now()->timezone('Europe/Moscow')->toIso8601String(), + ], + ); + } +} diff --git a/app/resources/views/emails/incident_detected_text.blade.php b/app/resources/views/emails/incident_detected_text.blade.php new file mode 100644 index 00000000..db0c54ef --- /dev/null +++ b/app/resources/views/emails/incident_detected_text.blade.php @@ -0,0 +1,10 @@ +Автоматический инцидент — Лидерра +=================================== +Severity: {{ strtoupper($severity) }} +Время: {{ $now }} (МСК) + +Описание: +{{ $summary }} + +Это автоматическое сообщение от команды incidents:watch-failures (Лидерра). +Проверьте incidents_log в панели администратора для деталей. diff --git a/app/tests/Feature/Incidents/IncidentsWatchFailuresExpandedTest.php b/app/tests/Feature/Incidents/IncidentsWatchFailuresExpandedTest.php new file mode 100644 index 00000000..c8dd1359 --- /dev/null +++ b/app/tests/Feature/Incidents/IncidentsWatchFailuresExpandedTest.php @@ -0,0 +1,200 @@ + $jobClass, 'job' => $jobClass]); + DB::table('failed_jobs')->insert([ + 'uuid' => (string) Str::uuid(), + 'connection' => 'redis', + 'queue' => 'default', + 'payload' => $payload, + 'exception' => $exception, + 'failed_at' => $at ?? now(), + ]); +} + +function makeFailedWebhookJobExp(string $exception, ?Carbon $at = null): void +{ + DB::table('failed_webhook_jobs')->insert([ + 'failed_at' => $at ?? now(), + 'exception' => $exception, + 'raw_payload' => '{}', + 'retry_count' => 0, + ]); +} + +function ensureAdminExp(): int +{ + $id = DB::table('saas_admin_users')->value('id'); + if ($id !== null) { + return (int) $id; + } + + return (int) DB::table('saas_admin_users')->insertGetId([ + 'email' => 'cron-expanded@liderra.ru', + 'full_name' => 'Cron Expanded', + 'password_hash' => '$2y$12$placeholder', + 'role' => 'dev_oncall', + 'is_active' => true, + 'created_at' => now(), + ]); +} + +// ─── Setup ────────────────────────────────────────────────────────────────── + +beforeEach(function () { + Mail::fake(); + ensureAdminExp(); +}); + +// ─── Tests ────────────────────────────────────────────────────────────────── + +test('failed_webhook_jobs spike still creates high incident (existing logic preserved)', function () { + $now = Carbon::now(); + for ($i = 0; $i < 201; $i++) { + makeFailedWebhookJobExp('App\\Exceptions\\WebhookException: connection refused', $now); + } + + $this->artisan('incidents:watch-failures')->assertSuccessful(); + + $incidents = DB::table('incidents_log')->get(); + expect($incidents)->toHaveCount(1); + expect($incidents->first()->severity)->toBe('high'); +}); + +test('failed_jobs spike threshold creates incident severity=high and sends mail', function () { + $now = Carbon::now(); + for ($i = 0; $i < 11; $i++) { + makeFailedJob( + 'App\\Jobs\\SyncSupplierProjectsJob', + 'RuntimeException: connection timeout', + $now + ); + } + + $this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful(); + + $incidents = DB::table('incidents_log') + ->where('summary', 'like', '%spike%') + ->get(); + + expect($incidents)->toHaveCount(1); + expect($incidents->first()->severity)->toBe('high'); + + Mail::assertSent(IncidentDetectedMail::class, 1); +}); + +test('failed_jobs daily-total threshold creates incident severity=medium', function () { + $yesterday = Carbon::now()->subHours(12); + for ($i = 0; $i < 51; $i++) { + makeFailedJob( + 'App\\Jobs\\GenerateReportJob', + 'PDOException: SQLSTATE connection refused', + $yesterday + ); + } + + $this->artisan('incidents:watch-failures', ['--threshold-daily' => 50])->assertSuccessful(); + + $incidents = DB::table('incidents_log') + ->where('summary', 'like', '%daily-total%') + ->get(); + + expect($incidents)->toHaveCount(1); + expect($incidents->first()->severity)->toBe('medium'); + + // Medium — no mail + Mail::assertNotSent(IncidentDetectedMail::class); +}); + +test('failed_jobs persistent exception creates incident severity=medium', function () { + $old = Carbon::now()->subHours(4); + for ($i = 0; $i < 3; $i++) { + makeFailedJob( + 'App\\Jobs\\CsvReconcileJob', + 'Illuminate\\Database\\QueryException: duplicate key value', + $old + ); + } + + $this->artisan('incidents:watch-failures', ['--persistent-hours' => 3])->assertSuccessful(); + + $incidents = DB::table('incidents_log') + ->where('summary', 'like', '%persistent%') + ->get(); + + expect($incidents)->toHaveCount(1); + expect($incidents->first()->severity)->toBe('medium'); + + // Medium — no mail + Mail::assertNotSent(IncidentDetectedMail::class); +}); + +test('dedup prevents duplicate incidents for same failed_jobs spike', function () { + $now = Carbon::now(); + for ($i = 0; $i < 11; $i++) { + makeFailedJob('App\\Jobs\\ImportLeadsJob', 'RuntimeException: quota exceeded', $now); + } + + // First run — creates incident + $this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful(); + expect(DB::table('incidents_log')->where('summary', 'like', '%spike%')->count())->toBe(1); + + // Second run — dedup kicks in + $this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful(); + expect(DB::table('incidents_log')->where('summary', 'like', '%spike%')->count())->toBe(1); +}); + +test('mail is sent only for high severity, not for medium', function () { + $now = Carbon::now(); + + // High: webhook spike + for ($i = 0; $i < 201; $i++) { + makeFailedWebhookJobExp('App\\Exceptions\\WebhookException: ssl error', $now); + } + + // Medium: daily-total + $yesterday = Carbon::now()->subHours(12); + for ($i = 0; $i < 55; $i++) { + makeFailedJob('App\\Jobs\\CleanupInactiveSupplierProjectsJob', 'RuntimeException: cleanup fail', $yesterday); + } + + $this->artisan('incidents:watch-failures', ['--threshold-daily' => 50])->assertSuccessful(); + + // Only 1 mail for the high webhook incident + Mail::assertSent(IncidentDetectedMail::class, 1); +}); + +test('warn-only when no saas_admin_users exist', function () { + // Remove all admins + DB::table('saas_admin_users')->delete(); + + $now = Carbon::now(); + for ($i = 0; $i < 11; $i++) { + makeFailedJob('App\\Jobs\\SyncSupplierProjectsJob', 'RuntimeException: no admin', $now); + } + + $this->artisan('incidents:watch-failures', ['--threshold-spike' => 10]) + ->assertSuccessful(); // SUCCESS not FAILURE + + // No incidents created (no admin FK) + expect(DB::table('incidents_log')->count())->toBe(0); + + // No mail + Mail::assertNotSent(IncidentDetectedMail::class); +});