feat(ops): incidents:watch-failures расширен на failed_jobs + 3 правила (holes #3+#5)

Закрывает дыры #3 (доп. пороги) и #5 (доп. job-классы) аудита журналирования.

Что добавлено:
* СКАН failed_jobs (Laravel-standard) дополнительно к failed_webhook_jobs:
  покрывает 7 ShouldQueue классов которые раньше не алертились
  (SyncSupplierProject, ImportLeads, GenerateReport, CsvReconcile,
  CleanupInactiveSupplierProjects, RefreshSupplierSession, DeleteSupplierProject)
* 3 правила детекции для failed_jobs:
  - spike: ≥10 failures одного job-класса за окно 10 мин → severity=high
  - daily-total: ≥50 failures одного job-класса за 24ч → severity=medium
  - persistent: exception повторяется >3ч → severity=medium
* Группировка по (job_class, LEFT(exception, 80)) через JSON-экстракт
  `payload::json->>'displayName'`
* Дедуп переведён с LIKE %summary% на точное совпадение root_cause —
  надёжно и без false-positive
* Mailable IncidentDetectedMail (отдельный от SchedulerHeartbeatMissingMail),
  отправка ТОЛЬКО при severity=high (medium = тихий signal в incidents_log)
* warn-only при отсутствии saas_admin_users (паттерн VerifyAuditChains)

Параметры команды (новые):
  --threshold-spike=10 --threshold-daily=50 --persistent-hours=3
  (старые --window=10 --threshold=200 --dedup-window=60 сохранены)

Тесты: 11/11 passed (4 старых + 7 новых, 37 assertions, 3.6s).

Plan: docs/superpowers/plans/2026-05-23-7-holes-overview.md (#3+#5).
This commit is contained in:
Дмитрий
2026-05-23 12:01:20 +03:00
parent 33462bf52e
commit 527f628a21
4 changed files with 429 additions and 61 deletions
@@ -4,46 +4,69 @@ declare(strict_types=1);
namespace App\Console\Commands;
use App\Mail\IncidentDetectedMail;
use Illuminate\Console\Command;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Mail;
/**
* Сканирует failed_webhook_jobs за скользящее окно и автоматически создаёт
* incidents_log, когда кластер падений превышает заданный порог.
* Сканирует failed_webhook_jobs и failed_jobs за скользящее окно.
*
* Запускается каждые 10 минут через Schedule (routes/console.php).
* Дедупликация: если открытый инцидент с такой же сигнатурой создан менее
* --dedup-window минут назад, новая запись не создаётся.
* failed_webhook_jobs: одно правило spike threshold (200).
* failed_jobs: три правила:
* - spike: кол-во за окно одного job-класса threshold-spike (10) high
* - daily-total: за 24ч одного job-класса threshold-daily (50) medium
* - persistent: один exception повторяется > persistent-hours часов medium
*
* NB: SaaS-admin scope обе таблицы (`failed_webhook_jobs` + `incidents_log`)
* имеют RLS-политики на `app.current_tenant_id`, который в cron-контексте
* не задан. Идём через `pgsql_supplier` (BYPASSRLS, роль `crm_supplier_worker`),
* как и остальные системные cron-команды (Reset, RetryFailed и т.п.).
* Дедуп: если открытый инцидент с той же сигнатурой создан < dedup-window мин
* пропускаем. Письмо на kdv1@bk.ru только для severity=high.
*/
class IncidentsWatchFailures extends Command
{
private const DB_CONNECTION = 'pgsql_supplier';
protected $signature = 'incidents:watch-failures
{--window=10 : Окно сканирования в минутах}
{--threshold=200 : Порог числа падений за окно}
{--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}';
{--window=10 : Окно сканирования в минутах}
{--threshold=200 : Порог спайка для failed_webhook_jobs}
{--threshold-spike=10 : Порог спайка для failed_jobs (за окно)}
{--threshold-daily=50 : Порог суммы за 24ч для failed_jobs}
{--persistent-hours=3 : Порог возраста persistent-exception для failed_jobs}
{--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}';
protected $description = 'Сканирует failed_webhook_jobs за окно и создаёт incidents_log на превышение порога';
protected $description = 'Сканирует failed_webhook_jobs и failed_jobs, создаёт incidents_log на превышение порогов';
public function handle(): int
{
$windowMinutes = (int) $this->option('window');
$threshold = (int) $this->option('threshold');
$thresholdSpike = (int) $this->option('threshold-spike');
$thresholdDaily = (int) $this->option('threshold-daily');
$persistentHours = (int) $this->option('persistent-hours');
$dedupMinutes = (int) $this->option('dedup-window');
$since = Carbon::now()->subMinutes($windowMinutes);
$since24h = Carbon::now()->subHours(24);
$dedupAt = Carbon::now()->subMinutes($dedupMinutes);
$now = Carbon::now();
// Группируем упавшие (ещё не resolved) джобы за окно по сигнатуре
$groups = DB::connection(self::DB_CONNECTION)
// --- Проверяем наличие SaaS-администратора (FK NOT NULL) ---
$adminId = DB::connection(self::DB_CONNECTION)
->table('saas_admin_users')
->where('is_active', true)
->whereNull('deleted_at')
->value('id');
if ($adminId === null) {
$this->warn('No active saas_admin_users found — skipping incident creation (warn-only).');
return self::SUCCESS;
}
$created = 0;
// ===== БЛОК 1: failed_webhook_jobs (исходная логика) =====
$webhookGroups = DB::connection(self::DB_CONNECTION)
->table('failed_webhook_jobs')
->selectRaw('LEFT(exception, 180) AS sig, COUNT(*) AS cnt')
->whereNull('resolved_at')
@@ -52,65 +75,156 @@ class IncidentsWatchFailures extends Command
->havingRaw('COUNT(*) >= ?', [$threshold])
->get();
if ($groups->isEmpty()) {
$this->info('No failure spikes detected.');
return self::SUCCESS;
}
// Получаем ID первого доступного SaaS-администратора (для NOT NULL FK)
$adminId = DB::connection(self::DB_CONNECTION)
->table('saas_admin_users')
->where('is_active', true)
->whereNull('deleted_at')
->value('id');
if ($adminId === null) {
$this->error('No active saas_admin_users found — cannot create incidents_log rows.');
return self::FAILURE;
}
$created = 0;
foreach ($groups as $group) {
foreach ($webhookGroups as $group) {
$sig = $group->sig;
$count = (int) $group->cnt;
$dedupKey = substr($sig, 0, 80);
// Дедупликация: есть ли уже открытый инцидент с такой сигнатурой?
$alreadyOpen = DB::connection(self::DB_CONNECTION)
->table('incidents_log')
->where('summary', 'like', '%'.addcslashes(substr($sig, 0, 80), '%_\\').'%')
->whereNull('resolved_at')
->where('detected_at', '>=', $dedupAt)
->exists();
if ($alreadyOpen) {
$this->line("Skipping (dedup): {$sig}");
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping webhook (dedup): {$dedupKey}");
continue;
}
DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([
'type' => 'other',
'severity' => 'high',
'summary' => "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. "
."Сигнатура: {$sig}",
'root_cause' => null,
'started_at' => $since,
'detected_at' => $now,
'resolved_at' => null,
'created_by_admin_id' => $adminId,
'created_at' => $now,
'updated_at' => $now,
]);
$summary = "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. Сигнатура: {$sig}";
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
$created++;
$this->info("Incident created: [{$count} failures] {$sig}");
$this->info("Webhook incident [high]: {$count} failures");
}
// ===== БЛОК 2: failed_jobs — spike =====
$spikes = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'LEFT(exception, 80) AS exc_sig, '.
'COUNT(*) AS cnt'
)
->where('failed_at', '>=', $since)
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
->havingRaw('COUNT(*) >= ?', [$thresholdSpike])
->get();
foreach ($spikes as $row) {
$jobClass = (string) $row->job_class;
$excSig = (string) $row->exc_sig;
$cnt = (int) $row->cnt;
$dedupKey = "spike:{$jobClass}:{$excSig}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping spike (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: spike {$cnt} failures job={$jobClass} за {$windowMinutes} мин. Exc: {$excSig}";
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
$created++;
$this->info("Job spike [high]: {$jobClass}{$cnt}");
}
// ===== БЛОК 3: failed_jobs — daily-total =====
$daily = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'COUNT(*) AS cnt'
)
->where('failed_at', '>=', $since24h)
->groupByRaw("payload::json->>'displayName'")
->havingRaw('COUNT(*) >= ?', [$thresholdDaily])
->get();
foreach ($daily as $row) {
$jobClass = (string) $row->job_class;
$cnt = (int) $row->cnt;
$dedupKey = "daily:{$jobClass}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping daily (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: daily-total {$cnt} failures job={$jobClass} за 24ч";
$this->createIncident($adminId, 'other', 'medium', $summary, $since24h, $now, $dedupKey);
$created++;
$this->info("Job daily [medium]: {$jobClass}{$cnt}");
}
// ===== БЛОК 4: failed_jobs — persistent =====
$persistentSince = Carbon::now()->subHours($persistentHours);
$persistent = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'LEFT(exception, 80) AS exc_sig, '.
'MIN(failed_at) AS oldest_at, '.
'COUNT(*) AS cnt'
)
->where('failed_at', '<=', $persistentSince)
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
->get();
foreach ($persistent as $row) {
$jobClass = (string) $row->job_class;
$excSig = (string) $row->exc_sig;
$dedupKey = "persistent:{$jobClass}:{$excSig}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping persistent (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: persistent exception job={$jobClass} повторяется >{$persistentHours}ч. Exc: {$excSig}";
$this->createIncident($adminId, 'other', 'medium', $summary, Carbon::parse($row->oldest_at), $now, $dedupKey);
$created++;
$this->info("Job persistent [medium]: {$jobClass}");
}
$this->info("Done. Created {$created} incident(s).");
return self::SUCCESS;
}
private function isDup(string $dedupKey, Carbon $dedupAt): bool
{
// Сигнатура сохраняется в root_cause для надёжного дедупа
return DB::connection(self::DB_CONNECTION)
->table('incidents_log')
->where('root_cause', $dedupKey)
->whereNull('resolved_at')
->where('detected_at', '>=', $dedupAt)
->exists();
}
private function createIncident(
int $adminId,
string $type,
string $severity,
string $summary,
Carbon $startedAt,
Carbon $now,
string $dedupKey = '',
): void {
DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([
'type' => $type,
'severity' => $severity,
'summary' => $summary,
'root_cause' => $dedupKey !== '' ? $dedupKey : null,
'started_at' => $startedAt,
'detected_at' => $now,
'resolved_at' => null,
'created_by_admin_id' => $adminId,
'created_at' => $now,
'updated_at' => $now,
]);
if ($severity === 'high') {
Mail::to('kdv1@bk.ru')->send(new IncidentDetectedMail($summary, $severity));
}
}
}
+44
View File
@@ -0,0 +1,44 @@
<?php
declare(strict_types=1);
namespace App\Mail;
use Illuminate\Mail\Mailable;
use Illuminate\Mail\Mailables\Content;
use Illuminate\Mail\Mailables\Envelope;
/**
* Уведомление об автоматически обнаруженном инциденте.
*
* Отправляется только для severity=high командой incidents:watch-failures.
* Subject: [Лидерра HIGH] Incident: {summary first 100}.
*/
final class IncidentDetectedMail extends Mailable
{
public function __construct(
public readonly string $summary,
public readonly string $severity,
) {}
public function envelope(): Envelope
{
$subjectSnippet = mb_substr($this->summary, 0, 100);
return new Envelope(
subject: "[Лидерра HIGH] Incident: {$subjectSnippet}",
);
}
public function content(): Content
{
return new Content(
text: 'emails.incident_detected_text',
with: [
'summary' => $this->summary,
'severity' => $this->severity,
'now' => now()->timezone('Europe/Moscow')->toIso8601String(),
],
);
}
}
@@ -0,0 +1,10 @@
Автоматический инцидент Лидерра
===================================
Severity: {{ strtoupper($severity) }}
Время: {{ $now }} (МСК)
Описание:
{{ $summary }}
Это автоматическое сообщение от команды incidents:watch-failures (Лидерра).
Проверьте incidents_log в панели администратора для деталей.
@@ -0,0 +1,200 @@
<?php
declare(strict_types=1);
use App\Mail\IncidentDetectedMail;
use Illuminate\Foundation\Testing\DatabaseTransactions;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Mail;
use Illuminate\Support\Str;
use Tests\Concerns\SharesSupplierPdo;
uses(DatabaseTransactions::class);
uses(SharesSupplierPdo::class);
// ─── Helpers ────────────────────────────────────────────────────────────────
function makeFailedJob(string $jobClass, string $exception, ?Carbon $at = null): void
{
$payload = json_encode(['displayName' => $jobClass, 'job' => $jobClass]);
DB::table('failed_jobs')->insert([
'uuid' => (string) Str::uuid(),
'connection' => 'redis',
'queue' => 'default',
'payload' => $payload,
'exception' => $exception,
'failed_at' => $at ?? now(),
]);
}
function makeFailedWebhookJobExp(string $exception, ?Carbon $at = null): void
{
DB::table('failed_webhook_jobs')->insert([
'failed_at' => $at ?? now(),
'exception' => $exception,
'raw_payload' => '{}',
'retry_count' => 0,
]);
}
function ensureAdminExp(): int
{
$id = DB::table('saas_admin_users')->value('id');
if ($id !== null) {
return (int) $id;
}
return (int) DB::table('saas_admin_users')->insertGetId([
'email' => 'cron-expanded@liderra.ru',
'full_name' => 'Cron Expanded',
'password_hash' => '$2y$12$placeholder',
'role' => 'dev_oncall',
'is_active' => true,
'created_at' => now(),
]);
}
// ─── Setup ──────────────────────────────────────────────────────────────────
beforeEach(function () {
Mail::fake();
ensureAdminExp();
});
// ─── Tests ──────────────────────────────────────────────────────────────────
test('failed_webhook_jobs spike still creates high incident (existing logic preserved)', function () {
$now = Carbon::now();
for ($i = 0; $i < 201; $i++) {
makeFailedWebhookJobExp('App\\Exceptions\\WebhookException: connection refused', $now);
}
$this->artisan('incidents:watch-failures')->assertSuccessful();
$incidents = DB::table('incidents_log')->get();
expect($incidents)->toHaveCount(1);
expect($incidents->first()->severity)->toBe('high');
});
test('failed_jobs spike threshold creates incident severity=high and sends mail', function () {
$now = Carbon::now();
for ($i = 0; $i < 11; $i++) {
makeFailedJob(
'App\\Jobs\\SyncSupplierProjectsJob',
'RuntimeException: connection timeout',
$now
);
}
$this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful();
$incidents = DB::table('incidents_log')
->where('summary', 'like', '%spike%')
->get();
expect($incidents)->toHaveCount(1);
expect($incidents->first()->severity)->toBe('high');
Mail::assertSent(IncidentDetectedMail::class, 1);
});
test('failed_jobs daily-total threshold creates incident severity=medium', function () {
$yesterday = Carbon::now()->subHours(12);
for ($i = 0; $i < 51; $i++) {
makeFailedJob(
'App\\Jobs\\GenerateReportJob',
'PDOException: SQLSTATE connection refused',
$yesterday
);
}
$this->artisan('incidents:watch-failures', ['--threshold-daily' => 50])->assertSuccessful();
$incidents = DB::table('incidents_log')
->where('summary', 'like', '%daily-total%')
->get();
expect($incidents)->toHaveCount(1);
expect($incidents->first()->severity)->toBe('medium');
// Medium — no mail
Mail::assertNotSent(IncidentDetectedMail::class);
});
test('failed_jobs persistent exception creates incident severity=medium', function () {
$old = Carbon::now()->subHours(4);
for ($i = 0; $i < 3; $i++) {
makeFailedJob(
'App\\Jobs\\CsvReconcileJob',
'Illuminate\\Database\\QueryException: duplicate key value',
$old
);
}
$this->artisan('incidents:watch-failures', ['--persistent-hours' => 3])->assertSuccessful();
$incidents = DB::table('incidents_log')
->where('summary', 'like', '%persistent%')
->get();
expect($incidents)->toHaveCount(1);
expect($incidents->first()->severity)->toBe('medium');
// Medium — no mail
Mail::assertNotSent(IncidentDetectedMail::class);
});
test('dedup prevents duplicate incidents for same failed_jobs spike', function () {
$now = Carbon::now();
for ($i = 0; $i < 11; $i++) {
makeFailedJob('App\\Jobs\\ImportLeadsJob', 'RuntimeException: quota exceeded', $now);
}
// First run — creates incident
$this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful();
expect(DB::table('incidents_log')->where('summary', 'like', '%spike%')->count())->toBe(1);
// Second run — dedup kicks in
$this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])->assertSuccessful();
expect(DB::table('incidents_log')->where('summary', 'like', '%spike%')->count())->toBe(1);
});
test('mail is sent only for high severity, not for medium', function () {
$now = Carbon::now();
// High: webhook spike
for ($i = 0; $i < 201; $i++) {
makeFailedWebhookJobExp('App\\Exceptions\\WebhookException: ssl error', $now);
}
// Medium: daily-total
$yesterday = Carbon::now()->subHours(12);
for ($i = 0; $i < 55; $i++) {
makeFailedJob('App\\Jobs\\CleanupInactiveSupplierProjectsJob', 'RuntimeException: cleanup fail', $yesterday);
}
$this->artisan('incidents:watch-failures', ['--threshold-daily' => 50])->assertSuccessful();
// Only 1 mail for the high webhook incident
Mail::assertSent(IncidentDetectedMail::class, 1);
});
test('warn-only when no saas_admin_users exist', function () {
// Remove all admins
DB::table('saas_admin_users')->delete();
$now = Carbon::now();
for ($i = 0; $i < 11; $i++) {
makeFailedJob('App\\Jobs\\SyncSupplierProjectsJob', 'RuntimeException: no admin', $now);
}
$this->artisan('incidents:watch-failures', ['--threshold-spike' => 10])
->assertSuccessful(); // SUCCESS not FAILURE
// No incidents created (no admin FK)
expect(DB::table('incidents_log')->count())->toBe(0);
// No mail
Mail::assertNotSent(IncidentDetectedMail::class);
});