Files
portal/app/app/Console/Commands/IncidentsWatchFailures.php
T
Дмитрий 527f628a21 feat(ops): incidents:watch-failures расширен на failed_jobs + 3 правила (holes #3+#5)
Закрывает дыры #3 (доп. пороги) и #5 (доп. job-классы) аудита журналирования.

Что добавлено:
* СКАН failed_jobs (Laravel-standard) дополнительно к failed_webhook_jobs:
  покрывает 7 ShouldQueue классов которые раньше не алертились
  (SyncSupplierProject, ImportLeads, GenerateReport, CsvReconcile,
  CleanupInactiveSupplierProjects, RefreshSupplierSession, DeleteSupplierProject)
* 3 правила детекции для failed_jobs:
  - spike: ≥10 failures одного job-класса за окно 10 мин → severity=high
  - daily-total: ≥50 failures одного job-класса за 24ч → severity=medium
  - persistent: exception повторяется >3ч → severity=medium
* Группировка по (job_class, LEFT(exception, 80)) через JSON-экстракт
  `payload::json->>'displayName'`
* Дедуп переведён с LIKE %summary% на точное совпадение root_cause —
  надёжно и без false-positive
* Mailable IncidentDetectedMail (отдельный от SchedulerHeartbeatMissingMail),
  отправка ТОЛЬКО при severity=high (medium = тихий signal в incidents_log)
* warn-only при отсутствии saas_admin_users (паттерн VerifyAuditChains)

Параметры команды (новые):
  --threshold-spike=10 --threshold-daily=50 --persistent-hours=3
  (старые --window=10 --threshold=200 --dedup-window=60 сохранены)

Тесты: 11/11 passed (4 старых + 7 новых, 37 assertions, 3.6s).

Plan: docs/superpowers/plans/2026-05-23-7-holes-overview.md (#3+#5).
2026-05-23 12:01:20 +03:00

231 lines
9.1 KiB
PHP

<?php
declare(strict_types=1);
namespace App\Console\Commands;
use App\Mail\IncidentDetectedMail;
use Illuminate\Console\Command;
use Illuminate\Support\Carbon;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\Mail;
/**
* Сканирует failed_webhook_jobs и failed_jobs за скользящее окно.
*
* failed_webhook_jobs: одно правило — spike ≥ threshold (200).
* failed_jobs: три правила:
* - spike: кол-во за окно одного job-класса ≥ threshold-spike (10) → high
* - daily-total: за 24ч одного job-класса ≥ threshold-daily (50) → medium
* - persistent: один exception повторяется > persistent-hours часов → medium
*
* Дедуп: если открытый инцидент с той же сигнатурой создан < dedup-window мин —
* пропускаем. Письмо на kdv1@bk.ru только для severity=high.
*/
class IncidentsWatchFailures extends Command
{
private const DB_CONNECTION = 'pgsql_supplier';
protected $signature = 'incidents:watch-failures
{--window=10 : Окно сканирования в минутах}
{--threshold=200 : Порог спайка для failed_webhook_jobs}
{--threshold-spike=10 : Порог спайка для failed_jobs (за окно)}
{--threshold-daily=50 : Порог суммы за 24ч для failed_jobs}
{--persistent-hours=3 : Порог возраста persistent-exception для failed_jobs}
{--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}';
protected $description = 'Сканирует failed_webhook_jobs и failed_jobs, создаёт incidents_log на превышение порогов';
public function handle(): int
{
$windowMinutes = (int) $this->option('window');
$threshold = (int) $this->option('threshold');
$thresholdSpike = (int) $this->option('threshold-spike');
$thresholdDaily = (int) $this->option('threshold-daily');
$persistentHours = (int) $this->option('persistent-hours');
$dedupMinutes = (int) $this->option('dedup-window');
$since = Carbon::now()->subMinutes($windowMinutes);
$since24h = Carbon::now()->subHours(24);
$dedupAt = Carbon::now()->subMinutes($dedupMinutes);
$now = Carbon::now();
// --- Проверяем наличие SaaS-администратора (FK NOT NULL) ---
$adminId = DB::connection(self::DB_CONNECTION)
->table('saas_admin_users')
->where('is_active', true)
->whereNull('deleted_at')
->value('id');
if ($adminId === null) {
$this->warn('No active saas_admin_users found — skipping incident creation (warn-only).');
return self::SUCCESS;
}
$created = 0;
// ===== БЛОК 1: failed_webhook_jobs (исходная логика) =====
$webhookGroups = DB::connection(self::DB_CONNECTION)
->table('failed_webhook_jobs')
->selectRaw('LEFT(exception, 180) AS sig, COUNT(*) AS cnt')
->whereNull('resolved_at')
->where('failed_at', '>=', $since)
->groupByRaw('LEFT(exception, 180)')
->havingRaw('COUNT(*) >= ?', [$threshold])
->get();
foreach ($webhookGroups as $group) {
$sig = $group->sig;
$count = (int) $group->cnt;
$dedupKey = substr($sig, 0, 80);
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping webhook (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. Сигнатура: {$sig}";
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
$created++;
$this->info("Webhook incident [high]: {$count} failures");
}
// ===== БЛОК 2: failed_jobs — spike =====
$spikes = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'LEFT(exception, 80) AS exc_sig, '.
'COUNT(*) AS cnt'
)
->where('failed_at', '>=', $since)
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
->havingRaw('COUNT(*) >= ?', [$thresholdSpike])
->get();
foreach ($spikes as $row) {
$jobClass = (string) $row->job_class;
$excSig = (string) $row->exc_sig;
$cnt = (int) $row->cnt;
$dedupKey = "spike:{$jobClass}:{$excSig}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping spike (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: spike {$cnt} failures job={$jobClass} за {$windowMinutes} мин. Exc: {$excSig}";
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
$created++;
$this->info("Job spike [high]: {$jobClass}{$cnt}");
}
// ===== БЛОК 3: failed_jobs — daily-total =====
$daily = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'COUNT(*) AS cnt'
)
->where('failed_at', '>=', $since24h)
->groupByRaw("payload::json->>'displayName'")
->havingRaw('COUNT(*) >= ?', [$thresholdDaily])
->get();
foreach ($daily as $row) {
$jobClass = (string) $row->job_class;
$cnt = (int) $row->cnt;
$dedupKey = "daily:{$jobClass}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping daily (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: daily-total {$cnt} failures job={$jobClass} за 24ч";
$this->createIncident($adminId, 'other', 'medium', $summary, $since24h, $now, $dedupKey);
$created++;
$this->info("Job daily [medium]: {$jobClass}{$cnt}");
}
// ===== БЛОК 4: failed_jobs — persistent =====
$persistentSince = Carbon::now()->subHours($persistentHours);
$persistent = DB::connection(self::DB_CONNECTION)
->table('failed_jobs')
->selectRaw(
"payload::json->>'displayName' AS job_class, ".
'LEFT(exception, 80) AS exc_sig, '.
'MIN(failed_at) AS oldest_at, '.
'COUNT(*) AS cnt'
)
->where('failed_at', '<=', $persistentSince)
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
->get();
foreach ($persistent as $row) {
$jobClass = (string) $row->job_class;
$excSig = (string) $row->exc_sig;
$dedupKey = "persistent:{$jobClass}:{$excSig}";
if ($this->isDup($dedupKey, $dedupAt)) {
$this->line("Skipping persistent (dedup): {$dedupKey}");
continue;
}
$summary = "Автоматически: persistent exception job={$jobClass} повторяется >{$persistentHours}ч. Exc: {$excSig}";
$this->createIncident($adminId, 'other', 'medium', $summary, Carbon::parse($row->oldest_at), $now, $dedupKey);
$created++;
$this->info("Job persistent [medium]: {$jobClass}");
}
$this->info("Done. Created {$created} incident(s).");
return self::SUCCESS;
}
private function isDup(string $dedupKey, Carbon $dedupAt): bool
{
// Сигнатура сохраняется в root_cause для надёжного дедупа
return DB::connection(self::DB_CONNECTION)
->table('incidents_log')
->where('root_cause', $dedupKey)
->whereNull('resolved_at')
->where('detected_at', '>=', $dedupAt)
->exists();
}
private function createIncident(
int $adminId,
string $type,
string $severity,
string $summary,
Carbon $startedAt,
Carbon $now,
string $dedupKey = '',
): void {
DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([
'type' => $type,
'severity' => $severity,
'summary' => $summary,
'root_cause' => $dedupKey !== '' ? $dedupKey : null,
'started_at' => $startedAt,
'detected_at' => $now,
'resolved_at' => null,
'created_by_admin_id' => $adminId,
'created_at' => $now,
'updated_at' => $now,
]);
if ($severity === 'high') {
Mail::to('kdv1@bk.ru')->send(new IncidentDetectedMail($summary, $severity));
}
}
}