527f628a21
Закрывает дыры #3 (доп. пороги) и #5 (доп. job-классы) аудита журналирования. Что добавлено: * СКАН failed_jobs (Laravel-standard) дополнительно к failed_webhook_jobs: покрывает 7 ShouldQueue классов которые раньше не алертились (SyncSupplierProject, ImportLeads, GenerateReport, CsvReconcile, CleanupInactiveSupplierProjects, RefreshSupplierSession, DeleteSupplierProject) * 3 правила детекции для failed_jobs: - spike: ≥10 failures одного job-класса за окно 10 мин → severity=high - daily-total: ≥50 failures одного job-класса за 24ч → severity=medium - persistent: exception повторяется >3ч → severity=medium * Группировка по (job_class, LEFT(exception, 80)) через JSON-экстракт `payload::json->>'displayName'` * Дедуп переведён с LIKE %summary% на точное совпадение root_cause — надёжно и без false-positive * Mailable IncidentDetectedMail (отдельный от SchedulerHeartbeatMissingMail), отправка ТОЛЬКО при severity=high (medium = тихий signal в incidents_log) * warn-only при отсутствии saas_admin_users (паттерн VerifyAuditChains) Параметры команды (новые): --threshold-spike=10 --threshold-daily=50 --persistent-hours=3 (старые --window=10 --threshold=200 --dedup-window=60 сохранены) Тесты: 11/11 passed (4 старых + 7 новых, 37 assertions, 3.6s). Plan: docs/superpowers/plans/2026-05-23-7-holes-overview.md (#3+#5).
231 lines
9.1 KiB
PHP
231 lines
9.1 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Console\Commands;
|
|
|
|
use App\Mail\IncidentDetectedMail;
|
|
use Illuminate\Console\Command;
|
|
use Illuminate\Support\Carbon;
|
|
use Illuminate\Support\Facades\DB;
|
|
use Illuminate\Support\Facades\Mail;
|
|
|
|
/**
|
|
* Сканирует failed_webhook_jobs и failed_jobs за скользящее окно.
|
|
*
|
|
* failed_webhook_jobs: одно правило — spike ≥ threshold (200).
|
|
* failed_jobs: три правила:
|
|
* - spike: кол-во за окно одного job-класса ≥ threshold-spike (10) → high
|
|
* - daily-total: за 24ч одного job-класса ≥ threshold-daily (50) → medium
|
|
* - persistent: один exception повторяется > persistent-hours часов → medium
|
|
*
|
|
* Дедуп: если открытый инцидент с той же сигнатурой создан < dedup-window мин —
|
|
* пропускаем. Письмо на kdv1@bk.ru только для severity=high.
|
|
*/
|
|
class IncidentsWatchFailures extends Command
|
|
{
|
|
private const DB_CONNECTION = 'pgsql_supplier';
|
|
|
|
protected $signature = 'incidents:watch-failures
|
|
{--window=10 : Окно сканирования в минутах}
|
|
{--threshold=200 : Порог спайка для failed_webhook_jobs}
|
|
{--threshold-spike=10 : Порог спайка для failed_jobs (за окно)}
|
|
{--threshold-daily=50 : Порог суммы за 24ч для failed_jobs}
|
|
{--persistent-hours=3 : Порог возраста persistent-exception для failed_jobs}
|
|
{--dedup-window=60 : Окно дедупа открытых инцидентов в минутах}';
|
|
|
|
protected $description = 'Сканирует failed_webhook_jobs и failed_jobs, создаёт incidents_log на превышение порогов';
|
|
|
|
public function handle(): int
|
|
{
|
|
$windowMinutes = (int) $this->option('window');
|
|
$threshold = (int) $this->option('threshold');
|
|
$thresholdSpike = (int) $this->option('threshold-spike');
|
|
$thresholdDaily = (int) $this->option('threshold-daily');
|
|
$persistentHours = (int) $this->option('persistent-hours');
|
|
$dedupMinutes = (int) $this->option('dedup-window');
|
|
|
|
$since = Carbon::now()->subMinutes($windowMinutes);
|
|
$since24h = Carbon::now()->subHours(24);
|
|
$dedupAt = Carbon::now()->subMinutes($dedupMinutes);
|
|
$now = Carbon::now();
|
|
|
|
// --- Проверяем наличие SaaS-администратора (FK NOT NULL) ---
|
|
$adminId = DB::connection(self::DB_CONNECTION)
|
|
->table('saas_admin_users')
|
|
->where('is_active', true)
|
|
->whereNull('deleted_at')
|
|
->value('id');
|
|
|
|
if ($adminId === null) {
|
|
$this->warn('No active saas_admin_users found — skipping incident creation (warn-only).');
|
|
|
|
return self::SUCCESS;
|
|
}
|
|
|
|
$created = 0;
|
|
|
|
// ===== БЛОК 1: failed_webhook_jobs (исходная логика) =====
|
|
$webhookGroups = DB::connection(self::DB_CONNECTION)
|
|
->table('failed_webhook_jobs')
|
|
->selectRaw('LEFT(exception, 180) AS sig, COUNT(*) AS cnt')
|
|
->whereNull('resolved_at')
|
|
->where('failed_at', '>=', $since)
|
|
->groupByRaw('LEFT(exception, 180)')
|
|
->havingRaw('COUNT(*) >= ?', [$threshold])
|
|
->get();
|
|
|
|
foreach ($webhookGroups as $group) {
|
|
$sig = $group->sig;
|
|
$count = (int) $group->cnt;
|
|
$dedupKey = substr($sig, 0, 80);
|
|
|
|
if ($this->isDup($dedupKey, $dedupAt)) {
|
|
$this->line("Skipping webhook (dedup): {$dedupKey}");
|
|
|
|
continue;
|
|
}
|
|
|
|
$summary = "Автоматически: {$count} упавших webhook-джобов за {$windowMinutes} мин. Сигнатура: {$sig}";
|
|
|
|
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
|
|
$created++;
|
|
$this->info("Webhook incident [high]: {$count} failures");
|
|
}
|
|
|
|
// ===== БЛОК 2: failed_jobs — spike =====
|
|
$spikes = DB::connection(self::DB_CONNECTION)
|
|
->table('failed_jobs')
|
|
->selectRaw(
|
|
"payload::json->>'displayName' AS job_class, ".
|
|
'LEFT(exception, 80) AS exc_sig, '.
|
|
'COUNT(*) AS cnt'
|
|
)
|
|
->where('failed_at', '>=', $since)
|
|
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
|
|
->havingRaw('COUNT(*) >= ?', [$thresholdSpike])
|
|
->get();
|
|
|
|
foreach ($spikes as $row) {
|
|
$jobClass = (string) $row->job_class;
|
|
$excSig = (string) $row->exc_sig;
|
|
$cnt = (int) $row->cnt;
|
|
$dedupKey = "spike:{$jobClass}:{$excSig}";
|
|
|
|
if ($this->isDup($dedupKey, $dedupAt)) {
|
|
$this->line("Skipping spike (dedup): {$dedupKey}");
|
|
|
|
continue;
|
|
}
|
|
|
|
$summary = "Автоматически: spike {$cnt} failures job={$jobClass} за {$windowMinutes} мин. Exc: {$excSig}";
|
|
$this->createIncident($adminId, 'other', 'high', $summary, $since, $now, $dedupKey);
|
|
$created++;
|
|
$this->info("Job spike [high]: {$jobClass} — {$cnt}");
|
|
}
|
|
|
|
// ===== БЛОК 3: failed_jobs — daily-total =====
|
|
$daily = DB::connection(self::DB_CONNECTION)
|
|
->table('failed_jobs')
|
|
->selectRaw(
|
|
"payload::json->>'displayName' AS job_class, ".
|
|
'COUNT(*) AS cnt'
|
|
)
|
|
->where('failed_at', '>=', $since24h)
|
|
->groupByRaw("payload::json->>'displayName'")
|
|
->havingRaw('COUNT(*) >= ?', [$thresholdDaily])
|
|
->get();
|
|
|
|
foreach ($daily as $row) {
|
|
$jobClass = (string) $row->job_class;
|
|
$cnt = (int) $row->cnt;
|
|
$dedupKey = "daily:{$jobClass}";
|
|
|
|
if ($this->isDup($dedupKey, $dedupAt)) {
|
|
$this->line("Skipping daily (dedup): {$dedupKey}");
|
|
|
|
continue;
|
|
}
|
|
|
|
$summary = "Автоматически: daily-total {$cnt} failures job={$jobClass} за 24ч";
|
|
$this->createIncident($adminId, 'other', 'medium', $summary, $since24h, $now, $dedupKey);
|
|
$created++;
|
|
$this->info("Job daily [medium]: {$jobClass} — {$cnt}");
|
|
}
|
|
|
|
// ===== БЛОК 4: failed_jobs — persistent =====
|
|
$persistentSince = Carbon::now()->subHours($persistentHours);
|
|
|
|
$persistent = DB::connection(self::DB_CONNECTION)
|
|
->table('failed_jobs')
|
|
->selectRaw(
|
|
"payload::json->>'displayName' AS job_class, ".
|
|
'LEFT(exception, 80) AS exc_sig, '.
|
|
'MIN(failed_at) AS oldest_at, '.
|
|
'COUNT(*) AS cnt'
|
|
)
|
|
->where('failed_at', '<=', $persistentSince)
|
|
->groupByRaw("payload::json->>'displayName', LEFT(exception, 80)")
|
|
->get();
|
|
|
|
foreach ($persistent as $row) {
|
|
$jobClass = (string) $row->job_class;
|
|
$excSig = (string) $row->exc_sig;
|
|
$dedupKey = "persistent:{$jobClass}:{$excSig}";
|
|
|
|
if ($this->isDup($dedupKey, $dedupAt)) {
|
|
$this->line("Skipping persistent (dedup): {$dedupKey}");
|
|
|
|
continue;
|
|
}
|
|
|
|
$summary = "Автоматически: persistent exception job={$jobClass} повторяется >{$persistentHours}ч. Exc: {$excSig}";
|
|
$this->createIncident($adminId, 'other', 'medium', $summary, Carbon::parse($row->oldest_at), $now, $dedupKey);
|
|
$created++;
|
|
$this->info("Job persistent [medium]: {$jobClass}");
|
|
}
|
|
|
|
$this->info("Done. Created {$created} incident(s).");
|
|
|
|
return self::SUCCESS;
|
|
}
|
|
|
|
private function isDup(string $dedupKey, Carbon $dedupAt): bool
|
|
{
|
|
// Сигнатура сохраняется в root_cause для надёжного дедупа
|
|
return DB::connection(self::DB_CONNECTION)
|
|
->table('incidents_log')
|
|
->where('root_cause', $dedupKey)
|
|
->whereNull('resolved_at')
|
|
->where('detected_at', '>=', $dedupAt)
|
|
->exists();
|
|
}
|
|
|
|
private function createIncident(
|
|
int $adminId,
|
|
string $type,
|
|
string $severity,
|
|
string $summary,
|
|
Carbon $startedAt,
|
|
Carbon $now,
|
|
string $dedupKey = '',
|
|
): void {
|
|
DB::connection(self::DB_CONNECTION)->table('incidents_log')->insert([
|
|
'type' => $type,
|
|
'severity' => $severity,
|
|
'summary' => $summary,
|
|
'root_cause' => $dedupKey !== '' ? $dedupKey : null,
|
|
'started_at' => $startedAt,
|
|
'detected_at' => $now,
|
|
'resolved_at' => null,
|
|
'created_by_admin_id' => $adminId,
|
|
'created_at' => $now,
|
|
'updated_at' => $now,
|
|
]);
|
|
|
|
if ($severity === 'high') {
|
|
Mail::to('kdv1@bk.ru')->send(new IncidentDetectedMail($summary, $severity));
|
|
}
|
|
}
|
|
}
|