diff --git a/.github/workflows/sql-runner.yml b/.github/workflows/sql-runner.yml new file mode 100644 index 00000000..481e3fcd --- /dev/null +++ b/.github/workflows/sql-runner.yml @@ -0,0 +1,96 @@ +name: Run whitelisted SQL on liderra.ru + +on: + workflow_dispatch: + inputs: + sql: + description: 'SQL query (SELECT only by default; UPDATE/DELETE need confirm_mutating=true)' + required: true + type: string + confirm_mutating: + description: 'Подтверждаю UPDATE/DELETE на проде' + required: false + default: false + type: boolean + +jobs: + run: + runs-on: ubuntu-latest + timeout-minutes: 10 + env: + LIDERRA_HOST: 111.88.246.137 + LIDERRA_USER: ubuntu + SQL: ${{ github.event.inputs.sql }} + CONFIRM_MUT: ${{ github.event.inputs.confirm_mutating }} + + steps: + - name: Whitelist check + run: | + set -euo pipefail + SQL_LOWER=$(echo "$SQL" | tr '[:upper:]' '[:lower:]' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + # Allow: SELECT / WITH (CTE) / \d / EXPLAIN + READ_RE='^(select |with |explain |\\d|\\df|\\di|\\dt)' + + # Mutating allowed if confirm=true: targeted UPDATE/DELETE on specific tables + MUTATING_RE='^(update supplier_leads|update failed_webhook_jobs|update scheduler_heartbeats|delete from failed_webhook_jobs|delete from incidents_log) ' + + if [[ "$SQL_LOWER" =~ $READ_RE ]]; then + echo "::notice::SELECT/read-only — allowed." + exit 0 + fi + + if [[ "$SQL_LOWER" =~ $MUTATING_RE ]]; then + if [[ "$CONFIRM_MUT" != "true" ]]; then + echo "::error::Mutating SQL requires confirm_mutating=true." + exit 1 + fi + echo "::warning::Mutating SQL authorized." + exit 0 + fi + + echo "::error::SQL not in whitelist: $SQL_LOWER" + exit 1 + + - name: Setup SSH key + run: | + mkdir -p ~/.ssh + echo "${{ secrets.LIDERRA_SSH_KEY }}" > ~/.ssh/liderra_deploy + chmod 600 ~/.ssh/liderra_deploy + ssh-keyscan -H ${{ env.LIDERRA_HOST }} >> ~/.ssh/known_hosts 2>/dev/null + + - name: Run on prod + run: | + set -o pipefail + SQL_B64=$(printf '%s' "$SQL" | base64 -w0) + ssh -i ~/.ssh/liderra_deploy ${{ env.LIDERRA_USER }}@${{ env.LIDERRA_HOST }} \ + "SQL_B64='$SQL_B64' bash -s" <<'REMOTE' | tee /tmp/sql.log + SQL=$(echo "$SQL_B64" | base64 -d) + echo "=== Running on $(hostname) at $(date -u) ===" + echo "SQL: $SQL" + echo + sudo -u postgres psql -d liderra -c "$SQL" + RC=$? + echo + echo "=== Exit code: $RC ===" + exit $RC + REMOTE + + - name: Summary + if: always() + run: | + { + echo "## SQL on prod" + echo + echo '```sql' + echo "$SQL" + echo '```' + echo + echo '```' + cat /tmp/sql.log 2>/dev/null + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + - name: Cleanup + if: always() + run: rm -f ~/.ssh/liderra_deploy diff --git a/docs/ops/2026-05-29-stage5-stuck-leads-cleanup.md b/docs/ops/2026-05-29-stage5-stuck-leads-cleanup.md new file mode 100644 index 00000000..72d74277 --- /dev/null +++ b/docs/ops/2026-05-29-stage5-stuck-leads-cleanup.md @@ -0,0 +1,100 @@ +# Ops: Cleanup застрявших supplier_leads 1110, 1157 + +**Дата операции:** 2026-05-29 +**Причина:** Finding 2 Stage 5 monitoring — 256 782 строк в `failed_webhook_jobs` от 2 лидов +за 24ч. Root cause: поставщик crm.bp-gr.ru шлёт B1+SMS combo, constraint +`chk_supplier_projects_b1_not_for_sms` запрещает → DomainException → 3 retries → storm. + +**Связанные артефакты:** +- `docs/superpowers/plans/2026-05-29-supplier-webhook-fast-fail-and-stuck-cleanup.md` +- `docs/superpowers/plans/2026-05-29-stage5-monitoring-checklist.md` → day 1 → Finding 2 +- GitHub Actions run `26616602381` (investigate-day1-round3 artifact) + +--- + +## Snapshot before mutation (заполнить перед выполнением Step 3) + +``` +# Команда: +# gh workflow run sql-runner.yml \ +# -f sql="SELECT id, phone, error, processed_at FROM supplier_leads WHERE id IN (1110, 1157);" +# +# Вставить вывод сюда: +[PLACEHOLDER — вставить снимок строк из gh workflow run output] +``` + +**Ожидаемые данные:** +- `id=1110`: phone=+79333*4038, error LIKE '%does not support SMS%', processed_at=NULL +- `id=1157`: phone=+79333*4038 (тот же), error LIKE '%does not support SMS%', processed_at=NULL + +--- + +## Операции (выполнить по порядку) + +### Step 3 — UPDATE supplier_leads (2 rows) + +```bash +gh workflow run sql-runner.yml \ + -f sql="UPDATE supplier_leads SET processed_at = NOW(), error = COALESCE(error,'') || ' [admin-resolved 2026-05-29: B1+SMS unsupported, see plan 2026-05-29-supplier-webhook-fast-fail]' WHERE id IN (1110, 1157) AND processed_at IS NULL;" \ + -f confirm_mutating=true +``` + +Expected: **2 rows updated**. + +### Step 4 — UPDATE failed_webhook_jobs (~256k rows) + +```bash +gh workflow run sql-runner.yml \ + -f sql="UPDATE failed_webhook_jobs SET resolved_at = NOW(), retried_by = 'admin-cleanup-2026-05-29' WHERE raw_payload->>'supplier_lead_id' IN ('1110','1157') AND resolved_at IS NULL;" \ + -f confirm_mutating=true +``` + +Expected: ~256 000 rows updated. Может занять 1-3 минуты. + +### Step 5 — Verify storm остановлен + +```bash +gh workflow run sql-runner.yml \ + -f sql="SELECT COUNT(*) FROM failed_webhook_jobs WHERE failed_at > NOW() - INTERVAL '1 hour' AND resolved_at IS NULL;" +``` + +Через 1 час после Step 4: ожидаем count < 100 (норма — единичные ошибки). + +--- + +## Rollback Instructions + +Если что-то пошло не так и нужно откатить Step 3: + +```sql +-- Откатить processed_at обратно на NULL для обоих лидов +-- ТОЛЬКО если был ложно помечен processed_at (убедиться что deals НЕ создались) +UPDATE supplier_leads +SET processed_at = NULL, + error = LEFT(error, STRPOS(error, ' [admin-resolved 2026-05-29') - 1) +WHERE id IN (1110, 1157) + AND error LIKE '%[admin-resolved 2026-05-29%'; +``` + +Откат Step 4 (`failed_webhook_jobs`) не нужен — resolved_at = логическое закрытие строк, +reverse смысла не имеет. При необходимости реоткрыть: + +```sql +UPDATE failed_webhook_jobs +SET resolved_at = NULL +WHERE retried_by = 'admin-cleanup-2026-05-29' + AND resolved_at IS NOT NULL; +``` + +--- + +## Log фактического выполнения + +| Шаг | Время UTC | Результат | Комментарий | +|-----|-----------|-----------|-------------| +| Step 2 snapshot | [PLACEHOLDER] | [rows] | — | +| Step 3 UPDATE leads | [PLACEHOLDER] | [N rows updated] | — | +| Step 4 UPDATE failed_jobs | [PLACEHOLDER] | [N rows updated] | — | +| Step 5 verify | [PLACEHOLDER] | [count] | — | + +**Operator:** [PLACEHOLDER — имя/логин кто выполнял]