From d50a3d5108b4af685d9d0c19a66bf48fa3bb5489 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=BC=D0=B8=D1=82=D1=80=D0=B8=D0=B9?= Date: Tue, 30 Jun 2026 10:16:22 +0300 Subject: [PATCH] =?UTF-8?q?feat(=D0=B0=D0=B2=D1=82=D0=BE=D0=BF=D0=BE=D0=B4?= =?UTF-8?q?=D0=B1=D0=BE=D1=80):=20HtmlPhoneScanner=20=E2=80=94=20=D0=BD?= =?UTF-8?q?=D0=BE=D0=BC=D0=B5=D1=80=D0=B0=20=D0=B8=D0=B7=20=D0=BA=D0=BE?= =?UTF-8?q?=D0=B4=D0=B0=20(tel/schema/microdata/=D1=82=D0=B5=D0=BB=D0=BE/e?= =?UTF-8?q?mail)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 --- .../Agent/Extract/HtmlPhoneScanner.php | 75 +++++++++++++++++++ .../Extract/HtmlPhoneScannerTest.php | 26 +++++++ 2 files changed, 101 insertions(+) create mode 100644 app/app/Services/Autopodbor/Agent/Extract/HtmlPhoneScanner.php create mode 100644 app/tests/Unit/Autopodbor/Extract/HtmlPhoneScannerTest.php diff --git a/app/app/Services/Autopodbor/Agent/Extract/HtmlPhoneScanner.php b/app/app/Services/Autopodbor/Agent/Extract/HtmlPhoneScanner.php new file mode 100644 index 00000000..1d9e7dc0 --- /dev/null +++ b/app/app/Services/Autopodbor/Agent/Extract/HtmlPhoneScanner.php @@ -0,0 +1,75 @@ +>, body: array, emails: list} */ + public function scan(string $html): array + { + $code = []; + $add = function (string $raw, string $slot) use (&$code): void { + $n = $this->normalizeMaybe($raw); + if ($n === null) { + return; + } + $code[$n] ??= []; + if (! in_array($slot, $code[$n], true)) { + $code[$n][] = $slot; + } + }; + + if (preg_match_all('/tel:([+0-9()\s-]{7,})/i', $html, $m)) { + foreach ($m[1] as $x) { + $add($x, 'tel'); + } + } + if (preg_match_all('/"telephone"\s*:\s*"([^"]+)"/i', $html, $m)) { + foreach ($m[1] as $x) { + $add($x, 'schema'); + } + } + if (preg_match_all('/itemprop=["\']telephone["\'][^>]*content=["\']([^"\']+)/i', $html, $m)) { + foreach ($m[1] as $x) { + $add($x, 'microdata'); + } + } + + $body = []; + if (preg_match_all('/(?:\+7|8)[\s(\-]*\d{3}[\s)\-]*\d{3}[\s\-]*\d{2}[\s\-]*\d{2}/', $html, $m)) { + foreach ($m[0] as $x) { + $n = $this->normalizeMaybe($x); + if ($n !== null) { + $body[$n] = ($body[$n] ?? 0) + 1; + } + } + } + + $emails = []; + if (preg_match_all('/([a-z0-9._%+-]+)@[a-z0-9.-]+\.[a-z]{2,}/i', $html, $m)) { + foreach ($m[1] as $local) { + $d = preg_replace('/\D+/', '', $local) ?? ''; + if (strlen($d) >= 7) { + $emails[] = $d; + } + } + } + + return ['code' => $code, 'body' => $body, 'emails' => $emails]; + } + + private function normalizeMaybe(string $raw): ?string + { + $digits = preg_replace('/\D+/', '', $raw) ?? ''; + if (strlen($digits) === 11 && ($digits[0] === '8' || $digits[0] === '7')) { + return '7'.substr($digits, 1); + } + if (strlen($digits) === 10) { + return '7'.$digits; + } + + return null; + } +} diff --git a/app/tests/Unit/Autopodbor/Extract/HtmlPhoneScannerTest.php b/app/tests/Unit/Autopodbor/Extract/HtmlPhoneScannerTest.php new file mode 100644 index 00000000..a1b68890 --- /dev/null +++ b/app/tests/Unit/Autopodbor/Extract/HtmlPhoneScannerTest.php @@ -0,0 +1,26 @@ +звонок2'; + $r = (new HtmlPhoneScanner)->scan($html); + expect($r['code'])->toHaveKey('78432032533') + ->and($r['code']['78432032533'])->toContain('tel') + ->and($r['code'])->toHaveKey('78432452533'); +}); + +it('берёт номера из schema.org и microdata', function () { + $html = '' + .'x'; + $r = (new HtmlPhoneScanner)->scan($html); + expect($r['code']['78432032533'])->toContain('schema') + ->and($r['code']['78432452533'])->toContain('microdata'); +}); + +it('считает вхождения в тело и берёт e-mail-цифры', function () { + $html = 'тел 8(843)203-25-33, ещё 8(843)203-25-33. почта 2032533@mail.ru'; + $r = (new HtmlPhoneScanner)->scan($html); + expect($r['body']['78432032533'])->toBe(2) + ->and($r['emails'])->toContain('2032533'); +});