feat(автоподбор): HtmlPhoneScanner — номера из кода (tel/schema/microdata/тело/email)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Дмитрий
2026-06-30 10:16:22 +03:00
parent bc462d25fa
commit d50a3d5108
2 changed files with 101 additions and 0 deletions
@@ -0,0 +1,75 @@
<?php
declare(strict_types=1);
namespace App\Services\Autopodbor\Agent\Extract;
final class HtmlPhoneScanner
{
/** @return array{code: array<string,list<string>>, body: array<string,int>, emails: list<string>} */
public function scan(string $html): array
{
$code = [];
$add = function (string $raw, string $slot) use (&$code): void {
$n = $this->normalizeMaybe($raw);
if ($n === null) {
return;
}
$code[$n] ??= [];
if (! in_array($slot, $code[$n], true)) {
$code[$n][] = $slot;
}
};
if (preg_match_all('/tel:([+0-9()\s-]{7,})/i', $html, $m)) {
foreach ($m[1] as $x) {
$add($x, 'tel');
}
}
if (preg_match_all('/"telephone"\s*:\s*"([^"]+)"/i', $html, $m)) {
foreach ($m[1] as $x) {
$add($x, 'schema');
}
}
if (preg_match_all('/itemprop=["\']telephone["\'][^>]*content=["\']([^"\']+)/i', $html, $m)) {
foreach ($m[1] as $x) {
$add($x, 'microdata');
}
}
$body = [];
if (preg_match_all('/(?:\+7|8)[\s(\-]*\d{3}[\s)\-]*\d{3}[\s\-]*\d{2}[\s\-]*\d{2}/', $html, $m)) {
foreach ($m[0] as $x) {
$n = $this->normalizeMaybe($x);
if ($n !== null) {
$body[$n] = ($body[$n] ?? 0) + 1;
}
}
}
$emails = [];
if (preg_match_all('/([a-z0-9._%+-]+)@[a-z0-9.-]+\.[a-z]{2,}/i', $html, $m)) {
foreach ($m[1] as $local) {
$d = preg_replace('/\D+/', '', $local) ?? '';
if (strlen($d) >= 7) {
$emails[] = $d;
}
}
}
return ['code' => $code, 'body' => $body, 'emails' => $emails];
}
private function normalizeMaybe(string $raw): ?string
{
$digits = preg_replace('/\D+/', '', $raw) ?? '';
if (strlen($digits) === 11 && ($digits[0] === '8' || $digits[0] === '7')) {
return '7'.substr($digits, 1);
}
if (strlen($digits) === 10) {
return '7'.$digits;
}
return null;
}
}
@@ -0,0 +1,26 @@
<?php
use App\Services\Autopodbor\Agent\Extract\HtmlPhoneScanner;
it('берёт номера из tel:-ссылок', function () {
$html = '<a href="tel:+7 (843) 203-25-33">звонок</a><a href="tel:88432452533">2</a>';
$r = (new HtmlPhoneScanner)->scan($html);
expect($r['code'])->toHaveKey('78432032533')
->and($r['code']['78432032533'])->toContain('tel')
->and($r['code'])->toHaveKey('78432452533');
});
it('берёт номера из schema.org и microdata', function () {
$html = '<script type="application/ld+json">{"telephone":"+7(843)203-25-33"}</script>'
.'<span itemprop="telephone" content="+78432452533">x</span>';
$r = (new HtmlPhoneScanner)->scan($html);
expect($r['code']['78432032533'])->toContain('schema')
->and($r['code']['78432452533'])->toContain('microdata');
});
it('считает вхождения в тело и берёт e-mail-цифры', function () {
$html = 'тел 8(843)203-25-33, ещё 8(843)203-25-33. почта 2032533@mail.ru';
$r = (new HtmlPhoneScanner)->scan($html);
expect($r['body']['78432032533'])->toBe(2)
->and($r['emails'])->toContain('2032533');
});