feat(автоподбор): HtmlPhoneScanner — номера из кода (tel/schema/microdata/тело/email)
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,75 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
namespace App\Services\Autopodbor\Agent\Extract;
|
||||
|
||||
final class HtmlPhoneScanner
|
||||
{
|
||||
/** @return array{code: array<string,list<string>>, body: array<string,int>, emails: list<string>} */
|
||||
public function scan(string $html): array
|
||||
{
|
||||
$code = [];
|
||||
$add = function (string $raw, string $slot) use (&$code): void {
|
||||
$n = $this->normalizeMaybe($raw);
|
||||
if ($n === null) {
|
||||
return;
|
||||
}
|
||||
$code[$n] ??= [];
|
||||
if (! in_array($slot, $code[$n], true)) {
|
||||
$code[$n][] = $slot;
|
||||
}
|
||||
};
|
||||
|
||||
if (preg_match_all('/tel:([+0-9()\s-]{7,})/i', $html, $m)) {
|
||||
foreach ($m[1] as $x) {
|
||||
$add($x, 'tel');
|
||||
}
|
||||
}
|
||||
if (preg_match_all('/"telephone"\s*:\s*"([^"]+)"/i', $html, $m)) {
|
||||
foreach ($m[1] as $x) {
|
||||
$add($x, 'schema');
|
||||
}
|
||||
}
|
||||
if (preg_match_all('/itemprop=["\']telephone["\'][^>]*content=["\']([^"\']+)/i', $html, $m)) {
|
||||
foreach ($m[1] as $x) {
|
||||
$add($x, 'microdata');
|
||||
}
|
||||
}
|
||||
|
||||
$body = [];
|
||||
if (preg_match_all('/(?:\+7|8)[\s(\-]*\d{3}[\s)\-]*\d{3}[\s\-]*\d{2}[\s\-]*\d{2}/', $html, $m)) {
|
||||
foreach ($m[0] as $x) {
|
||||
$n = $this->normalizeMaybe($x);
|
||||
if ($n !== null) {
|
||||
$body[$n] = ($body[$n] ?? 0) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$emails = [];
|
||||
if (preg_match_all('/([a-z0-9._%+-]+)@[a-z0-9.-]+\.[a-z]{2,}/i', $html, $m)) {
|
||||
foreach ($m[1] as $local) {
|
||||
$d = preg_replace('/\D+/', '', $local) ?? '';
|
||||
if (strlen($d) >= 7) {
|
||||
$emails[] = $d;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ['code' => $code, 'body' => $body, 'emails' => $emails];
|
||||
}
|
||||
|
||||
private function normalizeMaybe(string $raw): ?string
|
||||
{
|
||||
$digits = preg_replace('/\D+/', '', $raw) ?? '';
|
||||
if (strlen($digits) === 11 && ($digits[0] === '8' || $digits[0] === '7')) {
|
||||
return '7'.substr($digits, 1);
|
||||
}
|
||||
if (strlen($digits) === 10) {
|
||||
return '7'.$digits;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
use App\Services\Autopodbor\Agent\Extract\HtmlPhoneScanner;
|
||||
|
||||
it('берёт номера из tel:-ссылок', function () {
|
||||
$html = '<a href="tel:+7 (843) 203-25-33">звонок</a><a href="tel:88432452533">2</a>';
|
||||
$r = (new HtmlPhoneScanner)->scan($html);
|
||||
expect($r['code'])->toHaveKey('78432032533')
|
||||
->and($r['code']['78432032533'])->toContain('tel')
|
||||
->and($r['code'])->toHaveKey('78432452533');
|
||||
});
|
||||
|
||||
it('берёт номера из schema.org и microdata', function () {
|
||||
$html = '<script type="application/ld+json">{"telephone":"+7(843)203-25-33"}</script>'
|
||||
.'<span itemprop="telephone" content="+78432452533">x</span>';
|
||||
$r = (new HtmlPhoneScanner)->scan($html);
|
||||
expect($r['code']['78432032533'])->toContain('schema')
|
||||
->and($r['code']['78432452533'])->toContain('microdata');
|
||||
});
|
||||
|
||||
it('считает вхождения в тело и берёт e-mail-цифры', function () {
|
||||
$html = 'тел 8(843)203-25-33, ещё 8(843)203-25-33. почта 2032533@mail.ru';
|
||||
$r = (new HtmlPhoneScanner)->scan($html);
|
||||
expect($r['body']['78432032533'])->toBe(2)
|
||||
->and($r['emails'])->toContain('2032533');
|
||||
});
|
||||
Reference in New Issue
Block a user