158 lines
4.7 KiB
PHP
158 lines
4.7 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace App\Services\Traffic;
|
|
|
|
use Illuminate\Http\Request;
|
|
|
|
final class BotClassifier
|
|
{
|
|
/**
|
|
* @return array{is_bot: bool, type: ?string, family: ?string}
|
|
*/
|
|
public function classify(Request $request): array
|
|
{
|
|
$userAgent = trim((string) $request->userAgent());
|
|
|
|
if ($userAgent === '') {
|
|
return $this->bot('suspicious_bot', 'Empty UA');
|
|
}
|
|
|
|
$normalized = strtolower($userAgent);
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'curl' => ['curl'],
|
|
'wget' => ['wget'],
|
|
'python-requests' => ['python-requests'],
|
|
'libwww-perl' => ['libwww-perl'],
|
|
'Go-http-client' => ['go-http-client'],
|
|
'Java' => ['java/'],
|
|
'scrapy' => ['scrapy'],
|
|
'httpclient' => ['httpclient'],
|
|
'masscan' => ['masscan'],
|
|
'nikto' => ['nikto'],
|
|
'sqlmap' => ['sqlmap'],
|
|
])) {
|
|
return $this->bot('suspicious_bot', $family);
|
|
}
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'Googlebot' => ['googlebot'],
|
|
'Bingbot' => ['bingbot'],
|
|
'DuckDuckBot' => ['duckduckbot'],
|
|
'YandexBot' => ['yandexbot'],
|
|
'Baiduspider' => ['baiduspider'],
|
|
'Applebot' => ['applebot'],
|
|
'Slurp' => ['slurp'],
|
|
])) {
|
|
return $this->bot('search_bot', $family);
|
|
}
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'GPTBot' => ['gptbot'],
|
|
'ChatGPT-User' => ['chatgpt-user'],
|
|
'OAI-SearchBot' => ['oai-searchbot'],
|
|
'ClaudeBot' => ['claudebot'],
|
|
'PerplexityBot' => ['perplexitybot'],
|
|
'Bytespider' => ['bytespider'],
|
|
'CCBot' => ['ccbot'],
|
|
'Google-Extended' => ['google-extended'],
|
|
'anthropic-ai' => ['anthropic-ai'],
|
|
'cohere-ai' => ['cohere-ai'],
|
|
])) {
|
|
return $this->bot('ai_bot', $family);
|
|
}
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'AhrefsBot' => ['ahrefsbot'],
|
|
'SemrushBot' => ['semrushbot'],
|
|
'MJ12bot' => ['mj12bot'],
|
|
'DotBot' => ['dotbot'],
|
|
'PetalBot' => ['petalbot'],
|
|
'DataForSeoBot' => ['dataforseobot'],
|
|
'BLEXBot' => ['blexbot'],
|
|
'MauiBot' => ['mauibot'],
|
|
'serpstatbot' => ['serpstatbot'],
|
|
])) {
|
|
return $this->bot('seo_bot', $family);
|
|
}
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'facebookexternalhit' => ['facebookexternalhit'],
|
|
'Twitterbot' => ['twitterbot'],
|
|
'LinkedInBot' => ['linkedinbot'],
|
|
'Slackbot' => ['slackbot'],
|
|
'Discordbot' => ['discordbot'],
|
|
'TelegramBot' => ['telegrambot'],
|
|
'WhatsApp' => ['whatsapp'],
|
|
'Pinterestbot' => ['pinterestbot'],
|
|
])) {
|
|
return $this->bot('social_bot', $family);
|
|
}
|
|
|
|
if ($family = $this->matchFamily($normalized, [
|
|
'UptimeRobot' => ['uptimerobot'],
|
|
'Pingdom' => ['pingdom'],
|
|
'StatusCake' => ['statuscake'],
|
|
'Better Stack' => ['better stack', 'betterstack'],
|
|
'BetterUptime' => ['betteruptime'],
|
|
])) {
|
|
return $this->bot('monitoring_bot', $family);
|
|
}
|
|
|
|
if (strlen($userAgent) < 8) {
|
|
return $this->bot('suspicious_bot', 'Short UA');
|
|
}
|
|
|
|
if ($this->containsAny($normalized, ['bot', 'crawler', 'spider', 'crawl', 'preview'])) {
|
|
return $this->bot('unknown_bot', 'Unknown crawler');
|
|
}
|
|
|
|
return [
|
|
'is_bot' => false,
|
|
'type' => null,
|
|
'family' => null,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* @param array<string, array<int, string>> $families
|
|
*/
|
|
private function matchFamily(string $normalizedUserAgent, array $families): ?string
|
|
{
|
|
foreach ($families as $family => $keywords) {
|
|
if ($this->containsAny($normalizedUserAgent, $keywords)) {
|
|
return $family;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* @param array<int, string> $keywords
|
|
*/
|
|
private function containsAny(string $haystack, array $keywords): bool
|
|
{
|
|
foreach ($keywords as $keyword) {
|
|
if ($keyword !== '' && str_contains($haystack, $keyword)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* @return array{is_bot: bool, type: string, family: string}
|
|
*/
|
|
private function bot(string $type, string $family): array
|
|
{
|
|
return [
|
|
'is_bot' => true,
|
|
'type' => $type,
|
|
'family' => $family,
|
|
];
|
|
}
|
|
} |