diff --git a/.htaccess b/.htaccess index 27637c2..3ba960f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 528ba08..b675b9b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c7c3054..9cfb5c6 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -23,6 +23,7 @@ Diffbot DuckAssistBot EchoboxBot FacebookBot +facebookexternalhit Factset_spyderbot FirecrawlAgent FriendlyCrawler diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c1afb05..a53333e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 0edf11f..9d69a3a 100644 --- a/robots.txt +++ b/robots.txt @@ -23,6 +23,7 @@ User-agent: Diffbot User-agent: DuckAssistBot User-agent: EchoboxBot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 78a3408..a5ab4c7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -25,6 +25,7 @@ | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) | Ostensibly only for sharing, but likely used as an AI crawler as well | Unclear at this time. | Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is "to crawl the content of an app or website that was shared on one of Meta’s family of apps…". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |