From ae8f74c10cec97ec758bf39345ff20717302c665 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:22:04 -0700 Subject: [PATCH 01/53] Update robots.json --- robots.json | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/robots.json b/robots.json index e907c8b..f711d43 100644 --- a/robots.json +++ b/robots.json @@ -69,13 +69,6 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "Claude-Web": { - "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", - "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." - }, "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", @@ -83,6 +76,20 @@ "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, + "Claude-User": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", + "frequency": "No information provided.", + "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." + }, + "Claude-SearchBot": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "frequency": "No information provided.", + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", "respect": "Unclear at this time.", @@ -328,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8dc36aa2e2bbc9b99bde043b593cdc6c9669f401 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:23:28 -0700 Subject: [PATCH 02/53] Update robots.txt --- robots.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/robots.txt b/robots.txt index 8c79fc2..e19468d 100644 --- a/robots.txt +++ b/robots.txt @@ -8,8 +8,9 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: Claude-Web User-agent: ClaudeBot +User-agent: Claude-User +User-agent: Claude-SearchBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Crawlspace From 4a6f37d72718aeb44d1d8cbcccb740ace3fe82d6 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:42:58 -0700 Subject: [PATCH 03/53] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f711d43..ba052ae 100644 --- a/robots.json +++ b/robots.json @@ -78,7 +78,7 @@ }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." From fd41de8522536a25de71f37310a05e77d71a0792 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:43:03 -0700 Subject: [PATCH 04/53] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index ba052ae..ca6fc40 100644 --- a/robots.json +++ b/robots.json @@ -85,7 +85,7 @@ }, "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." From 16d1de70943e9d448d3d5e02e91d86e38dac80d7 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 16 May 2025 00:59:08 +0000 Subject: [PATCH 05/53] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index f518037..f2ed4c2 100644 --- a/robots.json +++ b/robots.json @@ -336,11 +336,11 @@ "respect": "Yes" }, "QualifiedBot": { - "description": "Operated by Qualified as part of their suite of AI product offerings.", - "frequency": "No explicit frequency provided.", - "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", - "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time." + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", @@ -405,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 5fba0b746d550b6ae4d7c9605904b6ec102d0f98 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 15:40:46 -0700 Subject: [PATCH 06/53] chore(robots.json): adds MistralAI-User/1.0 crawler --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index f518037..647e664 100644 --- a/robots.json +++ b/robots.json @@ -272,6 +272,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "MistralAI-User/1.0": { + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", From ca918a963f735019a0c66343bf8338a9228d94f5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 21:16:49 -0700 Subject: [PATCH 07/53] chore(robots.json): adds Google-CloudVertexBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2ed4c2..b459533 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, + "Google-CloudVertexBot": { + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "Build and manage AI models for businesses employing Vertex AI", + "frequency": "No information.", + "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents." + }, "Google-Extended": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -405,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From dd1ed174b77ca2c0c4a40d6f4bce6beda4a1c296 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 16 May 2025 11:35:15 +0000 Subject: [PATCH 08/53] Merge pull request #129 from ai-robots-txt/google-cloudvertexbot chore(robots.json): adds Google-CloudVertexBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index de88e50..b2204d7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 43ad3bf..36fd20c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 9770b45..7389f10 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ FacebookBot Factset_spyderbot FirecrawlAgent FriendlyCrawler +Google-CloudVertexBot Google-Extended GoogleOther GoogleOther-Image diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index afbd77c..f05f785 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index dea4dd5..a5be10b 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 57469fa..d8542b3 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | From 7a2e6cba52e782ab32552c2335637af761afbe51 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 17 May 2025 00:57:28 +0000 Subject: [PATCH 09/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index b459533..1ecfcd8 100644 --- a/robots.json +++ b/robots.json @@ -412,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9297c7dfa3122109a6f3ae3ce18026e0e6c94ebe Mon Sep 17 00:00:00 2001 From: Mihitoko Date: Mon, 19 May 2025 23:56:57 +0200 Subject: [PATCH 10/53] Mention X-Robots-Tag header as alternative for bing --- docs/additional-steps/bing.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md index 37c60c7..f9afb78 100644 --- a/docs/additional-steps/bing.md +++ b/docs/additional-steps/bing.md @@ -10,15 +10,19 @@ Fortunately, Bing supports a relatively simple opt-out method, requiring an addi ## How to opt-out of AI training -You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. +You must add a metatag in the `` of your webpage or set the [X-Robots-Tag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/X-Robots-Tag) HTTP header in your response. This also needs to be added to every page or response on your website. -The line you need to add is: +If using the metatag, the line you need to add is: ```plaintext ``` +Or include the HTTP response header: +```plaintext +X-Robots-Tag: noarchive +``` -By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." +By adding this line or header, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." ## Will my site be negatively affected From 8a8001cbece8a5607163bed6eb83d5bb35bb24e5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 20 May 2025 13:55:25 -0700 Subject: [PATCH 11/53] chore(README): updates the opening line of our README to clarify the types of agents we block --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 232b3ed..307f005 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This is an open list of web crawlers associated with AI companies and the training of LLMs to block. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 8b151b2cdc6ef8e949fd59d26d7456bcbb60d4e7 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 06:52:36 -0700 Subject: [PATCH 12/53] Update README.md Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 307f005..f427af4 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 1c2acd75b7def13d9dc85233bfb4aaca8bcafd12 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 15:27:26 +0000 Subject: [PATCH 13/53] Merge pull request #126 from ai-robots-txt/mistral-bot chore(robots.json): adds MistralAI-User/1.0 crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index b2204d7..cc483c7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 36fd20c..205acbd 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7389f10..de5b4fb 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -38,6 +38,7 @@ meta-externalagent Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher +MistralAI-User/1.0 NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index f05f785..3274559 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a5be10b..b3e16f8 100644 --- a/robots.txt +++ b/robots.txt @@ -38,6 +38,7 @@ User-agent: meta-externalagent User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: MistralAI-User/1.0 User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d8542b3..84c69f5 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -40,6 +40,7 @@ | Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b1d9a60a38c04cac81dd156ad73ddef7eb60b50b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 11:40:33 -0700 Subject: [PATCH 14/53] chore(robots.json): adds wpbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index bddefdd..ed7d63d 100644 --- a/robots.json +++ b/robots.json @@ -412,6 +412,13 @@ "frequency": "Unclear at this time.", "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" }, + "wpbot": { + "operator": "[QuantumCloud](https://www.quantumcloud.com)", + "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)", + "function": "Live chat support and lead generation.", + "frequency": "Unclear at this time.", + "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -419,4 +426,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 7c5389f4a0c5f60745e1a7552e142bd33a587d8f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 19:00:23 +0000 Subject: [PATCH 15/53] Merge pull request #98 from kylebuckingham/main Updating Claude Bots --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- table-of-bot-metrics.md | 3 ++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.htaccess b/.htaccess index cc483c7..2146722 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 205acbd..879426d 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index de5b4fb..8a9ccf9 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,8 +9,9 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -Claude-Web ClaudeBot +Claude-User +Claude-SearchBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3274559..5f96718 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 84c69f5..e6f35a4 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,8 +11,9 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From fedb658cc08225d71a6b0f32c9c2859b7420f0ee Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 21:06:05 +0000 Subject: [PATCH 16/53] Merge pull request #133 from ai-robots-txt/wpbot chore(robots.json): adds wpbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 2146722..3337284 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 879426d..2001edc 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8a9ccf9..377710b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -58,4 +58,5 @@ TikTokSpider Timpibot VelenPublicWebCrawler Webzio-Extended +wpbot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f96718..ba1f8c6 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8690e50..92e527b 100644 --- a/robots.txt +++ b/robots.txt @@ -58,5 +58,6 @@ User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended +User-agent: wpbot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e6f35a4..c795559 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -60,4 +60,5 @@ | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 7bf7f9164d55a58e0d6080dff52eea3ed5d3584e Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 22 May 2025 00:58:45 +0000 Subject: [PATCH 17/53] Update from Dark Visitors --- robots.json | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/robots.json b/robots.json index 9dce781..06187ae 100644 --- a/robots.json +++ b/robots.json @@ -76,12 +76,12 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "ClaudeBot": { + "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Scrapes data to train Anthropic's AI products.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", @@ -90,12 +90,19 @@ "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." }, - "Claude-SearchBot": { + "Claude-Web": { + "operator": "Anthropic", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web" + }, + "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "function": "Scrapes data to train Anthropic's AI products.", "frequency": "No information provided.", - "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", @@ -287,11 +294,11 @@ "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, "MistralAI-User/1.0": { - "operator": "Mistral AI", - "function": "Takes action based on user prompts.", - "frequency": "Only when prompted by a user.", - "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", - "respect": "Yes" + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" }, "NovaAct": { "operator": "Unclear at this time.", @@ -433,4 +440,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 093ab81d789528bb5d89c2d2c708b8f157e3b795 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 23 May 2025 00:58:57 +0000 Subject: [PATCH 18/53] Update from Dark Visitors --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 5 +++-- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++-- table-of-bot-metrics.md | 5 +++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.htaccess b/.htaccess index 3337284..26c6e72 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 2001edc..7a1076c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 377710b..8ef373b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,9 +9,10 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -ClaudeBot -Claude-User Claude-SearchBot +Claude-User +Claude-Web +ClaudeBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ba1f8c6..a691c55 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 92e527b..3330b20 100644 --- a/robots.txt +++ b/robots.txt @@ -9,9 +9,10 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: ClaudeBot -User-agent: Claude-User User-agent: Claude-SearchBot +User-agent: Claude-User +User-agent: Claude-Web +User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Cotoyogi diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c795559..0e6a88e 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,9 +11,10 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | | Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-Web | Anthropic | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From 3e8edd083e32e01bd5cf0629f108815092d5f7ec Mon Sep 17 00:00:00 2001 From: imp <80153024+not-not-the-imp@users.noreply.github.com> Date: Fri, 23 May 2025 13:03:49 +0100 Subject: [PATCH 19/53] Add AndiBot and PhindBot Fixes #75 --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 06187ae..8a8432b 100644 --- a/robots.json +++ b/robots.json @@ -20,6 +20,13 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provide genAI summary ." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -363,6 +370,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "PhindBot": { + "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "frequency": "No explicit frequency provided.", + "function": "AI-enhanced search engine.", + "operator": "[phind](https://www.phind.com/)", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", @@ -440,4 +454,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From d22b9ec51ac14b0d9dfdd86a04cd566731c0e8c4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:00:13 -0700 Subject: [PATCH 20/53] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8a8432b..9bf9fdd 100644 --- a/robots.json +++ b/robots.json @@ -23,7 +23,7 @@ "Andibot": { "operator": "[Andi](https://andisearch.com/)", "respect": "Unclear at this time", - "function": "search engine using generative AI, AI Search Assistant", + "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", "description": "Scrapes website and provide genAI summary ." }, From 4259b25cccbb4d0bff740261f1eb825fa86bf381 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:01:09 -0700 Subject: [PATCH 21/53] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 9bf9fdd..50c2fae 100644 --- a/robots.json +++ b/robots.json @@ -25,7 +25,7 @@ "respect": "Unclear at this time", "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", - "description": "Scrapes website and provide genAI summary ." + "description": "Scrapes website and provides AI summary." }, "Amazonbot": { "operator": "Amazon", From 268922f8f2fa05b4eb4cc991116fce2700c09184 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:02:05 -0700 Subject: [PATCH 22/53] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 50c2fae..6f3a05d 100644 --- a/robots.json +++ b/robots.json @@ -371,7 +371,7 @@ "respect": "Yes" }, "PhindBot": { - "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "description": "Company offers an AI agent that uses AI and generate extra web query on the fly", "frequency": "No explicit frequency provided.", "function": "AI-enhanced search engine.", "operator": "[phind](https://www.phind.com/)", From 1dd66b696963dd3f9a577aa4ca59d83c8bc41aef Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 11:53:06 -0700 Subject: [PATCH 23/53] Revert "chore(robots.json): adds imgproxy crawler" This reverts commit b65f45e408461560a32f44f05860f80655737467. --- robots.json | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/robots.json b/robots.json index 6f3a05d..f730f50 100644 --- a/robots.json +++ b/robots.json @@ -251,13 +251,6 @@ "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, - "imgproxy": { - "frequency": "No information.", - "function": "Not documented or explained on operator's site.", - "operator": "[imgproxy](https://imgproxy.net)", - "respect": "Unclear at this time.", - "description": "AI-powered image processing." - }, "ISSCyberRiskCrawler": { "description": "Used to train machine learning based models to quantify cyber risk.", "frequency": "No information.", @@ -454,4 +447,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 899ce01c554359ecc66aa36bc2af367069175e10 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 14:47:31 -0700 Subject: [PATCH 24/53] chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .github/workflows/ai_robots_update.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index 7e11ce8..17c1cc8 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -20,7 +20,12 @@ jobs: echo "... done." git --no-pager diff git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) + if ! git diff --cached --quiet; then + git commit -m "Update from Dark Visitors" + git push + else + echo "No changes to commit." + fi shell: bash convert: name: convert From 87016d15040f50630121666c40a6048df8a4169d Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 3 Jun 2025 01:00:29 +0000 Subject: [PATCH 25/53] Update from Dark Visitors --- robots.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/robots.json b/robots.json index f730f50..d8e8262 100644 --- a/robots.json +++ b/robots.json @@ -20,13 +20,6 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, - "Andibot": { - "operator": "[Andi](https://andisearch.com/)", - "respect": "Unclear at this time", - "function": "Search engine using generative AI, AI Search Assistant", - "frequency": "No information provided.", - "description": "Scrapes website and provides AI summary." - }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -34,6 +27,13 @@ "frequency": "No information provided.", "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "Search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provides AI summary." + }, "anthropic-ai": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "Unclear at this time.", From d239e7e5ad0e9c72d17961c57deaf10c1feef899 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 01:52:35 +0000 Subject: [PATCH 26/53] Merge pull request #139 from ai-robots-txt/workflow-fix chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- robots.txt | 3 ++- table-of-bot-metrics.md | 3 ++- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index 26c6e72..ddb7255 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7a1076c..60ed4d3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8ef373b..6c23da0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -2,6 +2,7 @@ AI2Bot Ai2Bot-Dolma aiHitBot Amazonbot +Andibot anthropic-ai Applebot Applebot-Extended @@ -33,7 +34,6 @@ iaskspider/2.0 ICC-Crawler ImagesiftBot img2dataset -imgproxy ISSCyberRiskCrawler Kangaroo Bot meta-externalagent @@ -50,6 +50,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +PhindBot QualifiedBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a691c55..1c50815 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 3330b20..4b362e4 100644 --- a/robots.txt +++ b/robots.txt @@ -2,6 +2,7 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma User-agent: aiHitBot User-agent: Amazonbot +User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended @@ -33,7 +34,6 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset -User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent @@ -50,6 +50,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: PhindBot User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0e6a88e..737b217 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -4,6 +4,7 @@ | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Andibot | [Andi](https://andisearch.com/) | Unclear at this time | Search engine using generative AI, AI Search Assistant | No information provided. | Scrapes website and provides AI summary. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | @@ -35,7 +36,6 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | @@ -52,6 +52,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 3187fd8a3219620a4b0f3b7b7950f40e5aac4ad1 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 3 Jun 2025 12:24:16 -0700 Subject: [PATCH 27/53] chore(robots.json): adds YandexAdditional crawlers --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index d8e8262..45340fd 100644 --- a/robots.json +++ b/robots.json @@ -440,6 +440,20 @@ "frequency": "Unclear at this time.", "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." }, + "YandexAdditional": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, + "YandexAdditionalBot": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -447,4 +461,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 080946c360465c5e0b0af8dd475bb0248bca77a1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 19:51:25 +0000 Subject: [PATCH 28/53] Merge pull request #140 from ai-robots-txt/yandex-bots chore(robots.json): adds YandexAdditional crawlers --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ddb7255..c381fce 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 60ed4d3..3527a7a 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 6c23da0..a8ba9aa 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -61,4 +61,6 @@ Timpibot VelenPublicWebCrawler Webzio-Extended wpbot +YandexAdditional +YandexAdditionalBot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 1c50815..5f7a0db 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4b362e4..d26ccb4 100644 --- a/robots.txt +++ b/robots.txt @@ -61,5 +61,7 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: wpbot +User-agent: YandexAdditional +User-agent: YandexAdditionalBot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 737b217..3275a20 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -63,4 +63,6 @@ | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | +| YandexAdditional | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | +| YandexAdditionalBot | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 8f75f4a2f5f8f8381a73e6a6faab241e0ad58a14 Mon Sep 17 00:00:00 2001 From: Ivan Chupin Date: Wed, 4 Jun 2025 03:48:42 +0500 Subject: [PATCH 29/53] Add SBIntuitionsBot --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 45340fd..cd7ec4a 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,13 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "SBIntuitionsBot": { + "description": "AI development and information analysis", + "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", + "frequency": "No information.", + "function": "Uses data gathered in AI development and information analysis.", + "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)" + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", From 3efabc603dcae235cc04d1e2f2e9113c70e6197c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 23:28:48 +0000 Subject: [PATCH 30/53] Merge pull request #141 from Ivan-Chupin/patch-1 Add SBIntuitionsBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index c381fce..48971b1 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 3527a7a..117e653 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index a8ba9aa..c2ebb47 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,7 @@ PerplexityBot PetalBot PhindBot QualifiedBot +SBIntuitionsBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f7a0db..edcf8a7 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index d26ccb4..1c5e989 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,7 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 3275a20..c9a3910 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,7 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 2b5a59a303a81847234ac11ec2f180ee9795db90 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 4 Jun 2025 01:00:07 +0000 Subject: [PATCH 31/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index cd7ec4a..739f579 100644 --- a/robots.json +++ b/robots.json @@ -468,4 +468,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 03831a7eb55ae3682e45a651588df13acc4a4a04 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:46:58 -0700 Subject: [PATCH 32/53] chore(robots.json): adds Quillbot --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..bea7350 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,20 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "QuillBot": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, + "quillbot.com": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, "SBIntuitionsBot": { "description": "AI development and information analysis", "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4568d69b0edad5708fe49fafd2b5c0572cd2dfa4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:54:14 -0700 Subject: [PATCH 33/53] chore(robots.json): adds Panscient --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..8d1163a 100644 --- a/robots.json +++ b/robots.json @@ -342,6 +342,20 @@ "frequency": "Unclear at this time.", "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" }, + "Panscient": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, + "panscient.com": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, "Perplexity-User": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://docs.perplexity.ai/guides/bots)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 9c28c63a0c4889d694f7ebdd278c17564f4b72a3 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 17:54:57 +0000 Subject: [PATCH 34/53] Merge pull request #142 from ai-robots-txt/quillbot chore(robots.json): adds Quillbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 48971b1..ee898fd 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 117e653..de3bf3b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c2ebb47..be27797 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,8 @@ PerplexityBot PetalBot PhindBot QualifiedBot +QuillBot +quillbot.com SBIntuitionsBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index edcf8a7..d7decae 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1c5e989..4ed2deb 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,8 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: QuillBot +User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c9a3910..e0fcd26 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,8 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | +| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 75ea75a95b006e68f89f8826bf84caff569a1eb8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 18:04:06 +0000 Subject: [PATCH 35/53] Merge pull request #143 from ai-robots-txt/panscient chore(robots.json): adds Panscient --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ee898fd..5fefc69 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index de3bf3b..5caa249 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index be27797..fe153c8 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -47,6 +47,8 @@ omgili omgilibot Operator PanguBot +Panscient +panscient.com Perplexity-User PerplexityBot PetalBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index d7decae..e5d660b 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4ed2deb..26a9d78 100644 --- a/robots.txt +++ b/robots.txt @@ -47,6 +47,8 @@ User-agent: omgili User-agent: omgilibot User-agent: Operator User-agent: PanguBot +User-agent: Panscient +User-agent: panscient.com User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e0fcd26..69b9ec2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -49,6 +49,8 @@ | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | +| Panscient | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | +| panscient\.com | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | From 77393df5aa2ac7e3f2bb3e54ff919470c63e5b09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 5 Jun 2025 00:59:28 +0000 Subject: [PATCH 36/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 22a1370..6df5084 100644 --- a/robots.json +++ b/robots.json @@ -496,4 +496,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 528d77bf072780a40c169cab399eb0b2139edb83 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 5 Jun 2025 09:14:23 -0700 Subject: [PATCH 37/53] chore(robots.json): adds bedrockbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..a3f75bc 100644 --- a/robots.json +++ b/robots.json @@ -55,6 +55,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "bedrockbot": { + "operator": "[Amazon](https://amazon.com)", + "respect": "[Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector)", + "function": "Data scraping for custom AI applications.", + "frequency": "Unclear at this time.", + "description": "Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application." + }, "Brightbot 1.0": { "operator": "Browsing.ai", "respect": "Unclear at this time.", @@ -468,4 +475,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From ac7ed17e71a59a67d54279d010477abecfb15caf Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 5 Jun 2025 16:51:17 +0000 Subject: [PATCH 38/53] Merge pull request #145 from ai-robots-txt/aws-bedrockbot chore(robots.json): adds bedrockbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 5fefc69..dbbde1e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 5caa249..08b2fd3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe153c8..cdaeecd 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -6,6 +6,7 @@ Andibot anthropic-ai Applebot Applebot-Extended +bedrockbot Brightbot 1.0 Bytespider CCBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index e5d660b..542ac65 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 26a9d78..a8dc655 100644 --- a/robots.txt +++ b/robots.txt @@ -6,6 +6,7 @@ User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: bedrockbot User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 69b9ec2..ee324f7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -8,6 +8,7 @@ | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. | | Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | From e21f6ae1b6eeb0782012a4a31e6af61b6a21cb09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 6 Jun 2025 00:59:25 +0000 Subject: [PATCH 39/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8f5dadd..3b5c434 100644 --- a/robots.json +++ b/robots.json @@ -503,4 +503,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 7867c3e26c267301cc528edf89fccf5203bfb99b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 9 Jun 2025 08:44:25 -0700 Subject: [PATCH 40/53] chore(robots.json): adds EchoboxBot (#148) --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 3b5c434..911dadc 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "frequency": "Unclear at this time.", "description": "DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot" }, + "EchoboxBot": { + "operator": "[Echobox](https://echobox.com)", + "respect": "Unclear at this time.", + "function": "Data collection to support AI-powered products.", + "frequency": "Unclear at this time.", + "description": "Supports company's AI-powered social and email management products." + }, "FacebookBot": { "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", @@ -503,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 3759a6bf146f0153bbb7ab880ff95380a4fc739e Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 9 Jun 2025 15:44:36 +0000 Subject: [PATCH 41/53] chore(robots.json): adds EchoboxBot (#148) --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index dbbde1e..74b4e5b 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 08b2fd3..9ca050b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index cdaeecd..dc9f851 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ Cotoyogi Crawlspace Diffbot DuckAssistBot +EchoboxBot FacebookBot Factset_spyderbot FirecrawlAgent diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 542ac65..4fd7a29 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a8dc655..8964626 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot +User-agent: EchoboxBot User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ee324f7..b596540 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | +| EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | From cf598b6b71e0e3c6b59a699c74218a62d43c5e16 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 10 Jun 2025 01:00:37 +0000 Subject: [PATCH 42/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 911dadc..19dffe8 100644 --- a/robots.json +++ b/robots.json @@ -510,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 14d68f05ba42b04e8c33fcca5bacf7d8dee86cae Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 11 Jun 2025 13:50:53 -0700 Subject: [PATCH 43/53] chore(robots.json): adds additional SemrushBot user agents --- robots.json | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/robots.json b/robots.json index 19dffe8..0ad3731 100644 --- a/robots.json +++ b/robots.json @@ -433,6 +433,27 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, + "SemrushBot": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-BA": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-CT": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", @@ -440,6 +461,13 @@ "frequency": "Roughly once every 10 seconds.", "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." }, + "SemrushBot-SI": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-SWA": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", From 842e2256e896e6d2904fa501df59f2ca17ea6bc0 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 12 Jun 2025 07:12:00 +0000 Subject: [PATCH 44/53] Merge pull request #150 from ai-robots-txt/semrush-bots chore(robots.json): adds additional SemrushBot user agents --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 4 ++++ nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++++ table-of-bot-metrics.md | 4 ++++ 6 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 74b4e5b..a6c8fea 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 9ca050b..7ed7e03 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index dc9f851..fe6c362 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -60,7 +60,11 @@ QuillBot quillbot.com SBIntuitionsBot Scrapy +SemrushBot +SemrushBot-BA +SemrushBot-CT SemrushBot-OCOB +SemrushBot-SI SemrushBot-SWA Sidetrade indexer bot TikTokSpider diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 4fd7a29..13aac72 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8964626..b130083 100644 --- a/robots.txt +++ b/robots.txt @@ -60,7 +60,11 @@ User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy +User-agent: SemrushBot +User-agent: SemrushBot-BA +User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: TikTokSpider diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b596540..a834382 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -62,7 +62,11 @@ | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-BA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-CT | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-SI | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | From d760f9216f8d6295b43e00862daec913f82610ad Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 12 Jun 2025 13:08:29 -0700 Subject: [PATCH 45/53] chore(robots.json): adds MyCentralAIScraperBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 19dffe8..94e4f79 100644 --- a/robots.json +++ b/robots.json @@ -314,6 +314,13 @@ "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", "respect": "Yes" }, + "MyCentralAIScraperBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is uncleaar at this time." + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -510,4 +517,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8f17718e762831498b09438835b2e1cb74f7e19d Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 13 Jun 2025 10:28:12 +0100 Subject: [PATCH 46/53] Fix typo --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 94e4f79..befdb35 100644 --- a/robots.json +++ b/robots.json @@ -319,7 +319,7 @@ "respect": "Unclear at this time.", "function": "AI data scraper", "frequency": "Unclear at this time.", - "description": "Operator and data use is uncleaar at this time." + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", From e53d81c66d353016e27e75215b22dc8557e1a82c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 13 Jun 2025 09:28:41 +0000 Subject: [PATCH 47/53] Merge pull request #152 from ai-robots-txt/MyCentralAIScraperBot chore(robots.json): adds MyCentralAIScraperBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a6c8fea..a2d6a6e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7ed7e03..e99d69c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe6c362..7d1d3a0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -43,6 +43,7 @@ Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher MistralAI-User/1.0 +MyCentralAIScraperBot NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 13aac72..ef0259e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index b130083..ab60705 100644 --- a/robots.txt +++ b/robots.txt @@ -43,6 +43,7 @@ User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher User-agent: MistralAI-User/1.0 +User-agent: MyCentralAIScraperBot User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index a834382..f8a2005 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -45,6 +45,7 @@ | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | +| MyCentralAIScraperBot | Unclear at this time. | Unclear at this time. | AI data scraper | Unclear at this time. | Operator and data use is unclear at this time. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b05f2fee000caf6c784135335b14c2254572754e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Spagnuolo?= Date: Fri, 13 Jun 2025 17:15:13 -0300 Subject: [PATCH 48/53] Update robots.json with new crawler Update with Poseidon Research Crawler as found in nytimes.com/robots.txt --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 52e196a..9bd1f85 100644 --- a/robots.json +++ b/robots.json @@ -405,6 +405,13 @@ "operator": "[phind](https://www.phind.com/)", "respect": "Unclear at this time." }, + "Poseidon Research Crawler": { + "operator": "[Poseidon Research](https://www.poseidonresearch.com)", + "description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.", + "frequency": "No explicit frequency provided.", + "function": "AI research crawler", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", From 2b68568ac26db7a37d859fec12b0bb35e61cfedc Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 14 Jun 2025 00:58:11 +0000 Subject: [PATCH 49/53] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index 52e196a..7c10cab 100644 --- a/robots.json +++ b/robots.json @@ -315,11 +315,11 @@ "respect": "Yes" }, "MyCentralAIScraperBot": { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": "AI data scraper", - "frequency": "Unclear at this time.", - "description": "Operator and data use is unclear at this time." + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", @@ -545,4 +545,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From eb05f2f5276ff963f786cf8fd57d8ea909e82fd9 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 14 Jun 2025 14:04:03 +0000 Subject: [PATCH 50/53] Merge pull request #153 from sergiospagnuolo/Poseidon Update robots.json with new crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a2d6a6e..27637c2 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index e99d69c..528ba08 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7d1d3a0..c7c3054 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -56,6 +56,7 @@ Perplexity-User PerplexityBot PetalBot PhindBot +Poseidon Research Crawler QualifiedBot QuillBot quillbot.com diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ef0259e..c1afb05 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index ab60705..0edf11f 100644 --- a/robots.txt +++ b/robots.txt @@ -56,6 +56,7 @@ User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot +User-agent: Poseidon Research Crawler User-agent: QualifiedBot User-agent: QuillBot User-agent: quillbot.com diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f8a2005..78a3408 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -58,6 +58,7 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | +| Poseidon Research Crawler | [Poseidon Research](https://www.poseidonresearch.com) | Unclear at this time. | AI research crawler | No explicit frequency provided. | Lab focused on scaling the interpretability research necessary to make better AI systems possible. | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | From 7535893aecf2895dd09bf00de397afde4edf0f8f Mon Sep 17 00:00:00 2001 From: paulrudy <1110792+paulrudy@users.noreply.github.com> Date: Sun, 15 Jun 2025 16:39:17 -0700 Subject: [PATCH 51/53] re-add facebookexternalhit --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 5d5f692..60c431c 100644 --- a/robots.json +++ b/robots.json @@ -174,6 +174,13 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "facebookexternalhit": { + "operator": "Meta/Facebook", + "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", + "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", + "frequency": "Unclear at this time.", + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)", "respect": "Unclear at this time.", From 5326c202b57d87a9921810ef26fca98edce77d5f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 16 Jun 2025 15:12:42 +0000 Subject: [PATCH 52/53] Merge pull request #154 from paulrudy/main re-add facebookexternalhit --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 27637c2..3ba960f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 528ba08..b675b9b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c7c3054..9cfb5c6 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -23,6 +23,7 @@ Diffbot DuckAssistBot EchoboxBot FacebookBot +facebookexternalhit Factset_spyderbot FirecrawlAgent FriendlyCrawler diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c1afb05..a53333e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 0edf11f..9d69a3a 100644 --- a/robots.txt +++ b/robots.txt @@ -23,6 +23,7 @@ User-agent: Diffbot User-agent: DuckAssistBot User-agent: EchoboxBot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 78a3408..a5ab4c7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -25,6 +25,7 @@ | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) | Ostensibly only for sharing, but likely used as an AI crawler as well | Unclear at this time. | Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is "to crawl the content of an app or website that was shared on one of Meta’s family of apps…". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | From 4ed17b8e4af67d347b039429eb633c96acbba72f Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 17 Jun 2025 01:00:21 +0000 Subject: [PATCH 53/53] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 60c431c..ab2a383 100644 --- a/robots.json +++ b/robots.json @@ -179,7 +179,7 @@ "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", "frequency": "Unclear at this time.", - "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta\u2019s family of apps\u2026\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)",