From 05b79b8a5886983c818eaad107fcf6c7de5fad3a Mon Sep 17 00:00:00 2001 From: nisbet-hubbard <87453615+nisbet-hubbard@users.noreply.github.com> Date: Mon, 27 Jan 2025 19:41:03 +0800 Subject: [PATCH 001/111] Update robots.json --- robots.json | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index 4d7d582..7f3cba3 100644 --- a/robots.json +++ b/robots.json @@ -265,12 +265,19 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, - "SemrushBot": { + "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Scrapes data for use in LLM article-writing tool.", + "function": "Crawls your site for ContentShake AI tool.", "frequency": "Roughly once every 10 seconds.", - "description": "SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports." + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-SWA": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Checks URLs on your site for SWA tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." }, "Sidetrade indexer bot": { "description": "AI product training.", @@ -307,4 +314,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 89d4c6e5ca03f0aedec09b9191e2aece6f2efec3 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 1 Feb 2025 10:51:01 +0000 Subject: [PATCH 002/111] Merge pull request #73 from nisbet-hubbard/patch-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Actually block Semrush’s AI tools --- .htaccess | 2 +- robots.txt | 3 ++- table-of-bot-metrics.md | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index beaddc3..97482e2 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] RewriteRule .* - [F,L] \ No newline at end of file diff --git a/robots.txt b/robots.txt index fd388fd..3839e55 100644 --- a/robots.txt +++ b/robots.txt @@ -36,7 +36,8 @@ User-agent: PanguBot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy -User-agent: SemrushBot +User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: Timpibot User-agent: VelenPublicWebCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f44c585..b51bbae 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -38,7 +38,8 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Scrapes data for use in LLM article-writing tool. | Roughly once every 10 seconds. | SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports. | +| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | From bebffccc0ced8c420276c93f3109c2e71cd5ca0c Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 2 Feb 2025 00:52:50 +0000 Subject: [PATCH 003/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 7f3cba3..79762a0 100644 --- a/robots.json +++ b/robots.json @@ -314,4 +314,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 261a2b83b90fe89f1d842066709c019fd1dba30f Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:26:19 -0500 Subject: [PATCH 004/111] update README to inclide list of ai bots Cloudflare considers verified --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 065b0b7..6758570 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,19 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). + +If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: +- Amazonbot +- Applebot +- CCBot +- ChatGPT-User +- DuckAssistBot +- GoogleOther +- GPTBot +- OAI-SearchBot +- PerplexityBot +- PetalBot + ## Additional resources - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight From e396a2ec781095c5e2659eefb99c46ab7715a664 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:31:20 -0500 Subject: [PATCH 005/111] forgot to include heading --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6758570..e70d283 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). - +## Cloudflare Verified Bots If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot From f99339922fa9afdbb00e18bb99105e81cd3f8e88 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:36:33 -0500 Subject: [PATCH 006/111] grammar update and include syntax for verified bot condition --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e70d283..f471ede 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). ## Cloudflare Verified Bots -If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: +If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot - CCBot From af87b85d7f00bc285cb414280e02d2f42284a9d8 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:39:08 -0500 Subject: [PATCH 007/111] include return after heading --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f471ede..303f009 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). ## Cloudflare Verified Bots + If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot From 5b13c2e504c843c2a95981cee1c2655d9f21c8f4 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Sat, 15 Feb 2025 11:22:10 -0500 Subject: [PATCH 008/111] add more concise message about verified bots Co-authored-by: Glyn Normington --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index 303f009..a206c83 100644 --- a/README.md +++ b/README.md @@ -39,21 +39,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b ## Report abusive crawlers If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). - -## Cloudflare Verified Bots - -If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: -- Amazonbot -- Applebot -- CCBot -- ChatGPT-User -- DuckAssistBot -- GoogleOther -- GPTBot -- OAI-SearchBot -- PerplexityBot -- PetalBot - +But even if you don't use Cloudflare's hard block, their list of [verified bots](https://radar.cloudflare.com/traffic/verified-bots) may come in handy. ## Additional resources - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight From a9ec4ffa6fd1816ee6c1c146fa75983abc0b2edc Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sun, 16 Feb 2025 13:36:39 -0800 Subject: [PATCH 009/111] chore: add Brightbot 1.0 --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 79762a0..a634634 100644 --- a/robots.json +++ b/robots.json @@ -41,6 +41,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "Brightbot 1.0": { + "operator": "Browsing.ai", + "respect": "Unclear at this time.", + "function": "LLM/AI training.", + "frequency": "Unclear at this time.", + "description": "Scrapes data to train LLMs and AI products focused on website customer support." + }, "Bytespider": { "operator": "ByteDance", "respect": "No", @@ -314,4 +321,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 693289bb29c42b7a526d8210d1f743ca3608690d Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sun, 16 Feb 2025 21:37:52 +0000 Subject: [PATCH 010/111] chore: add Brightbot 1.0 --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.htaccess b/.htaccess index 97482e2..512c274 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] RewriteRule .* - [F,L] \ No newline at end of file diff --git a/robots.txt b/robots.txt index 3839e55..80c40e8 100644 --- a/robots.txt +++ b/robots.txt @@ -4,6 +4,7 @@ User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b51bbae..af32bf2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -6,6 +6,7 @@ | anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | From abfd6dfcd15267ed03b5fda4cd3eac2512604ed2 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Mon, 17 Feb 2025 00:53:32 +0000 Subject: [PATCH 011/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index a634634..cdc7bb5 100644 --- a/robots.json +++ b/robots.json @@ -321,4 +321,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From c0d418cd875b432fd4558be57ad3c009326b631e Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Mon, 17 Feb 2025 21:00:57 +0100 Subject: [PATCH 012/111] .htaccess: Allow robots access to /robots.txt --- .htaccess | 2 +- code/robots.py | 2 +- code/test_files/.htaccess | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 512c274..c42f99e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] -RewriteRule .* - [F,L] \ No newline at end of file +RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index 087b00b..bb18e70 100644 --- a/code/robots.py +++ b/code/robots.py @@ -142,7 +142,7 @@ def json_to_htaccess(robot_json): robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) htaccess += "|".join(robots) htaccess += ").*$ [NC]\n" - htaccess += "RewriteRule .* - [F,L]" + htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index a34bf55..2e78674 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] -RewriteRule .* - [F,L] \ No newline at end of file +RewriteRule !^/?robots\.txt$ - [F,L] From a884a2afb9dbc7338b0faa24b3c10308adbc48e4 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Mon, 17 Feb 2025 21:00:57 +0100 Subject: [PATCH 013/111] .htaccess: Make regex in RewriteCond safe Improve the regular expression by removing unneeded anchors and escaping special characters (not just space) to prevent false positives or a misbehaving rewrite rule. --- .htaccess | 2 +- code/robots.py | 19 ++++++++++--------- code/test_files/.htaccess | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.htaccess b/.htaccess index c42f99e..2313293 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index bb18e70..a8a674d 100644 --- a/code/robots.py +++ b/code/robots.py @@ -1,8 +1,9 @@ import json -from pathlib import Path - +import re import requests + from bs4 import BeautifulSoup +from pathlib import Path def load_robots_json(): @@ -99,7 +100,6 @@ def updated_robots_json(soup): def ingest_darkvisitors(): - old_robots_json = load_robots_json() soup = get_agent_soup() if soup: @@ -132,16 +132,17 @@ def json_to_table(robots_json): return table +def list_to_pcre(lst): + # Python re is not 100% identical to PCRE which is used by Apache, but it + # should probably be close enough in the real world for re.escape to work. + return f"({"|".join(map(re.escape, lst))})" + + def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out # User agents that contain any of the blocked values. htaccess = "RewriteEngine On\n" - htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" - - # Escape spaces in each User Agent to build the regular expression - robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) - htaccess += "|".join(robots) - htaccess += ").*$ [NC]\n" + htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 2e78674..90ddcf2 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] From 0bd3fa63b832ffd8fa908675656c7007021f6654 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:12:04 +0100 Subject: [PATCH 014/111] table-of-bot-metrics.md: Escape robot names for Markdown table Some characters which could occur in a crawler's name have a special meaning in Markdown. They are escaped to prevent them from having unintended side effects. The escaping is only applied to the first (Name) column of the table. The rest of the columns is expected to already be Markdown encoded in robots.json. --- code/robots.py | 8 ++++++-- table-of-bot-metrics.md | 40 ++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/code/robots.py b/code/robots.py index a8a674d..62fb061 100644 --- a/code/robots.py +++ b/code/robots.py @@ -121,13 +121,17 @@ def json_to_txt(robots_json): return robots_txt +def escape_md(s): + return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) + + def json_to_table(robots_json): """Compose a markdown table with the information in robots.json""" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" - table += "|-----|----------|-----------------------|----------|------------------|-------------|\n" + table += "|------|----------|-----------------------|----------|------------------|-------------|\n" for name, robot in robots_json.items(): - table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' + table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' return table diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index af32bf2..ce82047 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -1,48 +1,48 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| +|------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | +| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| cohere-training-data-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | -| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 17b826a6d3868cf87fb52adf95f52872ac5c4437 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:13:27 +0100 Subject: [PATCH 015/111] Update tests and convert to stock unittest For these simple tests Python's built-in unittest framework is more than enough. No additional dependencies are required. Added some more test cases with "special" characters to test the escaping code better. --- code/test_files/.htaccess | 2 +- code/test_files/robots.json | 44 ++++++++++++++++- code/test_files/robots.txt | 6 +++ code/test_files/table-of-bot-metrics.md | 38 +++++++++------ code/tests.py | 65 ++++++++++++++++++------- 5 files changed, 120 insertions(+), 35 deletions(-) mode change 100644 => 100755 code/tests.py diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 90ddcf2..7e39092 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/test_files/robots.json b/code/test_files/robots.json index c50d63c..b0cbfbb 100644 --- a/code/test_files/robots.json +++ b/code/test_files/robots.json @@ -278,5 +278,47 @@ "function": "Scrapes data for search engine and LLMs.", "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." + }, + "crawler.with.dots": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression dots need to be escaped." + }, + "star***crawler": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression stars need to be escaped." + }, + "Is this a crawler?": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression spaces and question marks need to be escaped." + }, + "a[mazing]{42}(robot)": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped." + }, + "2^32$": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression RE anchor characters need to be escaped." + }, + "curl|sudo bash": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression pipes need to be escaped." } -} \ No newline at end of file +} diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt index 927f6f4..03c3c25 100644 --- a/code/test_files/robots.txt +++ b/code/test_files/robots.txt @@ -38,4 +38,10 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: YouBot +User-agent: crawler.with.dots +User-agent: star***crawler +User-agent: Is this a crawler? +User-agent: a[mazing]{42}(robot) +User-agent: 2^32$ +User-agent: curl|sudo bash Disallow: / diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md index 257ba99..88af6c0 100644 --- a/code/test_files/table-of-bot-metrics.md +++ b/code/test_files/table-of-bot-metrics.md @@ -1,35 +1,35 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| +|------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | @@ -38,5 +38,11 @@ | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| crawler\.with\.dots | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression dots need to be escaped. | +| star\*\*\*crawler | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression stars need to be escaped. | +| Is this a crawler? | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression spaces and question marks need to be escaped. | +| a\[mazing\]\{42\}\(robot\) | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped. | +| 2^32$ | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression RE anchor characters need to be escaped. | +| curl\|sudo bash | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression pipes need to be escaped. | diff --git a/code/tests.py b/code/tests.py old mode 100644 new mode 100755 index 6f778c3..94cbb47 --- a/code/tests.py +++ b/code/tests.py @@ -1,27 +1,58 @@ -"""These tests can be run with pytest. -This requires pytest: pip install pytest -cd to the `code` directory and run `pytest` -""" +#!/usr/bin/env python3 +"""To run these tests just execute this script.""" import json -from pathlib import Path +import unittest from robots import json_to_txt, json_to_table, json_to_htaccess +class RobotsUnittestExtensions: + def loadJson(self, pathname): + with open(pathname, "rt") as f: + return json.load(f) -def test_robots_txt_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_txt = json_to_txt(robots_json) - assert Path("test_files/robots.txt").read_text() == robots_txt + def assertEqualsFile(self, f, s): + with open(f, "rt") as f: + f_contents = f.read() + + return self.assertMultiLineEqual(f_contents, s) -def test_table_of_bot_metrices_md(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_table = json_to_table(robots_json) - assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table +class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_robots_txt_generation(self): + robots_txt = json_to_txt(self.robots_dict) + self.assertEqualsFile("test_files/robots.txt", robots_txt) -def test_htaccess_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_htaccess = json_to_htaccess(robots_json) - assert Path("test_files/.htaccess").read_text() == robots_htaccess +class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 32768 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_table_generation(self): + robots_table = json_to_table(self.robots_dict) + self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) + + +class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_htaccess_generation(self): + robots_htaccess = json_to_htaccess(self.robots_dict) + self.assertEqualsFile("test_files/.htaccess", robots_htaccess) + + +if __name__ == "__main__": + import os + os.chdir(os.path.dirname(__file__)) + + unittest.main(verbosity=2) From c7c1e7b96fe74f90590f4d375c1bab4be53a4044 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:15:10 +0100 Subject: [PATCH 016/111] robots.py: Make executable --- code/robots.py | 2 ++ 1 file changed, 2 insertions(+) mode change 100644 => 100755 code/robots.py diff --git a/code/robots.py b/code/robots.py old mode 100644 new mode 100755 index 62fb061..6bf7920 --- a/code/robots.py +++ b/code/robots.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import re import requests From 1d55a205e4c8447829abdd34098ef9b0fedefee1 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Tue, 18 Feb 2025 05:08:28 +0000 Subject: [PATCH 017/111] Document testing in README Fixes: https://github.com/ai-robots-txt/ai.robots.txt/issues/81 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a206c83..30a85da 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,11 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. +You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing: +```console +code/tests.py +``` + ## Subscribe to updates You can subscribe to list updates via RSS/Atom with the releases feed: From 8a7489633326465fd7e83fecece6740440d38eb6 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:23:40 +0100 Subject: [PATCH 018/111] Add workflow to run tests on pull request or push to main --- .github/workflows/run-tests.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/run-tests.yml diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 0000000..c98861f --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,21 @@ +on: + pull_request: + branches: + - main + push: + branches: + - main +jobs: + run-tests: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + fetch-depth: 2 + - name: Install dependencies + run: | + pip install -U requests beautifulsoup4 + - name: Run tests + run: | + code/tests.py From 6ecfcdfcbfd1bd36da1982b7a4f9f95cbeb8101a Mon Sep 17 00:00:00 2001 From: deyigifts Date: Mon, 24 Mar 2025 14:16:57 +0800 Subject: [PATCH 019/111] Update perplexity bot Update based on perplexity bot docs --- robots.json | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index cdc7bb5..eaac816 100644 --- a/robots.json +++ b/robots.json @@ -253,10 +253,17 @@ }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", - "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", + "respect": "[Yes](https://docs.perplexity.ai/guides/bots)", + "function": "Search result generation.", + "frequency": "No information.", + "description": "Crawls sites to surface as results in Perplexity." + }, + "Perplexity‑User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", "function": "Used to answer queries at the request of users.", - "frequency": "Takes action based on user prompts.", - "description": "Operated by Perplexity to obtain results in response to user queries." + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." }, "PetalBot": { "description": "Operated by Huawei to provide search and AI assistant services.", @@ -321,4 +328,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From da85207314724c02d151a7bdfcdca3ef3fd056a1 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:27:09 +0100 Subject: [PATCH 020/111] Implement new function "json_to_nginx" which outputs an Nginx configuration snippet --- code/robots.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/code/robots.py b/code/robots.py index 6bf7920..f58f2b8 100755 --- a/code/robots.py +++ b/code/robots.py @@ -152,6 +152,12 @@ def json_to_htaccess(robot_json): htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess +def json_to_nginx(robot_json): + # Creates an Nginx config file. This config snippet can be included in + # nginx server{} blocks to block AI bots. + config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" + return config + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -178,6 +184,10 @@ def conversions(): file_name="./.htaccess", converter=json_to_htaccess, ) + update_file_if_changed( + file_name="./nginx-block-ai-bots.conf", + converter=json_to_nginx, + ) if __name__ == "__main__": From 5a312c5f4d1fcd89c17f4d6cb360ad7230857402 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:11 +0100 Subject: [PATCH 021/111] Mention Nginx config feature in README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 30a85da..b984672 100644 --- a/README.md +++ b/README.md @@ -13,16 +13,19 @@ If you'd like to add information about a crawler to the list, please make a pull This repository provides the following files: - `robots.txt` - `.htaccess` +- `nginx-block-ai-bots.conf` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. +`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. + ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing: ```console From 4f3f4cd0dd0f421c2787b1336d37b8da06998882 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:50 +0100 Subject: [PATCH 022/111] Add assembled version of nginx-block-ai-bots.conf file --- nginx-block-ai-bots.conf | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 nginx-block-ai-bots.conf diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf new file mode 100644 index 0000000..ce30520 --- /dev/null +++ b/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { + return 403; +} \ No newline at end of file From 7c3b5a2cb21f5404cf4e2af1acf8689ba77d7b06 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 16:12:18 +0100 Subject: [PATCH 023/111] Add tests for Nginx config generator --- code/test_files/nginx-block-ai-bots.conf | 3 +++ code/tests.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 code/test_files/nginx-block-ai-bots.conf diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf new file mode 100644 index 0000000..d1b559e --- /dev/null +++ b/code/test_files/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { + return 403; +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 94cbb47..61d69b4 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -50,6 +50,16 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) +class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_nginx_generation(self): + robots_nginx = json_to_nginx(self.robots_dict) + self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) + if __name__ == "__main__": import os From 68d1d93714bbe4931811f301c7030ca979d95b39 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 27 Mar 2025 19:29:30 +0000 Subject: [PATCH 024/111] Merge pull request #91 from deyigifts/perplexity-user Update perplexity bots --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 2313293..2f5d0e4 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 80c40e8..8c79fc2 100644 --- a/robots.txt +++ b/robots.txt @@ -35,6 +35,7 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot +User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ce82047..0cc2264 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | +| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 6851413c52b91b9729bbbfd75f84af364b490bde Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 27 Mar 2025 19:49:15 +0000 Subject: [PATCH 025/111] Merge pull request #94 from ThomasLeister/feature/implement-nginx-configuration-snippet-export Implement Nginx configuration snippet export --- nginx-block-ai-bots.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ce30520..72d65ec 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file From ec18af76242c1b62bbbfc7e1df72098b423402a6 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 27 Mar 2025 12:51:22 -0700 Subject: [PATCH 026/111] Revert "Merge pull request #91 from deyigifts/perplexity-user" This reverts commit 68d1d93714bbe4931811f301c7030ca979d95b39. --- .htaccess | 2 +- robots.txt | 1 - table-of-bot-metrics.md | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.htaccess b/.htaccess index 2f5d0e4..2313293 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 8c79fc2..80c40e8 100644 --- a/robots.txt +++ b/robots.txt @@ -35,7 +35,6 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot -User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0cc2264..ce82047 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,8 +36,7 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | -| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From c249de99a317b54e8891f1682dbf514e7763986e Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 28 Mar 2025 00:54:28 +0000 Subject: [PATCH 027/111] Update from Dark Visitors --- robots.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/robots.json b/robots.json index eaac816..e907c8b 100644 --- a/robots.json +++ b/robots.json @@ -258,7 +258,7 @@ "frequency": "No information.", "description": "Crawls sites to surface as results in Perplexity." }, - "Perplexity‑User": { + "Perplexity\u2011User": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://docs.perplexity.ai/guides/bots)", "function": "Used to answer queries at the request of users.", @@ -328,4 +328,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 5b8650b99b35ff2aa1aa9ae26183b312edc48d45 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 29 Mar 2025 00:54:10 +0000 Subject: [PATCH 028/111] Update from Dark Visitors --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 2313293..2f5d0e4 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 80c40e8..8c79fc2 100644 --- a/robots.txt +++ b/robots.txt @@ -35,6 +35,7 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot +User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ce82047..0cc2264 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | +| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From ae8f74c10cec97ec758bf39345ff20717302c665 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:22:04 -0700 Subject: [PATCH 029/111] Update robots.json --- robots.json | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/robots.json b/robots.json index e907c8b..f711d43 100644 --- a/robots.json +++ b/robots.json @@ -69,13 +69,6 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "Claude-Web": { - "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", - "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." - }, "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", @@ -83,6 +76,20 @@ "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, + "Claude-User": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", + "frequency": "No information provided.", + "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." + }, + "Claude-SearchBot": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "frequency": "No information provided.", + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", "respect": "Unclear at this time.", @@ -328,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8dc36aa2e2bbc9b99bde043b593cdc6c9669f401 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:23:28 -0700 Subject: [PATCH 030/111] Update robots.txt --- robots.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/robots.txt b/robots.txt index 8c79fc2..e19468d 100644 --- a/robots.txt +++ b/robots.txt @@ -8,8 +8,9 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: Claude-Web User-agent: ClaudeBot +User-agent: Claude-User +User-agent: Claude-SearchBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Crawlspace From 6b0349f37ddf69ef9ec0e09a884b351f4a0e4b43 Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Fri, 4 Apr 2025 15:20:30 -0700 Subject: [PATCH 031/111] fix python complaining about f-string syntax ``` python code/tests.py Traceback (most recent call last): File "/Users/fbarthelemy/Code/ai.robots.txt/code/tests.py", line 7, in from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx File "/Users/fbarthelemy/Code/ai.robots.txt/code/robots.py", line 144 return f"({"|".join(map(re.escape, lst))})" ^ SyntaxError: f-string: expecting '}' ``` --- code/robots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index f58f2b8..90c0e8c 100755 --- a/code/robots.py +++ b/code/robots.py @@ -141,7 +141,8 @@ def json_to_table(robots_json): def list_to_pcre(lst): # Python re is not 100% identical to PCRE which is used by Apache, but it # should probably be close enough in the real world for re.escape to work. - return f"({"|".join(map(re.escape, lst))})" + formatted = "|".join(map(re.escape, lst)) + return f"({formatted})" def json_to_htaccess(robot_json): From 5f5a89c38c27b676c3212f6ea3895d31f315f37e Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Fri, 4 Apr 2025 17:34:14 -0700 Subject: [PATCH 032/111] Fix html-mangled hyphen in Perplexity-Users Fixes: #99 --- .htaccess | 2 +- code/robots.py | 15 +++++++++++++++ code/test_files/.htaccess | 2 +- code/test_files/nginx-block-ai-bots.conf | 2 +- code/test_files/robots.json | 7 +++++++ code/test_files/robots.txt | 1 + code/test_files/table-of-bot-metrics.md | 1 + code/tests.py | 5 +++++ nginx-block-ai-bots.conf | 2 +- robots.json | 14 +++++++------- robots.txt | 2 +- table-of-bot-metrics.md | 2 +- 12 files changed, 42 insertions(+), 13 deletions(-) diff --git a/.htaccess b/.htaccess index 2f5d0e4..27a7e11 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index 90c0e8c..d158b36 100755 --- a/code/robots.py +++ b/code/robots.py @@ -50,6 +50,7 @@ def updated_robots_json(soup): continue for agent in section.find_all("a", href=True): name = agent.find("div", {"class": "agent-name"}).get_text().strip() + name = clean_robot_name(name) desc = agent.find("p").get_text().strip() default_values = { @@ -101,6 +102,20 @@ def updated_robots_json(soup): return sorted_robots +def clean_robot_name(name): + """ Clean the robot name by removing some characters that were mangled by html software once. """ + # This was specifically spotted in "Perplexity-User" + # Looks like a non-breaking hyphen introduced by the HTML rendering software + # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots + # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, + # and it's only the Row-Heading that has the special hyphen + # + # Technically, there's no reason there wouldn't someday be a bot that + # actually uses a non-breaking hyphen, but that seems unlikely, + # so this solution should be fine for now. + return re.sub(r"\u2011", "-", name) + + def ingest_darkvisitors(): old_robots_json = load_robots_json() soup = get_agent_soup() diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 7e39092..f0d6783 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf index d1b559e..c569b15 100644 --- a/code/test_files/nginx-block-ai-bots.conf +++ b/code/test_files/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { return 403; } \ No newline at end of file diff --git a/code/test_files/robots.json b/code/test_files/robots.json index b0cbfbb..385f284 100644 --- a/code/test_files/robots.json +++ b/code/test_files/robots.json @@ -223,6 +223,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "Perplexity-User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", + "function": "Used to answer queries at the request of users.", + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." + }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt index 03c3c25..ee201f8 100644 --- a/code/test_files/robots.txt +++ b/code/test_files/robots.txt @@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md index 88af6c0..9b280aa 100644 --- a/code/test_files/table-of-bot-metrics.md +++ b/code/test_files/table-of-bot-metrics.md @@ -32,6 +32,7 @@ | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | diff --git a/code/tests.py b/code/tests.py index 61d69b4..f58b445 100755 --- a/code/tests.py +++ b/code/tests.py @@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestRobotsNameCleaning(unittest.TestCase): + def test_clean_name(self): + from robots import clean_robot_name + + self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") if __name__ == "__main__": import os diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 72d65ec..0577bd9 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.json b/robots.json index e907c8b..8fd7572 100644 --- a/robots.json +++ b/robots.json @@ -251,6 +251,13 @@ "frequency": "Unclear at this time.", "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" }, + "Perplexity-User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", + "function": "Used to answer queries at the request of users.", + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." + }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[Yes](https://docs.perplexity.ai/guides/bots)", @@ -258,13 +265,6 @@ "frequency": "No information.", "description": "Crawls sites to surface as results in Perplexity." }, - "Perplexity\u2011User": { - "operator": "[Perplexity](https://www.perplexity.ai/)", - "respect": "[No](https://docs.perplexity.ai/guides/bots)", - "function": "Used to answer queries at the request of users.", - "frequency": "Only when prompted by a user.", - "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." - }, "PetalBot": { "description": "Operated by Huawei to provide search and AI assistant services.", "frequency": "No explicit frequency provided.", diff --git a/robots.txt b/robots.txt index 8c79fc2..c531918 100644 --- a/robots.txt +++ b/robots.txt @@ -34,8 +34,8 @@ User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot User-agent: PanguBot +User-agent: Perplexity-User User-agent: PerplexityBot -User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0cc2264..d92df34 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,8 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | +| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | -| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From c6f308cbd0a00166f5085fa4adc98630c767e11e Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Sat, 5 Apr 2025 09:01:52 -0700 Subject: [PATCH 033/111] PR Feedback: log special-case, comment consistency --- code/robots.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/code/robots.py b/code/robots.py index d158b36..86ea413 100755 --- a/code/robots.py +++ b/code/robots.py @@ -107,13 +107,16 @@ def clean_robot_name(name): # This was specifically spotted in "Perplexity-User" # Looks like a non-breaking hyphen introduced by the HTML rendering software # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots - # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, + # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, # and it's only the Row-Heading that has the special hyphen # # Technically, there's no reason there wouldn't someday be a bot that # actually uses a non-breaking hyphen, but that seems unlikely, # so this solution should be fine for now. - return re.sub(r"\u2011", "-", name) + result = re.sub(r"\u2011", "-", name) + if result != name: + print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") + return result def ingest_darkvisitors(): From b65f45e408461560a32f44f05860f80655737467 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 10 Apr 2025 10:12:51 -0700 Subject: [PATCH 034/111] chore(robots.json): adds imgproxy crawler --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8fd7572..4c9f7d7 100644 --- a/robots.json +++ b/robots.json @@ -195,6 +195,13 @@ "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, + "imgproxy": { + "frequency": "No information.", + "function": "Not documented or explained on operator's site.", + "operator": "[imgproxy](https://imgproxy.net)", + "respect": "Unclear at this time.", + "description": "AI-powered image processing." + }, "ISSCyberRiskCrawler": { "description": "Used to train machine learning based models to quantify cyber risk.", "frequency": "No information.", @@ -328,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4a764bba18f10167cb5f7107c8721e5dc208100f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 10 Apr 2025 19:22:34 +0000 Subject: [PATCH 035/111] Merge pull request #102 from ai-robots-txt/imgproxy-bot chore(robots.json): adds imgproxy crawler --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 27a7e11..c0e5fbb 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 0577bd9..a6bbfa2 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index c531918..de25a56 100644 --- a/robots.txt +++ b/robots.txt @@ -26,6 +26,7 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset +User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d92df34..b3e51fe 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -28,6 +28,7 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | From 305188b2e78855d4e7193f29a3e7205f96fa86f6 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 11 Apr 2025 00:55:52 +0000 Subject: [PATCH 036/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 4c9f7d7..eff38ac 100644 --- a/robots.json +++ b/robots.json @@ -335,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From d9f882a9b21170754c4b37ff1bbc237171876684 Mon Sep 17 00:00:00 2001 From: Joshua Sheard Date: Mon, 14 Apr 2025 15:46:01 +0100 Subject: [PATCH 037/111] Include "AI Agents" from Dark Visitors --- code/robots.py | 1 + 1 file changed, 1 insertion(+) diff --git a/code/robots.py b/code/robots.py index 86ea413..8a06b55 100755 --- a/code/robots.py +++ b/code/robots.py @@ -30,6 +30,7 @@ def updated_robots_json(soup): """Update AI scraper information with data from darkvisitors.""" existing_content = load_robots_json() to_include = [ + "AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", From a96e33098975edf1c05c8d9684b36b9fa31f7ef2 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 15 Apr 2025 00:57:01 +0000 Subject: [PATCH 038/111] Update from Dark Visitors --- robots.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/robots.json b/robots.json index eff38ac..8bba6b2 100644 --- a/robots.json +++ b/robots.json @@ -230,6 +230,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "NovaAct": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact" + }, "OAI-SearchBot": { "operator": "[OpenAI](https://openai.com)", "respect": "[Yes](https://platform.openai.com/docs/bots)", @@ -251,6 +258,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "Operator": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator" + }, "PanguBot": { "operator": "the Chinese company Huawei", "respect": "Unclear at this time.", From e0cdb278fbd243f554579fe5050850f124b286a8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 16 Apr 2025 00:57:11 +0000 Subject: [PATCH 039/111] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index c0e5fbb..d10e796 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a6bbfa2..c37cef5 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index de25a56..1e3aa80 100644 --- a/robots.txt +++ b/robots.txt @@ -31,9 +31,11 @@ User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent User-agent: Meta-ExternalFetcher +User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Operator User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b3e51fe..4c87b41 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -33,9 +33,11 @@ | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | From 4a6f37d72718aeb44d1d8cbcccb740ace3fe82d6 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:42:58 -0700 Subject: [PATCH 040/111] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f711d43..ba052ae 100644 --- a/robots.json +++ b/robots.json @@ -78,7 +78,7 @@ }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." From fd41de8522536a25de71f37310a05e77d71a0792 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:43:03 -0700 Subject: [PATCH 041/111] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index ba052ae..ca6fc40 100644 --- a/robots.json +++ b/robots.json @@ -85,7 +85,7 @@ }, "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." From d05ede8fe164c5c3f47acecc1343e6d2ea5c294b Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 17:46:56 +0100 Subject: [PATCH 042/111] Clarify our position on sponsorship Some firms, including those with .ai domains, have offered to sponsor this project. So make our position clear. --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 967cf41..487f784 100644 --- a/FAQ.md +++ b/FAQ.md @@ -55,3 +55,7 @@ That depends on your stack. ## How can I contribute? Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. + +## Can my company sponsor ai.robots.txt? + +No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From b1856e6988a93bd834b228f121fa3524d11c7be7 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 18:40:44 +0100 Subject: [PATCH 043/111] Donations --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 487f784..7264819 100644 --- a/FAQ.md +++ b/FAQ.md @@ -56,6 +56,10 @@ That depends on your stack. Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. +## I'd like to donate money + +That's kind of you, but we don't need your money. If you insist, we'd love you to make a donation to the [American Civil Liberties Union](https://www.aclu.org/), the [Disasters Emergency Committee](https://www.dec.org.uk/), or a similar organisation. + ## Can my company sponsor ai.robots.txt? No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From 33c5ce1326367abecccad23742779783c10c36a1 Mon Sep 17 00:00:00 2001 From: Dennis Lee Date: Mon, 21 Apr 2025 18:55:11 +0100 Subject: [PATCH 044/111] Update robots.json Updated robots list with five new proposed AI bots: aiHitBot Cotoyogi Factset_spyderbot FirecrawlAgent TikTokSpider --- robots.json | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8bba6b2..698b31e 100644 --- a/robots.json +++ b/robots.json @@ -13,6 +13,13 @@ "operator": "[Ai2](https://allenai.org/crawler)", "respect": "Yes" }, + "aiHitBot": { + "operator": "[aiHit](https://www.aihitdata.com/about)", + "respect": "Yes", + "function": "A massive, artificial intelligence/machine learning, automated system.", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -97,6 +104,13 @@ "frequency": "Unclear at this time.", "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler" }, + "Cotoyogi": { + "operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)", + "respect": "Yes", + "function": "AI LLM Scraper.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training in Japanese language." + }, "Crawlspace": { "operator": "[Crawlspace](https://crawlspace.dev)", "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)", @@ -125,6 +139,20 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "Factset_spyderbot": { + "operator": "[Factset](https://www.factset.com/ai)", + "respect": "Unclear at this time.", + "function": "AI model training.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training." + }, + "FirecrawlAgent": { + "operator": "[Firecrawl](https://www.firecrawl.dev/)", + "respect": "Yes", + "function": "AI scraper and LLM training", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems and LLM training." + }, "FriendlyCrawler": { "description": "Unclear who the operator is; but data is used for training/machine learning.", "frequency": "Unclear at this time.", @@ -321,6 +349,13 @@ "operator": "[Sidetrade](https://www.sidetrade.com)", "respect": "Unclear at this time." }, + "TikTokSpider": { + "operator": "ByteDance", + "respect": "Unclear at this time.", + "function": "LLM training.", + "frequency": "Unclear at this time.", + "description": "Downloads data to train LLMS, as per Bytespider." + }, "Timpibot": { "operator": "[Timpi](https://timpi.io)", "respect": "Unclear at this time.", @@ -349,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From bbec639c14f3e7258729718dd2c6fc0b1734a9b1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 22 Apr 2025 14:50:26 +0000 Subject: [PATCH 045/111] Merge pull request #109 from dennislee1/patch-1 AI bots to consider adding --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++++ table-of-bot-metrics.md | 5 +++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index d10e796..b4ab72f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c37cef5..090275a 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1e3aa80..1f8eaf2 100644 --- a/robots.txt +++ b/robots.txt @@ -1,5 +1,6 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma +User-agent: aiHitBot User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot @@ -12,10 +13,13 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler +User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther @@ -44,6 +48,7 @@ User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot +User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4c87b41..0249766 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -2,6 +2,7 @@ |------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | @@ -14,10 +15,13 @@ | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | +| FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | @@ -46,6 +50,7 @@ | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | +| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | From 8d25a424d96dbf4a3cb12d4bb51929a764aa8f89 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 23 Apr 2025 00:56:52 +0000 Subject: [PATCH 046/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 698b31e..df9dcda 100644 --- a/robots.json +++ b/robots.json @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9d846ced45cdd13c7ecc03353c6ec554b5f9015d Mon Sep 17 00:00:00 2001 From: maia Date: Thu, 24 Apr 2025 04:08:20 +0200 Subject: [PATCH 047/111] Update robots.json Lowercase meta-external* as that was not technically the UA for the bots, also removed a period in the "respect" for consistency --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index df9dcda..f2cec91 100644 --- a/robots.json +++ b/robots.json @@ -244,14 +244,14 @@ "frequency": "Unclear at this time.", "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" }, - "Meta-ExternalAgent": { + "meta-externalagent": { "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", - "respect": "Yes.", + "respect": "Yes", "function": "Used to train models and improve products.", "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, - "Meta-ExternalFetcher": { + "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", "function": "AI Assistants", @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4654e14e9c857a228289d3258835182838202503 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 24 Apr 2025 07:00:34 +0000 Subject: [PATCH 048/111] Merge pull request #112 from maiavixen/main Fixed meta-external* being titlecase, and removed period for consistency --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++-- table-of-bot-metrics.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index b4ab72f..a97f98a 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 090275a..3320071 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1f8eaf2..53291ca 100644 --- a/robots.txt +++ b/robots.txt @@ -33,8 +33,8 @@ User-agent: img2dataset User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher +User-agent: meta-externalagent +User-agent: meta-externalfetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0249766..5c093b8 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -35,8 +35,8 @@ | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 934ac7b31864d0cf0b21bb1580ddea97ec8b4994 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 25 Apr 2025 00:56:57 +0000 Subject: [PATCH 049/111] Update from Dark Visitors --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2cec91..e15a196 100644 --- a/robots.json +++ b/robots.json @@ -251,6 +251,13 @@ "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, + "Meta-ExternalAgent": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent" + }, "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -258,6 +265,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "Meta-ExternalFetcher": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -384,4 +398,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From c6c7f1748f1e28053184539a70a6a08f5aeabc37 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 26 Apr 2025 00:55:12 +0000 Subject: [PATCH 050/111] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index a97f98a..586adab 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3320071..fc58d61 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 53291ca..232e119 100644 --- a/robots.txt +++ b/robots.txt @@ -34,7 +34,9 @@ User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent +User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 5c093b8..4dd6076 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,9 @@ | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 50e739dd738bb821018a863491b770dd8ee61155 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 08:42:52 +0200 Subject: [PATCH 051/111] HAProxy converter added. --- README.md | 9 +++++++ code/robots.py | 9 +++++++ haproxy-block-ai-bots.txt | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 haproxy-block-ai-bots.txt diff --git a/README.md b/README.md index b984672..1f1eff6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; +1. Add the file to the config directory of HAProxy +2. Add the following lines in the `frontend` section; + ``` + acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt + http-request deny if ai_robot + ``` + (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) ## Contributing diff --git a/code/robots.py b/code/robots.py index 8a06b55..da157c1 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,6 +178,11 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config +def json_to_haproxy(robots_json): + # Creates a source file for HAProxy. Follow instructions in the README to implement it. + txt = "\n".join(f"{k}" for k in robots_json.keys()) + return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -208,6 +213,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./haproxy-block-ai-bots.txt", + converter=json_to_haproxy, + ) if __name__ == "__main__": diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..3c326bd --- /dev/null +++ b/haproxy-block-ai-bots.txt @@ -0,0 +1,57 @@ +AI2Bot +Ai2Bot-Dolma +aiHitBot +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Brightbot 1.0 +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +cohere-training-data-crawler +Cotoyogi +Crawlspace +Diffbot +DuckAssistBot +FacebookBot +Factset_spyderbot +FirecrawlAgent +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +imgproxy +ISSCyberRiskCrawler +Kangaroo Bot +meta-externalagent +Meta-ExternalAgent +meta-externalfetcher +Meta-ExternalFetcher +NovaAct +OAI-SearchBot +omgili +omgilibot +Operator +PanguBot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +SemrushBot-OCOB +SemrushBot-SWA +Sidetrade indexer bot +TikTokSpider +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot \ No newline at end of file From 66da70905f503239faeb0e49204776f508928048 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:09:40 +0200 Subject: [PATCH 052/111] Fixed incorrect English sentence. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f1eff6..ff124e3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt http-request deny if ai_robot ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) + (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) ## Contributing From a4a9f2ac2b9116d104789664231af4017d3828a7 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:30:26 +0200 Subject: [PATCH 053/111] Tests for HAProxy file added. --- code/test_files/haproxy-block-ai-bots.txt | 47 +++++++++++++++++++++++ code/tests.py | 12 +++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 code/test_files/haproxy-block-ai-bots.txt diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..5ed6939 --- /dev/null +++ b/code/test_files/haproxy-block-ai-bots.txt @@ -0,0 +1,47 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot +crawler.with.dots +star***crawler +Is this a crawler? +a[mazing]{42}(robot) +2^32$ +curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index f58b445..e179c44 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_haproxy_generation(self): + robots_haproxy = json_to_haproxy(self.robots_dict) + self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) + class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name From 1310dbae4656e212ff01e7d8530d78c76dfd5a9f Mon Sep 17 00:00:00 2001 From: Crazyroostereye <63781667+Crazyroostereye1@users.noreply.github.com> Date: Thu, 1 May 2025 12:21:32 +0200 Subject: [PATCH 054/111] Added a Caddyfile converter (#110) Co-authored-by: Julian Beittel Co-authored-by: Glyn Normington --- Caddyfile | 3 +++ README.md | 3 +++ code/robots.py | 12 ++++++++++++ code/test_files/Caddyfile | 3 +++ code/tests.py | 13 ++++++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 Caddyfile create mode 100644 code/test_files/Caddyfile diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..1857d75 --- /dev/null +++ b/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" +} \ No newline at end of file diff --git a/README.md b/README.md index ff124e3..8d7bfb1 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `Caddyfile` - `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -23,6 +24,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots` + `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; 1. Add the file to the config directory of HAProxy 2. Add the following lines in the `frontend` section; diff --git a/code/robots.py b/code/robots.py index da157c1..054c2be 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,12 +178,20 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config + +def json_to_caddy(robot_json): + caddyfile = "@aibots {\n " + caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' + caddyfile += "\n}" + return caddyfile + def json_to_haproxy(robots_json): # Creates a source file for HAProxy. Follow instructions in the README to implement it. txt = "\n".join(f"{k}" for k in robots_json.keys()) return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) @@ -213,6 +221,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./Caddyfile", + converter=json_to_caddy + update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", converter=json_to_haproxy, diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile new file mode 100644 index 0000000..82f365a --- /dev/null +++ b/code/test_files/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)" +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index e179c44..434406f 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -76,6 +76,17 @@ class TestRobotsNameCleaning(unittest.TestCase): self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") +class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_caddyfile_generation(self): + robots_caddyfile = json_to_caddy(self.robots_dict) + self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) From ec995cd686a09b4af1c6a59d95e1ced122f1d5fc Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Thu, 1 May 2025 11:27:40 +0100 Subject: [PATCH 055/111] Fix Python syntax error --- code/robots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index 054c2be..c795649 100755 --- a/code/robots.py +++ b/code/robots.py @@ -223,7 +223,8 @@ def conversions(): ) update_file_if_changed( file_name="./Caddyfile", - converter=json_to_caddy + converter=json_to_caddy, + ) update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", From 678380727e8685af8c5311bcfa1f55c7aa866d3b Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 1 May 2025 10:29:06 +0000 Subject: [PATCH 056/111] Merge pull request #115 from glyn/syntax Fix Python syntax error --- Caddyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Caddyfile b/Caddyfile index 1857d75..0e10cfa 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file From 36a52a88d8e3832091d73062ef268acb46f6e031 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 12 May 2025 20:20:18 -0700 Subject: [PATCH 057/111] Bing AI opt-out instructions --- README.md | 2 ++ docs/additional-steps/bing.md | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 docs/additional-steps/bing.md diff --git a/README.md b/README.md index 8d7bfb1..28ef743 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +[Bing uses the data it crawls for AI and training, you may opt out by adding a `meta` tag to the `head` of your site.]((./docs/additional-steps/bing.md)) + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md new file mode 100644 index 0000000..37c60c7 --- /dev/null +++ b/docs/additional-steps/bing.md @@ -0,0 +1,36 @@ +# Bing (bingbot) + +It's not well publicised, but Bing uses the data it crawls for AI and training. + +However, the current thinking is, blocking a search engine of this size using `robots.txt` seems a quite drastic approach as it is second only to Google and could significantly impact your website in search results. + +Additionally, Bing powers a number of search engines such as Yahoo and AOL, and its search results are also used in Duck Duck Go, amongst others. + +Fortunately, Bing supports a relatively simple opt-out method, requiring an additional step. + +## How to opt-out of AI training + +You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. + +The line you need to add is: + +```plaintext + +``` + +By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." + +## Will my site be negatively affected + +Simple answer, no. +The original use of "noarchive" has been retired by all search engines. Google retired its use in 2024. + +The use of this metatag will not impact your site in search engines or in any other meaningful way if you add it to your page(s). + +It is now solely used by a handful of crawlers, such as Bingbot and Amazonbot, to signify to them not to use your data for AI/training. + +## Resources + +Bing Blog AI opt-out announcement: https://blogs.bing.com/webmaster/september-2023/Announcing-new-options-for-webmasters-to-control-usage-of-their-content-in-Bing-Chat + +Bing metatag information, including AI opt-out: https://www.bing.com/webmasters/help/which-robots-metatags-does-bing-support-5198d240 From b4610a725cac409b5c686d0f68383ea3f6daa818 Mon Sep 17 00:00:00 2001 From: Florent Poinsaut Date: Wed, 14 May 2025 14:11:56 +0200 Subject: [PATCH 058/111] Add Traefik plugin --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 8d7bfb1..80de135 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,12 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +### Related + +- [Robots.txt Traefik plugin](https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin): +middleware plugin for [Traefik](https://traefik.io/traefik/) to automatically add rules of [robots.txt](./robots.txt) +file on-the-fly. + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. From 9539256cb3116b626439bf79a776ea67b7aa2edd Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 14 May 2025 16:46:32 -0700 Subject: [PATCH 059/111] chore(robots.json): adds QualifiedBot crawler --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index e15a196..99e28d3 100644 --- a/robots.json +++ b/robots.json @@ -335,6 +335,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "QualifiedBot": { + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time.", + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", @@ -398,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 0c56b96fd99bfcc736c4f64c0df9bb87a1fc6075 Mon Sep 17 00:00:00 2001 From: Joe Hoyle Date: Thu, 15 May 2025 11:26:47 -0400 Subject: [PATCH 060/111] Fix JSON syntax error --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 99e28d3..f518037 100644 --- a/robots.json +++ b/robots.json @@ -340,7 +340,7 @@ "frequency": "No explicit frequency provided.", "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time.", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", From 1c470babbefed7b470443f6dd834e721c58481d6 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 15 May 2025 16:12:30 +0000 Subject: [PATCH 061/111] Merge pull request #123 from joehoyle/patch-1 Fix JSON syntax error --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 586adab..de88e50 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 0e10cfa..43ad3bf 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 3c326bd..9770b45 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -46,6 +46,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +QualifiedBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index fc58d61..afbd77c 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 232e119..dea4dd5 100644 --- a/robots.txt +++ b/robots.txt @@ -46,6 +46,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4dd6076..57469fa 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -48,6 +48,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 498aa50760fe3850820b46933bf87b82918e5803 Mon Sep 17 00:00:00 2001 From: Patrick Evans Date: Thu, 15 May 2025 11:10:06 -0500 Subject: [PATCH 062/111] lint robots.json during pull requests --- .github/workflows/run-tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index c98861f..042cc13 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -19,3 +19,10 @@ jobs: - name: Run tests run: | code/tests.py + lint-json: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: JQ Json Lint + run: jq . robots.json From 16d1de70943e9d448d3d5e02e91d86e38dac80d7 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 16 May 2025 00:59:08 +0000 Subject: [PATCH 063/111] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index f518037..f2ed4c2 100644 --- a/robots.json +++ b/robots.json @@ -336,11 +336,11 @@ "respect": "Yes" }, "QualifiedBot": { - "description": "Operated by Qualified as part of their suite of AI product offerings.", - "frequency": "No explicit frequency provided.", - "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", - "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time." + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", @@ -405,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 5fba0b746d550b6ae4d7c9605904b6ec102d0f98 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 15:40:46 -0700 Subject: [PATCH 064/111] chore(robots.json): adds MistralAI-User/1.0 crawler --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index f518037..647e664 100644 --- a/robots.json +++ b/robots.json @@ -272,6 +272,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "MistralAI-User/1.0": { + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", From ca918a963f735019a0c66343bf8338a9228d94f5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 21:16:49 -0700 Subject: [PATCH 065/111] chore(robots.json): adds Google-CloudVertexBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2ed4c2..b459533 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, + "Google-CloudVertexBot": { + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "Build and manage AI models for businesses employing Vertex AI", + "frequency": "No information.", + "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents." + }, "Google-Extended": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -405,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From dd1ed174b77ca2c0c4a40d6f4bce6beda4a1c296 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 16 May 2025 11:35:15 +0000 Subject: [PATCH 066/111] Merge pull request #129 from ai-robots-txt/google-cloudvertexbot chore(robots.json): adds Google-CloudVertexBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index de88e50..b2204d7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 43ad3bf..36fd20c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 9770b45..7389f10 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ FacebookBot Factset_spyderbot FirecrawlAgent FriendlyCrawler +Google-CloudVertexBot Google-Extended GoogleOther GoogleOther-Image diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index afbd77c..f05f785 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index dea4dd5..a5be10b 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 57469fa..d8542b3 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | From 7a2e6cba52e782ab32552c2335637af761afbe51 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 17 May 2025 00:57:28 +0000 Subject: [PATCH 067/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index b459533..1ecfcd8 100644 --- a/robots.json +++ b/robots.json @@ -412,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9297c7dfa3122109a6f3ae3ce18026e0e6c94ebe Mon Sep 17 00:00:00 2001 From: Mihitoko Date: Mon, 19 May 2025 23:56:57 +0200 Subject: [PATCH 068/111] Mention X-Robots-Tag header as alternative for bing --- docs/additional-steps/bing.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md index 37c60c7..f9afb78 100644 --- a/docs/additional-steps/bing.md +++ b/docs/additional-steps/bing.md @@ -10,15 +10,19 @@ Fortunately, Bing supports a relatively simple opt-out method, requiring an addi ## How to opt-out of AI training -You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. +You must add a metatag in the `` of your webpage or set the [X-Robots-Tag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/X-Robots-Tag) HTTP header in your response. This also needs to be added to every page or response on your website. -The line you need to add is: +If using the metatag, the line you need to add is: ```plaintext ``` +Or include the HTTP response header: +```plaintext +X-Robots-Tag: noarchive +``` -By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." +By adding this line or header, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." ## Will my site be negatively affected From 8a8001cbece8a5607163bed6eb83d5bb35bb24e5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 20 May 2025 13:55:25 -0700 Subject: [PATCH 069/111] chore(README): updates the opening line of our README to clarify the types of agents we block --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 232b3ed..307f005 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This is an open list of web crawlers associated with AI companies and the training of LLMs to block. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 8b151b2cdc6ef8e949fd59d26d7456bcbb60d4e7 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 06:52:36 -0700 Subject: [PATCH 070/111] Update README.md Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 307f005..f427af4 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 1c2acd75b7def13d9dc85233bfb4aaca8bcafd12 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 15:27:26 +0000 Subject: [PATCH 071/111] Merge pull request #126 from ai-robots-txt/mistral-bot chore(robots.json): adds MistralAI-User/1.0 crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index b2204d7..cc483c7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 36fd20c..205acbd 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7389f10..de5b4fb 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -38,6 +38,7 @@ meta-externalagent Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher +MistralAI-User/1.0 NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index f05f785..3274559 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a5be10b..b3e16f8 100644 --- a/robots.txt +++ b/robots.txt @@ -38,6 +38,7 @@ User-agent: meta-externalagent User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: MistralAI-User/1.0 User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d8542b3..84c69f5 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -40,6 +40,7 @@ | Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b1d9a60a38c04cac81dd156ad73ddef7eb60b50b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 11:40:33 -0700 Subject: [PATCH 072/111] chore(robots.json): adds wpbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index bddefdd..ed7d63d 100644 --- a/robots.json +++ b/robots.json @@ -412,6 +412,13 @@ "frequency": "Unclear at this time.", "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" }, + "wpbot": { + "operator": "[QuantumCloud](https://www.quantumcloud.com)", + "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)", + "function": "Live chat support and lead generation.", + "frequency": "Unclear at this time.", + "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -419,4 +426,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 7c5389f4a0c5f60745e1a7552e142bd33a587d8f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 19:00:23 +0000 Subject: [PATCH 073/111] Merge pull request #98 from kylebuckingham/main Updating Claude Bots --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- table-of-bot-metrics.md | 3 ++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.htaccess b/.htaccess index cc483c7..2146722 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 205acbd..879426d 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index de5b4fb..8a9ccf9 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,8 +9,9 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -Claude-Web ClaudeBot +Claude-User +Claude-SearchBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3274559..5f96718 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 84c69f5..e6f35a4 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,8 +11,9 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From fedb658cc08225d71a6b0f32c9c2859b7420f0ee Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 21:06:05 +0000 Subject: [PATCH 074/111] Merge pull request #133 from ai-robots-txt/wpbot chore(robots.json): adds wpbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 2146722..3337284 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 879426d..2001edc 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8a9ccf9..377710b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -58,4 +58,5 @@ TikTokSpider Timpibot VelenPublicWebCrawler Webzio-Extended +wpbot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f96718..ba1f8c6 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8690e50..92e527b 100644 --- a/robots.txt +++ b/robots.txt @@ -58,5 +58,6 @@ User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended +User-agent: wpbot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e6f35a4..c795559 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -60,4 +60,5 @@ | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 7bf7f9164d55a58e0d6080dff52eea3ed5d3584e Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 22 May 2025 00:58:45 +0000 Subject: [PATCH 075/111] Update from Dark Visitors --- robots.json | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/robots.json b/robots.json index 9dce781..06187ae 100644 --- a/robots.json +++ b/robots.json @@ -76,12 +76,12 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "ClaudeBot": { + "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Scrapes data to train Anthropic's AI products.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", @@ -90,12 +90,19 @@ "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." }, - "Claude-SearchBot": { + "Claude-Web": { + "operator": "Anthropic", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web" + }, + "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "function": "Scrapes data to train Anthropic's AI products.", "frequency": "No information provided.", - "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", @@ -287,11 +294,11 @@ "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, "MistralAI-User/1.0": { - "operator": "Mistral AI", - "function": "Takes action based on user prompts.", - "frequency": "Only when prompted by a user.", - "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", - "respect": "Yes" + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" }, "NovaAct": { "operator": "Unclear at this time.", @@ -433,4 +440,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 093ab81d789528bb5d89c2d2c708b8f157e3b795 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 23 May 2025 00:58:57 +0000 Subject: [PATCH 076/111] Update from Dark Visitors --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 5 +++-- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++-- table-of-bot-metrics.md | 5 +++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.htaccess b/.htaccess index 3337284..26c6e72 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 2001edc..7a1076c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 377710b..8ef373b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,9 +9,10 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -ClaudeBot -Claude-User Claude-SearchBot +Claude-User +Claude-Web +ClaudeBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ba1f8c6..a691c55 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 92e527b..3330b20 100644 --- a/robots.txt +++ b/robots.txt @@ -9,9 +9,10 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: ClaudeBot -User-agent: Claude-User User-agent: Claude-SearchBot +User-agent: Claude-User +User-agent: Claude-Web +User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Cotoyogi diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c795559..0e6a88e 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,9 +11,10 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | | Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-Web | Anthropic | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From 3e8edd083e32e01bd5cf0629f108815092d5f7ec Mon Sep 17 00:00:00 2001 From: imp <80153024+not-not-the-imp@users.noreply.github.com> Date: Fri, 23 May 2025 13:03:49 +0100 Subject: [PATCH 077/111] Add AndiBot and PhindBot Fixes #75 --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 06187ae..8a8432b 100644 --- a/robots.json +++ b/robots.json @@ -20,6 +20,13 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provide genAI summary ." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -363,6 +370,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "PhindBot": { + "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "frequency": "No explicit frequency provided.", + "function": "AI-enhanced search engine.", + "operator": "[phind](https://www.phind.com/)", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", @@ -440,4 +454,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From d22b9ec51ac14b0d9dfdd86a04cd566731c0e8c4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:00:13 -0700 Subject: [PATCH 078/111] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8a8432b..9bf9fdd 100644 --- a/robots.json +++ b/robots.json @@ -23,7 +23,7 @@ "Andibot": { "operator": "[Andi](https://andisearch.com/)", "respect": "Unclear at this time", - "function": "search engine using generative AI, AI Search Assistant", + "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", "description": "Scrapes website and provide genAI summary ." }, From 4259b25cccbb4d0bff740261f1eb825fa86bf381 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:01:09 -0700 Subject: [PATCH 079/111] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 9bf9fdd..50c2fae 100644 --- a/robots.json +++ b/robots.json @@ -25,7 +25,7 @@ "respect": "Unclear at this time", "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", - "description": "Scrapes website and provide genAI summary ." + "description": "Scrapes website and provides AI summary." }, "Amazonbot": { "operator": "Amazon", From 268922f8f2fa05b4eb4cc991116fce2700c09184 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:02:05 -0700 Subject: [PATCH 080/111] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 50c2fae..6f3a05d 100644 --- a/robots.json +++ b/robots.json @@ -371,7 +371,7 @@ "respect": "Yes" }, "PhindBot": { - "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "description": "Company offers an AI agent that uses AI and generate extra web query on the fly", "frequency": "No explicit frequency provided.", "function": "AI-enhanced search engine.", "operator": "[phind](https://www.phind.com/)", From 1dd66b696963dd3f9a577aa4ca59d83c8bc41aef Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 11:53:06 -0700 Subject: [PATCH 081/111] Revert "chore(robots.json): adds imgproxy crawler" This reverts commit b65f45e408461560a32f44f05860f80655737467. --- robots.json | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/robots.json b/robots.json index 6f3a05d..f730f50 100644 --- a/robots.json +++ b/robots.json @@ -251,13 +251,6 @@ "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, - "imgproxy": { - "frequency": "No information.", - "function": "Not documented or explained on operator's site.", - "operator": "[imgproxy](https://imgproxy.net)", - "respect": "Unclear at this time.", - "description": "AI-powered image processing." - }, "ISSCyberRiskCrawler": { "description": "Used to train machine learning based models to quantify cyber risk.", "frequency": "No information.", @@ -454,4 +447,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 899ce01c554359ecc66aa36bc2af367069175e10 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 14:47:31 -0700 Subject: [PATCH 082/111] chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .github/workflows/ai_robots_update.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index 7e11ce8..17c1cc8 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -20,7 +20,12 @@ jobs: echo "... done." git --no-pager diff git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) + if ! git diff --cached --quiet; then + git commit -m "Update from Dark Visitors" + git push + else + echo "No changes to commit." + fi shell: bash convert: name: convert From 87016d15040f50630121666c40a6048df8a4169d Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 3 Jun 2025 01:00:29 +0000 Subject: [PATCH 083/111] Update from Dark Visitors --- robots.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/robots.json b/robots.json index f730f50..d8e8262 100644 --- a/robots.json +++ b/robots.json @@ -20,13 +20,6 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, - "Andibot": { - "operator": "[Andi](https://andisearch.com/)", - "respect": "Unclear at this time", - "function": "Search engine using generative AI, AI Search Assistant", - "frequency": "No information provided.", - "description": "Scrapes website and provides AI summary." - }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -34,6 +27,13 @@ "frequency": "No information provided.", "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "Search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provides AI summary." + }, "anthropic-ai": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "Unclear at this time.", From d239e7e5ad0e9c72d17961c57deaf10c1feef899 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 01:52:35 +0000 Subject: [PATCH 084/111] Merge pull request #139 from ai-robots-txt/workflow-fix chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- robots.txt | 3 ++- table-of-bot-metrics.md | 3 ++- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index 26c6e72..ddb7255 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7a1076c..60ed4d3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8ef373b..6c23da0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -2,6 +2,7 @@ AI2Bot Ai2Bot-Dolma aiHitBot Amazonbot +Andibot anthropic-ai Applebot Applebot-Extended @@ -33,7 +34,6 @@ iaskspider/2.0 ICC-Crawler ImagesiftBot img2dataset -imgproxy ISSCyberRiskCrawler Kangaroo Bot meta-externalagent @@ -50,6 +50,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +PhindBot QualifiedBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a691c55..1c50815 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 3330b20..4b362e4 100644 --- a/robots.txt +++ b/robots.txt @@ -2,6 +2,7 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma User-agent: aiHitBot User-agent: Amazonbot +User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended @@ -33,7 +34,6 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset -User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent @@ -50,6 +50,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: PhindBot User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0e6a88e..737b217 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -4,6 +4,7 @@ | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Andibot | [Andi](https://andisearch.com/) | Unclear at this time | Search engine using generative AI, AI Search Assistant | No information provided. | Scrapes website and provides AI summary. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | @@ -35,7 +36,6 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | @@ -52,6 +52,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 3187fd8a3219620a4b0f3b7b7950f40e5aac4ad1 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 3 Jun 2025 12:24:16 -0700 Subject: [PATCH 085/111] chore(robots.json): adds YandexAdditional crawlers --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index d8e8262..45340fd 100644 --- a/robots.json +++ b/robots.json @@ -440,6 +440,20 @@ "frequency": "Unclear at this time.", "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." }, + "YandexAdditional": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, + "YandexAdditionalBot": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -447,4 +461,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 080946c360465c5e0b0af8dd475bb0248bca77a1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 19:51:25 +0000 Subject: [PATCH 086/111] Merge pull request #140 from ai-robots-txt/yandex-bots chore(robots.json): adds YandexAdditional crawlers --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ddb7255..c381fce 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 60ed4d3..3527a7a 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 6c23da0..a8ba9aa 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -61,4 +61,6 @@ Timpibot VelenPublicWebCrawler Webzio-Extended wpbot +YandexAdditional +YandexAdditionalBot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 1c50815..5f7a0db 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4b362e4..d26ccb4 100644 --- a/robots.txt +++ b/robots.txt @@ -61,5 +61,7 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: wpbot +User-agent: YandexAdditional +User-agent: YandexAdditionalBot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 737b217..3275a20 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -63,4 +63,6 @@ | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | +| YandexAdditional | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | +| YandexAdditionalBot | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 8f75f4a2f5f8f8381a73e6a6faab241e0ad58a14 Mon Sep 17 00:00:00 2001 From: Ivan Chupin Date: Wed, 4 Jun 2025 03:48:42 +0500 Subject: [PATCH 087/111] Add SBIntuitionsBot --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 45340fd..cd7ec4a 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,13 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "SBIntuitionsBot": { + "description": "AI development and information analysis", + "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", + "frequency": "No information.", + "function": "Uses data gathered in AI development and information analysis.", + "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)" + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", From 3efabc603dcae235cc04d1e2f2e9113c70e6197c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 23:28:48 +0000 Subject: [PATCH 088/111] Merge pull request #141 from Ivan-Chupin/patch-1 Add SBIntuitionsBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index c381fce..48971b1 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 3527a7a..117e653 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index a8ba9aa..c2ebb47 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,7 @@ PerplexityBot PetalBot PhindBot QualifiedBot +SBIntuitionsBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f7a0db..edcf8a7 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index d26ccb4..1c5e989 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,7 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 3275a20..c9a3910 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,7 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 2b5a59a303a81847234ac11ec2f180ee9795db90 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 4 Jun 2025 01:00:07 +0000 Subject: [PATCH 089/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index cd7ec4a..739f579 100644 --- a/robots.json +++ b/robots.json @@ -468,4 +468,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 03831a7eb55ae3682e45a651588df13acc4a4a04 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:46:58 -0700 Subject: [PATCH 090/111] chore(robots.json): adds Quillbot --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..bea7350 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,20 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "QuillBot": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, + "quillbot.com": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, "SBIntuitionsBot": { "description": "AI development and information analysis", "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4568d69b0edad5708fe49fafd2b5c0572cd2dfa4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:54:14 -0700 Subject: [PATCH 091/111] chore(robots.json): adds Panscient --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..8d1163a 100644 --- a/robots.json +++ b/robots.json @@ -342,6 +342,20 @@ "frequency": "Unclear at this time.", "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" }, + "Panscient": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, + "panscient.com": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, "Perplexity-User": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://docs.perplexity.ai/guides/bots)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 9c28c63a0c4889d694f7ebdd278c17564f4b72a3 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 17:54:57 +0000 Subject: [PATCH 092/111] Merge pull request #142 from ai-robots-txt/quillbot chore(robots.json): adds Quillbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 48971b1..ee898fd 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 117e653..de3bf3b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c2ebb47..be27797 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,8 @@ PerplexityBot PetalBot PhindBot QualifiedBot +QuillBot +quillbot.com SBIntuitionsBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index edcf8a7..d7decae 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1c5e989..4ed2deb 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,8 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: QuillBot +User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c9a3910..e0fcd26 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,8 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | +| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 75ea75a95b006e68f89f8826bf84caff569a1eb8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 18:04:06 +0000 Subject: [PATCH 093/111] Merge pull request #143 from ai-robots-txt/panscient chore(robots.json): adds Panscient --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ee898fd..5fefc69 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index de3bf3b..5caa249 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index be27797..fe153c8 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -47,6 +47,8 @@ omgili omgilibot Operator PanguBot +Panscient +panscient.com Perplexity-User PerplexityBot PetalBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index d7decae..e5d660b 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4ed2deb..26a9d78 100644 --- a/robots.txt +++ b/robots.txt @@ -47,6 +47,8 @@ User-agent: omgili User-agent: omgilibot User-agent: Operator User-agent: PanguBot +User-agent: Panscient +User-agent: panscient.com User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e0fcd26..69b9ec2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -49,6 +49,8 @@ | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | +| Panscient | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | +| panscient\.com | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | From 77393df5aa2ac7e3f2bb3e54ff919470c63e5b09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 5 Jun 2025 00:59:28 +0000 Subject: [PATCH 094/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 22a1370..6df5084 100644 --- a/robots.json +++ b/robots.json @@ -496,4 +496,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 528d77bf072780a40c169cab399eb0b2139edb83 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 5 Jun 2025 09:14:23 -0700 Subject: [PATCH 095/111] chore(robots.json): adds bedrockbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..a3f75bc 100644 --- a/robots.json +++ b/robots.json @@ -55,6 +55,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "bedrockbot": { + "operator": "[Amazon](https://amazon.com)", + "respect": "[Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector)", + "function": "Data scraping for custom AI applications.", + "frequency": "Unclear at this time.", + "description": "Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application." + }, "Brightbot 1.0": { "operator": "Browsing.ai", "respect": "Unclear at this time.", @@ -468,4 +475,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From ac7ed17e71a59a67d54279d010477abecfb15caf Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 5 Jun 2025 16:51:17 +0000 Subject: [PATCH 096/111] Merge pull request #145 from ai-robots-txt/aws-bedrockbot chore(robots.json): adds bedrockbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 5fefc69..dbbde1e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 5caa249..08b2fd3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe153c8..cdaeecd 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -6,6 +6,7 @@ Andibot anthropic-ai Applebot Applebot-Extended +bedrockbot Brightbot 1.0 Bytespider CCBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index e5d660b..542ac65 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 26a9d78..a8dc655 100644 --- a/robots.txt +++ b/robots.txt @@ -6,6 +6,7 @@ User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: bedrockbot User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 69b9ec2..ee324f7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -8,6 +8,7 @@ | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. | | Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | From e21f6ae1b6eeb0782012a4a31e6af61b6a21cb09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 6 Jun 2025 00:59:25 +0000 Subject: [PATCH 097/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8f5dadd..3b5c434 100644 --- a/robots.json +++ b/robots.json @@ -503,4 +503,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 7867c3e26c267301cc528edf89fccf5203bfb99b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 9 Jun 2025 08:44:25 -0700 Subject: [PATCH 098/111] chore(robots.json): adds EchoboxBot (#148) --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 3b5c434..911dadc 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "frequency": "Unclear at this time.", "description": "DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot" }, + "EchoboxBot": { + "operator": "[Echobox](https://echobox.com)", + "respect": "Unclear at this time.", + "function": "Data collection to support AI-powered products.", + "frequency": "Unclear at this time.", + "description": "Supports company's AI-powered social and email management products." + }, "FacebookBot": { "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", @@ -503,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 3759a6bf146f0153bbb7ab880ff95380a4fc739e Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 9 Jun 2025 15:44:36 +0000 Subject: [PATCH 099/111] chore(robots.json): adds EchoboxBot (#148) --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index dbbde1e..74b4e5b 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 08b2fd3..9ca050b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index cdaeecd..dc9f851 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ Cotoyogi Crawlspace Diffbot DuckAssistBot +EchoboxBot FacebookBot Factset_spyderbot FirecrawlAgent diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 542ac65..4fd7a29 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a8dc655..8964626 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot +User-agent: EchoboxBot User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ee324f7..b596540 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | +| EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | From cf598b6b71e0e3c6b59a699c74218a62d43c5e16 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 10 Jun 2025 01:00:37 +0000 Subject: [PATCH 100/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 911dadc..19dffe8 100644 --- a/robots.json +++ b/robots.json @@ -510,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 14d68f05ba42b04e8c33fcca5bacf7d8dee86cae Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 11 Jun 2025 13:50:53 -0700 Subject: [PATCH 101/111] chore(robots.json): adds additional SemrushBot user agents --- robots.json | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/robots.json b/robots.json index 19dffe8..0ad3731 100644 --- a/robots.json +++ b/robots.json @@ -433,6 +433,27 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, + "SemrushBot": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-BA": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-CT": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", @@ -440,6 +461,13 @@ "frequency": "Roughly once every 10 seconds.", "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." }, + "SemrushBot-SI": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-SWA": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", From 842e2256e896e6d2904fa501df59f2ca17ea6bc0 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 12 Jun 2025 07:12:00 +0000 Subject: [PATCH 102/111] Merge pull request #150 from ai-robots-txt/semrush-bots chore(robots.json): adds additional SemrushBot user agents --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 4 ++++ nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++++ table-of-bot-metrics.md | 4 ++++ 6 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 74b4e5b..a6c8fea 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 9ca050b..7ed7e03 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index dc9f851..fe6c362 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -60,7 +60,11 @@ QuillBot quillbot.com SBIntuitionsBot Scrapy +SemrushBot +SemrushBot-BA +SemrushBot-CT SemrushBot-OCOB +SemrushBot-SI SemrushBot-SWA Sidetrade indexer bot TikTokSpider diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 4fd7a29..13aac72 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8964626..b130083 100644 --- a/robots.txt +++ b/robots.txt @@ -60,7 +60,11 @@ User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy +User-agent: SemrushBot +User-agent: SemrushBot-BA +User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: TikTokSpider diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b596540..a834382 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -62,7 +62,11 @@ | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-BA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-CT | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-SI | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | From d760f9216f8d6295b43e00862daec913f82610ad Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 12 Jun 2025 13:08:29 -0700 Subject: [PATCH 103/111] chore(robots.json): adds MyCentralAIScraperBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 19dffe8..94e4f79 100644 --- a/robots.json +++ b/robots.json @@ -314,6 +314,13 @@ "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", "respect": "Yes" }, + "MyCentralAIScraperBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is uncleaar at this time." + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -510,4 +517,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8f17718e762831498b09438835b2e1cb74f7e19d Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 13 Jun 2025 10:28:12 +0100 Subject: [PATCH 104/111] Fix typo --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 94e4f79..befdb35 100644 --- a/robots.json +++ b/robots.json @@ -319,7 +319,7 @@ "respect": "Unclear at this time.", "function": "AI data scraper", "frequency": "Unclear at this time.", - "description": "Operator and data use is uncleaar at this time." + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", From e53d81c66d353016e27e75215b22dc8557e1a82c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 13 Jun 2025 09:28:41 +0000 Subject: [PATCH 105/111] Merge pull request #152 from ai-robots-txt/MyCentralAIScraperBot chore(robots.json): adds MyCentralAIScraperBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a6c8fea..a2d6a6e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7ed7e03..e99d69c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe6c362..7d1d3a0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -43,6 +43,7 @@ Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher MistralAI-User/1.0 +MyCentralAIScraperBot NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 13aac72..ef0259e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index b130083..ab60705 100644 --- a/robots.txt +++ b/robots.txt @@ -43,6 +43,7 @@ User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher User-agent: MistralAI-User/1.0 +User-agent: MyCentralAIScraperBot User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index a834382..f8a2005 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -45,6 +45,7 @@ | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | +| MyCentralAIScraperBot | Unclear at this time. | Unclear at this time. | AI data scraper | Unclear at this time. | Operator and data use is unclear at this time. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b05f2fee000caf6c784135335b14c2254572754e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Spagnuolo?= Date: Fri, 13 Jun 2025 17:15:13 -0300 Subject: [PATCH 106/111] Update robots.json with new crawler Update with Poseidon Research Crawler as found in nytimes.com/robots.txt --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 52e196a..9bd1f85 100644 --- a/robots.json +++ b/robots.json @@ -405,6 +405,13 @@ "operator": "[phind](https://www.phind.com/)", "respect": "Unclear at this time." }, + "Poseidon Research Crawler": { + "operator": "[Poseidon Research](https://www.poseidonresearch.com)", + "description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.", + "frequency": "No explicit frequency provided.", + "function": "AI research crawler", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", From 2b68568ac26db7a37d859fec12b0bb35e61cfedc Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 14 Jun 2025 00:58:11 +0000 Subject: [PATCH 107/111] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index 52e196a..7c10cab 100644 --- a/robots.json +++ b/robots.json @@ -315,11 +315,11 @@ "respect": "Yes" }, "MyCentralAIScraperBot": { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": "AI data scraper", - "frequency": "Unclear at this time.", - "description": "Operator and data use is unclear at this time." + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", @@ -545,4 +545,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From eb05f2f5276ff963f786cf8fd57d8ea909e82fd9 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 14 Jun 2025 14:04:03 +0000 Subject: [PATCH 108/111] Merge pull request #153 from sergiospagnuolo/Poseidon Update robots.json with new crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a2d6a6e..27637c2 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index e99d69c..528ba08 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7d1d3a0..c7c3054 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -56,6 +56,7 @@ Perplexity-User PerplexityBot PetalBot PhindBot +Poseidon Research Crawler QualifiedBot QuillBot quillbot.com diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ef0259e..c1afb05 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index ab60705..0edf11f 100644 --- a/robots.txt +++ b/robots.txt @@ -56,6 +56,7 @@ User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot +User-agent: Poseidon Research Crawler User-agent: QualifiedBot User-agent: QuillBot User-agent: quillbot.com diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f8a2005..78a3408 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -58,6 +58,7 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | +| Poseidon Research Crawler | [Poseidon Research](https://www.poseidonresearch.com) | Unclear at this time. | AI research crawler | No explicit frequency provided. | Lab focused on scaling the interpretability research necessary to make better AI systems possible. | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | From 7535893aecf2895dd09bf00de397afde4edf0f8f Mon Sep 17 00:00:00 2001 From: paulrudy <1110792+paulrudy@users.noreply.github.com> Date: Sun, 15 Jun 2025 16:39:17 -0700 Subject: [PATCH 109/111] re-add facebookexternalhit --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 5d5f692..60c431c 100644 --- a/robots.json +++ b/robots.json @@ -174,6 +174,13 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "facebookexternalhit": { + "operator": "Meta/Facebook", + "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", + "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", + "frequency": "Unclear at this time.", + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)", "respect": "Unclear at this time.", From 5326c202b57d87a9921810ef26fca98edce77d5f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 16 Jun 2025 15:12:42 +0000 Subject: [PATCH 110/111] Merge pull request #154 from paulrudy/main re-add facebookexternalhit --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 27637c2..3ba960f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 528ba08..b675b9b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c7c3054..9cfb5c6 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -23,6 +23,7 @@ Diffbot DuckAssistBot EchoboxBot FacebookBot +facebookexternalhit Factset_spyderbot FirecrawlAgent FriendlyCrawler diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c1afb05..a53333e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 0edf11f..9d69a3a 100644 --- a/robots.txt +++ b/robots.txt @@ -23,6 +23,7 @@ User-agent: Diffbot User-agent: DuckAssistBot User-agent: EchoboxBot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 78a3408..a5ab4c7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -25,6 +25,7 @@ | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) | Ostensibly only for sharing, but likely used as an AI crawler as well | Unclear at this time. | Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is "to crawl the content of an app or website that was shared on one of Meta’s family of apps…". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | From 4ed17b8e4af67d347b039429eb633c96acbba72f Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 17 Jun 2025 01:00:21 +0000 Subject: [PATCH 111/111] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 60c431c..ab2a383 100644 --- a/robots.json +++ b/robots.json @@ -179,7 +179,7 @@ "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", "frequency": "Unclear at this time.", - "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta\u2019s family of apps\u2026\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)",