From 05b79b8a5886983c818eaad107fcf6c7de5fad3a Mon Sep 17 00:00:00 2001
From: nisbet-hubbard <87453615+nisbet-hubbard@users.noreply.github.com>
Date: Mon, 27 Jan 2025 19:41:03 +0800
Subject: [PATCH 001/111] Update robots.json

---
 robots.json | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/robots.json b/robots.json
index 4d7d582..7f3cba3 100644
--- a/robots.json
+++ b/robots.json
@@ -265,12 +265,19 @@
         "operator": "[Zyte](https://www.zyte.com)",
         "respect": "Unclear at this time."
     },
-    "SemrushBot": {
+    "SemrushBot-OCOB": {
         "operator": "[Semrush](https://www.semrush.com/)",
         "respect": "[Yes](https://www.semrush.com/bot/)",
-        "function": "Scrapes data for use in LLM article-writing tool.",
+        "function": "Crawls your site for ContentShake AI tool.",
         "frequency": "Roughly once every 10 seconds.",
-        "description": "SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports."
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
+    },
+    "SemrushBot-SWA": {
+        "operator": "[Semrush](https://www.semrush.com/)",
+        "respect": "[Yes](https://www.semrush.com/bot/)",
+        "function": "Checks URLs on your site for SWA tool.",
+        "frequency": "Roughly once every 10 seconds.",
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
     },
     "Sidetrade indexer bot": {
         "description": "AI product training.",
@@ -307,4 +314,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 89d4c6e5ca03f0aedec09b9191e2aece6f2efec3 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Sat, 1 Feb 2025 10:51:01 +0000
Subject: [PATCH 002/111] Merge pull request #73 from nisbet-hubbard/patch-8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Actually block Semrush’s AI tools
---
 .htaccess               | 2 +-
 robots.txt              | 3 ++-
 table-of-bot-metrics.md | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index beaddc3..97482e2 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
 RewriteRule .* - [F,L]
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index fd388fd..3839e55 100644
--- a/robots.txt
+++ b/robots.txt
@@ -36,7 +36,8 @@ User-agent: PanguBot
 User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: Scrapy
-User-agent: SemrushBot
+User-agent: SemrushBot-OCOB
+User-agent: SemrushBot-SWA
 User-agent: Sidetrade indexer bot
 User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index f44c585..b51bbae 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -38,7 +38,8 @@
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
-| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Scrapes data for use in LLM article-writing tool. | Roughly once every 10 seconds. | SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports. |
+| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |

From bebffccc0ced8c420276c93f3109c2e71cd5ca0c Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Sun, 2 Feb 2025 00:52:50 +0000
Subject: [PATCH 003/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 7f3cba3..79762a0 100644
--- a/robots.json
+++ b/robots.json
@@ -314,4 +314,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 261a2b83b90fe89f1d842066709c019fd1dba30f Mon Sep 17 00:00:00 2001
From: always-be-testing <warptank@protonmail.com>
Date: Fri, 14 Feb 2025 12:26:19 -0500
Subject: [PATCH 004/111] update README to inclide list of ai bots Cloudflare
 considers verified

---
 README.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 065b0b7..6758570 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,19 @@ Alternatively, you can also subscribe to new releases with your GitHub account b
 
 If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform).
 
+
+If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of  [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: 
+- Amazonbot
+- Applebot
+- CCBot
+- ChatGPT-User
+- DuckAssistBot
+- GoogleOther
+- GPTBot
+- OAI-SearchBot
+- PerplexityBot
+- PetalBot
+
 ## Additional resources
 
 - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight

From e396a2ec781095c5e2659eefb99c46ab7715a664 Mon Sep 17 00:00:00 2001
From: always-be-testing <warptank@protonmail.com>
Date: Fri, 14 Feb 2025 12:31:20 -0500
Subject: [PATCH 005/111] forgot to include heading

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6758570..e70d283 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b
 
 If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform).
 
-
+## Cloudflare Verified Bots
 If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of  [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: 
 - Amazonbot
 - Applebot

From f99339922fa9afdbb00e18bb99105e81cd3f8e88 Mon Sep 17 00:00:00 2001
From: always-be-testing <warptank@protonmail.com>
Date: Fri, 14 Feb 2025 12:36:33 -0500
Subject: [PATCH 006/111] grammar update and include syntax for verified bot
 condition

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e70d283..f471ede 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b
 If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform).
 
 ## Cloudflare Verified Bots
-If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of  [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: 
+If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare:
 - Amazonbot
 - Applebot
 - CCBot

From af87b85d7f00bc285cb414280e02d2f42284a9d8 Mon Sep 17 00:00:00 2001
From: always-be-testing <warptank@protonmail.com>
Date: Fri, 14 Feb 2025 12:39:08 -0500
Subject: [PATCH 007/111] include return after heading

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index f471ede..303f009 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b
 If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform).
 
 ## Cloudflare Verified Bots
+
 If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare:
 - Amazonbot
 - Applebot

From 5b13c2e504c843c2a95981cee1c2655d9f21c8f4 Mon Sep 17 00:00:00 2001
From: always-be-testing <warptank@protonmail.com>
Date: Sat, 15 Feb 2025 11:22:10 -0500
Subject: [PATCH 008/111] add more concise message about verified bots

Co-authored-by: Glyn Normington <work@underlap.org>
---
 README.md | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 303f009..a206c83 100644
--- a/README.md
+++ b/README.md
@@ -39,21 +39,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b
 ## Report abusive crawlers
 
 If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform).
-
-## Cloudflare Verified Bots
-
-If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare:
-- Amazonbot
-- Applebot
-- CCBot
-- ChatGPT-User
-- DuckAssistBot
-- GoogleOther
-- GPTBot
-- OAI-SearchBot
-- PerplexityBot
-- PetalBot
-
+But even if you don't use Cloudflare's hard block, their list of [verified bots](https://radar.cloudflare.com/traffic/verified-bots) may come in handy.
 ## Additional resources
 
 - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight

From a9ec4ffa6fd1816ee6c1c146fa75983abc0b2edc Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Sun, 16 Feb 2025 13:36:39 -0800
Subject: [PATCH 009/111] chore: add Brightbot 1.0

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 79762a0..a634634 100644
--- a/robots.json
+++ b/robots.json
@@ -41,6 +41,13 @@
         "frequency": "Unclear at this time.",
         "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools."
     },
+    "Brightbot 1.0": {
+        "operator": "Browsing.ai",
+        "respect": "Unclear at this time.",
+        "function": "LLM/AI training.",
+        "frequency": "Unclear at this time.",
+        "description": "Scrapes data to train LLMs and AI products focused on website customer support."
+    },
     "Bytespider": {
         "operator": "ByteDance",
         "respect": "No",
@@ -314,4 +321,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 693289bb29c42b7a526d8210d1f743ca3608690d Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Sun, 16 Feb 2025 21:37:52 +0000
Subject: [PATCH 010/111] chore: add Brightbot 1.0

---
 .htaccess               | 2 +-
 robots.txt              | 1 +
 table-of-bot-metrics.md | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.htaccess b/.htaccess
index 97482e2..512c274 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
 RewriteRule .* - [F,L]
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 3839e55..80c40e8 100644
--- a/robots.txt
+++ b/robots.txt
@@ -4,6 +4,7 @@ User-agent: Amazonbot
 User-agent: anthropic-ai
 User-agent: Applebot
 User-agent: Applebot-Extended
+User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
 User-agent: ChatGPT-User
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index b51bbae..af32bf2 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -6,6 +6,7 @@
 | anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
 | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
+| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. |
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
 | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |

From abfd6dfcd15267ed03b5fda4cd3eac2512604ed2 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Mon, 17 Feb 2025 00:53:32 +0000
Subject: [PATCH 011/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index a634634..cdc7bb5 100644
--- a/robots.json
+++ b/robots.json
@@ -321,4 +321,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From c0d418cd875b432fd4558be57ad3c009326b631e Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Mon, 17 Feb 2025 21:00:57 +0100
Subject: [PATCH 012/111] .htaccess: Allow robots access to /robots.txt

---
 .htaccess                 | 2 +-
 code/robots.py            | 2 +-
 code/test_files/.htaccess | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 512c274..c42f99e 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
 RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
-RewriteRule .* - [F,L]
\ No newline at end of file
+RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/robots.py b/code/robots.py
index 087b00b..bb18e70 100644
--- a/code/robots.py
+++ b/code/robots.py
@@ -142,7 +142,7 @@ def json_to_htaccess(robot_json):
     robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
     htaccess += "|".join(robots)
     htaccess += ").*$ [NC]\n"
-    htaccess += "RewriteRule .* - [F,L]"
+    htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
     return htaccess
 
 
diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index a34bf55..2e78674 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
 RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
-RewriteRule .* - [F,L]
\ No newline at end of file
+RewriteRule !^/?robots\.txt$ - [F,L]

From a884a2afb9dbc7338b0faa24b3c10308adbc48e4 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Mon, 17 Feb 2025 21:00:57 +0100
Subject: [PATCH 013/111] .htaccess: Make regex in RewriteCond safe

Improve the regular expression by removing unneeded anchors and
escaping special characters (not just space) to prevent false positives
or a misbehaving rewrite rule.
---
 .htaccess                 |  2 +-
 code/robots.py            | 19 ++++++++++---------
 code/test_files/.htaccess |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/.htaccess b/.htaccess
index c42f99e..2313293 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/robots.py b/code/robots.py
index bb18e70..a8a674d 100644
--- a/code/robots.py
+++ b/code/robots.py
@@ -1,8 +1,9 @@
 import json
-from pathlib import Path
-
+import re
 import requests
+
 from bs4 import BeautifulSoup
+from pathlib import Path
 
 
 def load_robots_json():
@@ -99,7 +100,6 @@ def updated_robots_json(soup):
 
 
 def ingest_darkvisitors():
-
     old_robots_json = load_robots_json()
     soup = get_agent_soup()
     if soup:
@@ -132,16 +132,17 @@ def json_to_table(robots_json):
     return table
 
 
+def list_to_pcre(lst):
+    # Python re is not 100% identical to PCRE which is used by Apache, but it
+    # should probably be close enough in the real world for re.escape to work.
+    return f"({"|".join(map(re.escape, lst))})"
+
+
 def json_to_htaccess(robot_json):
     # Creates a .htaccess filter file. It uses a regular expression to filter out
     # User agents that contain any of the blocked values.
     htaccess = "RewriteEngine On\n"
-    htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*("
-
-    # Escape spaces in each User Agent to build the regular expression
-    robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys())
-    htaccess += "|".join(robots)
-    htaccess += ").*$ [NC]\n"
+    htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
     htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
     return htaccess
 
diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index 2e78674..90ddcf2 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]

From 0bd3fa63b832ffd8fa908675656c7007021f6654 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Tue, 18 Feb 2025 10:12:04 +0100
Subject: [PATCH 014/111] table-of-bot-metrics.md: Escape robot names for
 Markdown table

Some characters which could occur in a crawler's name have a special meaning in
Markdown. They are escaped to prevent them from having unintended side effects.

The escaping is only applied to the first (Name) column of the table. The rest
of the columns is expected to already be Markdown encoded in robots.json.
---
 code/robots.py          |  8 ++++++--
 table-of-bot-metrics.md | 40 ++++++++++++++++++++--------------------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/code/robots.py b/code/robots.py
index a8a674d..62fb061 100644
--- a/code/robots.py
+++ b/code/robots.py
@@ -121,13 +121,17 @@ def json_to_txt(robots_json):
     return robots_txt
 
 
+def escape_md(s):
+    return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
+
+
 def json_to_table(robots_json):
     """Compose a markdown table with the information in robots.json"""
     table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
-    table += "|-----|----------|-----------------------|----------|------------------|-------------|\n"
+    table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
 
     for name, robot in robots_json.items():
-        table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
+        table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
 
     return table
 
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index af32bf2..ce82047 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -1,48 +1,48 @@
 | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |
-|-----|----------|-----------------------|----------|------------------|-------------|
+|------|----------|-----------------------|----------|------------------|-------------|
 | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
-| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
+| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
 | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
-| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
-| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
-| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. |
+| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
+| Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. |
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
-| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
-| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
+| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
-| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
-| cohere-training-data-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
+| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
+| cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
 | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. |
 | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. |
 | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot |
 | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. |
 | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |
-| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
+| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
 | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
-| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
-| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
+| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
+| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
 | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. |
-| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. |
-| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
+| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. |
+| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
 | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. |
 | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
-| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
-| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
-| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
+| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
-| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
-| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
-| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
+| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
 | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. |

From 17b826a6d3868cf87fb52adf95f52872ac5c4437 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Tue, 18 Feb 2025 10:13:27 +0100
Subject: [PATCH 015/111] Update tests and convert to stock unittest

For these simple tests Python's built-in unittest framework is more than enough.
No additional dependencies are required.

Added some more test cases with "special" characters to test the escaping code
better.
---
 code/test_files/.htaccess               |  2 +-
 code/test_files/robots.json             | 44 ++++++++++++++++-
 code/test_files/robots.txt              |  6 +++
 code/test_files/table-of-bot-metrics.md | 38 +++++++++------
 code/tests.py                           | 65 ++++++++++++++++++-------
 5 files changed, 120 insertions(+), 35 deletions(-)
 mode change 100644 => 100755 code/tests.py

diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index 90ddcf2..7e39092 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/test_files/robots.json b/code/test_files/robots.json
index c50d63c..b0cbfbb 100644
--- a/code/test_files/robots.json
+++ b/code/test_files/robots.json
@@ -278,5 +278,47 @@
         "function": "Scrapes data for search engine and LLMs.",
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
+    },
+    "crawler.with.dots": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression dots need to be escaped."
+    },
+    "star***crawler": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression stars need to be escaped."
+    },
+    "Is this a crawler?": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression spaces and question marks need to be escaped."
+    },
+    "a[mazing]{42}(robot)": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped."
+    },
+    "2^32$": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression RE anchor characters need to be escaped."
+    },
+    "curl|sudo bash": {
+        "operator": "Test suite",
+        "respect": "No",
+        "function": "To ensure the code works correctly.",
+        "frequency": "No information.",
+        "description": "When used in the .htaccess regular expression pipes need to be escaped."
     }
-}
\ No newline at end of file
+}
diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt
index 927f6f4..03c3c25 100644
--- a/code/test_files/robots.txt
+++ b/code/test_files/robots.txt
@@ -38,4 +38,10 @@ User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
 User-agent: Webzio-Extended
 User-agent: YouBot
+User-agent: crawler.with.dots
+User-agent: star***crawler
+User-agent: Is this a crawler?
+User-agent: a[mazing]{42}(robot)
+User-agent: 2^32$
+User-agent: curl|sudo bash
 Disallow: /
diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md
index 257ba99..88af6c0 100644
--- a/code/test_files/table-of-bot-metrics.md
+++ b/code/test_files/table-of-bot-metrics.md
@@ -1,35 +1,35 @@
 | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |
-|-----|----------|-----------------------|----------|------------------|-------------|
+|------|----------|-----------------------|----------|------------------|-------------|
 | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
-| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
+| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
 | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
-| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
-| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
+| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
-| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
-| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
+| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
-| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
+| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
 | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. |
 | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. |
 | facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. |
 | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |
-| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
+| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
 | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
-| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
-| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
+| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
+| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
 | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. |
-| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. |
-| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
+| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. |
+| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
 | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. |
 | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
-| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
-| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
-| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
+| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
@@ -38,5 +38,11 @@
 | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
-| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
+| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
 | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. |
+| crawler\.with\.dots | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression dots need to be escaped. |
+| star\*\*\*crawler | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression stars need to be escaped. |
+| Is this a crawler? | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression spaces and question marks need to be escaped. |
+| a\[mazing\]\{42\}\(robot\) | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped. |
+| 2^32$ | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression RE anchor characters need to be escaped. |
+| curl\|sudo bash | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression pipes need to be escaped. |
diff --git a/code/tests.py b/code/tests.py
old mode 100644
new mode 100755
index 6f778c3..94cbb47
--- a/code/tests.py
+++ b/code/tests.py
@@ -1,27 +1,58 @@
-"""These tests can be run with pytest.
-This requires pytest: pip install pytest
-cd to the `code` directory and run `pytest`
-"""
+#!/usr/bin/env python3
+"""To run these tests just execute this script."""
 
 import json
-from pathlib import Path
+import unittest
 
 from robots import json_to_txt, json_to_table, json_to_htaccess
 
+class RobotsUnittestExtensions:
+    def loadJson(self, pathname):
+        with open(pathname, "rt") as f:
+            return json.load(f)
 
-def test_robots_txt_creation():
-    robots_json = json.loads(Path("test_files/robots.json").read_text())
-    robots_txt = json_to_txt(robots_json)
-    assert Path("test_files/robots.txt").read_text() == robots_txt
+    def assertEqualsFile(self, f, s):
+        with open(f, "rt") as f:
+            f_contents = f.read()
+
+        return self.assertMultiLineEqual(f_contents, s)
 
 
-def test_table_of_bot_metrices_md():
-    robots_json = json.loads(Path("test_files/robots.json").read_text())
-    robots_table = json_to_table(robots_json)
-    assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table
+class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_robots_txt_generation(self):
+        robots_txt = json_to_txt(self.robots_dict)
+        self.assertEqualsFile("test_files/robots.txt", robots_txt)
 
 
-def test_htaccess_creation():
-    robots_json = json.loads(Path("test_files/robots.json").read_text())
-    robots_htaccess = json_to_htaccess(robots_json)
-    assert Path("test_files/.htaccess").read_text() == robots_htaccess
+class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 32768
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_table_generation(self):
+        robots_table = json_to_table(self.robots_dict)
+        self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table)
+
+
+class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_htaccess_generation(self):
+        robots_htaccess = json_to_htaccess(self.robots_dict)
+        self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
+
+
+if __name__ == "__main__":
+    import os
+    os.chdir(os.path.dirname(__file__))
+
+    unittest.main(verbosity=2)

From c7c1e7b96fe74f90590f4d375c1bab4be53a4044 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Tue, 18 Feb 2025 10:15:10 +0100
Subject: [PATCH 016/111] robots.py: Make executable

---
 code/robots.py | 2 ++
 1 file changed, 2 insertions(+)
 mode change 100644 => 100755 code/robots.py

diff --git a/code/robots.py b/code/robots.py
old mode 100644
new mode 100755
index 62fb061..6bf7920
--- a/code/robots.py
+++ b/code/robots.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python3
+
 import json
 import re
 import requests

From 1d55a205e4c8447829abdd34098ef9b0fedefee1 Mon Sep 17 00:00:00 2001
From: Glyn Normington <glyn.normington@gmail.com>
Date: Tue, 18 Feb 2025 05:08:28 +0000
Subject: [PATCH 017/111] Document testing in README

Fixes: https://github.com/ai-robots-txt/ai.robots.txt/issues/81
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index a206c83..30a85da 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,11 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
 
 A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`.
 
+You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing:
+```console
+code/tests.py
+```
+
 ## Subscribe to updates
 
 You can subscribe to list updates via RSS/Atom with the releases feed:

From 8a7489633326465fd7e83fecece6740440d38eb6 Mon Sep 17 00:00:00 2001
From: Dennis Camera <dennis.camera@riiengineering.ch>
Date: Tue, 18 Feb 2025 10:23:40 +0100
Subject: [PATCH 018/111] Add workflow to run tests on pull request or push to
 main

---
 .github/workflows/run-tests.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/run-tests.yml

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
new file mode 100644
index 0000000..c98861f
--- /dev/null
+++ b/.github/workflows/run-tests.yml
@@ -0,0 +1,21 @@
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - main
+jobs:
+  run-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Install dependencies
+        run: |
+          pip install -U requests beautifulsoup4
+      - name: Run tests
+        run: |
+          code/tests.py

From 6ecfcdfcbfd1bd36da1982b7a4f9f95cbeb8101a Mon Sep 17 00:00:00 2001
From: deyigifts <daijiahao@deyigifts.com>
Date: Mon, 24 Mar 2025 14:16:57 +0800
Subject: [PATCH 019/111] Update perplexity bot

Update based on perplexity bot docs
---
 robots.json | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/robots.json b/robots.json
index cdc7bb5..eaac816 100644
--- a/robots.json
+++ b/robots.json
@@ -253,10 +253,17 @@
     },
     "PerplexityBot": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
-        "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)",
+        "respect": "[Yes](https://docs.perplexity.ai/guides/bots)",
+        "function": "Search result generation.",
+        "frequency": "No information.",
+        "description": "Crawls sites to surface as results in Perplexity."
+    },
+    "Perplexity‑User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
         "function": "Used to answer queries at the request of users.",
-        "frequency": "Takes action based on user prompts.",
-        "description": "Operated by Perplexity to obtain results in response to user queries."
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
     },
     "PetalBot": {
         "description": "Operated by Huawei to provide search and AI assistant services.",
@@ -321,4 +328,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From da85207314724c02d151a7bdfcdca3ef3fd056a1 Mon Sep 17 00:00:00 2001
From: Thomas Leister <thomas.leister@mailbox.org>
Date: Thu, 27 Mar 2025 12:27:09 +0100
Subject: [PATCH 020/111] Implement new function "json_to_nginx" which outputs
 an Nginx configuration snippet

---
 code/robots.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/code/robots.py b/code/robots.py
index 6bf7920..f58f2b8 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -152,6 +152,12 @@ def json_to_htaccess(robot_json):
     htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
     return htaccess
 
+def json_to_nginx(robot_json):
+    # Creates an Nginx config file. This config snippet can be included in 
+    # nginx server{} blocks to block AI bots.
+    config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
+    return config
+
 
 def update_file_if_changed(file_name, converter):
     """Update files if newer content is available and log the (in)actions."""
@@ -178,6 +184,10 @@ def conversions():
         file_name="./.htaccess",
         converter=json_to_htaccess,
     )
+    update_file_if_changed(
+        file_name="./nginx-block-ai-bots.conf",
+        converter=json_to_nginx,
+    )
 
 
 if __name__ == "__main__":

From 5a312c5f4d1fcd89c17f4d6cb360ad7230857402 Mon Sep 17 00:00:00 2001
From: Thomas Leister <thomas.leister@mailbox.org>
Date: Thu, 27 Mar 2025 12:28:11 +0100
Subject: [PATCH 021/111] Mention Nginx config feature in README

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 30a85da..b984672 100644
--- a/README.md
+++ b/README.md
@@ -13,16 +13,19 @@ If you'd like to add information about a crawler to the list, please make a pull
 This repository provides the following files:
 - `robots.txt`
 - `.htaccess`
+- `nginx-block-ai-bots.conf`
 
 `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
 
 `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server.
 Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist.
 
+`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
+
 
 ## Contributing
 
-A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`.
+A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`.
 
 You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing:
 ```console

From 4f3f4cd0dd0f421c2787b1336d37b8da06998882 Mon Sep 17 00:00:00 2001
From: Thomas Leister <thomas.leister@mailbox.org>
Date: Thu, 27 Mar 2025 12:28:50 +0100
Subject: [PATCH 022/111] Add assembled version of nginx-block-ai-bots.conf
 file

---
 nginx-block-ai-bots.conf | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 nginx-block-ai-bots.conf

diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
new file mode 100644
index 0000000..ce30520
--- /dev/null
+++ b/nginx-block-ai-bots.conf
@@ -0,0 +1,3 @@
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+    return 403;
+}
\ No newline at end of file

From 7c3b5a2cb21f5404cf4e2af1acf8689ba77d7b06 Mon Sep 17 00:00:00 2001
From: Thomas Leister <thomas.leister@mailbox.org>
Date: Thu, 27 Mar 2025 16:12:18 +0100
Subject: [PATCH 023/111] Add tests for Nginx config generator

---
 code/test_files/nginx-block-ai-bots.conf |  3 +++
 code/tests.py                            | 12 +++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 code/test_files/nginx-block-ai-bots.conf

diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf
new file mode 100644
index 0000000..d1b559e
--- /dev/null
+++ b/code/test_files/nginx-block-ai-bots.conf
@@ -0,0 +1,3 @@
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
+    return 403;
+}
\ No newline at end of file
diff --git a/code/tests.py b/code/tests.py
index 94cbb47..61d69b4 100755
--- a/code/tests.py
+++ b/code/tests.py
@@ -4,7 +4,7 @@
 import json
 import unittest
 
-from robots import json_to_txt, json_to_table, json_to_htaccess
+from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
 
 class RobotsUnittestExtensions:
     def loadJson(self, pathname):
@@ -50,6 +50,16 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
         robots_htaccess = json_to_htaccess(self.robots_dict)
         self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
 
+class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_nginx_generation(self):
+        robots_nginx = json_to_nginx(self.robots_dict)
+        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
+
 
 if __name__ == "__main__":
     import os

From 68d1d93714bbe4931811f301c7030ca979d95b39 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 27 Mar 2025 19:29:30 +0000
Subject: [PATCH 024/111] Merge pull request #91 from deyigifts/perplexity-user

Update perplexity bots
---
 .htaccess               | 2 +-
 robots.txt              | 1 +
 table-of-bot-metrics.md | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2313293..2f5d0e4 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/robots.txt b/robots.txt
index 80c40e8..8c79fc2 100644
--- a/robots.txt
+++ b/robots.txt
@@ -35,6 +35,7 @@ User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
 User-agent: PerplexityBot
+User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index ce82047..0cc2264 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,7 +36,8 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
-| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
+| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
+| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From 6851413c52b91b9729bbbfd75f84af364b490bde Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 27 Mar 2025 19:49:15 +0000
Subject: [PATCH 025/111] Merge pull request #94 from
 ThomasLeister/feature/implement-nginx-configuration-snippet-export

Implement Nginx configuration snippet export
---
 nginx-block-ai-bots.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index ce30520..72d65ec 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file

From ec18af76242c1b62bbbfc7e1df72098b423402a6 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 27 Mar 2025 12:51:22 -0700
Subject: [PATCH 026/111] Revert "Merge pull request #91 from
 deyigifts/perplexity-user"

This reverts commit 68d1d93714bbe4931811f301c7030ca979d95b39.
---
 .htaccess               | 2 +-
 robots.txt              | 1 -
 table-of-bot-metrics.md | 3 +--
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2f5d0e4..2313293 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/robots.txt b/robots.txt
index 8c79fc2..80c40e8 100644
--- a/robots.txt
+++ b/robots.txt
@@ -35,7 +35,6 @@ User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
 User-agent: PerplexityBot
-User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 0cc2264..ce82047 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,8 +36,7 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
-| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
-| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
+| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From c249de99a317b54e8891f1682dbf514e7763986e Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Fri, 28 Mar 2025 00:54:28 +0000
Subject: [PATCH 027/111] Update from Dark Visitors

---
 robots.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/robots.json b/robots.json
index eaac816..e907c8b 100644
--- a/robots.json
+++ b/robots.json
@@ -258,7 +258,7 @@
         "frequency": "No information.",
         "description": "Crawls sites to surface as results in Perplexity."
     },
-    "Perplexity‑User": {
+    "Perplexity\u2011User": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[No](https://docs.perplexity.ai/guides/bots)",
         "function": "Used to answer queries at the request of users.",
@@ -328,4 +328,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 5b8650b99b35ff2aa1aa9ae26183b312edc48d45 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Sat, 29 Mar 2025 00:54:10 +0000
Subject: [PATCH 028/111] Update from Dark Visitors

---
 .htaccess               | 2 +-
 robots.txt              | 1 +
 table-of-bot-metrics.md | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2313293..2f5d0e4 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/robots.txt b/robots.txt
index 80c40e8..8c79fc2 100644
--- a/robots.txt
+++ b/robots.txt
@@ -35,6 +35,7 @@ User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
 User-agent: PerplexityBot
+User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index ce82047..0cc2264 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,7 +36,8 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
-| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
+| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
+| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From ae8f74c10cec97ec758bf39345ff20717302c665 Mon Sep 17 00:00:00 2001
From: Kyle Buckingham <kyle@getsphere.ai>
Date: Tue, 1 Apr 2025 15:22:04 -0700
Subject: [PATCH 029/111] Update robots.json

---
 robots.json | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/robots.json b/robots.json
index e907c8b..f711d43 100644
--- a/robots.json
+++ b/robots.json
@@ -69,13 +69,6 @@
         "frequency": "Only when prompted by a user.",
         "description": "Used by plugins in ChatGPT to answer queries based on user input."
     },
-    "Claude-Web": {
-        "operator": "[Anthropic](https://www.anthropic.com)",
-        "respect": "Unclear at this time.",
-        "function": "Scrapes data to train Anthropic's AI products.",
-        "frequency": "No information provided.",
-        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
-    },
     "ClaudeBot": {
         "operator": "[Anthropic](https://www.anthropic.com)",
         "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
@@ -83,6 +76,20 @@
         "frequency": "No information provided.",
         "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
     },
+    "Claude-User": {
+        "operator": "[Anthropic](https://www.anthropic.com)",
+        "respect": "Unclear at this time.",
+        "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.",
+        "frequency": "No information provided.",
+        "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."
+    },
+    "Claude-SearchBot": {
+        "operator": "[Anthropic](https://www.anthropic.com)",
+        "respect": "Unclear at this time.",
+        "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
+        "frequency": "No information provided.",
+        "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
+    },
     "cohere-ai": {
         "operator": "[Cohere](https://cohere.com)",
         "respect": "Unclear at this time.",
@@ -328,4 +335,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 8dc36aa2e2bbc9b99bde043b593cdc6c9669f401 Mon Sep 17 00:00:00 2001
From: Kyle Buckingham <kyle@getsphere.ai>
Date: Tue, 1 Apr 2025 15:23:28 -0700
Subject: [PATCH 030/111] Update robots.txt

---
 robots.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/robots.txt b/robots.txt
index 8c79fc2..e19468d 100644
--- a/robots.txt
+++ b/robots.txt
@@ -8,8 +8,9 @@ User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
 User-agent: ChatGPT-User
-User-agent: Claude-Web
 User-agent: ClaudeBot
+User-agent: Claude-User
+User-agent: Claude-SearchBot
 User-agent: cohere-ai
 User-agent: cohere-training-data-crawler
 User-agent: Crawlspace

From 6b0349f37ddf69ef9ec0e09a884b351f4a0e4b43 Mon Sep 17 00:00:00 2001
From: Frederic Barthelemy <git@fbartho.com>
Date: Fri, 4 Apr 2025 15:20:30 -0700
Subject: [PATCH 031/111] fix python complaining about f-string syntax

```
python code/tests.py
Traceback (most recent call last):
  File "/Users/fbarthelemy/Code/ai.robots.txt/code/tests.py", line 7, in <module>
    from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
  File "/Users/fbarthelemy/Code/ai.robots.txt/code/robots.py", line 144
    return f"({"|".join(map(re.escape, lst))})"
                ^
SyntaxError: f-string: expecting '}'
```
---
 code/robots.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/code/robots.py b/code/robots.py
index f58f2b8..90c0e8c 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -141,7 +141,8 @@ def json_to_table(robots_json):
 def list_to_pcre(lst):
     # Python re is not 100% identical to PCRE which is used by Apache, but it
     # should probably be close enough in the real world for re.escape to work.
-    return f"({"|".join(map(re.escape, lst))})"
+    formatted = "|".join(map(re.escape, lst))
+    return f"({formatted})"
 
 
 def json_to_htaccess(robot_json):

From 5f5a89c38c27b676c3212f6ea3895d31f315f37e Mon Sep 17 00:00:00 2001
From: Frederic Barthelemy <git@fbartho.com>
Date: Fri, 4 Apr 2025 17:34:14 -0700
Subject: [PATCH 032/111] Fix html-mangled hyphen in Perplexity-Users

Fixes: #99
---
 .htaccess                                |  2 +-
 code/robots.py                           | 15 +++++++++++++++
 code/test_files/.htaccess                |  2 +-
 code/test_files/nginx-block-ai-bots.conf |  2 +-
 code/test_files/robots.json              |  7 +++++++
 code/test_files/robots.txt               |  1 +
 code/test_files/table-of-bot-metrics.md  |  1 +
 code/tests.py                            |  5 +++++
 nginx-block-ai-bots.conf                 |  2 +-
 robots.json                              | 14 +++++++-------
 robots.txt                               |  2 +-
 table-of-bot-metrics.md                  |  2 +-
 12 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2f5d0e4..27a7e11 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/robots.py b/code/robots.py
index 90c0e8c..d158b36 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -50,6 +50,7 @@ def updated_robots_json(soup):
             continue
         for agent in section.find_all("a", href=True):
             name = agent.find("div", {"class": "agent-name"}).get_text().strip()
+            name = clean_robot_name(name)
             desc = agent.find("p").get_text().strip()
 
             default_values = {
@@ -101,6 +102,20 @@ def updated_robots_json(soup):
     return sorted_robots
 
 
+def clean_robot_name(name):
+    """ Clean the robot name by removing some characters that were mangled by html software once. """
+    # This was specifically spotted in "Perplexity-User"
+    # Looks like a non-breaking hyphen introduced by the HTML rendering software
+    # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
+    # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, 
+    # and it's only the Row-Heading that has the special hyphen
+    # 
+    # Technically, there's no reason there wouldn't someday be a bot that 
+    # actually uses a non-breaking hyphen, but that seems unlikely,
+    # so this solution should be fine for now.
+    return re.sub(r"\u2011", "-", name)
+
+
 def ingest_darkvisitors():
     old_robots_json = load_robots_json()
     soup = get_agent_soup()
diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess
index 7e39092..f0d6783 100644
--- a/code/test_files/.htaccess
+++ b/code/test_files/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf
index d1b559e..c569b15 100644
--- a/code/test_files/nginx-block-ai-bots.conf
+++ b/code/test_files/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") {
     return 403;
 }
\ No newline at end of file
diff --git a/code/test_files/robots.json b/code/test_files/robots.json
index b0cbfbb..385f284 100644
--- a/code/test_files/robots.json
+++ b/code/test_files/robots.json
@@ -223,6 +223,13 @@
         "operator": "[Webz.io](https://webz.io/)",
         "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
     },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
     "PerplexityBot": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)",
diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt
index 03c3c25..ee201f8 100644
--- a/code/test_files/robots.txt
+++ b/code/test_files/robots.txt
@@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher
 User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: Scrapy
diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md
index 88af6c0..9b280aa 100644
--- a/code/test_files/table-of-bot-metrics.md
+++ b/code/test_files/table-of-bot-metrics.md
@@ -32,6 +32,7 @@
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
diff --git a/code/tests.py b/code/tests.py
index 61d69b4..f58b445 100755
--- a/code/tests.py
+++ b/code/tests.py
@@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
         robots_nginx = json_to_nginx(self.robots_dict)
         self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
 
+class TestRobotsNameCleaning(unittest.TestCase):
+    def test_clean_name(self):
+        from robots import clean_robot_name
+
+        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
 
 if __name__ == "__main__":
     import os
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 72d65ec..0577bd9 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.json b/robots.json
index e907c8b..8fd7572 100644
--- a/robots.json
+++ b/robots.json
@@ -251,6 +251,13 @@
         "frequency": "Unclear at this time.",
         "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot"
     },
+    "Perplexity-User": {
+        "operator": "[Perplexity](https://www.perplexity.ai/)",
+        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
+        "function": "Used to answer queries at the request of users.",
+        "frequency": "Only when prompted by a user.",
+        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
+    },
     "PerplexityBot": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[Yes](https://docs.perplexity.ai/guides/bots)",
@@ -258,13 +265,6 @@
         "frequency": "No information.",
         "description": "Crawls sites to surface as results in Perplexity."
     },
-    "Perplexity\u2011User": {
-        "operator": "[Perplexity](https://www.perplexity.ai/)",
-        "respect": "[No](https://docs.perplexity.ai/guides/bots)",
-        "function": "Used to answer queries at the request of users.",
-        "frequency": "Only when prompted by a user.",
-        "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response."
-    },
     "PetalBot": {
         "description": "Operated by Huawei to provide search and AI assistant services.",
         "frequency": "No explicit frequency provided.",
diff --git a/robots.txt b/robots.txt
index 8c79fc2..c531918 100644
--- a/robots.txt
+++ b/robots.txt
@@ -34,8 +34,8 @@ User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
 User-agent: PanguBot
+User-agent: Perplexity-User
 User-agent: PerplexityBot
-User-agent: Perplexity‑User
 User-agent: PetalBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 0cc2264..d92df34 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,8 +36,8 @@
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
+| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
-| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From c6f308cbd0a00166f5085fa4adc98630c767e11e Mon Sep 17 00:00:00 2001
From: Frederic Barthelemy <git@fbartho.com>
Date: Sat, 5 Apr 2025 09:01:52 -0700
Subject: [PATCH 033/111] PR Feedback: log special-case, comment consistency

---
 code/robots.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/code/robots.py b/code/robots.py
index d158b36..86ea413 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -107,13 +107,16 @@ def clean_robot_name(name):
     # This was specifically spotted in "Perplexity-User"
     # Looks like a non-breaking hyphen introduced by the HTML rendering software
     # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
-    # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, 
+    # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, 
     # and it's only the Row-Heading that has the special hyphen
     # 
     # Technically, there's no reason there wouldn't someday be a bot that 
     # actually uses a non-breaking hyphen, but that seems unlikely,
     # so this solution should be fine for now.
-    return re.sub(r"\u2011", "-", name)
+    result = re.sub(r"\u2011", "-", name)
+    if result != name:
+        print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
+    return result
 
 
 def ingest_darkvisitors():

From b65f45e408461560a32f44f05860f80655737467 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 10 Apr 2025 10:12:51 -0700
Subject: [PATCH 034/111] chore(robots.json): adds imgproxy crawler

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 8fd7572..4c9f7d7 100644
--- a/robots.json
+++ b/robots.json
@@ -195,6 +195,13 @@
         "operator": "[img2dataset](https://github.com/rom1504/img2dataset)",
         "respect": "Unclear at this time."
     },
+    "imgproxy": {
+        "frequency": "No information.",
+        "function": "Not documented or explained on operator's site.",
+        "operator": "[imgproxy](https://imgproxy.net)",
+        "respect": "Unclear at this time.",
+        "description": "AI-powered image processing."
+    },
     "ISSCyberRiskCrawler": {
         "description": "Used to train machine learning based models to quantify cyber risk.",
         "frequency": "No information.",
@@ -328,4 +335,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 4a764bba18f10167cb5f7107c8721e5dc208100f Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 10 Apr 2025 19:22:34 +0000
Subject: [PATCH 035/111] Merge pull request #102 from
 ai-robots-txt/imgproxy-bot

chore(robots.json): adds imgproxy crawler
---
 .htaccess                | 2 +-
 nginx-block-ai-bots.conf | 2 +-
 robots.txt               | 1 +
 table-of-bot-metrics.md  | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index 27a7e11..c0e5fbb 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 0577bd9..a6bbfa2 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index c531918..de25a56 100644
--- a/robots.txt
+++ b/robots.txt
@@ -26,6 +26,7 @@ User-agent: iaskspider/2.0
 User-agent: ICC-Crawler
 User-agent: ImagesiftBot
 User-agent: img2dataset
+User-agent: imgproxy
 User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
 User-agent: Meta-ExternalAgent
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index d92df34..b3e51fe 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -28,6 +28,7 @@
 | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
 | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. |
 | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. |
+| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
 | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |

From 305188b2e78855d4e7193f29a3e7205f96fa86f6 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Fri, 11 Apr 2025 00:55:52 +0000
Subject: [PATCH 036/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 4c9f7d7..eff38ac 100644
--- a/robots.json
+++ b/robots.json
@@ -335,4 +335,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From d9f882a9b21170754c4b37ff1bbc237171876684 Mon Sep 17 00:00:00 2001
From: Joshua Sheard <mail@jsheard.com>
Date: Mon, 14 Apr 2025 15:46:01 +0100
Subject: [PATCH 037/111] Include "AI Agents" from Dark Visitors

---
 code/robots.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/code/robots.py b/code/robots.py
index 86ea413..8a06b55 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -30,6 +30,7 @@ def updated_robots_json(soup):
     """Update AI scraper information with data from darkvisitors."""
     existing_content = load_robots_json()
     to_include = [
+        "AI Agents",
         "AI Assistants",
         "AI Data Scrapers",
         "AI Search Crawlers",

From a96e33098975edf1c05c8d9684b36b9fa31f7ef2 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Tue, 15 Apr 2025 00:57:01 +0000
Subject: [PATCH 038/111] Update from Dark Visitors

---
 robots.json | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/robots.json b/robots.json
index eff38ac..8bba6b2 100644
--- a/robots.json
+++ b/robots.json
@@ -230,6 +230,13 @@
         "frequency": "Unclear at this time.",
         "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
     },
+    "NovaAct": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Agents",
+        "frequency": "Unclear at this time.",
+        "description": "Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact"
+    },
     "OAI-SearchBot": {
         "operator": "[OpenAI](https://openai.com)",
         "respect": "[Yes](https://platform.openai.com/docs/bots)",
@@ -251,6 +258,13 @@
         "operator": "[Webz.io](https://webz.io/)",
         "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
     },
+    "Operator": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Agents",
+        "frequency": "Unclear at this time.",
+        "description": "Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator"
+    },
     "PanguBot": {
         "operator": "the Chinese company Huawei",
         "respect": "Unclear at this time.",

From e0cdb278fbd243f554579fe5050850f124b286a8 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 16 Apr 2025 00:57:11 +0000
Subject: [PATCH 039/111] Update from Dark Visitors

---
 .htaccess                | 2 +-
 nginx-block-ai-bots.conf | 2 +-
 robots.txt               | 2 ++
 table-of-bot-metrics.md  | 2 ++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index c0e5fbb..d10e796 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index a6bbfa2..c37cef5 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index de25a56..1e3aa80 100644
--- a/robots.txt
+++ b/robots.txt
@@ -31,9 +31,11 @@ User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
 User-agent: Meta-ExternalAgent
 User-agent: Meta-ExternalFetcher
+User-agent: NovaAct
 User-agent: OAI-SearchBot
 User-agent: omgili
 User-agent: omgilibot
+User-agent: Operator
 User-agent: PanguBot
 User-agent: Perplexity-User
 User-agent: PerplexityBot
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index b3e51fe..4c87b41 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -33,9 +33,11 @@
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
 | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
 | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
+| Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
 | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |

From 4a6f37d72718aeb44d1d8cbcccb740ace3fe82d6 Mon Sep 17 00:00:00 2001
From: Kyle Buckingham <kyle@getsphere.ai>
Date: Wed, 16 Apr 2025 16:42:58 -0700
Subject: [PATCH 040/111] Update robots.json

Co-authored-by: Glyn Normington <work@underlap.org>
---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index f711d43..ba052ae 100644
--- a/robots.json
+++ b/robots.json
@@ -78,7 +78,7 @@
     },
     "Claude-User": {
         "operator": "[Anthropic](https://www.anthropic.com)",
-        "respect": "Unclear at this time.",
+        "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
         "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.",
         "frequency": "No information provided.",
         "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."

From fd41de8522536a25de71f37310a05e77d71a0792 Mon Sep 17 00:00:00 2001
From: Kyle Buckingham <kyle@getsphere.ai>
Date: Wed, 16 Apr 2025 16:43:03 -0700
Subject: [PATCH 041/111] Update robots.json

Co-authored-by: Glyn Normington <work@underlap.org>
---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index ba052ae..ca6fc40 100644
--- a/robots.json
+++ b/robots.json
@@ -85,7 +85,7 @@
     },
     "Claude-SearchBot": {
         "operator": "[Anthropic](https://www.anthropic.com)",
-        "respect": "Unclear at this time.",
+        "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
         "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
         "frequency": "No information provided.",
         "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."

From d05ede8fe164c5c3f47acecc1343e6d2ea5c294b Mon Sep 17 00:00:00 2001
From: Glyn Normington <glyn.normington@gmail.com>
Date: Fri, 18 Apr 2025 17:46:56 +0100
Subject: [PATCH 042/111] Clarify our position on sponsorship

Some firms, including those with .ai domains, have
offered to sponsor this project. So make our position
clear.
---
 FAQ.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/FAQ.md b/FAQ.md
index 967cf41..487f784 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -55,3 +55,7 @@ That depends on your stack.
 ## How can I contribute?
 
 Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort.
+
+## Can my company sponsor ai.robots.txt?
+
+No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven.

From b1856e6988a93bd834b228f121fa3524d11c7be7 Mon Sep 17 00:00:00 2001
From: Glyn Normington <glyn.normington@gmail.com>
Date: Fri, 18 Apr 2025 18:40:44 +0100
Subject: [PATCH 043/111] Donations

---
 FAQ.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/FAQ.md b/FAQ.md
index 487f784..7264819 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -56,6 +56,10 @@ That depends on your stack.
 
 Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort.
 
+## I'd like to donate money
+
+That's kind of you, but we don't need your money. If you insist, we'd love you to make a donation to the [American Civil Liberties Union](https://www.aclu.org/), the [Disasters Emergency Committee](https://www.dec.org.uk/), or a similar organisation.
+
 ## Can my company sponsor ai.robots.txt?
 
 No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven.

From 33c5ce1326367abecccad23742779783c10c36a1 Mon Sep 17 00:00:00 2001
From: Dennis Lee <github@denmail.co.uk>
Date: Mon, 21 Apr 2025 18:55:11 +0100
Subject: [PATCH 044/111] Update robots.json

Updated robots list with five new proposed AI bots:

aiHitBot
Cotoyogi
Factset_spyderbot
FirecrawlAgent
TikTokSpider
---
 robots.json | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 8bba6b2..698b31e 100644
--- a/robots.json
+++ b/robots.json
@@ -13,6 +13,13 @@
         "operator": "[Ai2](https://allenai.org/crawler)",
         "respect": "Yes"
     },
+    "aiHitBot": {
+        "operator": "[aiHit](https://www.aihitdata.com/about)",
+        "respect": "Yes",
+        "function": "A massive, artificial intelligence/machine learning, automated system.",
+        "frequency": "No information provided.",
+        "description": "Scrapes data for AI systems."
+    },
     "Amazonbot": {
         "operator": "Amazon",
         "respect": "Yes",
@@ -97,6 +104,13 @@
         "frequency": "Unclear at this time.",
         "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler"
     },
+    "Cotoyogi": {
+        "operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)",
+        "respect": "Yes",
+        "function": "AI LLM Scraper.",
+        "frequency": "No information provided.",
+        "description": "Scrapes data for AI training in Japanese language."
+    },
     "Crawlspace": {
         "operator": "[Crawlspace](https://crawlspace.dev)",
         "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)",
@@ -125,6 +139,20 @@
         "frequency": "Up to 1 page per second",
         "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
     },
+    "Factset_spyderbot": {
+        "operator": "[Factset](https://www.factset.com/ai)",
+        "respect": "Unclear at this time.",
+        "function": "AI model training.",
+        "frequency": "No information provided.",
+        "description": "Scrapes data for AI training."
+    },
+    "FirecrawlAgent": {
+        "operator": "[Firecrawl](https://www.firecrawl.dev/)",
+        "respect": "Yes",
+        "function": "AI scraper and LLM training",
+        "frequency": "No information provided.",
+        "description": "Scrapes data for AI systems and LLM training."
+    },
     "FriendlyCrawler": {
         "description": "Unclear who the operator is; but data is used for training/machine learning.",
         "frequency": "Unclear at this time.",
@@ -321,6 +349,13 @@
         "operator": "[Sidetrade](https://www.sidetrade.com)",
         "respect": "Unclear at this time."
     },
+    "TikTokSpider": {
+        "operator": "ByteDance",
+        "respect": "Unclear at this time.",
+        "function": "LLM training.",
+        "frequency": "Unclear at this time.",
+        "description": "Downloads data to train LLMS, as per Bytespider."
+    },
     "Timpibot": {
         "operator": "[Timpi](https://timpi.io)",
         "respect": "Unclear at this time.",
@@ -349,4 +384,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From bbec639c14f3e7258729718dd2c6fc0b1734a9b1 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Tue, 22 Apr 2025 14:50:26 +0000
Subject: [PATCH 045/111] Merge pull request #109 from dennislee1/patch-1

AI bots to consider adding
---
 .htaccess                | 2 +-
 nginx-block-ai-bots.conf | 2 +-
 robots.txt               | 5 +++++
 table-of-bot-metrics.md  | 5 +++++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index d10e796..b4ab72f 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index c37cef5..090275a 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 1e3aa80..1f8eaf2 100644
--- a/robots.txt
+++ b/robots.txt
@@ -1,5 +1,6 @@
 User-agent: AI2Bot
 User-agent: Ai2Bot-Dolma
+User-agent: aiHitBot
 User-agent: Amazonbot
 User-agent: anthropic-ai
 User-agent: Applebot
@@ -12,10 +13,13 @@ User-agent: Claude-Web
 User-agent: ClaudeBot
 User-agent: cohere-ai
 User-agent: cohere-training-data-crawler
+User-agent: Cotoyogi
 User-agent: Crawlspace
 User-agent: Diffbot
 User-agent: DuckAssistBot
 User-agent: FacebookBot
+User-agent: Factset_spyderbot
+User-agent: FirecrawlAgent
 User-agent: FriendlyCrawler
 User-agent: Google-Extended
 User-agent: GoogleOther
@@ -44,6 +48,7 @@ User-agent: Scrapy
 User-agent: SemrushBot-OCOB
 User-agent: SemrushBot-SWA
 User-agent: Sidetrade indexer bot
+User-agent: TikTokSpider
 User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
 User-agent: Webzio-Extended
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 4c87b41..0249766 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -2,6 +2,7 @@
 |------|----------|-----------------------|----------|------------------|-------------|
 | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
 | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
+| aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. |
 | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
 | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
@@ -14,10 +15,13 @@
 | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
 | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
+| Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. |
 | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. |
 | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. |
 | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot |
 | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. |
+| Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. |
+| FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. |
 | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |
 | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
 | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
@@ -46,6 +50,7 @@
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
+| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
 | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |

From 8d25a424d96dbf4a3cb12d4bb51929a764aa8f89 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Wed, 23 Apr 2025 00:56:52 +0000
Subject: [PATCH 046/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 698b31e..df9dcda 100644
--- a/robots.json
+++ b/robots.json
@@ -384,4 +384,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 9d846ced45cdd13c7ecc03353c6ec554b5f9015d Mon Sep 17 00:00:00 2001
From: maia <iam@maia.lgbt>
Date: Thu, 24 Apr 2025 04:08:20 +0200
Subject: [PATCH 047/111] Update robots.json

Lowercase meta-external* as that was not technically the UA for the bots, also removed a period in the "respect" for consistency
---
 robots.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/robots.json b/robots.json
index df9dcda..f2cec91 100644
--- a/robots.json
+++ b/robots.json
@@ -244,14 +244,14 @@
         "frequency": "Unclear at this time.",
         "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot"
     },
-    "Meta-ExternalAgent": {
+    "meta-externalagent": {
         "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)",
-        "respect": "Yes.",
+        "respect": "Yes",
         "function": "Used to train models and improve products.",
         "frequency": "No information.",
         "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\""
     },
-    "Meta-ExternalFetcher": {
+    "meta-externalfetcher": {
         "operator": "Unclear at this time.",
         "respect": "Unclear at this time.",
         "function": "AI Assistants",
@@ -384,4 +384,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 4654e14e9c857a228289d3258835182838202503 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 24 Apr 2025 07:00:34 +0000
Subject: [PATCH 048/111] Merge pull request #112 from maiavixen/main

Fixed meta-external* being titlecase, and removed period for consistency
---
 .htaccess                | 2 +-
 nginx-block-ai-bots.conf | 2 +-
 robots.txt               | 4 ++--
 table-of-bot-metrics.md  | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.htaccess b/.htaccess
index b4ab72f..a97f98a 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 090275a..3320071 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 1f8eaf2..53291ca 100644
--- a/robots.txt
+++ b/robots.txt
@@ -33,8 +33,8 @@ User-agent: img2dataset
 User-agent: imgproxy
 User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
-User-agent: Meta-ExternalAgent
-User-agent: Meta-ExternalFetcher
+User-agent: meta-externalagent
+User-agent: meta-externalfetcher
 User-agent: NovaAct
 User-agent: OAI-SearchBot
 User-agent: omgili
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 0249766..5c093b8 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -35,8 +35,8 @@
 | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
-| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
-| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |

From 934ac7b31864d0cf0b21bb1580ddea97ec8b4994 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Fri, 25 Apr 2025 00:56:57 +0000
Subject: [PATCH 049/111] Update from Dark Visitors

---
 robots.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index f2cec91..e15a196 100644
--- a/robots.json
+++ b/robots.json
@@ -251,6 +251,13 @@
         "frequency": "No information.",
         "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\""
     },
+    "Meta-ExternalAgent": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Data Scrapers",
+        "frequency": "Unclear at this time.",
+        "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent"
+    },
     "meta-externalfetcher": {
         "operator": "Unclear at this time.",
         "respect": "Unclear at this time.",
@@ -258,6 +265,13 @@
         "frequency": "Unclear at this time.",
         "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
     },
+    "Meta-ExternalFetcher": {
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI Assistants",
+        "frequency": "Unclear at this time.",
+        "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
+    },
     "NovaAct": {
         "operator": "Unclear at this time.",
         "respect": "Unclear at this time.",
@@ -384,4 +398,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From c6c7f1748f1e28053184539a70a6a08f5aeabc37 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Sat, 26 Apr 2025 00:55:12 +0000
Subject: [PATCH 050/111] Update from Dark Visitors

---
 .htaccess                | 2 +-
 nginx-block-ai-bots.conf | 2 +-
 robots.txt               | 2 ++
 table-of-bot-metrics.md  | 2 ++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.htaccess b/.htaccess
index a97f98a..586adab 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 3320071..fc58d61 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 53291ca..232e119 100644
--- a/robots.txt
+++ b/robots.txt
@@ -34,7 +34,9 @@ User-agent: imgproxy
 User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
 User-agent: meta-externalagent
+User-agent: Meta-ExternalAgent
 User-agent: meta-externalfetcher
+User-agent: Meta-ExternalFetcher
 User-agent: NovaAct
 User-agent: OAI-SearchBot
 User-agent: omgili
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 5c093b8..4dd6076 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -36,7 +36,9 @@
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
 | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
+| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent |
 | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |

From 50e739dd738bb821018a863491b770dd8ee61155 Mon Sep 17 00:00:00 2001
From: Rik Wijnen <r.wijnen@maastrichtuniversity.nl>
Date: Mon, 28 Apr 2025 08:42:52 +0200
Subject: [PATCH 051/111] HAProxy converter added.

---
 README.md                 |  9 +++++++
 code/robots.py            |  9 +++++++
 haproxy-block-ai-bots.txt | 57 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)
 create mode 100644 haproxy-block-ai-bots.txt

diff --git a/README.md b/README.md
index b984672..1f1eff6 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ This repository provides the following files:
 - `robots.txt`
 - `.htaccess`
 - `nginx-block-ai-bots.conf`
+- `haproxy-block-ai-bots.txt`
 
 `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
 
@@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
 
 `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
 
+`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
+1. Add the file to the config directory of HAProxy
+2. Add the following lines in the `frontend` section;
+   ```
+   acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
+   http-request deny if ai_robot
+   ```
+   (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.)
 
 ## Contributing
 
diff --git a/code/robots.py b/code/robots.py
index 8a06b55..da157c1 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -178,6 +178,11 @@ def json_to_nginx(robot_json):
     config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
     return config
 
+def json_to_haproxy(robots_json):
+    # Creates a source file for HAProxy. Follow instructions in the README to implement it.
+    txt = "\n".join(f"{k}" for k in robots_json.keys())
+    return txt
+
 
 def update_file_if_changed(file_name, converter):
     """Update files if newer content is available and log the (in)actions."""
@@ -208,6 +213,10 @@ def conversions():
         file_name="./nginx-block-ai-bots.conf",
         converter=json_to_nginx,
     )
+    update_file_if_changed(
+        file_name="./haproxy-block-ai-bots.txt",
+        converter=json_to_haproxy,
+    )
 
 
 if __name__ == "__main__":
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
new file mode 100644
index 0000000..3c326bd
--- /dev/null
+++ b/haproxy-block-ai-bots.txt
@@ -0,0 +1,57 @@
+AI2Bot
+Ai2Bot-Dolma
+aiHitBot
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Brightbot 1.0
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+cohere-training-data-crawler
+Cotoyogi
+Crawlspace
+Diffbot
+DuckAssistBot
+FacebookBot
+Factset_spyderbot
+FirecrawlAgent
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+imgproxy
+ISSCyberRiskCrawler
+Kangaroo Bot
+meta-externalagent
+Meta-ExternalAgent
+meta-externalfetcher
+Meta-ExternalFetcher
+NovaAct
+OAI-SearchBot
+omgili
+omgilibot
+Operator
+PanguBot
+Perplexity-User
+PerplexityBot
+PetalBot
+Scrapy
+SemrushBot-OCOB
+SemrushBot-SWA
+Sidetrade indexer bot
+TikTokSpider
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
\ No newline at end of file

From 66da70905f503239faeb0e49204776f508928048 Mon Sep 17 00:00:00 2001
From: Rik Wijnen <r.wijnen@maastrichtuniversity.nl>
Date: Mon, 28 Apr 2025 09:09:40 +0200
Subject: [PATCH 052/111] Fixed incorrect English sentence.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1f1eff6..ff124e3 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
    acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt
    http-request deny if ai_robot
    ```
-   (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.)
+   (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
 
 ## Contributing
 

From a4a9f2ac2b9116d104789664231af4017d3828a7 Mon Sep 17 00:00:00 2001
From: Rik Wijnen <r.wijnen@maastrichtuniversity.nl>
Date: Mon, 28 Apr 2025 09:30:26 +0200
Subject: [PATCH 053/111] Tests for HAProxy file added.

---
 code/test_files/haproxy-block-ai-bots.txt | 47 +++++++++++++++++++++++
 code/tests.py                             | 12 +++++-
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 code/test_files/haproxy-block-ai-bots.txt

diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt
new file mode 100644
index 0000000..5ed6939
--- /dev/null
+++ b/code/test_files/haproxy-block-ai-bots.txt
@@ -0,0 +1,47 @@
+AI2Bot
+Ai2Bot-Dolma
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+Diffbot
+FacebookBot
+facebookexternalhit
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+ISSCyberRiskCrawler
+Kangaroo Bot
+Meta-ExternalAgent
+Meta-ExternalFetcher
+OAI-SearchBot
+omgili
+omgilibot
+Perplexity-User
+PerplexityBot
+PetalBot
+Scrapy
+Sidetrade indexer bot
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
+crawler.with.dots
+star***crawler
+Is this a crawler?
+a[mazing]{42}(robot)
+2^32$
+curl|sudo bash
\ No newline at end of file
diff --git a/code/tests.py b/code/tests.py
index f58b445..e179c44 100755
--- a/code/tests.py
+++ b/code/tests.py
@@ -4,7 +4,7 @@
 import json
 import unittest
 
-from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx
+from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy
 
 class RobotsUnittestExtensions:
     def loadJson(self, pathname):
@@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
         robots_nginx = json_to_nginx(self.robots_dict)
         self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
 
+class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_haproxy_generation(self):
+        robots_haproxy = json_to_haproxy(self.robots_dict)
+        self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
+
 class TestRobotsNameCleaning(unittest.TestCase):
     def test_clean_name(self):
         from robots import clean_robot_name

From 1310dbae4656e212ff01e7d8530d78c76dfd5a9f Mon Sep 17 00:00:00 2001
From: Crazyroostereye <63781667+Crazyroostereye1@users.noreply.github.com>
Date: Thu, 1 May 2025 12:21:32 +0200
Subject: [PATCH 054/111] Added a Caddyfile converter (#110)

Co-authored-by: Julian Beittel <julian@beittel.net>
Co-authored-by: Glyn Normington <work@underlap.org>
---
 Caddyfile                 |  3 +++
 README.md                 |  3 +++
 code/robots.py            | 12 ++++++++++++
 code/test_files/Caddyfile |  3 +++
 code/tests.py             | 13 ++++++++++++-
 5 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 Caddyfile
 create mode 100644 code/test_files/Caddyfile

diff --git a/Caddyfile b/Caddyfile
new file mode 100644
index 0000000..1857d75
--- /dev/null
+++ b/Caddyfile
@@ -0,0 +1,3 @@
+@aibots {
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index ff124e3..8d7bfb1 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ This repository provides the following files:
 - `robots.txt`
 - `.htaccess`
 - `nginx-block-ai-bots.conf`
+- `Caddyfile`
 - `haproxy-block-ai-bots.txt`
 
 `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)).
@@ -23,6 +24,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
 
 `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive.
 
+`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots`
+
 `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it;
 1. Add the file to the config directory of HAProxy
 2. Add the following lines in the `frontend` section;
diff --git a/code/robots.py b/code/robots.py
index da157c1..054c2be 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -178,12 +178,20 @@ def json_to_nginx(robot_json):
     config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
     return config
 
+
+def json_to_caddy(robot_json):
+    caddyfile = "@aibots {\n    "
+    caddyfile += f'    header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
+    caddyfile += "\n}"
+    return caddyfile
+
 def json_to_haproxy(robots_json):
     # Creates a source file for HAProxy. Follow instructions in the README to implement it.
     txt = "\n".join(f"{k}" for k in robots_json.keys())
     return txt
 
 
+
 def update_file_if_changed(file_name, converter):
     """Update files if newer content is available and log the (in)actions."""
     new_content = converter(load_robots_json())
@@ -213,6 +221,10 @@ def conversions():
         file_name="./nginx-block-ai-bots.conf",
         converter=json_to_nginx,
     )
+    update_file_if_changed(
+        file_name="./Caddyfile",
+        converter=json_to_caddy
+      
     update_file_if_changed(
         file_name="./haproxy-block-ai-bots.txt",
         converter=json_to_haproxy,
diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile
new file mode 100644
index 0000000..82f365a
--- /dev/null
+++ b/code/test_files/Caddyfile
@@ -0,0 +1,3 @@
+@aibots {
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)"
+}
\ No newline at end of file
diff --git a/code/tests.py b/code/tests.py
index e179c44..434406f 100755
--- a/code/tests.py
+++ b/code/tests.py
@@ -4,7 +4,7 @@
 import json
 import unittest
 
-from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy
+from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
 
 class RobotsUnittestExtensions:
     def loadJson(self, pathname):
@@ -76,6 +76,17 @@ class TestRobotsNameCleaning(unittest.TestCase):
 
         self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
 
+class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
+    maxDiff = 8192
+
+    def setUp(self):
+        self.robots_dict = self.loadJson("test_files/robots.json")
+
+    def test_caddyfile_generation(self):
+        robots_caddyfile = json_to_caddy(self.robots_dict)
+        self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
+
+
 if __name__ == "__main__":
     import os
     os.chdir(os.path.dirname(__file__))

From ec995cd686a09b4af1c6a59d95e1ced122f1d5fc Mon Sep 17 00:00:00 2001
From: Glyn Normington <glyn.normington@gmail.com>
Date: Thu, 1 May 2025 11:27:40 +0100
Subject: [PATCH 055/111] Fix Python syntax error

---
 code/robots.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/code/robots.py b/code/robots.py
index 054c2be..c795649 100755
--- a/code/robots.py
+++ b/code/robots.py
@@ -223,7 +223,8 @@ def conversions():
     )
     update_file_if_changed(
         file_name="./Caddyfile",
-        converter=json_to_caddy
+        converter=json_to_caddy,
+    )
       
     update_file_if_changed(
         file_name="./haproxy-block-ai-bots.txt",

From 678380727e8685af8c5311bcfa1f55c7aa866d3b Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 1 May 2025 10:29:06 +0000
Subject: [PATCH 056/111] Merge pull request #115 from glyn/syntax

Fix Python syntax error
---
 Caddyfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Caddyfile b/Caddyfile
index 1857d75..0e10cfa 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
 }
\ No newline at end of file

From 36a52a88d8e3832091d73062ef268acb46f6e031 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Mon, 12 May 2025 20:20:18 -0700
Subject: [PATCH 057/111] Bing AI opt-out instructions

---
 README.md                     |  2 ++
 docs/additional-steps/bing.md | 36 +++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 docs/additional-steps/bing.md

diff --git a/README.md b/README.md
index 8d7bfb1..28ef743 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
    ```
    (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
 
+[Bing uses the data it crawls for AI and training, you may opt out by adding a `meta` tag to the `head` of your site.]((./docs/additional-steps/bing.md))
+
 ## Contributing
 
 A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`.
diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md
new file mode 100644
index 0000000..37c60c7
--- /dev/null
+++ b/docs/additional-steps/bing.md
@@ -0,0 +1,36 @@
+# Bing (bingbot)
+
+It's not well publicised, but Bing uses the data it crawls for AI and training.
+
+However, the current thinking is, blocking a search engine of this size using `robots.txt` seems a quite drastic approach as it is second only to Google and could significantly impact your website in search results.
+
+Additionally, Bing powers a number of search engines such as Yahoo and AOL, and its search results are also used in Duck Duck Go, amongst others.
+
+Fortunately, Bing supports a relatively simple opt-out method, requiring an additional step.
+
+## How to opt-out of AI training
+
+You must add a metatag in the `<head>` of your webpage. This also needs to be added to every page on your website.
+
+The line you need to add is:
+
+```plaintext
+<meta name="robots" content="noarchive">
+```
+
+By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models."
+
+## Will my site be negatively affected
+
+Simple answer, no.
+The original use of "noarchive" has been retired by all search engines. Google retired its use in 2024.
+
+The use of this metatag will not impact your site in search engines or in any other meaningful way if you add it to your page(s).
+
+It is now solely used by a handful of crawlers, such as Bingbot and Amazonbot, to signify to them not to use your data for AI/training.
+
+## Resources
+
+Bing Blog AI opt-out announcement: https://blogs.bing.com/webmaster/september-2023/Announcing-new-options-for-webmasters-to-control-usage-of-their-content-in-Bing-Chat
+
+Bing metatag information, including AI opt-out: https://www.bing.com/webmasters/help/which-robots-metatags-does-bing-support-5198d240

From b4610a725cac409b5c686d0f68383ea3f6daa818 Mon Sep 17 00:00:00 2001
From: Florent Poinsaut <florent@poinsaut.fr>
Date: Wed, 14 May 2025 14:11:56 +0200
Subject: [PATCH 058/111] Add Traefik plugin

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 8d7bfb1..80de135 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,12 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/
    ```
    (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.)
 
+### Related
+
+- [Robots.txt Traefik plugin](https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin):
+middleware plugin for [Traefik](https://traefik.io/traefik/) to automatically add rules of [robots.txt](./robots.txt)
+file on-the-fly.
+
 ## Contributing
 
 A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`.

From 9539256cb3116b626439bf79a776ea67b7aa2edd Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 14 May 2025 16:46:32 -0700
Subject: [PATCH 059/111] chore(robots.json): adds QualifiedBot crawler

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index e15a196..99e28d3 100644
--- a/robots.json
+++ b/robots.json
@@ -335,6 +335,13 @@
         "operator": "[Huawei](https://huawei.com/)",
         "respect": "Yes"
     },
+    "QualifiedBot": {
+      "description": "Operated by Qualified as part of their suite of AI product offerings.",
+      "frequency": "No explicit frequency provided.",
+      "function": "Company offers AI agents and other related products; usage can be assumed to support said products.",
+      "operator": "[Qualified](https://www.qualified.com)",
+      "respect": "Unclear at this time.",
+    },
     "Scrapy": {
         "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"",
         "frequency": "No information.",
@@ -398,4 +405,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 0c56b96fd99bfcc736c4f64c0df9bb87a1fc6075 Mon Sep 17 00:00:00 2001
From: Joe Hoyle <joe@humanmade.co.uk>
Date: Thu, 15 May 2025 11:26:47 -0400
Subject: [PATCH 060/111] Fix JSON syntax error

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 99e28d3..f518037 100644
--- a/robots.json
+++ b/robots.json
@@ -340,7 +340,7 @@
       "frequency": "No explicit frequency provided.",
       "function": "Company offers AI agents and other related products; usage can be assumed to support said products.",
       "operator": "[Qualified](https://www.qualified.com)",
-      "respect": "Unclear at this time.",
+      "respect": "Unclear at this time."
     },
     "Scrapy": {
         "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"",

From 1c470babbefed7b470443f6dd834e721c58481d6 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 15 May 2025 16:12:30 +0000
Subject: [PATCH 061/111] Merge pull request #123 from joehoyle/patch-1

Fix JSON syntax error
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 586adab..de88e50 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 0e10cfa..43ad3bf 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 3c326bd..9770b45 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -46,6 +46,7 @@ PanguBot
 Perplexity-User
 PerplexityBot
 PetalBot
+QualifiedBot
 Scrapy
 SemrushBot-OCOB
 SemrushBot-SWA
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index fc58d61..afbd77c 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 232e119..dea4dd5 100644
--- a/robots.txt
+++ b/robots.txt
@@ -46,6 +46,7 @@ User-agent: PanguBot
 User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
+User-agent: QualifiedBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
 User-agent: SemrushBot-SWA
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 4dd6076..57469fa 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -48,6 +48,7 @@
 | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
+| QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From 498aa50760fe3850820b46933bf87b82918e5803 Mon Sep 17 00:00:00 2001
From: Patrick Evans <holysoles97@gmail.com>
Date: Thu, 15 May 2025 11:10:06 -0500
Subject: [PATCH 062/111] lint robots.json during pull requests

---
 .github/workflows/run-tests.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index c98861f..042cc13 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -19,3 +19,10 @@ jobs:
       - name: Run tests
         run: |
           code/tests.py
+  lint-json:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: JQ Json Lint
+        run: jq . robots.json

From 16d1de70943e9d448d3d5e02e91d86e38dac80d7 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Fri, 16 May 2025 00:59:08 +0000
Subject: [PATCH 063/111] Update from Dark Visitors

---
 robots.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/robots.json b/robots.json
index f518037..f2ed4c2 100644
--- a/robots.json
+++ b/robots.json
@@ -336,11 +336,11 @@
         "respect": "Yes"
     },
     "QualifiedBot": {
-      "description": "Operated by Qualified as part of their suite of AI product offerings.",
-      "frequency": "No explicit frequency provided.",
-      "function": "Company offers AI agents and other related products; usage can be assumed to support said products.",
-      "operator": "[Qualified](https://www.qualified.com)",
-      "respect": "Unclear at this time."
+        "description": "Operated by Qualified as part of their suite of AI product offerings.",
+        "frequency": "No explicit frequency provided.",
+        "function": "Company offers AI agents and other related products; usage can be assumed to support said products.",
+        "operator": "[Qualified](https://www.qualified.com)",
+        "respect": "Unclear at this time."
     },
     "Scrapy": {
         "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"",
@@ -405,4 +405,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 5fba0b746d550b6ae4d7c9605904b6ec102d0f98 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 15 May 2025 15:40:46 -0700
Subject: [PATCH 064/111] chore(robots.json): adds MistralAI-User/1.0 crawler

---
 robots.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/robots.json b/robots.json
index f518037..647e664 100644
--- a/robots.json
+++ b/robots.json
@@ -272,6 +272,13 @@
         "frequency": "Unclear at this time.",
         "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
     },
+    "MistralAI-User/1.0": {
+      "operator": "Mistral AI",
+      "function": "Takes action based on user prompts.",
+      "frequency": "Only when prompted by a user.",
+      "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
+      "respect": "Yes"
+    },
     "NovaAct": {
         "operator": "Unclear at this time.",
         "respect": "Unclear at this time.",

From ca918a963f735019a0c66343bf8338a9228d94f5 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 15 May 2025 21:16:49 -0700
Subject: [PATCH 065/111] chore(robots.json): adds Google-CloudVertexBot

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index f2ed4c2..b459533 100644
--- a/robots.json
+++ b/robots.json
@@ -160,6 +160,13 @@
         "operator": "Unknown",
         "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)"
     },
+    "Google-CloudVertexBot": {
+        "operator": "Google",
+        "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)",
+        "function": "Build and manage AI models for businesses employing Vertex AI",
+        "frequency": "No information.",
+        "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents."
+    },
     "Google-Extended": {
         "operator": "Google",
         "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)",
@@ -405,4 +412,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From dd1ed174b77ca2c0c4a40d6f4bce6beda4a1c296 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Fri, 16 May 2025 11:35:15 +0000
Subject: [PATCH 066/111] Merge pull request #129 from
 ai-robots-txt/google-cloudvertexbot

chore(robots.json): adds Google-CloudVertexBot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index de88e50..b2204d7 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 43ad3bf..36fd20c 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 9770b45..7389f10 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -21,6 +21,7 @@ FacebookBot
 Factset_spyderbot
 FirecrawlAgent
 FriendlyCrawler
+Google-CloudVertexBot
 Google-Extended
 GoogleOther
 GoogleOther-Image
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index afbd77c..f05f785 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index dea4dd5..a5be10b 100644
--- a/robots.txt
+++ b/robots.txt
@@ -21,6 +21,7 @@ User-agent: FacebookBot
 User-agent: Factset_spyderbot
 User-agent: FirecrawlAgent
 User-agent: FriendlyCrawler
+User-agent: Google-CloudVertexBot
 User-agent: Google-Extended
 User-agent: GoogleOther
 User-agent: GoogleOther-Image
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 57469fa..d8542b3 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -23,6 +23,7 @@
 | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. |
 | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. |
 | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |
+| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. |
 | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. |
 | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |
 | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." |

From 7a2e6cba52e782ab32552c2335637af761afbe51 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Sat, 17 May 2025 00:57:28 +0000
Subject: [PATCH 067/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index b459533..1ecfcd8 100644
--- a/robots.json
+++ b/robots.json
@@ -412,4 +412,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 9297c7dfa3122109a6f3ae3ce18026e0e6c94ebe Mon Sep 17 00:00:00 2001
From: Mihitoko <max-lotz@outlook.de>
Date: Mon, 19 May 2025 23:56:57 +0200
Subject: [PATCH 068/111] Mention X-Robots-Tag header as alternative for bing

---
 docs/additional-steps/bing.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md
index 37c60c7..f9afb78 100644
--- a/docs/additional-steps/bing.md
+++ b/docs/additional-steps/bing.md
@@ -10,15 +10,19 @@ Fortunately, Bing supports a relatively simple opt-out method, requiring an addi
 
 ## How to opt-out of AI training
 
-You must add a metatag in the `<head>` of your webpage. This also needs to be added to every page on your website.
+You must add a metatag in the `<head>` of your webpage or set the [X-Robots-Tag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/X-Robots-Tag) HTTP header in your response. This also needs to be added to every page or response on your website.
 
-The line you need to add is:
+If using the metatag, the line you need to add is:
 
 ```plaintext
 <meta name="robots" content="noarchive">
 ```
+Or include the HTTP response header:
+```plaintext
+X-Robots-Tag: noarchive
+```
 
-By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models."
+By adding this line or header, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models."
 
 ## Will my site be negatively affected
 

From 8a8001cbece8a5607163bed6eb83d5bb35bb24e5 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Tue, 20 May 2025 13:55:25 -0700
Subject: [PATCH 069/111] chore(README): updates the opening line of our README
 to clarify the types of agents we block

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 232b3ed..307f005 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 <img src="/assets/images/noai-logo.png" width="100" />
 
-This is an open list of web crawlers associated with AI companies and the training of LLMs to block. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md).
+This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md).
 
 A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. 
 

From 8b151b2cdc6ef8e949fd59d26d7456bcbb60d4e7 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 21 May 2025 06:52:36 -0700
Subject: [PATCH 070/111] Update README.md

Co-authored-by: Glyn Normington <glyn.normington@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 307f005..f427af4 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 <img src="/assets/images/noai-logo.png" width="100" />
 
-This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md).
+This list contains AI-related crawlers of all types, regardless of purpose. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md).
 
 A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. 
 

From 1c2acd75b7def13d9dc85233bfb4aaca8bcafd12 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 21 May 2025 15:27:26 +0000
Subject: [PATCH 071/111] Merge pull request #126 from
 ai-robots-txt/mistral-bot

chore(robots.json): adds MistralAI-User/1.0 crawler
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index b2204d7..cc483c7 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 36fd20c..205acbd 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 7389f10..de5b4fb 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -38,6 +38,7 @@ meta-externalagent
 Meta-ExternalAgent
 meta-externalfetcher
 Meta-ExternalFetcher
+MistralAI-User/1.0
 NovaAct
 OAI-SearchBot
 omgili
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index f05f785..3274559 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index a5be10b..b3e16f8 100644
--- a/robots.txt
+++ b/robots.txt
@@ -38,6 +38,7 @@ User-agent: meta-externalagent
 User-agent: Meta-ExternalAgent
 User-agent: meta-externalfetcher
 User-agent: Meta-ExternalFetcher
+User-agent: MistralAI-User/1.0
 User-agent: NovaAct
 User-agent: OAI-SearchBot
 User-agent: omgili
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index d8542b3..84c69f5 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -40,6 +40,7 @@
 | Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent |
 | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
+| MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. |
 | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |

From b1d9a60a38c04cac81dd156ad73ddef7eb60b50b Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 21 May 2025 11:40:33 -0700
Subject: [PATCH 072/111] chore(robots.json): adds wpbot

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index bddefdd..ed7d63d 100644
--- a/robots.json
+++ b/robots.json
@@ -412,6 +412,13 @@
         "frequency": "Unclear at this time.",
         "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended"
     },
+    "wpbot": {
+        "operator": "[QuantumCloud](https://www.quantumcloud.com)",
+        "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)",
+        "function": "Live chat support and lead generation.",
+        "frequency": "Unclear at this time.",
+        "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support."
+    },
     "YouBot": {
         "operator": "[You](https://about.you.com/youchat/)",
         "respect": "[Yes](https://about.you.com/youbot/)",
@@ -419,4 +426,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 7c5389f4a0c5f60745e1a7552e142bd33a587d8f Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 21 May 2025 19:00:23 +0000
Subject: [PATCH 073/111] Merge pull request #98 from kylebuckingham/main

Updating Claude Bots
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 3 ++-
 nginx-block-ai-bots.conf  | 2 +-
 table-of-bot-metrics.md   | 3 ++-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.htaccess b/.htaccess
index cc483c7..2146722 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 205acbd..879426d 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index de5b4fb..8a9ccf9 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -9,8 +9,9 @@ Brightbot 1.0
 Bytespider
 CCBot
 ChatGPT-User
-Claude-Web
 ClaudeBot
+Claude-User
+Claude-SearchBot
 cohere-ai
 cohere-training-data-crawler
 Cotoyogi
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 3274559..5f96718 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 84c69f5..e6f35a4 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -11,8 +11,9 @@
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
 | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
-| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. |
+| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. |
 | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
 | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
 | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. |

From fedb658cc08225d71a6b0f32c9c2859b7420f0ee Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 21 May 2025 21:06:05 +0000
Subject: [PATCH 074/111] Merge pull request #133 from ai-robots-txt/wpbot

chore(robots.json): adds wpbot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 2146722..3337284 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 879426d..2001edc 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 8a9ccf9..377710b 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -58,4 +58,5 @@ TikTokSpider
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
+wpbot
 YouBot
\ No newline at end of file
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 5f96718..ba1f8c6 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 8690e50..92e527b 100644
--- a/robots.txt
+++ b/robots.txt
@@ -58,5 +58,6 @@ User-agent: TikTokSpider
 User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
 User-agent: Webzio-Extended
+User-agent: wpbot
 User-agent: YouBot
 Disallow: /
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index e6f35a4..c795559 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -60,4 +60,5 @@
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
 | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
+| wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. |
 | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. |

From 7bf7f9164d55a58e0d6080dff52eea3ed5d3584e Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Thu, 22 May 2025 00:58:45 +0000
Subject: [PATCH 075/111] Update from Dark Visitors

---
 robots.json | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/robots.json b/robots.json
index 9dce781..06187ae 100644
--- a/robots.json
+++ b/robots.json
@@ -76,12 +76,12 @@
         "frequency": "Only when prompted by a user.",
         "description": "Used by plugins in ChatGPT to answer queries based on user input."
     },
-    "ClaudeBot": {
+    "Claude-SearchBot": {
         "operator": "[Anthropic](https://www.anthropic.com)",
         "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
-        "function": "Scrapes data to train Anthropic's AI products.",
+        "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
         "frequency": "No information provided.",
-        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
+        "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
     },
     "Claude-User": {
         "operator": "[Anthropic](https://www.anthropic.com)",
@@ -90,12 +90,19 @@
         "frequency": "No information provided.",
         "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."
     },
-    "Claude-SearchBot": {
+    "Claude-Web": {
+        "operator": "Anthropic",
+        "respect": "Unclear at this time.",
+        "function": "Undocumented AI Agents",
+        "frequency": "Unclear at this time.",
+        "description": "Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web"
+    },
+    "ClaudeBot": {
         "operator": "[Anthropic](https://www.anthropic.com)",
         "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
-        "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
+        "function": "Scrapes data to train Anthropic's AI products.",
         "frequency": "No information provided.",
-        "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
+        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
     },
     "cohere-ai": {
         "operator": "[Cohere](https://cohere.com)",
@@ -287,11 +294,11 @@
         "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
     },
     "MistralAI-User/1.0": {
-      "operator": "Mistral AI",
-      "function": "Takes action based on user prompts.",
-      "frequency": "Only when prompted by a user.",
-      "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
-      "respect": "Yes"
+        "operator": "Mistral AI",
+        "function": "Takes action based on user prompts.",
+        "frequency": "Only when prompted by a user.",
+        "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
+        "respect": "Yes"
     },
     "NovaAct": {
         "operator": "Unclear at this time.",
@@ -433,4 +440,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 093ab81d789528bb5d89c2d2c708b8f157e3b795 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Fri, 23 May 2025 00:58:57 +0000
Subject: [PATCH 076/111] Update from Dark Visitors

---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 5 +++--
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 5 +++--
 table-of-bot-metrics.md   | 5 +++--
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.htaccess b/.htaccess
index 3337284..26c6e72 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 2001edc..7a1076c 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 377710b..8ef373b 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -9,9 +9,10 @@ Brightbot 1.0
 Bytespider
 CCBot
 ChatGPT-User
-ClaudeBot
-Claude-User
 Claude-SearchBot
+Claude-User
+Claude-Web
+ClaudeBot
 cohere-ai
 cohere-training-data-crawler
 Cotoyogi
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index ba1f8c6..a691c55 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 92e527b..3330b20 100644
--- a/robots.txt
+++ b/robots.txt
@@ -9,9 +9,10 @@ User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
 User-agent: ChatGPT-User
-User-agent: ClaudeBot
-User-agent: Claude-User
 User-agent: Claude-SearchBot
+User-agent: Claude-User
+User-agent: Claude-Web
+User-agent: ClaudeBot
 User-agent: cohere-ai
 User-agent: cohere-training-data-crawler
 User-agent: Cotoyogi
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index c795559..0e6a88e 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -11,9 +11,10 @@
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
 | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
-| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
-| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. |
 | Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. |
+| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. |
+| Claude\-Web | Anthropic | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web |
+| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
 | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
 | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. |

From 3e8edd083e32e01bd5cf0629f108815092d5f7ec Mon Sep 17 00:00:00 2001
From: imp <80153024+not-not-the-imp@users.noreply.github.com>
Date: Fri, 23 May 2025 13:03:49 +0100
Subject: [PATCH 077/111] Add AndiBot and PhindBot

Fixes #75
---
 robots.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 06187ae..8a8432b 100644
--- a/robots.json
+++ b/robots.json
@@ -20,6 +20,13 @@
         "frequency": "No information provided.",
         "description": "Scrapes data for AI systems."
     },
+    "Andibot": {
+        "operator": "[Andi](https://andisearch.com/)",
+        "respect": "Unclear at this time",
+        "function": "search engine using generative AI, AI Search Assistant",
+        "frequency": "No information provided.",
+        "description": "Scrapes website and provide genAI summary ."
+    },
     "Amazonbot": {
         "operator": "Amazon",
         "respect": "Yes",
@@ -363,6 +370,13 @@
         "operator": "[Huawei](https://huawei.com/)",
         "respect": "Yes"
     },
+    "PhindBot": {
+        "description": "Company offers AI agent that use genAI and generate extra web query on the fly",
+        "frequency": "No explicit frequency provided.",
+        "function": "AI-enhanced search engine.",
+        "operator": "[phind](https://www.phind.com/)",
+        "respect": "Unclear at this time."
+    },
     "QualifiedBot": {
         "description": "Operated by Qualified as part of their suite of AI product offerings.",
         "frequency": "No explicit frequency provided.",
@@ -440,4 +454,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From d22b9ec51ac14b0d9dfdd86a04cd566731c0e8c4 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Sat, 31 May 2025 16:00:13 -0700
Subject: [PATCH 078/111] Update robots.json

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 8a8432b..9bf9fdd 100644
--- a/robots.json
+++ b/robots.json
@@ -23,7 +23,7 @@
     "Andibot": {
         "operator": "[Andi](https://andisearch.com/)",
         "respect": "Unclear at this time",
-        "function": "search engine using generative AI, AI Search Assistant",
+        "function": "Search engine using generative AI, AI Search Assistant",
         "frequency": "No information provided.",
         "description": "Scrapes website and provide genAI summary ."
     },

From 4259b25cccbb4d0bff740261f1eb825fa86bf381 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Sat, 31 May 2025 16:01:09 -0700
Subject: [PATCH 079/111] Update robots.json

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 9bf9fdd..50c2fae 100644
--- a/robots.json
+++ b/robots.json
@@ -25,7 +25,7 @@
         "respect": "Unclear at this time",
         "function": "Search engine using generative AI, AI Search Assistant",
         "frequency": "No information provided.",
-        "description": "Scrapes website and provide genAI summary ."
+        "description": "Scrapes website and provides AI summary."
     },
     "Amazonbot": {
         "operator": "Amazon",

From 268922f8f2fa05b4eb4cc991116fce2700c09184 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Sat, 31 May 2025 16:02:05 -0700
Subject: [PATCH 080/111] Update robots.json

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 50c2fae..6f3a05d 100644
--- a/robots.json
+++ b/robots.json
@@ -371,7 +371,7 @@
         "respect": "Yes"
     },
     "PhindBot": {
-        "description": "Company offers AI agent that use genAI and generate extra web query on the fly",
+        "description": "Company offers an AI agent that uses AI and generate extra web query on the fly",
         "frequency": "No explicit frequency provided.",
         "function": "AI-enhanced search engine.",
         "operator": "[phind](https://www.phind.com/)",

From 1dd66b696963dd3f9a577aa4ca59d83c8bc41aef Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Mon, 2 Jun 2025 11:53:06 -0700
Subject: [PATCH 081/111] Revert "chore(robots.json): adds imgproxy crawler"

This reverts commit b65f45e408461560a32f44f05860f80655737467.
---
 robots.json | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/robots.json b/robots.json
index 6f3a05d..f730f50 100644
--- a/robots.json
+++ b/robots.json
@@ -251,13 +251,6 @@
         "operator": "[img2dataset](https://github.com/rom1504/img2dataset)",
         "respect": "Unclear at this time."
     },
-    "imgproxy": {
-        "frequency": "No information.",
-        "function": "Not documented or explained on operator's site.",
-        "operator": "[imgproxy](https://imgproxy.net)",
-        "respect": "Unclear at this time.",
-        "description": "AI-powered image processing."
-    },
     "ISSCyberRiskCrawler": {
         "description": "Used to train machine learning based models to quantify cyber risk.",
         "frequency": "No information.",
@@ -454,4 +447,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 899ce01c554359ecc66aa36bc2af367069175e10 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Mon, 2 Jun 2025 14:47:31 -0700
Subject: [PATCH 082/111] chore(ai_robots_update.yml): correct workflow by
 revising git flags + adding guard

---
 .github/workflows/ai_robots_update.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml
index 7e11ce8..17c1cc8 100644
--- a/.github/workflows/ai_robots_update.yml
+++ b/.github/workflows/ai_robots_update.yml
@@ -20,7 +20,12 @@ jobs:
           echo "... done."
           git --no-pager diff
           git add -A
-          git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push)
+          if ! git diff --cached --quiet; then
+            git commit -m "Update from Dark Visitors"
+            git push
+          else
+            echo "No changes to commit."
+          fi
         shell: bash
   convert:
     name: convert

From 87016d15040f50630121666c40a6048df8a4169d Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Tue, 3 Jun 2025 01:00:29 +0000
Subject: [PATCH 083/111] Update from Dark Visitors

---
 robots.json | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/robots.json b/robots.json
index f730f50..d8e8262 100644
--- a/robots.json
+++ b/robots.json
@@ -20,13 +20,6 @@
         "frequency": "No information provided.",
         "description": "Scrapes data for AI systems."
     },
-    "Andibot": {
-        "operator": "[Andi](https://andisearch.com/)",
-        "respect": "Unclear at this time",
-        "function": "Search engine using generative AI, AI Search Assistant",
-        "frequency": "No information provided.",
-        "description": "Scrapes website and provides AI summary."
-    },
     "Amazonbot": {
         "operator": "Amazon",
         "respect": "Yes",
@@ -34,6 +27,13 @@
         "frequency": "No information provided.",
         "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses."
     },
+    "Andibot": {
+        "operator": "[Andi](https://andisearch.com/)",
+        "respect": "Unclear at this time",
+        "function": "Search engine using generative AI, AI Search Assistant",
+        "frequency": "No information provided.",
+        "description": "Scrapes website and provides AI summary."
+    },
     "anthropic-ai": {
         "operator": "[Anthropic](https://www.anthropic.com)",
         "respect": "Unclear at this time.",

From d239e7e5ad0e9c72d17961c57deaf10c1feef899 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Tue, 3 Jun 2025 01:52:35 +0000
Subject: [PATCH 084/111] Merge pull request #139 from
 ai-robots-txt/workflow-fix

chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 3 ++-
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 3 ++-
 table-of-bot-metrics.md   | 3 ++-
 6 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/.htaccess b/.htaccess
index 26c6e72..ddb7255 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 7a1076c..60ed4d3 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 8ef373b..6c23da0 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -2,6 +2,7 @@ AI2Bot
 Ai2Bot-Dolma
 aiHitBot
 Amazonbot
+Andibot
 anthropic-ai
 Applebot
 Applebot-Extended
@@ -33,7 +34,6 @@ iaskspider/2.0
 ICC-Crawler
 ImagesiftBot
 img2dataset
-imgproxy
 ISSCyberRiskCrawler
 Kangaroo Bot
 meta-externalagent
@@ -50,6 +50,7 @@ PanguBot
 Perplexity-User
 PerplexityBot
 PetalBot
+PhindBot
 QualifiedBot
 Scrapy
 SemrushBot-OCOB
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index a691c55..1c50815 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 3330b20..4b362e4 100644
--- a/robots.txt
+++ b/robots.txt
@@ -2,6 +2,7 @@ User-agent: AI2Bot
 User-agent: Ai2Bot-Dolma
 User-agent: aiHitBot
 User-agent: Amazonbot
+User-agent: Andibot
 User-agent: anthropic-ai
 User-agent: Applebot
 User-agent: Applebot-Extended
@@ -33,7 +34,6 @@ User-agent: iaskspider/2.0
 User-agent: ICC-Crawler
 User-agent: ImagesiftBot
 User-agent: img2dataset
-User-agent: imgproxy
 User-agent: ISSCyberRiskCrawler
 User-agent: Kangaroo Bot
 User-agent: meta-externalagent
@@ -50,6 +50,7 @@ User-agent: PanguBot
 User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
+User-agent: PhindBot
 User-agent: QualifiedBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 0e6a88e..737b217 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -4,6 +4,7 @@
 | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. |
 | aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. |
 | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. |
+| Andibot | [Andi](https://andisearch.com/) | Unclear at this time | Search engine using generative AI, AI Search Assistant | No information provided. | Scrapes website and provides AI summary. |
 | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
 | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
@@ -35,7 +36,6 @@
 | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. |
 | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. |
 | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. |
-| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. |
 | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. |
 | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot |
 | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." |
@@ -52,6 +52,7 @@
 | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
+| PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly |
 | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From 3187fd8a3219620a4b0f3b7b7950f40e5aac4ad1 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Tue, 3 Jun 2025 12:24:16 -0700
Subject: [PATCH 085/111] chore(robots.json): adds YandexAdditional crawlers

---
 robots.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index d8e8262..45340fd 100644
--- a/robots.json
+++ b/robots.json
@@ -440,6 +440,20 @@
         "frequency": "Unclear at this time.",
         "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support."
     },
+    "YandexAdditional": {
+        "operator": "[Yandex](https://yandex.ru)",
+        "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)",
+        "function": "Scrapes/analyzes data for the YandexGPT LLM.",
+        "frequency": "No information.",
+        "description": "Retrieves data used for YandexGPT quick answers features."
+    },
+    "YandexAdditionalBot": {
+        "operator": "[Yandex](https://yandex.ru)",
+        "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)",
+        "function": "Scrapes/analyzes data for the YandexGPT LLM.",
+        "frequency": "No information.",
+        "description": "Retrieves data used for YandexGPT quick answers features."
+    },
     "YouBot": {
         "operator": "[You](https://about.you.com/youchat/)",
         "respect": "[Yes](https://about.you.com/youbot/)",
@@ -447,4 +461,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 080946c360465c5e0b0af8dd475bb0248bca77a1 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Tue, 3 Jun 2025 19:51:25 +0000
Subject: [PATCH 086/111] Merge pull request #140 from
 ai-robots-txt/yandex-bots

chore(robots.json): adds YandexAdditional crawlers
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 2 ++
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 2 ++
 table-of-bot-metrics.md   | 2 ++
 6 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index ddb7255..c381fce 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 60ed4d3..3527a7a 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 6c23da0..a8ba9aa 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -61,4 +61,6 @@ Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
 wpbot
+YandexAdditional
+YandexAdditionalBot
 YouBot
\ No newline at end of file
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 1c50815..5f7a0db 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 4b362e4..d26ccb4 100644
--- a/robots.txt
+++ b/robots.txt
@@ -61,5 +61,7 @@ User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
 User-agent: Webzio-Extended
 User-agent: wpbot
+User-agent: YandexAdditional
+User-agent: YandexAdditionalBot
 User-agent: YouBot
 Disallow: /
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 737b217..3275a20 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -63,4 +63,6 @@
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
 | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
 | wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. |
+| YandexAdditional | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. |
+| YandexAdditionalBot | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. |
 | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. |

From 8f75f4a2f5f8f8381a73e6a6faab241e0ad58a14 Mon Sep 17 00:00:00 2001
From: Ivan Chupin <ivan.chupin.1973@gmail.com>
Date: Wed, 4 Jun 2025 03:48:42 +0500
Subject: [PATCH 087/111] Add SBIntuitionsBot

---
 robots.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/robots.json b/robots.json
index 45340fd..cd7ec4a 100644
--- a/robots.json
+++ b/robots.json
@@ -377,6 +377,13 @@
         "operator": "[Qualified](https://www.qualified.com)",
         "respect": "Unclear at this time."
     },
+    "SBIntuitionsBot": {
+        "description": "AI development and information analysis",
+        "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)",
+        "frequency": "No information.",
+        "function": "Uses data gathered in AI development and information analysis.",
+        "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)"
+    },
     "Scrapy": {
         "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"",
         "frequency": "No information.",

From 3efabc603dcae235cc04d1e2f2e9113c70e6197c Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Tue, 3 Jun 2025 23:28:48 +0000
Subject: [PATCH 088/111] Merge pull request #141 from Ivan-Chupin/patch-1

Add SBIntuitionsBot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index c381fce..48971b1 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 3527a7a..117e653 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index a8ba9aa..c2ebb47 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -52,6 +52,7 @@ PerplexityBot
 PetalBot
 PhindBot
 QualifiedBot
+SBIntuitionsBot
 Scrapy
 SemrushBot-OCOB
 SemrushBot-SWA
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 5f7a0db..edcf8a7 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index d26ccb4..1c5e989 100644
--- a/robots.txt
+++ b/robots.txt
@@ -52,6 +52,7 @@ User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: PhindBot
 User-agent: QualifiedBot
+User-agent: SBIntuitionsBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
 User-agent: SemrushBot-SWA
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 3275a20..c9a3910 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -54,6 +54,7 @@
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly |
 | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. |
+| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From 2b5a59a303a81847234ac11ec2f180ee9795db90 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Wed, 4 Jun 2025 01:00:07 +0000
Subject: [PATCH 089/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index cd7ec4a..739f579 100644
--- a/robots.json
+++ b/robots.json
@@ -468,4 +468,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 03831a7eb55ae3682e45a651588df13acc4a4a04 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 4 Jun 2025 10:46:58 -0700
Subject: [PATCH 090/111] chore(robots.json): adds Quillbot

---
 robots.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 739f579..bea7350 100644
--- a/robots.json
+++ b/robots.json
@@ -377,6 +377,20 @@
         "operator": "[Qualified](https://www.qualified.com)",
         "respect": "Unclear at this time."
     },
+    "QuillBot": {
+        "description": "Operated by QuillBot as part of their suite of AI product offerings.",
+        "frequency": "No explicit frequency provided.",
+        "function": "Company offers AI detection, writing tools and other services.",
+        "operator": "[Quillbot](https://quillbot.com)",
+        "respect": "Unclear at this time."
+    },
+    "quillbot.com": {
+        "description": "Operated by QuillBot as part of their suite of AI product offerings.",
+        "frequency": "No explicit frequency provided.",
+        "function": "Company offers AI detection, writing tools and other services.",
+        "operator": "[Quillbot](https://quillbot.com)",
+        "respect": "Unclear at this time."
+    },
     "SBIntuitionsBot": {
         "description": "AI development and information analysis",
         "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)",
@@ -468,4 +482,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 4568d69b0edad5708fe49fafd2b5c0572cd2dfa4 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 4 Jun 2025 10:54:14 -0700
Subject: [PATCH 091/111] chore(robots.json): adds Panscient

---
 robots.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 739f579..8d1163a 100644
--- a/robots.json
+++ b/robots.json
@@ -342,6 +342,20 @@
         "frequency": "Unclear at this time.",
         "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot"
     },
+    "Panscient": {
+        "operator": "[Panscient](https://panscient.com)",
+        "respect": "[Yes](https://panscient.com/faq.htm)",
+        "function": "Data collection and analysis using machine learning and AI.",
+        "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.",
+        "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning."
+    },
+    "panscient.com": {
+        "operator": "[Panscient](https://panscient.com)",
+        "respect": "[Yes](https://panscient.com/faq.htm)",
+        "function": "Data collection and analysis using machine learning and AI.",
+        "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.",
+        "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning."
+    },
     "Perplexity-User": {
         "operator": "[Perplexity](https://www.perplexity.ai/)",
         "respect": "[No](https://docs.perplexity.ai/guides/bots)",
@@ -468,4 +482,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 9c28c63a0c4889d694f7ebdd278c17564f4b72a3 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 4 Jun 2025 17:54:57 +0000
Subject: [PATCH 092/111] Merge pull request #142 from ai-robots-txt/quillbot

chore(robots.json): adds Quillbot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 2 ++
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 2 ++
 table-of-bot-metrics.md   | 2 ++
 6 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 48971b1..ee898fd 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 117e653..de3bf3b 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index c2ebb47..be27797 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -52,6 +52,8 @@ PerplexityBot
 PetalBot
 PhindBot
 QualifiedBot
+QuillBot
+quillbot.com
 SBIntuitionsBot
 Scrapy
 SemrushBot-OCOB
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index edcf8a7..d7decae 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 1c5e989..4ed2deb 100644
--- a/robots.txt
+++ b/robots.txt
@@ -52,6 +52,8 @@ User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: PhindBot
 User-agent: QualifiedBot
+User-agent: QuillBot
+User-agent: quillbot.com
 User-agent: SBIntuitionsBot
 User-agent: Scrapy
 User-agent: SemrushBot-OCOB
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index c9a3910..e0fcd26 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -54,6 +54,8 @@
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly |
 | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. |
+| QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
+| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
 | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |

From 75ea75a95b006e68f89f8826bf84caff569a1eb8 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Wed, 4 Jun 2025 18:04:06 +0000
Subject: [PATCH 093/111] Merge pull request #143 from ai-robots-txt/panscient

chore(robots.json): adds Panscient
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 2 ++
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 2 ++
 table-of-bot-metrics.md   | 2 ++
 6 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index ee898fd..5fefc69 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index de3bf3b..5caa249 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index be27797..fe153c8 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -47,6 +47,8 @@ omgili
 omgilibot
 Operator
 PanguBot
+Panscient
+panscient.com
 Perplexity-User
 PerplexityBot
 PetalBot
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index d7decae..e5d660b 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 4ed2deb..26a9d78 100644
--- a/robots.txt
+++ b/robots.txt
@@ -47,6 +47,8 @@ User-agent: omgili
 User-agent: omgilibot
 User-agent: Operator
 User-agent: PanguBot
+User-agent: Panscient
+User-agent: panscient.com
 User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index e0fcd26..69b9ec2 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -49,6 +49,8 @@
 | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. |
 | Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator |
 | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot |
+| Panscient | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. |
+| panscient\.com | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. |
 | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. |
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |

From 77393df5aa2ac7e3f2bb3e54ff919470c63e5b09 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Thu, 5 Jun 2025 00:59:28 +0000
Subject: [PATCH 094/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 22a1370..6df5084 100644
--- a/robots.json
+++ b/robots.json
@@ -496,4 +496,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 528d77bf072780a40c169cab399eb0b2139edb83 Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 5 Jun 2025 09:14:23 -0700
Subject: [PATCH 095/111] chore(robots.json): adds bedrockbot

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 739f579..a3f75bc 100644
--- a/robots.json
+++ b/robots.json
@@ -55,6 +55,13 @@
         "frequency": "Unclear at this time.",
         "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools."
     },
+    "bedrockbot": {
+        "operator": "[Amazon](https://amazon.com)",
+        "respect": "[Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector)",
+        "function": "Data scraping for custom AI applications.",
+        "frequency": "Unclear at this time.",
+        "description": "Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application."
+    },
     "Brightbot 1.0": {
         "operator": "Browsing.ai",
         "respect": "Unclear at this time.",
@@ -468,4 +475,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From ac7ed17e71a59a67d54279d010477abecfb15caf Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 5 Jun 2025 16:51:17 +0000
Subject: [PATCH 096/111] Merge pull request #145 from
 ai-robots-txt/aws-bedrockbot

chore(robots.json): adds bedrockbot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 5fefc69..dbbde1e 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 5caa249..08b2fd3 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index fe153c8..cdaeecd 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -6,6 +6,7 @@ Andibot
 anthropic-ai
 Applebot
 Applebot-Extended
+bedrockbot
 Brightbot 1.0
 Bytespider
 CCBot
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index e5d660b..542ac65 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 26a9d78..a8dc655 100644
--- a/robots.txt
+++ b/robots.txt
@@ -6,6 +6,7 @@ User-agent: Andibot
 User-agent: anthropic-ai
 User-agent: Applebot
 User-agent: Applebot-Extended
+User-agent: bedrockbot
 User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 69b9ec2..ee324f7 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -8,6 +8,7 @@
 | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot |
 | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. |
+| bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. |
 | Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. |
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |

From e21f6ae1b6eeb0782012a4a31e6af61b6a21cb09 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Fri, 6 Jun 2025 00:59:25 +0000
Subject: [PATCH 097/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 8f5dadd..3b5c434 100644
--- a/robots.json
+++ b/robots.json
@@ -503,4 +503,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 7867c3e26c267301cc528edf89fccf5203bfb99b Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Mon, 9 Jun 2025 08:44:25 -0700
Subject: [PATCH 098/111] chore(robots.json): adds EchoboxBot (#148)

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 3b5c434..911dadc 100644
--- a/robots.json
+++ b/robots.json
@@ -160,6 +160,13 @@
         "frequency": "Unclear at this time.",
         "description": "DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot"
     },
+    "EchoboxBot": {
+        "operator": "[Echobox](https://echobox.com)",
+        "respect": "Unclear at this time.",
+        "function": "Data collection to support AI-powered products.",
+        "frequency": "Unclear at this time.",
+        "description": "Supports company's AI-powered social and email management products."
+    },
     "FacebookBot": {
         "operator": "Meta/Facebook",
         "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)",
@@ -503,4 +510,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 3759a6bf146f0153bbb7ab880ff95380a4fc739e Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Mon, 9 Jun 2025 15:44:36 +0000
Subject: [PATCH 099/111] chore(robots.json): adds EchoboxBot (#148)

---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index dbbde1e..74b4e5b 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 08b2fd3..9ca050b 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index cdaeecd..dc9f851 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -21,6 +21,7 @@ Cotoyogi
 Crawlspace
 Diffbot
 DuckAssistBot
+EchoboxBot
 FacebookBot
 Factset_spyderbot
 FirecrawlAgent
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 542ac65..4fd7a29 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index a8dc655..8964626 100644
--- a/robots.txt
+++ b/robots.txt
@@ -21,6 +21,7 @@ User-agent: Cotoyogi
 User-agent: Crawlspace
 User-agent: Diffbot
 User-agent: DuckAssistBot
+User-agent: EchoboxBot
 User-agent: FacebookBot
 User-agent: Factset_spyderbot
 User-agent: FirecrawlAgent
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index ee324f7..b596540 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -23,6 +23,7 @@
 | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. |
 | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. |
 | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot |
+| EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. |
 | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. |
 | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. |
 | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. |

From cf598b6b71e0e3c6b59a699c74218a62d43c5e16 Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Tue, 10 Jun 2025 01:00:37 +0000
Subject: [PATCH 100/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 911dadc..19dffe8 100644
--- a/robots.json
+++ b/robots.json
@@ -510,4 +510,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From 14d68f05ba42b04e8c33fcca5bacf7d8dee86cae Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Wed, 11 Jun 2025 13:50:53 -0700
Subject: [PATCH 101/111] chore(robots.json): adds additional SemrushBot user
 agents

---
 robots.json | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/robots.json b/robots.json
index 19dffe8..0ad3731 100644
--- a/robots.json
+++ b/robots.json
@@ -433,6 +433,27 @@
         "operator": "[Zyte](https://www.zyte.com)",
         "respect": "Unclear at this time."
     },
+    "SemrushBot": {
+        "operator": "[Semrush](https://www.semrush.com/)",
+        "respect": "[Yes](https://www.semrush.com/bot/)",
+        "function": "Crawls your site for ContentShake AI tool.",
+        "frequency": "Roughly once every 10 seconds.",
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
+    },
+    "SemrushBot-BA": {
+        "operator": "[Semrush](https://www.semrush.com/)",
+        "respect": "[Yes](https://www.semrush.com/bot/)",
+        "function": "Crawls your site for ContentShake AI tool.",
+        "frequency": "Roughly once every 10 seconds.",
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
+    },
+    "SemrushBot-CT": {
+        "operator": "[Semrush](https://www.semrush.com/)",
+        "respect": "[Yes](https://www.semrush.com/bot/)",
+        "function": "Crawls your site for ContentShake AI tool.",
+        "frequency": "Roughly once every 10 seconds.",
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
+    },
     "SemrushBot-OCOB": {
         "operator": "[Semrush](https://www.semrush.com/)",
         "respect": "[Yes](https://www.semrush.com/bot/)",
@@ -440,6 +461,13 @@
         "frequency": "Roughly once every 10 seconds.",
         "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
     },
+    "SemrushBot-SI": {
+        "operator": "[Semrush](https://www.semrush.com/)",
+        "respect": "[Yes](https://www.semrush.com/bot/)",
+        "function": "Crawls your site for ContentShake AI tool.",
+        "frequency": "Roughly once every 10 seconds.",
+        "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
+    },
     "SemrushBot-SWA": {
         "operator": "[Semrush](https://www.semrush.com/)",
         "respect": "[Yes](https://www.semrush.com/bot/)",

From 842e2256e896e6d2904fa501df59f2ca17ea6bc0 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Thu, 12 Jun 2025 07:12:00 +0000
Subject: [PATCH 102/111] Merge pull request #150 from
 ai-robots-txt/semrush-bots

chore(robots.json): adds additional SemrushBot user agents
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 4 ++++
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 4 ++++
 table-of-bot-metrics.md   | 4 ++++
 6 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 74b4e5b..a6c8fea 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 9ca050b..7ed7e03 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index dc9f851..fe6c362 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -60,7 +60,11 @@ QuillBot
 quillbot.com
 SBIntuitionsBot
 Scrapy
+SemrushBot
+SemrushBot-BA
+SemrushBot-CT
 SemrushBot-OCOB
+SemrushBot-SI
 SemrushBot-SWA
 Sidetrade indexer bot
 TikTokSpider
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 4fd7a29..13aac72 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 8964626..b130083 100644
--- a/robots.txt
+++ b/robots.txt
@@ -60,7 +60,11 @@ User-agent: QuillBot
 User-agent: quillbot.com
 User-agent: SBIntuitionsBot
 User-agent: Scrapy
+User-agent: SemrushBot
+User-agent: SemrushBot-BA
+User-agent: SemrushBot-CT
 User-agent: SemrushBot-OCOB
+User-agent: SemrushBot-SI
 User-agent: SemrushBot-SWA
 User-agent: Sidetrade indexer bot
 User-agent: TikTokSpider
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index b596540..a834382 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -62,7 +62,11 @@
 | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
 | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis |
 | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
+| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot\-BA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot\-CT | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
+| SemrushBot\-SI | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
 | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
 | TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |

From d760f9216f8d6295b43e00862daec913f82610ad Mon Sep 17 00:00:00 2001
From: Cory Dransfeldt <hi@coryd.dev>
Date: Thu, 12 Jun 2025 13:08:29 -0700
Subject: [PATCH 103/111] chore(robots.json): adds MyCentralAIScraperBot

---
 robots.json | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 19dffe8..94e4f79 100644
--- a/robots.json
+++ b/robots.json
@@ -314,6 +314,13 @@
         "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
         "respect": "Yes"
     },
+    "MyCentralAIScraperBot": {
+      "operator": "Unclear at this time.",
+      "respect": "Unclear at this time.",
+      "function": "AI data scraper",
+      "frequency": "Unclear at this time.",
+      "description": "Operator and data use is uncleaar at this time."
+    },
     "NovaAct": {
         "operator": "Unclear at this time.",
         "respect": "Unclear at this time.",
@@ -510,4 +517,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
\ No newline at end of file
+}

From 8f17718e762831498b09438835b2e1cb74f7e19d Mon Sep 17 00:00:00 2001
From: Glyn Normington <glyn.normington@gmail.com>
Date: Fri, 13 Jun 2025 10:28:12 +0100
Subject: [PATCH 104/111] Fix typo

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 94e4f79..befdb35 100644
--- a/robots.json
+++ b/robots.json
@@ -319,7 +319,7 @@
       "respect": "Unclear at this time.",
       "function": "AI data scraper",
       "frequency": "Unclear at this time.",
-      "description": "Operator and data use is uncleaar at this time."
+      "description": "Operator and data use is unclear at this time."
     },
     "NovaAct": {
         "operator": "Unclear at this time.",

From e53d81c66d353016e27e75215b22dc8557e1a82c Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Fri, 13 Jun 2025 09:28:41 +0000
Subject: [PATCH 105/111] Merge pull request #152 from
 ai-robots-txt/MyCentralAIScraperBot

chore(robots.json): adds MyCentralAIScraperBot
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index a6c8fea..a2d6a6e 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 7ed7e03..e99d69c 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index fe6c362..7d1d3a0 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -43,6 +43,7 @@ Meta-ExternalAgent
 meta-externalfetcher
 Meta-ExternalFetcher
 MistralAI-User/1.0
+MyCentralAIScraperBot
 NovaAct
 OAI-SearchBot
 omgili
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index 13aac72..ef0259e 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index b130083..ab60705 100644
--- a/robots.txt
+++ b/robots.txt
@@ -43,6 +43,7 @@ User-agent: Meta-ExternalAgent
 User-agent: meta-externalfetcher
 User-agent: Meta-ExternalFetcher
 User-agent: MistralAI-User/1.0
+User-agent: MyCentralAIScraperBot
 User-agent: NovaAct
 User-agent: OAI-SearchBot
 User-agent: omgili
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index a834382..f8a2005 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -45,6 +45,7 @@
 | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher |
 | MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. |
+| MyCentralAIScraperBot | Unclear at this time. | Unclear at this time. | AI data scraper | Unclear at this time. | Operator and data use is unclear at this time. |
 | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact |
 | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. |
 | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. |

From b05f2fee000caf6c784135335b14c2254572754e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9rgio=20Spagnuolo?= <info@voltdata.info>
Date: Fri, 13 Jun 2025 17:15:13 -0300
Subject: [PATCH 106/111] Update robots.json with new crawler

Update with Poseidon Research Crawler as found in nytimes.com/robots.txt
---
 robots.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/robots.json b/robots.json
index 52e196a..9bd1f85 100644
--- a/robots.json
+++ b/robots.json
@@ -405,6 +405,13 @@
         "operator": "[phind](https://www.phind.com/)",
         "respect": "Unclear at this time."
     },
+    "Poseidon Research Crawler": {
+        "operator": "[Poseidon Research](https://www.poseidonresearch.com)",
+        "description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.",
+        "frequency": "No explicit frequency provided.",
+        "function": "AI research crawler",
+        "respect": "Unclear at this time."
+    },
     "QualifiedBot": {
         "description": "Operated by Qualified as part of their suite of AI product offerings.",
         "frequency": "No explicit frequency provided.",

From 2b68568ac26db7a37d859fec12b0bb35e61cfedc Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Sat, 14 Jun 2025 00:58:11 +0000
Subject: [PATCH 107/111] Update from Dark Visitors

---
 robots.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/robots.json b/robots.json
index 52e196a..7c10cab 100644
--- a/robots.json
+++ b/robots.json
@@ -315,11 +315,11 @@
         "respect": "Yes"
     },
     "MyCentralAIScraperBot": {
-      "operator": "Unclear at this time.",
-      "respect": "Unclear at this time.",
-      "function": "AI data scraper",
-      "frequency": "Unclear at this time.",
-      "description": "Operator and data use is unclear at this time."
+        "operator": "Unclear at this time.",
+        "respect": "Unclear at this time.",
+        "function": "AI data scraper",
+        "frequency": "Unclear at this time.",
+        "description": "Operator and data use is unclear at this time."
     },
     "NovaAct": {
         "operator": "Unclear at this time.",
@@ -545,4 +545,4 @@
         "frequency": "No information.",
         "description": "Retrieves data used for You.com web search engine and LLMs."
     }
-}
+}
\ No newline at end of file

From eb05f2f5276ff963f786cf8fd57d8ea909e82fd9 Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Sat, 14 Jun 2025 14:04:03 +0000
Subject: [PATCH 108/111] Merge pull request #153 from sergiospagnuolo/Poseidon

Update robots.json with new crawler
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index a2d6a6e..27637c2 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index e99d69c..528ba08 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index 7d1d3a0..c7c3054 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -56,6 +56,7 @@ Perplexity-User
 PerplexityBot
 PetalBot
 PhindBot
+Poseidon Research Crawler
 QualifiedBot
 QuillBot
 quillbot.com
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index ef0259e..c1afb05 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index ab60705..0edf11f 100644
--- a/robots.txt
+++ b/robots.txt
@@ -56,6 +56,7 @@ User-agent: Perplexity-User
 User-agent: PerplexityBot
 User-agent: PetalBot
 User-agent: PhindBot
+User-agent: Poseidon Research Crawler
 User-agent: QualifiedBot
 User-agent: QuillBot
 User-agent: quillbot.com
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index f8a2005..78a3408 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -58,6 +58,7 @@
 | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. |
 | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. |
 | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly |
+| Poseidon Research Crawler | [Poseidon Research](https://www.poseidonresearch.com) | Unclear at this time. | AI research crawler | No explicit frequency provided. | Lab focused on scaling the interpretability research necessary to make better AI systems possible. |
 | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. |
 | QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
 | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |

From 7535893aecf2895dd09bf00de397afde4edf0f8f Mon Sep 17 00:00:00 2001
From: paulrudy <1110792+paulrudy@users.noreply.github.com>
Date: Sun, 15 Jun 2025 16:39:17 -0700
Subject: [PATCH 109/111] re-add facebookexternalhit

---
 robots.json | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/robots.json b/robots.json
index 5d5f692..60c431c 100644
--- a/robots.json
+++ b/robots.json
@@ -174,6 +174,13 @@
         "frequency": "Up to 1 page per second",
         "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
     },
+    "facebookexternalhit": {
+        "operator": "Meta/Facebook",
+        "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)",
+        "function": "Ostensibly only for sharing, but likely used as an AI crawler as well",
+        "frequency": "Unclear at this time.",
+        "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary."
+    },
     "Factset_spyderbot": {
         "operator": "[Factset](https://www.factset.com/ai)",
         "respect": "Unclear at this time.",

From 5326c202b57d87a9921810ef26fca98edce77d5f Mon Sep 17 00:00:00 2001
From: "ai.robots.txt" <ai.robots.txt@users.noreply.github.com>
Date: Mon, 16 Jun 2025 15:12:42 +0000
Subject: [PATCH 110/111] Merge pull request #154 from paulrudy/main

re-add facebookexternalhit
---
 .htaccess                 | 2 +-
 Caddyfile                 | 2 +-
 haproxy-block-ai-bots.txt | 1 +
 nginx-block-ai-bots.conf  | 2 +-
 robots.txt                | 1 +
 table-of-bot-metrics.md   | 1 +
 6 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.htaccess b/.htaccess
index 27637c2..3ba960f 100644
--- a/.htaccess
+++ b/.htaccess
@@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
diff --git a/Caddyfile b/Caddyfile
index 528ba08..b675b9b 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -1,3 +1,3 @@
 @aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
 }
\ No newline at end of file
diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt
index c7c3054..9cfb5c6 100644
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@@ -23,6 +23,7 @@ Diffbot
 DuckAssistBot
 EchoboxBot
 FacebookBot
+facebookexternalhit
 Factset_spyderbot
 FirecrawlAgent
 FriendlyCrawler
diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf
index c1afb05..a53333e 100644
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
     return 403;
 }
\ No newline at end of file
diff --git a/robots.txt b/robots.txt
index 0edf11f..9d69a3a 100644
--- a/robots.txt
+++ b/robots.txt
@@ -23,6 +23,7 @@ User-agent: Diffbot
 User-agent: DuckAssistBot
 User-agent: EchoboxBot
 User-agent: FacebookBot
+User-agent: facebookexternalhit
 User-agent: Factset_spyderbot
 User-agent: FirecrawlAgent
 User-agent: FriendlyCrawler
diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md
index 78a3408..a5ab4c7 100644
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@@ -25,6 +25,7 @@
 | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot |
 | EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. |
 | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. |
+| facebookexternalhit | Meta/Facebook | [No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) | Ostensibly only for sharing, but likely used as an AI crawler as well | Unclear at this time. | Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is "to crawl the content of an app or website that was shared on one of Meta’s family of apps…". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary. |
 | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. |
 | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. |
 | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. |

From 4ed17b8e4af67d347b039429eb633c96acbba72f Mon Sep 17 00:00:00 2001
From: dark-visitors <dark-visitors@users.noreply.github.com>
Date: Tue, 17 Jun 2025 01:00:21 +0000
Subject: [PATCH 111/111] Update from Dark Visitors

---
 robots.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/robots.json b/robots.json
index 60c431c..ab2a383 100644
--- a/robots.json
+++ b/robots.json
@@ -179,7 +179,7 @@
         "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)",
         "function": "Ostensibly only for sharing, but likely used as an AI crawler as well",
         "frequency": "Unclear at this time.",
-        "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary."
+        "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta\u2019s family of apps\u2026\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary."
     },
     "Factset_spyderbot": {
         "operator": "[Factset](https://www.factset.com/ai)",