Update from Dark Visitors

Merge pull request #133 from ai-robots-txt/wpbot
chore(robots.json): adds wpbot
2025-06-19 18:10:52 +00:00 · 2025-05-22 00:58:45 +00:00 · 2025-05-21 21:06:05 +00:00 · 2025-05-21 22:05:51 +01:00 · 2025-05-21 19:00:23 +00:00 · 2025-05-21 12:00:11 -07:00
7 changed files with 42 additions and 15 deletions
--- a/.htaccess
+++ b/.htaccess
@ -1,3 +1,3 @@
 RewriteEngine On
-RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC]
+RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC]
 RewriteRule !^/?robots\.txt$ - [F,L]
--- a/2
+++ b/2
@ -1,3 +1,3 @@
@aibots {
-        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)"
+        header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)"
 }
--- a/haproxy-block-ai-bots.txt
+++ b/haproxy-block-ai-bots.txt
@ -9,8 +9,9 @@ Brightbot 1.0
 Bytespider
 CCBot
 ChatGPT-User
-Claude-Web
 ClaudeBot
+Claude-User
+Claude-SearchBot
 cohere-ai
 cohere-training-data-crawler
 Cotoyogi
@ -57,4 +58,5 @@ TikTokSpider
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
+wpbot
 YouBot
--- a/nginx-block-ai-bots.conf
+++ b/nginx-block-ai-bots.conf
@ -1,3 +1,3 @@
-if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") {
+if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") {
    return 403;
 }
--- a/robots.json
+++ b/robots.json
@ -76,12 +76,26 @@
        "frequency": "Only when prompted by a user.",
        "description": "Used by plugins in ChatGPT to answer queries based on user input."
    },
-    "Claude-Web": {
+    "Claude-SearchBot": {
        "operator": "[Anthropic](https://www.anthropic.com)",
-        "respect": "Unclear at this time.",
-        "function": "Scrapes data to train Anthropic's AI products.",
+        "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
+        "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
        "frequency": "No information provided.",
-        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
+        "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
+    },
+    "Claude-User": {
+        "operator": "[Anthropic](https://www.anthropic.com)",
+        "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
+        "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.",
+        "frequency": "No information provided.",
+        "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."
+    },
+    "Claude-Web": {
+        "operator": "Anthropic",
+        "respect": "Unclear at this time.",
+        "function": "Undocumented AI Agents",
+        "frequency": "Unclear at this time.",
+        "description": "Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web"
    },
    "ClaudeBot": {
        "operator": "[Anthropic](https://www.anthropic.com)",
@ -280,11 +294,11 @@
        "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
    },
    "MistralAI-User/1.0": {
-      "operator": "Mistral AI",
-      "function": "Takes action based on user prompts.",
-      "frequency": "Only when prompted by a user.",
-      "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
-      "respect": "Yes"
+        "operator": "Mistral AI",
+        "function": "Takes action based on user prompts.",
+        "frequency": "Only when prompted by a user.",
+        "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
+        "respect": "Yes"
    },
    "NovaAct": {
        "operator": "Unclear at this time.",
@ -412,6 +426,13 @@
        "frequency": "Unclear at this time.",
        "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended"
    },
+    "wpbot": {
+        "operator": "[QuantumCloud](https://www.quantumcloud.com)",
+        "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)",
+        "function": "Live chat support and lead generation.",
+        "frequency": "Unclear at this time.",
+        "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support."
+    },
    "YouBot": {
        "operator": "[You](https://about.you.com/youchat/)",
        "respect": "[Yes](https://about.you.com/youbot/)",
--- a/robots.txt
+++ b/robots.txt
@ -9,8 +9,9 @@ User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
 User-agent: ChatGPT-User
-User-agent: Claude-Web
 User-agent: ClaudeBot
+User-agent: Claude-User
+User-agent: Claude-SearchBot
 User-agent: cohere-ai
 User-agent: cohere-training-data-crawler
 User-agent: Cotoyogi
@ -57,5 +58,6 @@ User-agent: TikTokSpider
 User-agent: Timpibot
 User-agent: VelenPublicWebCrawler
 User-agent: Webzio-Extended
+User-agent: wpbot
 User-agent: YouBot
 Disallow: /
--- a/table-of-bot-metrics.md
+++ b/table-of-bot-metrics.md
@ -11,8 +11,9 @@
 | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. |
 | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). |
 | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. |
-| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
 | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. |
+| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. |
+| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. |
 | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. |
 | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler |
 | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. |
@ -59,4 +60,5 @@
 | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. |
 | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." |
 | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended |
+| wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. |
 | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. |
Author	SHA1	Message	Date
dark-visitors	7bf7f9164d	Update from Dark Visitors Some checks failed / run-tests (push) Waiting to run Details / lint-json (push) Waiting to run Details / ai-robots-txt (push) Has been cancelled Details	2025-05-22 00:58:45 +00:00
ai.robots.txt	fedb658cc0	Merge pull request #133 from ai-robots-txt/wpbot chore(robots.json): adds wpbot	2025-05-21 21:06:05 +00:00
Glyn Normington	851eabe059	Merge pull request #133 from ai-robots-txt/wpbot chore(robots.json): adds wpbot	2025-05-21 22:05:51 +01:00
ai.robots.txt	7c5389f4a0	Merge pull request #98 from kylebuckingham/main Updating Claude Bots	2025-05-21 19:00:23 +00:00
Cory Dransfeldt	af597586b6	Merge pull request #98 from kylebuckingham/main Updating Claude Bots	2025-05-21 12:00:11 -07:00
Cory Dransfeldt	b1d9a60a38	chore(robots.json): adds wpbot	2025-05-21 11:40:33 -07:00
Kyle Buckingham	fd41de8522	Update robots.json Co-authored-by: Glyn Normington <work@underlap.org>	2025-04-16 16:43:03 -07:00
Kyle Buckingham	4a6f37d727	Update robots.json Co-authored-by: Glyn Normington <work@underlap.org>	2025-04-16 16:42:58 -07:00
Kyle Buckingham	8dc36aa2e2	Update robots.txt	2025-04-01 15:23:28 -07:00
Kyle Buckingham	ae8f74c10c	Update robots.json	2025-04-01 15:22:04 -07:00