Merge 8dc36aa2e2 into a96e330989

Update from Dark Visitors
Merge pull request #105 from jsheard/patch-1
2025-06-19 10:00:52 +00:00 · 2025-04-15 12:41:53 +02:00 · 2025-04-15 00:57:01 +00:00 · 2025-04-14 10:08:38 -07:00 · 2025-04-14 15:46:01 +01:00 · 2025-04-01 15:23:28 -07:00
3 changed files with 32 additions and 9 deletions
--- a/code/robots.py
+++ b/code/robots.py
@ -30,6 +30,7 @@ def updated_robots_json(soup):
    """Update AI scraper information with data from darkvisitors."""
    existing_content = load_robots_json()
    to_include = [
        "AI Agents",
        "AI Assistants",
        "AI Data Scrapers",
        "AI Search Crawlers",
--- a/robots.json
+++ b/robots.json
@ -69,13 +69,6 @@
        "frequency": "Only when prompted by a user.",
        "description": "Used by plugins in ChatGPT to answer queries based on user input."
    },
    "Claude-Web": {
        "operator": "[Anthropic](https://www.anthropic.com)",
        "respect": "Unclear at this time.",
        "function": "Scrapes data to train Anthropic's AI products.",
        "frequency": "No information provided.",
        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
    },
    "ClaudeBot": {
        "operator": "[Anthropic](https://www.anthropic.com)",
        "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)",
@ -83,6 +76,20 @@
        "frequency": "No information provided.",
        "description": "Scrapes data to train LLMs and AI products offered by Anthropic."
    },
    "Claude-User": {
        "operator": "[Anthropic](https://www.anthropic.com)",
        "respect": "Unclear at this time.",
        "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.",
        "frequency": "No information provided.",
        "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent."
    },
    "Claude-SearchBot": {
        "operator": "[Anthropic](https://www.anthropic.com)",
        "respect": "Unclear at this time.",
        "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.",
        "frequency": "No information provided.",
        "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses."
    },
    "cohere-ai": {
        "operator": "[Cohere](https://cohere.com)",
        "respect": "Unclear at this time.",
@ -230,6 +237,13 @@
        "frequency": "Unclear at this time.",
        "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher"
    },
    "NovaAct": {
        "operator": "Unclear at this time.",
        "respect": "Unclear at this time.",
        "function": "AI Agents",
        "frequency": "Unclear at this time.",
        "description": "Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact"
    },
    "OAI-SearchBot": {
        "operator": "[OpenAI](https://openai.com)",
        "respect": "[Yes](https://platform.openai.com/docs/bots)",
@ -251,6 +265,13 @@
        "operator": "[Webz.io](https://webz.io/)",
        "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)"
    },
    "Operator": {
        "operator": "Unclear at this time.",
        "respect": "Unclear at this time.",
        "function": "AI Agents",
        "frequency": "Unclear at this time.",
        "description": "Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator"
    },
    "PanguBot": {
        "operator": "the Chinese company Huawei",
        "respect": "Unclear at this time.",
@ -335,4 +356,4 @@
        "frequency": "No information.",
        "description": "Retrieves data used for You.com web search engine and LLMs."
    }
-}
+}
--- a/robots.txt
+++ b/robots.txt
@ -8,8 +8,9 @@ User-agent: Brightbot 1.0
 User-agent: Bytespider
 User-agent: CCBot
 User-agent: ChatGPT-User
 User-agent: Claude-Web
 User-agent: ClaudeBot
 User-agent: Claude-User
 User-agent: Claude-SearchBot
 User-agent: cohere-ai
 User-agent: cohere-training-data-crawler
 User-agent: Crawlspace
Author	SHA1	Message	Date
Kyle Buckingham	ed25a8a2ad	Merge `8dc36aa2e2` into `a96e330989`	2025-04-15 12:41:53 +02:00
dark-visitors	a96e330989	Update from Dark Visitors Some checks are pending / run-tests (push) Waiting to run Details	2025-04-15 00:57:01 +00:00
Cory Dransfeldt	156e6baa09	Merge pull request #105 from jsheard/patch-1 Some checks are pending / run-tests (push) Waiting to run Details Include "AI Agents" from Dark Visitors	2025-04-14 10:08:38 -07:00
Joshua Sheard	d9f882a9b2	Include "AI Agents" from Dark Visitors	2025-04-14 15:46:01 +01:00
Kyle Buckingham	8dc36aa2e2	Update robots.txt	2025-04-01 15:23:28 -07:00
Kyle Buckingham	ae8f74c10c	Update robots.json	2025-04-01 15:22:04 -07:00