Merge 5c170bb6e3 into eb05f2f527

2025-06-19 10:00:52 +00:00 · 2025-06-14 12:25:21 -04:00 · 2025-06-14 12:25:21 -04:00 · 22a7c0c2db
commit 22a7c0c2db
parent eb05f2f527 5c170bb6e3
2 changed files with 105 additions and 98 deletions
--- a/code/robots.py
+++ b/code/robots.py
@ -3,7 +3,6 @@
 import json
 import re
 import requests
 from bs4 import BeautifulSoup
 from pathlib import Path
@ -14,14 +13,12 @@ def load_robots_json():
 def get_agent_soup():
-    """Retrieve current known agents from darkvisitors.com"""
+    """Retrieve current known agents from darkvisitors.com."""
    session = requests.Session()
    try:
        response = session.get("https://darkvisitors.com/agents")
    except requests.exceptions.ConnectionError:
-        print(
+        print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
            "ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
        )
        return
    return BeautifulSoup(response.text, "html.parser")
@ -34,14 +31,6 @@ def updated_robots_json(soup):
        "AI Assistants",
        "AI Data Scrapers",
        "AI Search Crawlers",
        # "Archivers",
        # "Developer Helpers",
        # "Fetchers",
        # "Intelligence Gatherers",
        # "Scrapers",
        # "Search Engine Crawlers",
        # "SEO Crawlers",
        # "Uncategorized",
        "Undocumented AI Agents",
    ]
@ -49,6 +38,7 @@ def updated_robots_json(soup):
        category = section.find("h2").get_text()
        if category not in to_include:
            continue
        for agent in section.find_all("a", href=True):
            name = agent.find("div", {"class": "agent-name"}).get_text().strip()
            name = clean_robot_name(name)
@ -77,19 +67,18 @@ def updated_robots_json(soup):
                # New field
                if field not in existing_content[name]:
                    return value
-                # Unclear value
+                # Replace unclear value
                if (
                    existing_content[name][field] in default_values
                    and value not in default_values
                ):
                    return value
                # Existing value
                return existing_content[name][field]
            existing_content[name] = {
                "operator": consolidate("operator", operator),
                "respect": consolidate("respect", default_value),
-                "function": consolidate("function", f"{category}"),
+                "function": consolidate("function", category),
                "frequency": consolidate("frequency", default_value),
                "description": consolidate(
                    "description",
@ -99,21 +88,11 @@ def updated_robots_json(soup):
    print(f"Total: {len(existing_content)}")
    sorted_keys = sorted(existing_content, key=lambda k: k.lower())
-    sorted_robots = {k: existing_content[k] for k in sorted_keys}
+    return {k: existing_content[k] for k in sorted_keys}
    return sorted_robots
 def clean_robot_name(name):
-    """ Clean the robot name by removing some characters that were mangled by html software once. """
+    """Clean the robot name by removing characters mangled by HTML rendering."""
    # This was specifically spotted in "Perplexity-User"
    # Looks like a non-breaking hyphen introduced by the HTML rendering software
    # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
    # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, 
    # and it's only the Row-Heading that has the special hyphen
    # 
    # Technically, there's no reason there wouldn't someday be a bot that 
    # actually uses a non-breaking hyphen, but that seems unlikely,
    # so this solution should be fine for now.
    result = re.sub(r"\u2011", "-", name)
    if result != name:
        print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
@ -125,117 +104,108 @@ def ingest_darkvisitors():
    soup = get_agent_soup()
    if soup:
        robots_json = updated_robots_json(soup)
-        print(
+        print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.")
-            "robots.json is unchanged."
+        Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
-            if robots_json == old_robots_json
+
-            else "robots.json got updates."
+
-        )
+def all_user_agents(robot_json):
-        Path("./robots.json").write_text(
+    """Expand all main names and their ua-synonyms into a flat list."""
-            json.dumps(robots_json, indent=4), encoding="utf-8"
+    return [
-        )
+        ua for name, data in robot_json.items()
        for ua in [name] + data.get("ua-synonyms", [])
    ]
 def json_to_txt(robots_json):
    """Compose the robots.txt from the robots.json file."""
-    robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
+    lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)]
-    robots_txt += "\nDisallow: /\n"
+    lines.append("Disallow: /")
-    return robots_txt
+    return "\n".join(lines)
 def escape_md(s):
    """Escape markdown special characters in bot names."""
    return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
 def json_to_table(robots_json):
-    """Compose a markdown table with the information in robots.json"""
+    """Compose a markdown table with the information in robots.json."""
    table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
    table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
    for name, robot in robots_json.items():
-        table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
+        table += (
            f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | "
            f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n"
        )
    return table
 def list_to_pcre(lst):
-    # Python re is not 100% identical to PCRE which is used by Apache, but it
+    """Convert a list of user agents into a regex pattern."""
-    # should probably be close enough in the real world for re.escape to work.
+    return f"({'|'.join(map(re.escape, lst))})"
    formatted = "|".join(map(re.escape, lst))
    return f"({formatted})"
 def json_to_htaccess(robot_json):
-    # Creates a .htaccess filter file. It uses a regular expression to filter out
+    """Generate .htaccess content to block bots via user-agent regex."""
-    # User agents that contain any of the blocked values.
+    return (
-    htaccess = "RewriteEngine On\n"
+        "RewriteEngine On\n"
-    htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
+        f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n"
-    htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
+        "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
-    return htaccess
+    )
 def json_to_nginx(robot_json):
-    # Creates an Nginx config file. This config snippet can be included in 
+    """Generate Nginx config snippet to block AI bots."""
-    # nginx server{} blocks to block AI bots.
+    return (
-    config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
+        f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n'
-    return config
+        f'    return 403;\n'
        f'}}'
    )
 def json_to_caddy(robot_json):
-    caddyfile = "@aibots {\n    "
+    """Generate a Caddyfile snippet to block AI bots."""
-    caddyfile += f'    header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
+    return (
-    caddyfile += "\n}"
+        "@aibots {\n"
-    return caddyfile
+        f'    header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n'
        "}"
    )
 def json_to_haproxy(robots_json):
-    # Creates a source file for HAProxy. Follow instructions in the README to implement it.
+    """Generate HAProxy configuration source."""
-    txt = "\n".join(f"{k}" for k in robots_json.keys())
+    return "\n".join(all_user_agents(robots_json))
    return txt
 def update_file_if_changed(file_name, converter):
-    """Update files if newer content is available and log the (in)actions."""
+    """Update output files only if the content has changed."""
    new_content = converter(load_robots_json())
    filepath = Path(file_name)
    # "touch" will create the file if it doesn't exist yet
    filepath.touch()
    old_content = filepath.read_text(encoding="utf-8")
    if old_content == new_content:
        print(f"{file_name} is already up to date.")
    else:
-        Path(file_name).write_text(new_content, encoding="utf-8")
+        filepath.write_text(new_content, encoding="utf-8")
        print(f"{file_name} has been updated.")
 def conversions():
-    """Triggers the conversions from the json file."""
+    """Generate all output files from robots.json."""
-    update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
+    update_file_if_changed("./robots.txt", json_to_txt)
-    update_file_if_changed(
+    update_file_if_changed("./table-of-bot-metrics.md", json_to_table)
-        file_name="./table-of-bot-metrics.md",
+    update_file_if_changed("./.htaccess", json_to_htaccess)
-        converter=json_to_table,
+    update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx)
-    )
+    update_file_if_changed("./Caddyfile", json_to_caddy)
-    update_file_if_changed(
+    update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy)
        file_name="./.htaccess",
        converter=json_to_htaccess,
    )
    update_file_if_changed(
        file_name="./nginx-block-ai-bots.conf",
        converter=json_to_nginx,
    )
    update_file_if_changed(
        file_name="./Caddyfile",
        converter=json_to_caddy,
    )
    update_file_if_changed(
        file_name="./haproxy-block-ai-bots.txt",
        converter=json_to_haproxy,
    )
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser = argparse.ArgumentParser(
        prog="ai-robots",
        description="Collects and updates information about web scrapers of AI companies.",
@ -251,6 +221,7 @@ if __name__ == "__main__":
        action="store_true",
        help="Create the robots.txt and markdown table from robots.json",
    )
    args = parser.parse_args()
    if not (args.update or args.convert):
@ -259,5 +230,6 @@ if __name__ == "__main__":
    if args.update:
        ingest_darkvisitors()
    if args.convert:
        conversions()
--- a/code/tests.py
+++ b/code/tests.py
@ -3,8 +3,18 @@
 import json
 import unittest
 import re
 from robots import (
    json_to_txt,
    json_to_table,
    json_to_htaccess,
    json_to_nginx,
    json_to_haproxy,
    json_to_caddy,
    clean_robot_name
 )
 from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
 class RobotsUnittestExtensions:
    def loadJson(self, pathname):
@ -14,7 +24,6 @@ class RobotsUnittestExtensions:
    def assertEqualsFile(self, f, s):
        with open(f, "rt") as f:
            f_contents = f.read()
        return self.assertMultiLineEqual(f_contents, s)
@ -50,6 +59,7 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_htaccess = json_to_htaccess(self.robots_dict)
        self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
 class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
@ -60,6 +70,7 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_nginx = json_to_nginx(self.robots_dict)
        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
 class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
@ -70,11 +81,6 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
        robots_haproxy = json_to_haproxy(self.robots_dict)
        self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
 class TestRobotsNameCleaning(unittest.TestCase):
    def test_clean_name(self):
        from robots import clean_robot_name
        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
 class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
@ -87,8 +93,37 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
        self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
 class TestRobotsNameCleaning(unittest.TestCase):
    def test_clean_name(self):
        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
 class TestUASynonymSupport(unittest.TestCase):
    def setUp(self):
        self.test_data = {
            "MainBot": {
                "ua-synonyms": ["mainbot/1.0", "Main-Bot"],
                "operator": "TestCorp",
                "respect": "No",
                "function": "Test",
                "frequency": "Often",
                "description": "A test bot"
            }
        }
    def test_robots_txt_includes_synonyms(self):
        output = json_to_txt(self.test_data)
        for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
            self.assertIn(f"User-agent: {variant}", output)
    def test_htaccess_includes_synonyms(self):
        output = json_to_htaccess(self.test_data)
        pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)"
        self.assertRegex(output, pattern)
 if __name__ == "__main__":
    import os
    os.chdir(os.path.dirname(__file__))
    unittest.main(verbosity=2)