feat: support ua-synonyms in robots.json to reduce duplication (#144)

2025-06-19 10:00:52 +00:00 · 2025-06-11 22:04:39 -07:00 · 2025-06-11 22:04:39 -07:00 · f0606f0eac
commit f0606f0eac
parent cf598b6b71
2 changed files with 93 additions and 163 deletions
--- a/code/robots.py
+++ b/code/robots.py
@ -1,50 +1,25 @@
 #!/usr/bin/env python3
-
 import json
 import re
 import requests
-
 from bs4 import BeautifulSoup
 from pathlib import Path

-
 def load_robots_json():
-    """Load the robots.json contents into a dictionary."""
    return json.loads(Path("./robots.json").read_text(encoding="utf-8"))

-
 def get_agent_soup():
-    """Retrieve current known agents from darkvisitors.com"""
    session = requests.Session()
    try:
        response = session.get("https://darkvisitors.com/agents")
    except requests.exceptions.ConnectionError:
-        print(
-            "ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
-        )
+        print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
        return
    return BeautifulSoup(response.text, "html.parser")

-
 def updated_robots_json(soup):
-    """Update AI scraper information with data from darkvisitors."""
    existing_content = load_robots_json()
-    to_include = [
-        "AI Agents",
-        "AI Assistants",
-        "AI Data Scrapers",
-        "AI Search Crawlers",
-        # "Archivers",
-        # "Developer Helpers",
-        # "Fetchers",
-        # "Intelligence Gatherers",
-        # "Scrapers",
-        # "Search Engine Crawlers",
-        # "SEO Crawlers",
-        # "Uncategorized",
-        "Undocumented AI Agents",
-    ]
-
+    to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"]
    for section in soup.find_all("div", {"class": "agent-links-section"}):
        category = section.find("h2").get_text()
        if category not in to_include:
@ -62,7 +37,6 @@ def updated_robots_json(soup):
            }
            default_value = "Unclear at this time."

-            # Parse the operator information from the description if possible
            operator = default_value
            if "operated by " in desc:
                try:
@ -71,19 +45,15 @@ def updated_robots_json(soup):
                    print(f"Error: {e}")

            def consolidate(field: str, value: str) -> str:
-                # New entry
                if name not in existing_content:
                    return value
-                # New field
                if field not in existing_content[name]:
                    return value
-                # Unclear value
                if (
                    existing_content[name][field] in default_values
                    and value not in default_values
                ):
                    return value
-                # Existing value
                return existing_content[name][field]

            existing_content[name] = {
@ -99,104 +69,79 @@ def updated_robots_json(soup):

    print(f"Total: {len(existing_content)}")
    sorted_keys = sorted(existing_content, key=lambda k: k.lower())
-    sorted_robots = {k: existing_content[k] for k in sorted_keys}
-    return sorted_robots
-
+    return {k: existing_content[k] for k in sorted_keys}

 def clean_robot_name(name):
-    """ Clean the robot name by removing some characters that were mangled by html software once. """
-    # This was specifically spotted in "Perplexity-User"
-    # Looks like a non-breaking hyphen introduced by the HTML rendering software
-    # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
-    # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, 
-    # and it's only the Row-Heading that has the special hyphen
-    # 
-    # Technically, there's no reason there wouldn't someday be a bot that 
-    # actually uses a non-breaking hyphen, but that seems unlikely,
-    # so this solution should be fine for now.
-    result = re.sub(r"\u2011", "-", name)
-    if result != name:
-        print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
-    return result
-
+    return re.sub(r"\u2011", "-", name)

 def ingest_darkvisitors():
-    old_robots_json = load_robots_json()
+    old = load_robots_json()
    soup = get_agent_soup()
    if soup:
        robots_json = updated_robots_json(soup)
-        print(
-            "robots.json is unchanged."
-            if robots_json == old_robots_json
-            else "robots.json got updates."
-        )
-        Path("./robots.json").write_text(
-            json.dumps(robots_json, indent=4), encoding="utf-8"
-        )
-
+        print("robots.json is unchanged." if robots_json == old else "robots.json got updates.")
+        Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")

 def json_to_txt(robots_json):
-    """Compose the robots.txt from the robots.json file."""
-    robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
-    robots_txt += "\nDisallow: /\n"
-    return robots_txt
-
+    agents = [
+        ua for name, data in robots_json.items()
+        for ua in [name] + data.get("ua-synonyms", [])
+    ]
+    txt = "\n".join(f"User-agent: {ua}" for ua in agents)
+    txt += "\nDisallow: /\n"
+    return txt

 def escape_md(s):
    return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)

-
 def json_to_table(robots_json):
-    """Compose a markdown table with the information in robots.json"""
    table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
    table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
-
    for name, robot in robots_json.items():
        table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
-
    return table

-
-def list_to_pcre(lst):
-    # Python re is not 100% identical to PCRE which is used by Apache, but it
-    # should probably be close enough in the real world for re.escape to work.
-    formatted = "|".join(map(re.escape, lst))
-    return f"({formatted})"
-
+def list_to_pcre(uas):
+    return f"({'|'.join(map(re.escape, uas))})"

 def json_to_htaccess(robot_json):
-    # Creates a .htaccess filter file. It uses a regular expression to filter out
-    # User agents that contain any of the blocked values.
-    htaccess = "RewriteEngine On\n"
-    htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
-    htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
-    return htaccess
+    all_uas = [
+        ua for name, data in robot_json.items()
+        for ua in [name] + data.get("ua-synonyms", [])
+    ]
+    return (
+        "RewriteEngine On\n"
+        f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n"
+        "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
+    )

 def json_to_nginx(robot_json):
-    # Creates an Nginx config file. This config snippet can be included in 
-    # nginx server{} blocks to block AI bots.
-    config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n    return 403;\n}}"
-    return config
-
+    all_uas = [
+        ua for name, data in robot_json.items()
+        for ua in [name] + data.get("ua-synonyms", [])
+    ]
+    return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n    return 403;\n}}'

 def json_to_caddy(robot_json):
-    caddyfile = "@aibots {\n    "
-    caddyfile += f'    header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
-    caddyfile += "\n}"
-    return caddyfile
+    all_uas = [
+        ua for name, data in robot_json.items()
+        for ua in [name] + data.get("ua-synonyms", [])
+    ]
+    return (
+        "@aibots {\n"
+        f"        header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n"
+        "}"
+    )

 def json_to_haproxy(robots_json):
-    # Creates a source file for HAProxy. Follow instructions in the README to implement it.
-    txt = "\n".join(f"{k}" for k in robots_json.keys())
-    return txt
-
-
+    return "\n".join(
+        ua for name, data in robots_json.items()
+        for ua in [name] + data.get("ua-synonyms", [])
+    )

 def update_file_if_changed(file_name, converter):
-    """Update files if newer content is available and log the (in)actions."""
    new_content = converter(load_robots_json())
    filepath = Path(file_name)
-    # "touch" will create the file if it doesn't exist yet
    filepath.touch()
    old_content = filepath.read_text(encoding="utf-8")
    if old_content == new_content:
@ -205,58 +150,23 @@ def update_file_if_changed(file_name, converter):
        Path(file_name).write_text(new_content, encoding="utf-8")
        print(f"{file_name} has been updated.")

-
 def conversions():
-    """Triggers the conversions from the json file."""
-    update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
-    update_file_if_changed(
-        file_name="./table-of-bot-metrics.md",
-        converter=json_to_table,
-    )
-    update_file_if_changed(
-        file_name="./.htaccess",
-        converter=json_to_htaccess,
-    )
-    update_file_if_changed(
-        file_name="./nginx-block-ai-bots.conf",
-        converter=json_to_nginx,
-    )
-    update_file_if_changed(
-        file_name="./Caddyfile",
-        converter=json_to_caddy,
-    )
-      
-    update_file_if_changed(
-        file_name="./haproxy-block-ai-bots.txt",
-        converter=json_to_haproxy,
-    )
-
+    update_file_if_changed("robots.txt", json_to_txt)
+    update_file_if_changed("table-of-bot-metrics.md", json_to_table)
+    update_file_if_changed(".htaccess", json_to_htaccess)
+    update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx)
+    update_file_if_changed("Caddyfile", json_to_caddy)
+    update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy)

 if __name__ == "__main__":
    import argparse
-
    parser = argparse.ArgumentParser()
-    parser = argparse.ArgumentParser(
-        prog="ai-robots",
-        description="Collects and updates information about web scrapers of AI companies.",
-        epilog="One of the flags must be set.\n",
-    )
-    parser.add_argument(
-        "--update",
-        action="store_true",
-        help="Update the robots.json file with data from darkvisitors.com/agents",
-    )
-    parser.add_argument(
-        "--convert",
-        action="store_true",
-        help="Create the robots.txt and markdown table from robots.json",
-    )
+    parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com")
+    parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json")
    args = parser.parse_args()
-
    if not (args.update or args.convert):
        print("ERROR: please provide one of the possible flags.")
        parser.print_help()
-
    if args.update:
        ingest_darkvisitors()
    if args.convert:
--- a/code/tests.py
+++ b/code/tests.py
@ -1,10 +1,17 @@
 #!/usr/bin/env python3
-"""To run these tests just execute this script."""
-
 import json
+import re
 import unittest

-from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
+from robots import (
+    json_to_txt,
+    json_to_table,
+    json_to_htaccess,
+    json_to_nginx,
+    json_to_haproxy,
+    json_to_caddy,
+)
+

 class RobotsUnittestExtensions:
    def loadJson(self, pathname):
@ -14,16 +21,13 @@ class RobotsUnittestExtensions:
    def assertEqualsFile(self, f, s):
        with open(f, "rt") as f:
            f_contents = f.read()
-
        return self.assertMultiLineEqual(f_contents, s)


 class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_robots_txt_generation(self):
        robots_txt = json_to_txt(self.robots_dict)
        self.assertEqualsFile("test_files/robots.txt", robots_txt)
@ -31,10 +35,8 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):

 class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 32768
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_table_generation(self):
        robots_table = json_to_table(self.robots_dict)
        self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table)
@ -42,53 +44,71 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):

 class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_htaccess_generation(self):
        robots_htaccess = json_to_htaccess(self.robots_dict)
        self.assertEqualsFile("test_files/.htaccess", robots_htaccess)

+
 class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_nginx_generation(self):
        robots_nginx = json_to_nginx(self.robots_dict)
        self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)

+
 class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_haproxy_generation(self):
        robots_haproxy = json_to_haproxy(self.robots_dict)
        self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)

-class TestRobotsNameCleaning(unittest.TestCase):
-    def test_clean_name(self):
-        from robots import clean_robot_name
-
-        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")

 class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
    maxDiff = 8192
-
    def setUp(self):
        self.robots_dict = self.loadJson("test_files/robots.json")
-
    def test_caddyfile_generation(self):
        robots_caddyfile = json_to_caddy(self.robots_dict)
        self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)


+class TestRobotsNameCleaning(unittest.TestCase):
+    def test_clean_name(self):
+        from robots import clean_robot_name
+        self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
+
+
+class TestUASynonymsSupport(unittest.TestCase):
+    def setUp(self):
+        self.test_data = {
+            "MainBot": {
+                "ua-synonyms": ["mainbot/1.0", "Main-Bot"],
+                "operator": "TestCorp",
+                "respect": "No",
+                "function": "AI Bot",
+                "frequency": "Daily",
+                "description": "Used for testing ua-synonyms."
+            }
+        }
+
+    def test_robots_txt_includes_synonyms(self):
+        output = json_to_txt(self.test_data)
+        for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
+            self.assertIn(f"User-agent: {variant}", output)
+
+    def test_htaccess_includes_synonyms(self):
+        output = json_to_htaccess(self.test_data)
+        for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
+            self.assertIn(re.escape(variant), output)
+
+
 if __name__ == "__main__":
    import os
    os.chdir(os.path.dirname(__file__))
-
    unittest.main(verbosity=2)