diff --git a/code/robots.py b/code/robots.py index c795649..1c93f84 100755 --- a/code/robots.py +++ b/code/robots.py @@ -3,7 +3,6 @@ import json import re import requests - from bs4 import BeautifulSoup from pathlib import Path @@ -14,14 +13,12 @@ def load_robots_json(): def get_agent_soup(): - """Retrieve current known agents from darkvisitors.com""" + """Retrieve current known agents from darkvisitors.com.""" session = requests.Session() try: response = session.get("https://darkvisitors.com/agents") except requests.exceptions.ConnectionError: - print( - "ERROR: Could not gather the current agents from https://darkvisitors.com/agents" - ) + print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents") return return BeautifulSoup(response.text, "html.parser") @@ -34,14 +31,6 @@ def updated_robots_json(soup): "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", - # "Archivers", - # "Developer Helpers", - # "Fetchers", - # "Intelligence Gatherers", - # "Scrapers", - # "Search Engine Crawlers", - # "SEO Crawlers", - # "Uncategorized", "Undocumented AI Agents", ] @@ -49,6 +38,7 @@ def updated_robots_json(soup): category = section.find("h2").get_text() if category not in to_include: continue + for agent in section.find_all("a", href=True): name = agent.find("div", {"class": "agent-name"}).get_text().strip() name = clean_robot_name(name) @@ -77,19 +67,18 @@ def updated_robots_json(soup): # New field if field not in existing_content[name]: return value - # Unclear value + # Replace unclear value if ( existing_content[name][field] in default_values and value not in default_values ): return value - # Existing value return existing_content[name][field] existing_content[name] = { "operator": consolidate("operator", operator), "respect": consolidate("respect", default_value), - "function": consolidate("function", f"{category}"), + "function": consolidate("function", category), "frequency": consolidate("frequency", default_value), "description": consolidate( "description", @@ -99,21 +88,11 @@ def updated_robots_json(soup): print(f"Total: {len(existing_content)}") sorted_keys = sorted(existing_content, key=lambda k: k.lower()) - sorted_robots = {k: existing_content[k] for k in sorted_keys} - return sorted_robots + return {k: existing_content[k] for k in sorted_keys} def clean_robot_name(name): - """ Clean the robot name by removing some characters that were mangled by html software once. """ - # This was specifically spotted in "Perplexity-User" - # Looks like a non-breaking hyphen introduced by the HTML rendering software - # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots - # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, - # and it's only the Row-Heading that has the special hyphen - # - # Technically, there's no reason there wouldn't someday be a bot that - # actually uses a non-breaking hyphen, but that seems unlikely, - # so this solution should be fine for now. + """Clean the robot name by removing characters mangled by HTML rendering.""" result = re.sub(r"\u2011", "-", name) if result != name: print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") @@ -125,117 +104,108 @@ def ingest_darkvisitors(): soup = get_agent_soup() if soup: robots_json = updated_robots_json(soup) - print( - "robots.json is unchanged." - if robots_json == old_robots_json - else "robots.json got updates." - ) - Path("./robots.json").write_text( - json.dumps(robots_json, indent=4), encoding="utf-8" - ) + print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.") + Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8") + + +def all_user_agents(robot_json): + """Expand all main names and their ua-synonyms into a flat list.""" + return [ + ua for name, data in robot_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ] def json_to_txt(robots_json): """Compose the robots.txt from the robots.json file.""" - robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) - robots_txt += "\nDisallow: /\n" - return robots_txt + lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)] + lines.append("Disallow: /") + return "\n".join(lines) def escape_md(s): + """Escape markdown special characters in bot names.""" return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) def json_to_table(robots_json): - """Compose a markdown table with the information in robots.json""" + """Compose a markdown table with the information in robots.json.""" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" table += "|------|----------|-----------------------|----------|------------------|-------------|\n" for name, robot in robots_json.items(): - table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' + table += ( + f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | " + f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n" + ) return table def list_to_pcre(lst): - # Python re is not 100% identical to PCRE which is used by Apache, but it - # should probably be close enough in the real world for re.escape to work. - formatted = "|".join(map(re.escape, lst)) - return f"({formatted})" + """Convert a list of user agents into a regex pattern.""" + return f"({'|'.join(map(re.escape, lst))})" def json_to_htaccess(robot_json): - # Creates a .htaccess filter file. It uses a regular expression to filter out - # User agents that contain any of the blocked values. - htaccess = "RewriteEngine On\n" - htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" - htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" - return htaccess + """Generate .htaccess content to block bots via user-agent regex.""" + return ( + "RewriteEngine On\n" + f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n" + "RewriteRule !^/?robots\\.txt$ - [F,L]\n" + ) + def json_to_nginx(robot_json): - # Creates an Nginx config file. This config snippet can be included in - # nginx server{} blocks to block AI bots. - config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" - return config + """Generate Nginx config snippet to block AI bots.""" + return ( + f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n' + f' return 403;\n' + f'}}' + ) def json_to_caddy(robot_json): - caddyfile = "@aibots {\n " - caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' - caddyfile += "\n}" - return caddyfile + """Generate a Caddyfile snippet to block AI bots.""" + return ( + "@aibots {\n" + f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n' + "}" + ) + def json_to_haproxy(robots_json): - # Creates a source file for HAProxy. Follow instructions in the README to implement it. - txt = "\n".join(f"{k}" for k in robots_json.keys()) - return txt - + """Generate HAProxy configuration source.""" + return "\n".join(all_user_agents(robots_json)) def update_file_if_changed(file_name, converter): - """Update files if newer content is available and log the (in)actions.""" + """Update output files only if the content has changed.""" new_content = converter(load_robots_json()) filepath = Path(file_name) - # "touch" will create the file if it doesn't exist yet filepath.touch() old_content = filepath.read_text(encoding="utf-8") + if old_content == new_content: print(f"{file_name} is already up to date.") else: - Path(file_name).write_text(new_content, encoding="utf-8") + filepath.write_text(new_content, encoding="utf-8") print(f"{file_name} has been updated.") def conversions(): - """Triggers the conversions from the json file.""" - update_file_if_changed(file_name="./robots.txt", converter=json_to_txt) - update_file_if_changed( - file_name="./table-of-bot-metrics.md", - converter=json_to_table, - ) - update_file_if_changed( - file_name="./.htaccess", - converter=json_to_htaccess, - ) - update_file_if_changed( - file_name="./nginx-block-ai-bots.conf", - converter=json_to_nginx, - ) - update_file_if_changed( - file_name="./Caddyfile", - converter=json_to_caddy, - ) - - update_file_if_changed( - file_name="./haproxy-block-ai-bots.txt", - converter=json_to_haproxy, - ) + """Generate all output files from robots.json.""" + update_file_if_changed("./robots.txt", json_to_txt) + update_file_if_changed("./table-of-bot-metrics.md", json_to_table) + update_file_if_changed("./.htaccess", json_to_htaccess) + update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx) + update_file_if_changed("./Caddyfile", json_to_caddy) + update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy) if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser() parser = argparse.ArgumentParser( prog="ai-robots", description="Collects and updates information about web scrapers of AI companies.", @@ -251,6 +221,7 @@ if __name__ == "__main__": action="store_true", help="Create the robots.txt and markdown table from robots.json", ) + args = parser.parse_args() if not (args.update or args.convert): @@ -259,5 +230,6 @@ if __name__ == "__main__": if args.update: ingest_darkvisitors() + if args.convert: conversions() diff --git a/code/tests.py b/code/tests.py index 434406f..77aaac3 100755 --- a/code/tests.py +++ b/code/tests.py @@ -3,8 +3,18 @@ import json import unittest +import re + +from robots import ( + json_to_txt, + json_to_table, + json_to_htaccess, + json_to_nginx, + json_to_haproxy, + json_to_caddy, + clean_robot_name +) -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -14,7 +24,6 @@ class RobotsUnittestExtensions: def assertEqualsFile(self, f, s): with open(f, "rt") as f: f_contents = f.read() - return self.assertMultiLineEqual(f_contents, s) @@ -50,6 +59,7 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) + class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 @@ -60,6 +70,7 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) + class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 @@ -70,11 +81,6 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_haproxy = json_to_haproxy(self.robots_dict) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) -class TestRobotsNameCleaning(unittest.TestCase): - def test_clean_name(self): - from robots import clean_robot_name - - self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 @@ -87,8 +93,37 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) +class TestRobotsNameCleaning(unittest.TestCase): + def test_clean_name(self): + self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") + + +class TestUASynonymSupport(unittest.TestCase): + def setUp(self): + self.test_data = { + "MainBot": { + "ua-synonyms": ["mainbot/1.0", "Main-Bot"], + "operator": "TestCorp", + "respect": "No", + "function": "Test", + "frequency": "Often", + "description": "A test bot" + } + } + + def test_robots_txt_includes_synonyms(self): + output = json_to_txt(self.test_data) + for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]: + self.assertIn(f"User-agent: {variant}", output) + + def test_htaccess_includes_synonyms(self): + output = json_to_htaccess(self.test_data) + pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)" + self.assertRegex(output, pattern) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) - unittest.main(verbosity=2) +