diff --git a/code/robots.py b/code/robots.py index 8fd0be3..1c93f84 100755 --- a/code/robots.py +++ b/code/robots.py @@ -1,14 +1,19 @@ #!/usr/bin/env python3 + import json import re import requests from bs4 import BeautifulSoup from pathlib import Path + def load_robots_json(): + """Load the robots.json contents into a dictionary.""" return json.loads(Path("./robots.json").read_text(encoding="utf-8")) + def get_agent_soup(): + """Retrieve current known agents from darkvisitors.com.""" session = requests.Session() try: response = session.get("https://darkvisitors.com/agents") @@ -17,13 +22,23 @@ def get_agent_soup(): return return BeautifulSoup(response.text, "html.parser") + def updated_robots_json(soup): + """Update AI scraper information with data from darkvisitors.""" existing_content = load_robots_json() - to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"] + to_include = [ + "AI Agents", + "AI Assistants", + "AI Data Scrapers", + "AI Search Crawlers", + "Undocumented AI Agents", + ] + for section in soup.find_all("div", {"class": "agent-links-section"}): category = section.find("h2").get_text() if category not in to_include: continue + for agent in section.find_all("a", href=True): name = agent.find("div", {"class": "agent-name"}).get_text().strip() name = clean_robot_name(name) @@ -37,6 +52,7 @@ def updated_robots_json(soup): } default_value = "Unclear at this time." + # Parse the operator information from the description if possible operator = default_value if "operated by " in desc: try: @@ -45,10 +61,13 @@ def updated_robots_json(soup): print(f"Error: {e}") def consolidate(field: str, value: str) -> str: + # New entry if name not in existing_content: return value + # New field if field not in existing_content[name]: return value + # Replace unclear value if ( existing_content[name][field] in default_values and value not in default_values @@ -59,7 +78,7 @@ def updated_robots_json(soup): existing_content[name] = { "operator": consolidate("operator", operator), "respect": consolidate("respect", default_value), - "function": consolidate("function", f"{category}"), + "function": consolidate("function", category), "frequency": consolidate("frequency", default_value), "description": consolidate( "description", @@ -71,103 +90,146 @@ def updated_robots_json(soup): sorted_keys = sorted(existing_content, key=lambda k: k.lower()) return {k: existing_content[k] for k in sorted_keys} + def clean_robot_name(name): - return re.sub(r"\u2011", "-", name) + """Clean the robot name by removing characters mangled by HTML rendering.""" + result = re.sub(r"\u2011", "-", name) + if result != name: + print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") + return result + def ingest_darkvisitors(): - old = load_robots_json() + old_robots_json = load_robots_json() soup = get_agent_soup() if soup: robots_json = updated_robots_json(soup) - print("robots.json is unchanged." if robots_json == old else "robots.json got updates.") + print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.") Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8") -def json_to_txt(robots_json): - agents = [ - ua for name, data in robots_json.items() - for ua in [name] + data.get("ua-synonyms", []) - ] - txt = "\n".join(f"User-agent: {ua}" for ua in agents) - txt += "\nDisallow: /\n" - return txt -def escape_md(s): - return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) - -def json_to_table(robots_json): - table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" - table += "|------|----------|-----------------------|----------|------------------|-------------|\n" - for name, robot in robots_json.items(): - table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' - return table - -def list_to_pcre(uas): - return f"({'|'.join(map(re.escape, uas))})" - -def json_to_htaccess(robot_json): - all_uas = [ +def all_user_agents(robot_json): + """Expand all main names and their ua-synonyms into a flat list.""" + return [ ua for name, data in robot_json.items() for ua in [name] + data.get("ua-synonyms", []) ] + + +def json_to_txt(robots_json): + """Compose the robots.txt from the robots.json file.""" + lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)] + lines.append("Disallow: /") + return "\n".join(lines) + + +def escape_md(s): + """Escape markdown special characters in bot names.""" + return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) + + +def json_to_table(robots_json): + """Compose a markdown table with the information in robots.json.""" + table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" + table += "|------|----------|-----------------------|----------|------------------|-------------|\n" + + for name, robot in robots_json.items(): + table += ( + f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | " + f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n" + ) + + return table + + +def list_to_pcre(lst): + """Convert a list of user agents into a regex pattern.""" + return f"({'|'.join(map(re.escape, lst))})" + + +def json_to_htaccess(robot_json): + """Generate .htaccess content to block bots via user-agent regex.""" return ( "RewriteEngine On\n" - f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n" + f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n" "RewriteRule !^/?robots\\.txt$ - [F,L]\n" ) + def json_to_nginx(robot_json): - all_uas = [ - ua for name, data in robot_json.items() - for ua in [name] + data.get("ua-synonyms", []) - ] - return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}' + """Generate Nginx config snippet to block AI bots.""" + return ( + f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n' + f' return 403;\n' + f'}}' + ) + def json_to_caddy(robot_json): - all_uas = [ - ua for name, data in robot_json.items() - for ua in [name] + data.get("ua-synonyms", []) - ] + """Generate a Caddyfile snippet to block AI bots.""" return ( "@aibots {\n" - f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n" + f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n' "}" ) + def json_to_haproxy(robots_json): - return "\n".join( - ua for name, data in robots_json.items() - for ua in [name] + data.get("ua-synonyms", []) - ) + """Generate HAProxy configuration source.""" + return "\n".join(all_user_agents(robots_json)) + def update_file_if_changed(file_name, converter): + """Update output files only if the content has changed.""" new_content = converter(load_robots_json()) filepath = Path(file_name) filepath.touch() old_content = filepath.read_text(encoding="utf-8") + if old_content == new_content: print(f"{file_name} is already up to date.") else: - Path(file_name).write_text(new_content, encoding="utf-8") + filepath.write_text(new_content, encoding="utf-8") print(f"{file_name} has been updated.") + def conversions(): - update_file_if_changed("robots.txt", json_to_txt) - update_file_if_changed("table-of-bot-metrics.md", json_to_table) - update_file_if_changed(".htaccess", json_to_htaccess) - update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx) - update_file_if_changed("Caddyfile", json_to_caddy) - update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy) + """Generate all output files from robots.json.""" + update_file_if_changed("./robots.txt", json_to_txt) + update_file_if_changed("./table-of-bot-metrics.md", json_to_table) + update_file_if_changed("./.htaccess", json_to_htaccess) + update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx) + update_file_if_changed("./Caddyfile", json_to_caddy) + update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy) + if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com") - parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json") + + parser = argparse.ArgumentParser( + prog="ai-robots", + description="Collects and updates information about web scrapers of AI companies.", + epilog="One of the flags must be set.\n", + ) + parser.add_argument( + "--update", + action="store_true", + help="Update the robots.json file with data from darkvisitors.com/agents", + ) + parser.add_argument( + "--convert", + action="store_true", + help="Create the robots.txt and markdown table from robots.json", + ) + args = parser.parse_args() + if not (args.update or args.convert): print("ERROR: please provide one of the possible flags.") parser.print_help() + if args.update: ingest_darkvisitors() + if args.convert: conversions() diff --git a/code/tests.py b/code/tests.py index de9432e..77aaac3 100755 --- a/code/tests.py +++ b/code/tests.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 +"""To run these tests just execute this script.""" + import json -import re import unittest +import re from robots import ( json_to_txt, @@ -10,6 +12,7 @@ from robots import ( json_to_nginx, json_to_haproxy, json_to_caddy, + clean_robot_name ) @@ -26,8 +29,10 @@ class RobotsUnittestExtensions: class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_robots_txt_generation(self): robots_txt = json_to_txt(self.robots_dict) self.assertEqualsFile("test_files/robots.txt", robots_txt) @@ -35,8 +40,10 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 32768 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_table_generation(self): robots_table = json_to_table(self.robots_dict) self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) @@ -44,8 +51,10 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_htaccess_generation(self): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) @@ -53,8 +62,10 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_nginx_generation(self): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) @@ -62,8 +73,10 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_haproxy_generation(self): robots_haproxy = json_to_haproxy(self.robots_dict) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) @@ -71,8 +84,10 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 + def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") + def test_caddyfile_generation(self): robots_caddyfile = json_to_caddy(self.robots_dict) self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) @@ -80,20 +95,19 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): - from robots import clean_robot_name self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") -class TestUASynonymsSupport(unittest.TestCase): +class TestUASynonymSupport(unittest.TestCase): def setUp(self): self.test_data = { "MainBot": { "ua-synonyms": ["mainbot/1.0", "Main-Bot"], "operator": "TestCorp", "respect": "No", - "function": "AI Bot", - "frequency": "Daily", - "description": "Used for testing ua-synonyms." + "function": "Test", + "frequency": "Often", + "description": "A test bot" } } @@ -104,11 +118,12 @@ class TestUASynonymsSupport(unittest.TestCase): def test_htaccess_includes_synonyms(self): output = json_to_htaccess(self.test_data) - for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]: - self.assertIn(re.escape(variant), output) + pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)" + self.assertRegex(output, pattern) if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) unittest.main(verbosity=2) +