diff --git a/code/robots.py b/code/robots.py index c795649..8fd0be3 100755 --- a/code/robots.py +++ b/code/robots.py @@ -1,50 +1,25 @@ #!/usr/bin/env python3 - import json import re import requests - from bs4 import BeautifulSoup from pathlib import Path - def load_robots_json(): - """Load the robots.json contents into a dictionary.""" return json.loads(Path("./robots.json").read_text(encoding="utf-8")) - def get_agent_soup(): - """Retrieve current known agents from darkvisitors.com""" session = requests.Session() try: response = session.get("https://darkvisitors.com/agents") except requests.exceptions.ConnectionError: - print( - "ERROR: Could not gather the current agents from https://darkvisitors.com/agents" - ) + print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents") return return BeautifulSoup(response.text, "html.parser") - def updated_robots_json(soup): - """Update AI scraper information with data from darkvisitors.""" existing_content = load_robots_json() - to_include = [ - "AI Agents", - "AI Assistants", - "AI Data Scrapers", - "AI Search Crawlers", - # "Archivers", - # "Developer Helpers", - # "Fetchers", - # "Intelligence Gatherers", - # "Scrapers", - # "Search Engine Crawlers", - # "SEO Crawlers", - # "Uncategorized", - "Undocumented AI Agents", - ] - + to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"] for section in soup.find_all("div", {"class": "agent-links-section"}): category = section.find("h2").get_text() if category not in to_include: @@ -62,7 +37,6 @@ def updated_robots_json(soup): } default_value = "Unclear at this time." - # Parse the operator information from the description if possible operator = default_value if "operated by " in desc: try: @@ -71,19 +45,15 @@ def updated_robots_json(soup): print(f"Error: {e}") def consolidate(field: str, value: str) -> str: - # New entry if name not in existing_content: return value - # New field if field not in existing_content[name]: return value - # Unclear value if ( existing_content[name][field] in default_values and value not in default_values ): return value - # Existing value return existing_content[name][field] existing_content[name] = { @@ -99,104 +69,79 @@ def updated_robots_json(soup): print(f"Total: {len(existing_content)}") sorted_keys = sorted(existing_content, key=lambda k: k.lower()) - sorted_robots = {k: existing_content[k] for k in sorted_keys} - return sorted_robots - + return {k: existing_content[k] for k in sorted_keys} def clean_robot_name(name): - """ Clean the robot name by removing some characters that were mangled by html software once. """ - # This was specifically spotted in "Perplexity-User" - # Looks like a non-breaking hyphen introduced by the HTML rendering software - # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots - # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, - # and it's only the Row-Heading that has the special hyphen - # - # Technically, there's no reason there wouldn't someday be a bot that - # actually uses a non-breaking hyphen, but that seems unlikely, - # so this solution should be fine for now. - result = re.sub(r"\u2011", "-", name) - if result != name: - print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") - return result - + return re.sub(r"\u2011", "-", name) def ingest_darkvisitors(): - old_robots_json = load_robots_json() + old = load_robots_json() soup = get_agent_soup() if soup: robots_json = updated_robots_json(soup) - print( - "robots.json is unchanged." - if robots_json == old_robots_json - else "robots.json got updates." - ) - Path("./robots.json").write_text( - json.dumps(robots_json, indent=4), encoding="utf-8" - ) - + print("robots.json is unchanged." if robots_json == old else "robots.json got updates.") + Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8") def json_to_txt(robots_json): - """Compose the robots.txt from the robots.json file.""" - robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) - robots_txt += "\nDisallow: /\n" - return robots_txt - + agents = [ + ua for name, data in robots_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ] + txt = "\n".join(f"User-agent: {ua}" for ua in agents) + txt += "\nDisallow: /\n" + return txt def escape_md(s): return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) - def json_to_table(robots_json): - """Compose a markdown table with the information in robots.json""" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" table += "|------|----------|-----------------------|----------|------------------|-------------|\n" - for name, robot in robots_json.items(): table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' - return table - -def list_to_pcre(lst): - # Python re is not 100% identical to PCRE which is used by Apache, but it - # should probably be close enough in the real world for re.escape to work. - formatted = "|".join(map(re.escape, lst)) - return f"({formatted})" - +def list_to_pcre(uas): + return f"({'|'.join(map(re.escape, uas))})" def json_to_htaccess(robot_json): - # Creates a .htaccess filter file. It uses a regular expression to filter out - # User agents that contain any of the blocked values. - htaccess = "RewriteEngine On\n" - htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" - htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" - return htaccess + all_uas = [ + ua for name, data in robot_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ] + return ( + "RewriteEngine On\n" + f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n" + "RewriteRule !^/?robots\\.txt$ - [F,L]\n" + ) def json_to_nginx(robot_json): - # Creates an Nginx config file. This config snippet can be included in - # nginx server{} blocks to block AI bots. - config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" - return config - + all_uas = [ + ua for name, data in robot_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ] + return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}' def json_to_caddy(robot_json): - caddyfile = "@aibots {\n " - caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' - caddyfile += "\n}" - return caddyfile + all_uas = [ + ua for name, data in robot_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ] + return ( + "@aibots {\n" + f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n" + "}" + ) def json_to_haproxy(robots_json): - # Creates a source file for HAProxy. Follow instructions in the README to implement it. - txt = "\n".join(f"{k}" for k in robots_json.keys()) - return txt - - + return "\n".join( + ua for name, data in robots_json.items() + for ua in [name] + data.get("ua-synonyms", []) + ) def update_file_if_changed(file_name, converter): - """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) filepath = Path(file_name) - # "touch" will create the file if it doesn't exist yet filepath.touch() old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: @@ -205,58 +150,23 @@ def update_file_if_changed(file_name, converter): Path(file_name).write_text(new_content, encoding="utf-8") print(f"{file_name} has been updated.") - def conversions(): - """Triggers the conversions from the json file.""" - update_file_if_changed(file_name="./robots.txt", converter=json_to_txt) - update_file_if_changed( - file_name="./table-of-bot-metrics.md", - converter=json_to_table, - ) - update_file_if_changed( - file_name="./.htaccess", - converter=json_to_htaccess, - ) - update_file_if_changed( - file_name="./nginx-block-ai-bots.conf", - converter=json_to_nginx, - ) - update_file_if_changed( - file_name="./Caddyfile", - converter=json_to_caddy, - ) - - update_file_if_changed( - file_name="./haproxy-block-ai-bots.txt", - converter=json_to_haproxy, - ) - + update_file_if_changed("robots.txt", json_to_txt) + update_file_if_changed("table-of-bot-metrics.md", json_to_table) + update_file_if_changed(".htaccess", json_to_htaccess) + update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx) + update_file_if_changed("Caddyfile", json_to_caddy) + update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy) if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser() - parser = argparse.ArgumentParser( - prog="ai-robots", - description="Collects and updates information about web scrapers of AI companies.", - epilog="One of the flags must be set.\n", - ) - parser.add_argument( - "--update", - action="store_true", - help="Update the robots.json file with data from darkvisitors.com/agents", - ) - parser.add_argument( - "--convert", - action="store_true", - help="Create the robots.txt and markdown table from robots.json", - ) + parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com") + parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json") args = parser.parse_args() - if not (args.update or args.convert): print("ERROR: please provide one of the possible flags.") parser.print_help() - if args.update: ingest_darkvisitors() if args.convert: diff --git a/code/tests.py b/code/tests.py index 434406f..de9432e 100755 --- a/code/tests.py +++ b/code/tests.py @@ -1,10 +1,17 @@ #!/usr/bin/env python3 -"""To run these tests just execute this script.""" - import json +import re import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy +from robots import ( + json_to_txt, + json_to_table, + json_to_htaccess, + json_to_nginx, + json_to_haproxy, + json_to_caddy, +) + class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -14,16 +21,13 @@ class RobotsUnittestExtensions: def assertEqualsFile(self, f, s): with open(f, "rt") as f: f_contents = f.read() - return self.assertMultiLineEqual(f_contents, s) class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_robots_txt_generation(self): robots_txt = json_to_txt(self.robots_dict) self.assertEqualsFile("test_files/robots.txt", robots_txt) @@ -31,10 +35,8 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 32768 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_table_generation(self): robots_table = json_to_table(self.robots_dict) self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) @@ -42,53 +44,71 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_htaccess_generation(self): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) + class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_nginx_generation(self): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) + class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_haproxy_generation(self): robots_haproxy = json_to_haproxy(self.robots_dict) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) -class TestRobotsNameCleaning(unittest.TestCase): - def test_clean_name(self): - from robots import clean_robot_name - - self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): maxDiff = 8192 - def setUp(self): self.robots_dict = self.loadJson("test_files/robots.json") - def test_caddyfile_generation(self): robots_caddyfile = json_to_caddy(self.robots_dict) self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) +class TestRobotsNameCleaning(unittest.TestCase): + def test_clean_name(self): + from robots import clean_robot_name + self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") + + +class TestUASynonymsSupport(unittest.TestCase): + def setUp(self): + self.test_data = { + "MainBot": { + "ua-synonyms": ["mainbot/1.0", "Main-Bot"], + "operator": "TestCorp", + "respect": "No", + "function": "AI Bot", + "frequency": "Daily", + "description": "Used for testing ua-synonyms." + } + } + + def test_robots_txt_includes_synonyms(self): + output = json_to_txt(self.test_data) + for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]: + self.assertIn(f"User-agent: {variant}", output) + + def test_htaccess_includes_synonyms(self): + output = json_to_htaccess(self.test_data) + for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]: + self.assertIn(re.escape(variant), output) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) - unittest.main(verbosity=2)