feat: add ua-synonyms support with tests, scoped to json_to_* functions

This commit is contained in:
brndnprog 2025-06-12 08:31:54 -07:00
parent f0606f0eac
commit 3148669363
2 changed files with 138 additions and 61 deletions

View file

@ -1,14 +1,19 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json import json
import re import re
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pathlib import Path from pathlib import Path
def load_robots_json(): def load_robots_json():
"""Load the robots.json contents into a dictionary."""
return json.loads(Path("./robots.json").read_text(encoding="utf-8")) return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
def get_agent_soup(): def get_agent_soup():
"""Retrieve current known agents from darkvisitors.com."""
session = requests.Session() session = requests.Session()
try: try:
response = session.get("https://darkvisitors.com/agents") response = session.get("https://darkvisitors.com/agents")
@ -17,13 +22,23 @@ def get_agent_soup():
return return
return BeautifulSoup(response.text, "html.parser") return BeautifulSoup(response.text, "html.parser")
def updated_robots_json(soup): def updated_robots_json(soup):
"""Update AI scraper information with data from darkvisitors."""
existing_content = load_robots_json() existing_content = load_robots_json()
to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"] to_include = [
"AI Agents",
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
"Undocumented AI Agents",
]
for section in soup.find_all("div", {"class": "agent-links-section"}): for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text() category = section.find("h2").get_text()
if category not in to_include: if category not in to_include:
continue continue
for agent in section.find_all("a", href=True): for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip() name = agent.find("div", {"class": "agent-name"}).get_text().strip()
name = clean_robot_name(name) name = clean_robot_name(name)
@ -37,6 +52,7 @@ def updated_robots_json(soup):
} }
default_value = "Unclear at this time." default_value = "Unclear at this time."
# Parse the operator information from the description if possible
operator = default_value operator = default_value
if "operated by " in desc: if "operated by " in desc:
try: try:
@ -45,10 +61,13 @@ def updated_robots_json(soup):
print(f"Error: {e}") print(f"Error: {e}")
def consolidate(field: str, value: str) -> str: def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content: if name not in existing_content:
return value return value
# New field
if field not in existing_content[name]: if field not in existing_content[name]:
return value return value
# Replace unclear value
if ( if (
existing_content[name][field] in default_values existing_content[name][field] in default_values
and value not in default_values and value not in default_values
@ -59,7 +78,7 @@ def updated_robots_json(soup):
existing_content[name] = { existing_content[name] = {
"operator": consolidate("operator", operator), "operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value), "respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"), "function": consolidate("function", category),
"frequency": consolidate("frequency", default_value), "frequency": consolidate("frequency", default_value),
"description": consolidate( "description": consolidate(
"description", "description",
@ -71,103 +90,146 @@ def updated_robots_json(soup):
sorted_keys = sorted(existing_content, key=lambda k: k.lower()) sorted_keys = sorted(existing_content, key=lambda k: k.lower())
return {k: existing_content[k] for k in sorted_keys} return {k: existing_content[k] for k in sorted_keys}
def clean_robot_name(name): def clean_robot_name(name):
return re.sub(r"\u2011", "-", name) """Clean the robot name by removing characters mangled by HTML rendering."""
result = re.sub(r"\u2011", "-", name)
if result != name:
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
return result
def ingest_darkvisitors(): def ingest_darkvisitors():
old = load_robots_json() old_robots_json = load_robots_json()
soup = get_agent_soup() soup = get_agent_soup()
if soup: if soup:
robots_json = updated_robots_json(soup) robots_json = updated_robots_json(soup)
print("robots.json is unchanged." if robots_json == old else "robots.json got updates.") print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.")
Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8") Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
def json_to_txt(robots_json):
agents = [
ua for name, data in robots_json.items()
for ua in [name] + data.get("ua-synonyms", [])
]
txt = "\n".join(f"User-agent: {ua}" for ua in agents)
txt += "\nDisallow: /\n"
return txt
def escape_md(s): def all_user_agents(robot_json):
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) """Expand all main names and their ua-synonyms into a flat list."""
return [
def json_to_table(robots_json):
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
for name, robot in robots_json.items():
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
return table
def list_to_pcre(uas):
return f"({'|'.join(map(re.escape, uas))})"
def json_to_htaccess(robot_json):
all_uas = [
ua for name, data in robot_json.items() ua for name, data in robot_json.items()
for ua in [name] + data.get("ua-synonyms", []) for ua in [name] + data.get("ua-synonyms", [])
] ]
def json_to_txt(robots_json):
"""Compose the robots.txt from the robots.json file."""
lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)]
lines.append("Disallow: /")
return "\n".join(lines)
def escape_md(s):
"""Escape markdown special characters in bot names."""
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
def json_to_table(robots_json):
"""Compose a markdown table with the information in robots.json."""
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
for name, robot in robots_json.items():
table += (
f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | "
f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n"
)
return table
def list_to_pcre(lst):
"""Convert a list of user agents into a regex pattern."""
return f"({'|'.join(map(re.escape, lst))})"
def json_to_htaccess(robot_json):
"""Generate .htaccess content to block bots via user-agent regex."""
return ( return (
"RewriteEngine On\n" "RewriteEngine On\n"
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n" f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n"
"RewriteRule !^/?robots\\.txt$ - [F,L]\n" "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
) )
def json_to_nginx(robot_json): def json_to_nginx(robot_json):
all_uas = [ """Generate Nginx config snippet to block AI bots."""
ua for name, data in robot_json.items() return (
for ua in [name] + data.get("ua-synonyms", []) f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n'
] f' return 403;\n'
return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}' f'}}'
)
def json_to_caddy(robot_json): def json_to_caddy(robot_json):
all_uas = [ """Generate a Caddyfile snippet to block AI bots."""
ua for name, data in robot_json.items()
for ua in [name] + data.get("ua-synonyms", [])
]
return ( return (
"@aibots {\n" "@aibots {\n"
f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n" f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n'
"}" "}"
) )
def json_to_haproxy(robots_json): def json_to_haproxy(robots_json):
return "\n".join( """Generate HAProxy configuration source."""
ua for name, data in robots_json.items() return "\n".join(all_user_agents(robots_json))
for ua in [name] + data.get("ua-synonyms", [])
)
def update_file_if_changed(file_name, converter): def update_file_if_changed(file_name, converter):
"""Update output files only if the content has changed."""
new_content = converter(load_robots_json()) new_content = converter(load_robots_json())
filepath = Path(file_name) filepath = Path(file_name)
filepath.touch() filepath.touch()
old_content = filepath.read_text(encoding="utf-8") old_content = filepath.read_text(encoding="utf-8")
if old_content == new_content: if old_content == new_content:
print(f"{file_name} is already up to date.") print(f"{file_name} is already up to date.")
else: else:
Path(file_name).write_text(new_content, encoding="utf-8") filepath.write_text(new_content, encoding="utf-8")
print(f"{file_name} has been updated.") print(f"{file_name} has been updated.")
def conversions(): def conversions():
update_file_if_changed("robots.txt", json_to_txt) """Generate all output files from robots.json."""
update_file_if_changed("table-of-bot-metrics.md", json_to_table) update_file_if_changed("./robots.txt", json_to_txt)
update_file_if_changed(".htaccess", json_to_htaccess) update_file_if_changed("./table-of-bot-metrics.md", json_to_table)
update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx) update_file_if_changed("./.htaccess", json_to_htaccess)
update_file_if_changed("Caddyfile", json_to_caddy) update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx)
update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy) update_file_if_changed("./Caddyfile", json_to_caddy)
update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy)
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com") parser = argparse.ArgumentParser(
parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json") prog="ai-robots",
description="Collects and updates information about web scrapers of AI companies.",
epilog="One of the flags must be set.\n",
)
parser.add_argument(
"--update",
action="store_true",
help="Update the robots.json file with data from darkvisitors.com/agents",
)
parser.add_argument(
"--convert",
action="store_true",
help="Create the robots.txt and markdown table from robots.json",
)
args = parser.parse_args() args = parser.parse_args()
if not (args.update or args.convert): if not (args.update or args.convert):
print("ERROR: please provide one of the possible flags.") print("ERROR: please provide one of the possible flags.")
parser.print_help() parser.print_help()
if args.update: if args.update:
ingest_darkvisitors() ingest_darkvisitors()
if args.convert: if args.convert:
conversions() conversions()

View file

@ -1,7 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""To run these tests just execute this script."""
import json import json
import re
import unittest import unittest
import re
from robots import ( from robots import (
json_to_txt, json_to_txt,
@ -10,6 +12,7 @@ from robots import (
json_to_nginx, json_to_nginx,
json_to_haproxy, json_to_haproxy,
json_to_caddy, json_to_caddy,
clean_robot_name
) )
@ -26,8 +29,10 @@ class RobotsUnittestExtensions:
class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_robots_txt_generation(self): def test_robots_txt_generation(self):
robots_txt = json_to_txt(self.robots_dict) robots_txt = json_to_txt(self.robots_dict)
self.assertEqualsFile("test_files/robots.txt", robots_txt) self.assertEqualsFile("test_files/robots.txt", robots_txt)
@ -35,8 +40,10 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 32768 maxDiff = 32768
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_table_generation(self): def test_table_generation(self):
robots_table = json_to_table(self.robots_dict) robots_table = json_to_table(self.robots_dict)
self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table)
@ -44,8 +51,10 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_htaccess_generation(self): def test_htaccess_generation(self):
robots_htaccess = json_to_htaccess(self.robots_dict) robots_htaccess = json_to_htaccess(self.robots_dict)
self.assertEqualsFile("test_files/.htaccess", robots_htaccess) self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
@ -53,8 +62,10 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_nginx_generation(self): def test_nginx_generation(self):
robots_nginx = json_to_nginx(self.robots_dict) robots_nginx = json_to_nginx(self.robots_dict)
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
@ -62,8 +73,10 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_haproxy_generation(self): def test_haproxy_generation(self):
robots_haproxy = json_to_haproxy(self.robots_dict) robots_haproxy = json_to_haproxy(self.robots_dict)
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
@ -71,8 +84,10 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_caddyfile_generation(self): def test_caddyfile_generation(self):
robots_caddyfile = json_to_caddy(self.robots_dict) robots_caddyfile = json_to_caddy(self.robots_dict)
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
@ -80,20 +95,19 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestRobotsNameCleaning(unittest.TestCase): class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self): def test_clean_name(self):
from robots import clean_robot_name
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User") self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestUASynonymsSupport(unittest.TestCase): class TestUASynonymSupport(unittest.TestCase):
def setUp(self): def setUp(self):
self.test_data = { self.test_data = {
"MainBot": { "MainBot": {
"ua-synonyms": ["mainbot/1.0", "Main-Bot"], "ua-synonyms": ["mainbot/1.0", "Main-Bot"],
"operator": "TestCorp", "operator": "TestCorp",
"respect": "No", "respect": "No",
"function": "AI Bot", "function": "Test",
"frequency": "Daily", "frequency": "Often",
"description": "Used for testing ua-synonyms." "description": "A test bot"
} }
} }
@ -104,11 +118,12 @@ class TestUASynonymsSupport(unittest.TestCase):
def test_htaccess_includes_synonyms(self): def test_htaccess_includes_synonyms(self):
output = json_to_htaccess(self.test_data) output = json_to_htaccess(self.test_data)
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]: pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)"
self.assertIn(re.escape(variant), output) self.assertRegex(output, pattern)
if __name__ == "__main__": if __name__ == "__main__":
import os import os
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
unittest.main(verbosity=2) unittest.main(verbosity=2)