feat: support ua-synonyms in robots.json to reduce duplication (#144)

This commit is contained in:
brndnprog 2025-06-11 22:04:39 -07:00
parent cf598b6b71
commit f0606f0eac
2 changed files with 93 additions and 163 deletions

View file

@ -1,50 +1,25 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json import json
import re import re
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pathlib import Path from pathlib import Path
def load_robots_json(): def load_robots_json():
"""Load the robots.json contents into a dictionary."""
return json.loads(Path("./robots.json").read_text(encoding="utf-8")) return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
def get_agent_soup(): def get_agent_soup():
"""Retrieve current known agents from darkvisitors.com"""
session = requests.Session() session = requests.Session()
try: try:
response = session.get("https://darkvisitors.com/agents") response = session.get("https://darkvisitors.com/agents")
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
print( print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
)
return return
return BeautifulSoup(response.text, "html.parser") return BeautifulSoup(response.text, "html.parser")
def updated_robots_json(soup): def updated_robots_json(soup):
"""Update AI scraper information with data from darkvisitors."""
existing_content = load_robots_json() existing_content = load_robots_json()
to_include = [ to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"]
"AI Agents",
"AI Assistants",
"AI Data Scrapers",
"AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents",
]
for section in soup.find_all("div", {"class": "agent-links-section"}): for section in soup.find_all("div", {"class": "agent-links-section"}):
category = section.find("h2").get_text() category = section.find("h2").get_text()
if category not in to_include: if category not in to_include:
@ -62,7 +37,6 @@ def updated_robots_json(soup):
} }
default_value = "Unclear at this time." default_value = "Unclear at this time."
# Parse the operator information from the description if possible
operator = default_value operator = default_value
if "operated by " in desc: if "operated by " in desc:
try: try:
@ -71,19 +45,15 @@ def updated_robots_json(soup):
print(f"Error: {e}") print(f"Error: {e}")
def consolidate(field: str, value: str) -> str: def consolidate(field: str, value: str) -> str:
# New entry
if name not in existing_content: if name not in existing_content:
return value return value
# New field
if field not in existing_content[name]: if field not in existing_content[name]:
return value return value
# Unclear value
if ( if (
existing_content[name][field] in default_values existing_content[name][field] in default_values
and value not in default_values and value not in default_values
): ):
return value return value
# Existing value
return existing_content[name][field] return existing_content[name][field]
existing_content[name] = { existing_content[name] = {
@ -99,104 +69,79 @@ def updated_robots_json(soup):
print(f"Total: {len(existing_content)}") print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower()) sorted_keys = sorted(existing_content, key=lambda k: k.lower())
sorted_robots = {k: existing_content[k] for k in sorted_keys} return {k: existing_content[k] for k in sorted_keys}
return sorted_robots
def clean_robot_name(name): def clean_robot_name(name):
""" Clean the robot name by removing some characters that were mangled by html software once. """ return re.sub(r"\u2011", "-", name)
# This was specifically spotted in "Perplexity-User"
# Looks like a non-breaking hyphen introduced by the HTML rendering software
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
# You can see the bot is listed several times as "Perplexity-User" with a normal hyphen,
# and it's only the Row-Heading that has the special hyphen
#
# Technically, there's no reason there wouldn't someday be a bot that
# actually uses a non-breaking hyphen, but that seems unlikely,
# so this solution should be fine for now.
result = re.sub(r"\u2011", "-", name)
if result != name:
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
return result
def ingest_darkvisitors(): def ingest_darkvisitors():
old_robots_json = load_robots_json() old = load_robots_json()
soup = get_agent_soup() soup = get_agent_soup()
if soup: if soup:
robots_json = updated_robots_json(soup) robots_json = updated_robots_json(soup)
print( print("robots.json is unchanged." if robots_json == old else "robots.json got updates.")
"robots.json is unchanged." Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
if robots_json == old_robots_json
else "robots.json got updates."
)
Path("./robots.json").write_text(
json.dumps(robots_json, indent=4), encoding="utf-8"
)
def json_to_txt(robots_json): def json_to_txt(robots_json):
"""Compose the robots.txt from the robots.json file.""" agents = [
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) ua for name, data in robots_json.items()
robots_txt += "\nDisallow: /\n" for ua in [name] + data.get("ua-synonyms", [])
return robots_txt ]
txt = "\n".join(f"User-agent: {ua}" for ua in agents)
txt += "\nDisallow: /\n"
return txt
def escape_md(s): def escape_md(s):
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
def json_to_table(robots_json): def json_to_table(robots_json):
"""Compose a markdown table with the information in robots.json"""
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|------|----------|-----------------------|----------|------------------|-------------|\n" table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
for name, robot in robots_json.items(): for name, robot in robots_json.items():
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
return table return table
def list_to_pcre(uas):
def list_to_pcre(lst): return f"({'|'.join(map(re.escape, uas))})"
# Python re is not 100% identical to PCRE which is used by Apache, but it
# should probably be close enough in the real world for re.escape to work.
formatted = "|".join(map(re.escape, lst))
return f"({formatted})"
def json_to_htaccess(robot_json): def json_to_htaccess(robot_json):
# Creates a .htaccess filter file. It uses a regular expression to filter out all_uas = [
# User agents that contain any of the blocked values. ua for name, data in robot_json.items()
htaccess = "RewriteEngine On\n" for ua in [name] + data.get("ua-synonyms", [])
htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" ]
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return (
return htaccess "RewriteEngine On\n"
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n"
"RewriteRule !^/?robots\\.txt$ - [F,L]\n"
)
def json_to_nginx(robot_json): def json_to_nginx(robot_json):
# Creates an Nginx config file. This config snippet can be included in all_uas = [
# nginx server{} blocks to block AI bots. ua for name, data in robot_json.items()
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" for ua in [name] + data.get("ua-synonyms", [])
return config ]
return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}'
def json_to_caddy(robot_json): def json_to_caddy(robot_json):
caddyfile = "@aibots {\n " all_uas = [
caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' ua for name, data in robot_json.items()
caddyfile += "\n}" for ua in [name] + data.get("ua-synonyms", [])
return caddyfile ]
return (
"@aibots {\n"
f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n"
"}"
)
def json_to_haproxy(robots_json): def json_to_haproxy(robots_json):
# Creates a source file for HAProxy. Follow instructions in the README to implement it. return "\n".join(
txt = "\n".join(f"{k}" for k in robots_json.keys()) ua for name, data in robots_json.items()
return txt for ua in [name] + data.get("ua-synonyms", [])
)
def update_file_if_changed(file_name, converter): def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions."""
new_content = converter(load_robots_json()) new_content = converter(load_robots_json())
filepath = Path(file_name) filepath = Path(file_name)
# "touch" will create the file if it doesn't exist yet
filepath.touch() filepath.touch()
old_content = filepath.read_text(encoding="utf-8") old_content = filepath.read_text(encoding="utf-8")
if old_content == new_content: if old_content == new_content:
@ -205,58 +150,23 @@ def update_file_if_changed(file_name, converter):
Path(file_name).write_text(new_content, encoding="utf-8") Path(file_name).write_text(new_content, encoding="utf-8")
print(f"{file_name} has been updated.") print(f"{file_name} has been updated.")
def conversions(): def conversions():
"""Triggers the conversions from the json file.""" update_file_if_changed("robots.txt", json_to_txt)
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt) update_file_if_changed("table-of-bot-metrics.md", json_to_table)
update_file_if_changed( update_file_if_changed(".htaccess", json_to_htaccess)
file_name="./table-of-bot-metrics.md", update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx)
converter=json_to_table, update_file_if_changed("Caddyfile", json_to_caddy)
) update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy)
update_file_if_changed(
file_name="./.htaccess",
converter=json_to_htaccess,
)
update_file_if_changed(
file_name="./nginx-block-ai-bots.conf",
converter=json_to_nginx,
)
update_file_if_changed(
file_name="./Caddyfile",
converter=json_to_caddy,
)
update_file_if_changed(
file_name="./haproxy-block-ai-bots.txt",
converter=json_to_haproxy,
)
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser( parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com")
prog="ai-robots", parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json")
description="Collects and updates information about web scrapers of AI companies.",
epilog="One of the flags must be set.\n",
)
parser.add_argument(
"--update",
action="store_true",
help="Update the robots.json file with data from darkvisitors.com/agents",
)
parser.add_argument(
"--convert",
action="store_true",
help="Create the robots.txt and markdown table from robots.json",
)
args = parser.parse_args() args = parser.parse_args()
if not (args.update or args.convert): if not (args.update or args.convert):
print("ERROR: please provide one of the possible flags.") print("ERROR: please provide one of the possible flags.")
parser.print_help() parser.print_help()
if args.update: if args.update:
ingest_darkvisitors() ingest_darkvisitors()
if args.convert: if args.convert:

View file

@ -1,10 +1,17 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""To run these tests just execute this script."""
import json import json
import re
import unittest import unittest
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy from robots import (
json_to_txt,
json_to_table,
json_to_htaccess,
json_to_nginx,
json_to_haproxy,
json_to_caddy,
)
class RobotsUnittestExtensions: class RobotsUnittestExtensions:
def loadJson(self, pathname): def loadJson(self, pathname):
@ -14,16 +21,13 @@ class RobotsUnittestExtensions:
def assertEqualsFile(self, f, s): def assertEqualsFile(self, f, s):
with open(f, "rt") as f: with open(f, "rt") as f:
f_contents = f.read() f_contents = f.read()
return self.assertMultiLineEqual(f_contents, s) return self.assertMultiLineEqual(f_contents, s)
class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_robots_txt_generation(self): def test_robots_txt_generation(self):
robots_txt = json_to_txt(self.robots_dict) robots_txt = json_to_txt(self.robots_dict)
self.assertEqualsFile("test_files/robots.txt", robots_txt) self.assertEqualsFile("test_files/robots.txt", robots_txt)
@ -31,10 +35,8 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 32768 maxDiff = 32768
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_table_generation(self): def test_table_generation(self):
robots_table = json_to_table(self.robots_dict) robots_table = json_to_table(self.robots_dict)
self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table)
@ -42,53 +44,71 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_htaccess_generation(self): def test_htaccess_generation(self):
robots_htaccess = json_to_htaccess(self.robots_dict) robots_htaccess = json_to_htaccess(self.robots_dict)
self.assertEqualsFile("test_files/.htaccess", robots_htaccess) self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_nginx_generation(self): def test_nginx_generation(self):
robots_nginx = json_to_nginx(self.robots_dict) robots_nginx = json_to_nginx(self.robots_dict)
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_haproxy_generation(self): def test_haproxy_generation(self):
robots_haproxy = json_to_haproxy(self.robots_dict) robots_haproxy = json_to_haproxy(self.robots_dict)
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self):
from robots import clean_robot_name
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
def setUp(self): def setUp(self):
self.robots_dict = self.loadJson("test_files/robots.json") self.robots_dict = self.loadJson("test_files/robots.json")
def test_caddyfile_generation(self): def test_caddyfile_generation(self):
robots_caddyfile = json_to_caddy(self.robots_dict) robots_caddyfile = json_to_caddy(self.robots_dict)
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self):
from robots import clean_robot_name
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestUASynonymsSupport(unittest.TestCase):
def setUp(self):
self.test_data = {
"MainBot": {
"ua-synonyms": ["mainbot/1.0", "Main-Bot"],
"operator": "TestCorp",
"respect": "No",
"function": "AI Bot",
"frequency": "Daily",
"description": "Used for testing ua-synonyms."
}
}
def test_robots_txt_includes_synonyms(self):
output = json_to_txt(self.test_data)
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
self.assertIn(f"User-agent: {variant}", output)
def test_htaccess_includes_synonyms(self):
output = json_to_htaccess(self.test_data)
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
self.assertIn(re.escape(variant), output)
if __name__ == "__main__": if __name__ == "__main__":
import os import os
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
unittest.main(verbosity=2) unittest.main(verbosity=2)