This commit is contained in:
Brendan Ngwa Nforbi 2025-06-14 12:25:21 -04:00 committed by GitHub
commit 22a7c0c2db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 105 additions and 98 deletions

View file

@ -3,7 +3,6 @@
import json import json
import re import re
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pathlib import Path from pathlib import Path
@ -14,14 +13,12 @@ def load_robots_json():
def get_agent_soup(): def get_agent_soup():
"""Retrieve current known agents from darkvisitors.com""" """Retrieve current known agents from darkvisitors.com."""
session = requests.Session() session = requests.Session()
try: try:
response = session.get("https://darkvisitors.com/agents") response = session.get("https://darkvisitors.com/agents")
except requests.exceptions.ConnectionError: except requests.exceptions.ConnectionError:
print( print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
)
return return
return BeautifulSoup(response.text, "html.parser") return BeautifulSoup(response.text, "html.parser")
@ -34,14 +31,6 @@ def updated_robots_json(soup):
"AI Assistants", "AI Assistants",
"AI Data Scrapers", "AI Data Scrapers",
"AI Search Crawlers", "AI Search Crawlers",
# "Archivers",
# "Developer Helpers",
# "Fetchers",
# "Intelligence Gatherers",
# "Scrapers",
# "Search Engine Crawlers",
# "SEO Crawlers",
# "Uncategorized",
"Undocumented AI Agents", "Undocumented AI Agents",
] ]
@ -49,6 +38,7 @@ def updated_robots_json(soup):
category = section.find("h2").get_text() category = section.find("h2").get_text()
if category not in to_include: if category not in to_include:
continue continue
for agent in section.find_all("a", href=True): for agent in section.find_all("a", href=True):
name = agent.find("div", {"class": "agent-name"}).get_text().strip() name = agent.find("div", {"class": "agent-name"}).get_text().strip()
name = clean_robot_name(name) name = clean_robot_name(name)
@ -77,19 +67,18 @@ def updated_robots_json(soup):
# New field # New field
if field not in existing_content[name]: if field not in existing_content[name]:
return value return value
# Unclear value # Replace unclear value
if ( if (
existing_content[name][field] in default_values existing_content[name][field] in default_values
and value not in default_values and value not in default_values
): ):
return value return value
# Existing value
return existing_content[name][field] return existing_content[name][field]
existing_content[name] = { existing_content[name] = {
"operator": consolidate("operator", operator), "operator": consolidate("operator", operator),
"respect": consolidate("respect", default_value), "respect": consolidate("respect", default_value),
"function": consolidate("function", f"{category}"), "function": consolidate("function", category),
"frequency": consolidate("frequency", default_value), "frequency": consolidate("frequency", default_value),
"description": consolidate( "description": consolidate(
"description", "description",
@ -99,21 +88,11 @@ def updated_robots_json(soup):
print(f"Total: {len(existing_content)}") print(f"Total: {len(existing_content)}")
sorted_keys = sorted(existing_content, key=lambda k: k.lower()) sorted_keys = sorted(existing_content, key=lambda k: k.lower())
sorted_robots = {k: existing_content[k] for k in sorted_keys} return {k: existing_content[k] for k in sorted_keys}
return sorted_robots
def clean_robot_name(name): def clean_robot_name(name):
""" Clean the robot name by removing some characters that were mangled by html software once. """ """Clean the robot name by removing characters mangled by HTML rendering."""
# This was specifically spotted in "Perplexity-User"
# Looks like a non-breaking hyphen introduced by the HTML rendering software
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
# You can see the bot is listed several times as "Perplexity-User" with a normal hyphen,
# and it's only the Row-Heading that has the special hyphen
#
# Technically, there's no reason there wouldn't someday be a bot that
# actually uses a non-breaking hyphen, but that seems unlikely,
# so this solution should be fine for now.
result = re.sub(r"\u2011", "-", name) result = re.sub(r"\u2011", "-", name)
if result != name: if result != name:
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
@ -125,117 +104,108 @@ def ingest_darkvisitors():
soup = get_agent_soup() soup = get_agent_soup()
if soup: if soup:
robots_json = updated_robots_json(soup) robots_json = updated_robots_json(soup)
print( print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.")
"robots.json is unchanged." Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
if robots_json == old_robots_json
else "robots.json got updates."
) def all_user_agents(robot_json):
Path("./robots.json").write_text( """Expand all main names and their ua-synonyms into a flat list."""
json.dumps(robots_json, indent=4), encoding="utf-8" return [
) ua for name, data in robot_json.items()
for ua in [name] + data.get("ua-synonyms", [])
]
def json_to_txt(robots_json): def json_to_txt(robots_json):
"""Compose the robots.txt from the robots.json file.""" """Compose the robots.txt from the robots.json file."""
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)]
robots_txt += "\nDisallow: /\n" lines.append("Disallow: /")
return robots_txt return "\n".join(lines)
def escape_md(s): def escape_md(s):
"""Escape markdown special characters in bot names."""
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
def json_to_table(robots_json): def json_to_table(robots_json):
"""Compose a markdown table with the information in robots.json""" """Compose a markdown table with the information in robots.json."""
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
table += "|------|----------|-----------------------|----------|------------------|-------------|\n" table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
for name, robot in robots_json.items(): for name, robot in robots_json.items():
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' table += (
f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | "
f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n"
)
return table return table
def list_to_pcre(lst): def list_to_pcre(lst):
# Python re is not 100% identical to PCRE which is used by Apache, but it """Convert a list of user agents into a regex pattern."""
# should probably be close enough in the real world for re.escape to work. return f"({'|'.join(map(re.escape, lst))})"
formatted = "|".join(map(re.escape, lst))
return f"({formatted})"
def json_to_htaccess(robot_json): def json_to_htaccess(robot_json):
# Creates a .htaccess filter file. It uses a regular expression to filter out """Generate .htaccess content to block bots via user-agent regex."""
# User agents that contain any of the blocked values. return (
htaccess = "RewriteEngine On\n" "RewriteEngine On\n"
htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n"
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
return htaccess )
def json_to_nginx(robot_json): def json_to_nginx(robot_json):
# Creates an Nginx config file. This config snippet can be included in """Generate Nginx config snippet to block AI bots."""
# nginx server{} blocks to block AI bots. return (
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n'
return config f' return 403;\n'
f'}}'
)
def json_to_caddy(robot_json): def json_to_caddy(robot_json):
caddyfile = "@aibots {\n " """Generate a Caddyfile snippet to block AI bots."""
caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' return (
caddyfile += "\n}" "@aibots {\n"
return caddyfile f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n'
"}"
)
def json_to_haproxy(robots_json): def json_to_haproxy(robots_json):
# Creates a source file for HAProxy. Follow instructions in the README to implement it. """Generate HAProxy configuration source."""
txt = "\n".join(f"{k}" for k in robots_json.keys()) return "\n".join(all_user_agents(robots_json))
return txt
def update_file_if_changed(file_name, converter): def update_file_if_changed(file_name, converter):
"""Update files if newer content is available and log the (in)actions.""" """Update output files only if the content has changed."""
new_content = converter(load_robots_json()) new_content = converter(load_robots_json())
filepath = Path(file_name) filepath = Path(file_name)
# "touch" will create the file if it doesn't exist yet
filepath.touch() filepath.touch()
old_content = filepath.read_text(encoding="utf-8") old_content = filepath.read_text(encoding="utf-8")
if old_content == new_content: if old_content == new_content:
print(f"{file_name} is already up to date.") print(f"{file_name} is already up to date.")
else: else:
Path(file_name).write_text(new_content, encoding="utf-8") filepath.write_text(new_content, encoding="utf-8")
print(f"{file_name} has been updated.") print(f"{file_name} has been updated.")
def conversions(): def conversions():
"""Triggers the conversions from the json file.""" """Generate all output files from robots.json."""
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt) update_file_if_changed("./robots.txt", json_to_txt)
update_file_if_changed( update_file_if_changed("./table-of-bot-metrics.md", json_to_table)
file_name="./table-of-bot-metrics.md", update_file_if_changed("./.htaccess", json_to_htaccess)
converter=json_to_table, update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx)
) update_file_if_changed("./Caddyfile", json_to_caddy)
update_file_if_changed( update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy)
file_name="./.htaccess",
converter=json_to_htaccess,
)
update_file_if_changed(
file_name="./nginx-block-ai-bots.conf",
converter=json_to_nginx,
)
update_file_if_changed(
file_name="./Caddyfile",
converter=json_to_caddy,
)
update_file_if_changed(
file_name="./haproxy-block-ai-bots.txt",
converter=json_to_haproxy,
)
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser()
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="ai-robots", prog="ai-robots",
description="Collects and updates information about web scrapers of AI companies.", description="Collects and updates information about web scrapers of AI companies.",
@ -251,6 +221,7 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Create the robots.txt and markdown table from robots.json", help="Create the robots.txt and markdown table from robots.json",
) )
args = parser.parse_args() args = parser.parse_args()
if not (args.update or args.convert): if not (args.update or args.convert):
@ -259,5 +230,6 @@ if __name__ == "__main__":
if args.update: if args.update:
ingest_darkvisitors() ingest_darkvisitors()
if args.convert: if args.convert:
conversions() conversions()

View file

@ -3,8 +3,18 @@
import json import json
import unittest import unittest
import re
from robots import (
json_to_txt,
json_to_table,
json_to_htaccess,
json_to_nginx,
json_to_haproxy,
json_to_caddy,
clean_robot_name
)
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
class RobotsUnittestExtensions: class RobotsUnittestExtensions:
def loadJson(self, pathname): def loadJson(self, pathname):
@ -14,7 +24,6 @@ class RobotsUnittestExtensions:
def assertEqualsFile(self, f, s): def assertEqualsFile(self, f, s):
with open(f, "rt") as f: with open(f, "rt") as f:
f_contents = f.read() f_contents = f.read()
return self.assertMultiLineEqual(f_contents, s) return self.assertMultiLineEqual(f_contents, s)
@ -50,6 +59,7 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
robots_htaccess = json_to_htaccess(self.robots_dict) robots_htaccess = json_to_htaccess(self.robots_dict)
self.assertEqualsFile("test_files/.htaccess", robots_htaccess) self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
@ -60,6 +70,7 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
robots_nginx = json_to_nginx(self.robots_dict) robots_nginx = json_to_nginx(self.robots_dict)
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
@ -70,11 +81,6 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
robots_haproxy = json_to_haproxy(self.robots_dict) robots_haproxy = json_to_haproxy(self.robots_dict)
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self):
from robots import clean_robot_name
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
maxDiff = 8192 maxDiff = 8192
@ -87,8 +93,37 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
class TestRobotsNameCleaning(unittest.TestCase):
def test_clean_name(self):
self.assertEqual(clean_robot_name("PerplexityUser"), "Perplexity-User")
class TestUASynonymSupport(unittest.TestCase):
def setUp(self):
self.test_data = {
"MainBot": {
"ua-synonyms": ["mainbot/1.0", "Main-Bot"],
"operator": "TestCorp",
"respect": "No",
"function": "Test",
"frequency": "Often",
"description": "A test bot"
}
}
def test_robots_txt_includes_synonyms(self):
output = json_to_txt(self.test_data)
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
self.assertIn(f"User-agent: {variant}", output)
def test_htaccess_includes_synonyms(self):
output = json_to_htaccess(self.test_data)
pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)"
self.assertRegex(output, pattern)
if __name__ == "__main__": if __name__ == "__main__":
import os import os
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
unittest.main(verbosity=2) unittest.main(verbosity=2)