mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-06-19 18:10:52 +00:00
feat: add ua-synonyms support with tests, scoped to json_to_* functions
This commit is contained in:
parent
f0606f0eac
commit
3148669363
2 changed files with 138 additions and 61 deletions
168
code/robots.py
168
code/robots.py
|
@ -1,14 +1,19 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_robots_json():
|
||||
"""Load the robots.json contents into a dictionary."""
|
||||
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def get_agent_soup():
|
||||
"""Retrieve current known agents from darkvisitors.com."""
|
||||
session = requests.Session()
|
||||
try:
|
||||
response = session.get("https://darkvisitors.com/agents")
|
||||
|
@ -17,13 +22,23 @@ def get_agent_soup():
|
|||
return
|
||||
return BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
|
||||
def updated_robots_json(soup):
|
||||
"""Update AI scraper information with data from darkvisitors."""
|
||||
existing_content = load_robots_json()
|
||||
to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"]
|
||||
to_include = [
|
||||
"AI Agents",
|
||||
"AI Assistants",
|
||||
"AI Data Scrapers",
|
||||
"AI Search Crawlers",
|
||||
"Undocumented AI Agents",
|
||||
]
|
||||
|
||||
for section in soup.find_all("div", {"class": "agent-links-section"}):
|
||||
category = section.find("h2").get_text()
|
||||
if category not in to_include:
|
||||
continue
|
||||
|
||||
for agent in section.find_all("a", href=True):
|
||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||
name = clean_robot_name(name)
|
||||
|
@ -37,6 +52,7 @@ def updated_robots_json(soup):
|
|||
}
|
||||
default_value = "Unclear at this time."
|
||||
|
||||
# Parse the operator information from the description if possible
|
||||
operator = default_value
|
||||
if "operated by " in desc:
|
||||
try:
|
||||
|
@ -45,10 +61,13 @@ def updated_robots_json(soup):
|
|||
print(f"Error: {e}")
|
||||
|
||||
def consolidate(field: str, value: str) -> str:
|
||||
# New entry
|
||||
if name not in existing_content:
|
||||
return value
|
||||
# New field
|
||||
if field not in existing_content[name]:
|
||||
return value
|
||||
# Replace unclear value
|
||||
if (
|
||||
existing_content[name][field] in default_values
|
||||
and value not in default_values
|
||||
|
@ -59,7 +78,7 @@ def updated_robots_json(soup):
|
|||
existing_content[name] = {
|
||||
"operator": consolidate("operator", operator),
|
||||
"respect": consolidate("respect", default_value),
|
||||
"function": consolidate("function", f"{category}"),
|
||||
"function": consolidate("function", category),
|
||||
"frequency": consolidate("frequency", default_value),
|
||||
"description": consolidate(
|
||||
"description",
|
||||
|
@ -71,103 +90,146 @@ def updated_robots_json(soup):
|
|||
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
|
||||
return {k: existing_content[k] for k in sorted_keys}
|
||||
|
||||
|
||||
def clean_robot_name(name):
|
||||
return re.sub(r"\u2011", "-", name)
|
||||
"""Clean the robot name by removing characters mangled by HTML rendering."""
|
||||
result = re.sub(r"\u2011", "-", name)
|
||||
if result != name:
|
||||
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
|
||||
return result
|
||||
|
||||
|
||||
def ingest_darkvisitors():
|
||||
old = load_robots_json()
|
||||
old_robots_json = load_robots_json()
|
||||
soup = get_agent_soup()
|
||||
if soup:
|
||||
robots_json = updated_robots_json(soup)
|
||||
print("robots.json is unchanged." if robots_json == old else "robots.json got updates.")
|
||||
print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.")
|
||||
Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
|
||||
|
||||
def json_to_txt(robots_json):
|
||||
agents = [
|
||||
ua for name, data in robots_json.items()
|
||||
for ua in [name] + data.get("ua-synonyms", [])
|
||||
]
|
||||
txt = "\n".join(f"User-agent: {ua}" for ua in agents)
|
||||
txt += "\nDisallow: /\n"
|
||||
return txt
|
||||
|
||||
def escape_md(s):
|
||||
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
||||
|
||||
def json_to_table(robots_json):
|
||||
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
||||
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
||||
for name, robot in robots_json.items():
|
||||
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
|
||||
return table
|
||||
|
||||
def list_to_pcre(uas):
|
||||
return f"({'|'.join(map(re.escape, uas))})"
|
||||
|
||||
def json_to_htaccess(robot_json):
|
||||
all_uas = [
|
||||
def all_user_agents(robot_json):
|
||||
"""Expand all main names and their ua-synonyms into a flat list."""
|
||||
return [
|
||||
ua for name, data in robot_json.items()
|
||||
for ua in [name] + data.get("ua-synonyms", [])
|
||||
]
|
||||
|
||||
|
||||
def json_to_txt(robots_json):
|
||||
"""Compose the robots.txt from the robots.json file."""
|
||||
lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)]
|
||||
lines.append("Disallow: /")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def escape_md(s):
|
||||
"""Escape markdown special characters in bot names."""
|
||||
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
||||
|
||||
|
||||
def json_to_table(robots_json):
|
||||
"""Compose a markdown table with the information in robots.json."""
|
||||
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
||||
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
||||
|
||||
for name, robot in robots_json.items():
|
||||
table += (
|
||||
f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | "
|
||||
f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n"
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def list_to_pcre(lst):
|
||||
"""Convert a list of user agents into a regex pattern."""
|
||||
return f"({'|'.join(map(re.escape, lst))})"
|
||||
|
||||
|
||||
def json_to_htaccess(robot_json):
|
||||
"""Generate .htaccess content to block bots via user-agent regex."""
|
||||
return (
|
||||
"RewriteEngine On\n"
|
||||
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n"
|
||||
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n"
|
||||
"RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
||||
)
|
||||
|
||||
|
||||
def json_to_nginx(robot_json):
|
||||
all_uas = [
|
||||
ua for name, data in robot_json.items()
|
||||
for ua in [name] + data.get("ua-synonyms", [])
|
||||
]
|
||||
return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}'
|
||||
"""Generate Nginx config snippet to block AI bots."""
|
||||
return (
|
||||
f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n'
|
||||
f' return 403;\n'
|
||||
f'}}'
|
||||
)
|
||||
|
||||
|
||||
def json_to_caddy(robot_json):
|
||||
all_uas = [
|
||||
ua for name, data in robot_json.items()
|
||||
for ua in [name] + data.get("ua-synonyms", [])
|
||||
]
|
||||
"""Generate a Caddyfile snippet to block AI bots."""
|
||||
return (
|
||||
"@aibots {\n"
|
||||
f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n"
|
||||
f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n'
|
||||
"}"
|
||||
)
|
||||
|
||||
|
||||
def json_to_haproxy(robots_json):
|
||||
return "\n".join(
|
||||
ua for name, data in robots_json.items()
|
||||
for ua in [name] + data.get("ua-synonyms", [])
|
||||
)
|
||||
"""Generate HAProxy configuration source."""
|
||||
return "\n".join(all_user_agents(robots_json))
|
||||
|
||||
|
||||
def update_file_if_changed(file_name, converter):
|
||||
"""Update output files only if the content has changed."""
|
||||
new_content = converter(load_robots_json())
|
||||
filepath = Path(file_name)
|
||||
filepath.touch()
|
||||
old_content = filepath.read_text(encoding="utf-8")
|
||||
|
||||
if old_content == new_content:
|
||||
print(f"{file_name} is already up to date.")
|
||||
else:
|
||||
Path(file_name).write_text(new_content, encoding="utf-8")
|
||||
filepath.write_text(new_content, encoding="utf-8")
|
||||
print(f"{file_name} has been updated.")
|
||||
|
||||
|
||||
def conversions():
|
||||
update_file_if_changed("robots.txt", json_to_txt)
|
||||
update_file_if_changed("table-of-bot-metrics.md", json_to_table)
|
||||
update_file_if_changed(".htaccess", json_to_htaccess)
|
||||
update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx)
|
||||
update_file_if_changed("Caddyfile", json_to_caddy)
|
||||
update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy)
|
||||
"""Generate all output files from robots.json."""
|
||||
update_file_if_changed("./robots.txt", json_to_txt)
|
||||
update_file_if_changed("./table-of-bot-metrics.md", json_to_table)
|
||||
update_file_if_changed("./.htaccess", json_to_htaccess)
|
||||
update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx)
|
||||
update_file_if_changed("./Caddyfile", json_to_caddy)
|
||||
update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com")
|
||||
parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json")
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="ai-robots",
|
||||
description="Collects and updates information about web scrapers of AI companies.",
|
||||
epilog="One of the flags must be set.\n",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--update",
|
||||
action="store_true",
|
||||
help="Update the robots.json file with data from darkvisitors.com/agents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--convert",
|
||||
action="store_true",
|
||||
help="Create the robots.txt and markdown table from robots.json",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not (args.update or args.convert):
|
||||
print("ERROR: please provide one of the possible flags.")
|
||||
parser.print_help()
|
||||
|
||||
if args.update:
|
||||
ingest_darkvisitors()
|
||||
|
||||
if args.convert:
|
||||
conversions()
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
"""To run these tests just execute this script."""
|
||||
|
||||
import json
|
||||
import re
|
||||
import unittest
|
||||
import re
|
||||
|
||||
from robots import (
|
||||
json_to_txt,
|
||||
|
@ -10,6 +12,7 @@ from robots import (
|
|||
json_to_nginx,
|
||||
json_to_haproxy,
|
||||
json_to_caddy,
|
||||
clean_robot_name
|
||||
)
|
||||
|
||||
|
||||
|
@ -26,8 +29,10 @@ class RobotsUnittestExtensions:
|
|||
|
||||
class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_robots_txt_generation(self):
|
||||
robots_txt = json_to_txt(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/robots.txt", robots_txt)
|
||||
|
@ -35,8 +40,10 @@ class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 32768
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_table_generation(self):
|
||||
robots_table = json_to_table(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table)
|
||||
|
@ -44,8 +51,10 @@ class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_htaccess_generation(self):
|
||||
robots_htaccess = json_to_htaccess(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
|
||||
|
@ -53,8 +62,10 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_nginx_generation(self):
|
||||
robots_nginx = json_to_nginx(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
||||
|
@ -62,8 +73,10 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_haproxy_generation(self):
|
||||
robots_haproxy = json_to_haproxy(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
|
||||
|
@ -71,8 +84,10 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||
maxDiff = 8192
|
||||
|
||||
def setUp(self):
|
||||
self.robots_dict = self.loadJson("test_files/robots.json")
|
||||
|
||||
def test_caddyfile_generation(self):
|
||||
robots_caddyfile = json_to_caddy(self.robots_dict)
|
||||
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
|
||||
|
@ -80,20 +95,19 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
|||
|
||||
class TestRobotsNameCleaning(unittest.TestCase):
|
||||
def test_clean_name(self):
|
||||
from robots import clean_robot_name
|
||||
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
||||
|
||||
|
||||
class TestUASynonymsSupport(unittest.TestCase):
|
||||
class TestUASynonymSupport(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.test_data = {
|
||||
"MainBot": {
|
||||
"ua-synonyms": ["mainbot/1.0", "Main-Bot"],
|
||||
"operator": "TestCorp",
|
||||
"respect": "No",
|
||||
"function": "AI Bot",
|
||||
"frequency": "Daily",
|
||||
"description": "Used for testing ua-synonyms."
|
||||
"function": "Test",
|
||||
"frequency": "Often",
|
||||
"description": "A test bot"
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -104,11 +118,12 @@ class TestUASynonymsSupport(unittest.TestCase):
|
|||
|
||||
def test_htaccess_includes_synonyms(self):
|
||||
output = json_to_htaccess(self.test_data)
|
||||
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
|
||||
self.assertIn(re.escape(variant), output)
|
||||
pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)"
|
||||
self.assertRegex(output, pattern)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue