mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-06-19 10:00:52 +00:00
173 lines
6.4 KiB
Python
Executable file
173 lines
6.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
import json
|
|
import re
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from pathlib import Path
|
|
|
|
def load_robots_json():
|
|
return json.loads(Path("./robots.json").read_text(encoding="utf-8"))
|
|
|
|
def get_agent_soup():
|
|
session = requests.Session()
|
|
try:
|
|
response = session.get("https://darkvisitors.com/agents")
|
|
except requests.exceptions.ConnectionError:
|
|
print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
|
|
return
|
|
return BeautifulSoup(response.text, "html.parser")
|
|
|
|
def updated_robots_json(soup):
|
|
existing_content = load_robots_json()
|
|
to_include = ["AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", "Undocumented AI Agents"]
|
|
for section in soup.find_all("div", {"class": "agent-links-section"}):
|
|
category = section.find("h2").get_text()
|
|
if category not in to_include:
|
|
continue
|
|
for agent in section.find_all("a", href=True):
|
|
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
|
name = clean_robot_name(name)
|
|
desc = agent.find("p").get_text().strip()
|
|
|
|
default_values = {
|
|
"Unclear at this time.",
|
|
"No information provided.",
|
|
"No information.",
|
|
"No explicit frequency provided.",
|
|
}
|
|
default_value = "Unclear at this time."
|
|
|
|
operator = default_value
|
|
if "operated by " in desc:
|
|
try:
|
|
operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip()
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
def consolidate(field: str, value: str) -> str:
|
|
if name not in existing_content:
|
|
return value
|
|
if field not in existing_content[name]:
|
|
return value
|
|
if (
|
|
existing_content[name][field] in default_values
|
|
and value not in default_values
|
|
):
|
|
return value
|
|
return existing_content[name][field]
|
|
|
|
existing_content[name] = {
|
|
"operator": consolidate("operator", operator),
|
|
"respect": consolidate("respect", default_value),
|
|
"function": consolidate("function", f"{category}"),
|
|
"frequency": consolidate("frequency", default_value),
|
|
"description": consolidate(
|
|
"description",
|
|
f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}",
|
|
),
|
|
}
|
|
|
|
print(f"Total: {len(existing_content)}")
|
|
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
|
|
return {k: existing_content[k] for k in sorted_keys}
|
|
|
|
def clean_robot_name(name):
|
|
return re.sub(r"\u2011", "-", name)
|
|
|
|
def ingest_darkvisitors():
|
|
old = load_robots_json()
|
|
soup = get_agent_soup()
|
|
if soup:
|
|
robots_json = updated_robots_json(soup)
|
|
print("robots.json is unchanged." if robots_json == old else "robots.json got updates.")
|
|
Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
|
|
|
|
def json_to_txt(robots_json):
|
|
agents = [
|
|
ua for name, data in robots_json.items()
|
|
for ua in [name] + data.get("ua-synonyms", [])
|
|
]
|
|
txt = "\n".join(f"User-agent: {ua}" for ua in agents)
|
|
txt += "\nDisallow: /\n"
|
|
return txt
|
|
|
|
def escape_md(s):
|
|
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
|
|
|
def json_to_table(robots_json):
|
|
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
|
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
|
for name, robot in robots_json.items():
|
|
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
|
|
return table
|
|
|
|
def list_to_pcre(uas):
|
|
return f"({'|'.join(map(re.escape, uas))})"
|
|
|
|
def json_to_htaccess(robot_json):
|
|
all_uas = [
|
|
ua for name, data in robot_json.items()
|
|
for ua in [name] + data.get("ua-synonyms", [])
|
|
]
|
|
return (
|
|
"RewriteEngine On\n"
|
|
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_uas)} [NC]\n"
|
|
"RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
|
)
|
|
|
|
def json_to_nginx(robot_json):
|
|
all_uas = [
|
|
ua for name, data in robot_json.items()
|
|
for ua in [name] + data.get("ua-synonyms", [])
|
|
]
|
|
return f'if ($http_user_agent ~* "{list_to_pcre(all_uas)}") {{\n return 403;\n}}'
|
|
|
|
def json_to_caddy(robot_json):
|
|
all_uas = [
|
|
ua for name, data in robot_json.items()
|
|
for ua in [name] + data.get("ua-synonyms", [])
|
|
]
|
|
return (
|
|
"@aibots {\n"
|
|
f" header_regexp User-Agent \"{list_to_pcre(all_uas)}\"\n"
|
|
"}"
|
|
)
|
|
|
|
def json_to_haproxy(robots_json):
|
|
return "\n".join(
|
|
ua for name, data in robots_json.items()
|
|
for ua in [name] + data.get("ua-synonyms", [])
|
|
)
|
|
|
|
def update_file_if_changed(file_name, converter):
|
|
new_content = converter(load_robots_json())
|
|
filepath = Path(file_name)
|
|
filepath.touch()
|
|
old_content = filepath.read_text(encoding="utf-8")
|
|
if old_content == new_content:
|
|
print(f"{file_name} is already up to date.")
|
|
else:
|
|
Path(file_name).write_text(new_content, encoding="utf-8")
|
|
print(f"{file_name} has been updated.")
|
|
|
|
def conversions():
|
|
update_file_if_changed("robots.txt", json_to_txt)
|
|
update_file_if_changed("table-of-bot-metrics.md", json_to_table)
|
|
update_file_if_changed(".htaccess", json_to_htaccess)
|
|
update_file_if_changed("nginx-block-ai-bots.conf", json_to_nginx)
|
|
update_file_if_changed("Caddyfile", json_to_caddy)
|
|
update_file_if_changed("haproxy-block-ai-bots.txt", json_to_haproxy)
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--update", action="store_true", help="Update the robots.json file from darkvisitors.com")
|
|
parser.add_argument("--convert", action="store_true", help="Generate output files from robots.json")
|
|
args = parser.parse_args()
|
|
if not (args.update or args.convert):
|
|
print("ERROR: please provide one of the possible flags.")
|
|
parser.print_help()
|
|
if args.update:
|
|
ingest_darkvisitors()
|
|
if args.convert:
|
|
conversions()
|