mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-06-19 10:00:52 +00:00
Merge 5c170bb6e3
into eb05f2f527
This commit is contained in:
commit
22a7c0c2db
2 changed files with 105 additions and 98 deletions
152
code/robots.py
152
code/robots.py
|
@ -3,7 +3,6 @@
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -14,14 +13,12 @@ def load_robots_json():
|
||||||
|
|
||||||
|
|
||||||
def get_agent_soup():
|
def get_agent_soup():
|
||||||
"""Retrieve current known agents from darkvisitors.com"""
|
"""Retrieve current known agents from darkvisitors.com."""
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
try:
|
try:
|
||||||
response = session.get("https://darkvisitors.com/agents")
|
response = session.get("https://darkvisitors.com/agents")
|
||||||
except requests.exceptions.ConnectionError:
|
except requests.exceptions.ConnectionError:
|
||||||
print(
|
print("ERROR: Could not gather the current agents from https://darkvisitors.com/agents")
|
||||||
"ERROR: Could not gather the current agents from https://darkvisitors.com/agents"
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
return BeautifulSoup(response.text, "html.parser")
|
return BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
@ -34,14 +31,6 @@ def updated_robots_json(soup):
|
||||||
"AI Assistants",
|
"AI Assistants",
|
||||||
"AI Data Scrapers",
|
"AI Data Scrapers",
|
||||||
"AI Search Crawlers",
|
"AI Search Crawlers",
|
||||||
# "Archivers",
|
|
||||||
# "Developer Helpers",
|
|
||||||
# "Fetchers",
|
|
||||||
# "Intelligence Gatherers",
|
|
||||||
# "Scrapers",
|
|
||||||
# "Search Engine Crawlers",
|
|
||||||
# "SEO Crawlers",
|
|
||||||
# "Uncategorized",
|
|
||||||
"Undocumented AI Agents",
|
"Undocumented AI Agents",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -49,6 +38,7 @@ def updated_robots_json(soup):
|
||||||
category = section.find("h2").get_text()
|
category = section.find("h2").get_text()
|
||||||
if category not in to_include:
|
if category not in to_include:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for agent in section.find_all("a", href=True):
|
for agent in section.find_all("a", href=True):
|
||||||
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
name = agent.find("div", {"class": "agent-name"}).get_text().strip()
|
||||||
name = clean_robot_name(name)
|
name = clean_robot_name(name)
|
||||||
|
@ -77,19 +67,18 @@ def updated_robots_json(soup):
|
||||||
# New field
|
# New field
|
||||||
if field not in existing_content[name]:
|
if field not in existing_content[name]:
|
||||||
return value
|
return value
|
||||||
# Unclear value
|
# Replace unclear value
|
||||||
if (
|
if (
|
||||||
existing_content[name][field] in default_values
|
existing_content[name][field] in default_values
|
||||||
and value not in default_values
|
and value not in default_values
|
||||||
):
|
):
|
||||||
return value
|
return value
|
||||||
# Existing value
|
|
||||||
return existing_content[name][field]
|
return existing_content[name][field]
|
||||||
|
|
||||||
existing_content[name] = {
|
existing_content[name] = {
|
||||||
"operator": consolidate("operator", operator),
|
"operator": consolidate("operator", operator),
|
||||||
"respect": consolidate("respect", default_value),
|
"respect": consolidate("respect", default_value),
|
||||||
"function": consolidate("function", f"{category}"),
|
"function": consolidate("function", category),
|
||||||
"frequency": consolidate("frequency", default_value),
|
"frequency": consolidate("frequency", default_value),
|
||||||
"description": consolidate(
|
"description": consolidate(
|
||||||
"description",
|
"description",
|
||||||
|
@ -99,21 +88,11 @@ def updated_robots_json(soup):
|
||||||
|
|
||||||
print(f"Total: {len(existing_content)}")
|
print(f"Total: {len(existing_content)}")
|
||||||
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
|
sorted_keys = sorted(existing_content, key=lambda k: k.lower())
|
||||||
sorted_robots = {k: existing_content[k] for k in sorted_keys}
|
return {k: existing_content[k] for k in sorted_keys}
|
||||||
return sorted_robots
|
|
||||||
|
|
||||||
|
|
||||||
def clean_robot_name(name):
|
def clean_robot_name(name):
|
||||||
""" Clean the robot name by removing some characters that were mangled by html software once. """
|
"""Clean the robot name by removing characters mangled by HTML rendering."""
|
||||||
# This was specifically spotted in "Perplexity-User"
|
|
||||||
# Looks like a non-breaking hyphen introduced by the HTML rendering software
|
|
||||||
# Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots
|
|
||||||
# You can see the bot is listed several times as "Perplexity-User" with a normal hyphen,
|
|
||||||
# and it's only the Row-Heading that has the special hyphen
|
|
||||||
#
|
|
||||||
# Technically, there's no reason there wouldn't someday be a bot that
|
|
||||||
# actually uses a non-breaking hyphen, but that seems unlikely,
|
|
||||||
# so this solution should be fine for now.
|
|
||||||
result = re.sub(r"\u2011", "-", name)
|
result = re.sub(r"\u2011", "-", name)
|
||||||
if result != name:
|
if result != name:
|
||||||
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
|
print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.")
|
||||||
|
@ -125,117 +104,108 @@ def ingest_darkvisitors():
|
||||||
soup = get_agent_soup()
|
soup = get_agent_soup()
|
||||||
if soup:
|
if soup:
|
||||||
robots_json = updated_robots_json(soup)
|
robots_json = updated_robots_json(soup)
|
||||||
print(
|
print("robots.json is unchanged." if robots_json == old_robots_json else "robots.json got updates.")
|
||||||
"robots.json is unchanged."
|
Path("./robots.json").write_text(json.dumps(robots_json, indent=4), encoding="utf-8")
|
||||||
if robots_json == old_robots_json
|
|
||||||
else "robots.json got updates."
|
|
||||||
)
|
def all_user_agents(robot_json):
|
||||||
Path("./robots.json").write_text(
|
"""Expand all main names and their ua-synonyms into a flat list."""
|
||||||
json.dumps(robots_json, indent=4), encoding="utf-8"
|
return [
|
||||||
)
|
ua for name, data in robot_json.items()
|
||||||
|
for ua in [name] + data.get("ua-synonyms", [])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def json_to_txt(robots_json):
|
def json_to_txt(robots_json):
|
||||||
"""Compose the robots.txt from the robots.json file."""
|
"""Compose the robots.txt from the robots.json file."""
|
||||||
robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys())
|
lines = [f"User-agent: {ua}" for ua in all_user_agents(robots_json)]
|
||||||
robots_txt += "\nDisallow: /\n"
|
lines.append("Disallow: /")
|
||||||
return robots_txt
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def escape_md(s):
|
def escape_md(s):
|
||||||
|
"""Escape markdown special characters in bot names."""
|
||||||
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s)
|
||||||
|
|
||||||
|
|
||||||
def json_to_table(robots_json):
|
def json_to_table(robots_json):
|
||||||
"""Compose a markdown table with the information in robots.json"""
|
"""Compose a markdown table with the information in robots.json."""
|
||||||
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n"
|
||||||
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
table += "|------|----------|-----------------------|----------|------------------|-------------|\n"
|
||||||
|
|
||||||
for name, robot in robots_json.items():
|
for name, robot in robots_json.items():
|
||||||
table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n'
|
table += (
|
||||||
|
f"| {escape_md(name)} | {robot['operator']} | {robot['respect']} | "
|
||||||
|
f"{robot['function']} | {robot['frequency']} | {robot['description']} |\n"
|
||||||
|
)
|
||||||
|
|
||||||
return table
|
return table
|
||||||
|
|
||||||
|
|
||||||
def list_to_pcre(lst):
|
def list_to_pcre(lst):
|
||||||
# Python re is not 100% identical to PCRE which is used by Apache, but it
|
"""Convert a list of user agents into a regex pattern."""
|
||||||
# should probably be close enough in the real world for re.escape to work.
|
return f"({'|'.join(map(re.escape, lst))})"
|
||||||
formatted = "|".join(map(re.escape, lst))
|
|
||||||
return f"({formatted})"
|
|
||||||
|
|
||||||
|
|
||||||
def json_to_htaccess(robot_json):
|
def json_to_htaccess(robot_json):
|
||||||
# Creates a .htaccess filter file. It uses a regular expression to filter out
|
"""Generate .htaccess content to block bots via user-agent regex."""
|
||||||
# User agents that contain any of the blocked values.
|
return (
|
||||||
htaccess = "RewriteEngine On\n"
|
"RewriteEngine On\n"
|
||||||
htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n"
|
f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(all_user_agents(robot_json))} [NC]\n"
|
||||||
htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
"RewriteRule !^/?robots\\.txt$ - [F,L]\n"
|
||||||
return htaccess
|
)
|
||||||
|
|
||||||
|
|
||||||
def json_to_nginx(robot_json):
|
def json_to_nginx(robot_json):
|
||||||
# Creates an Nginx config file. This config snippet can be included in
|
"""Generate Nginx config snippet to block AI bots."""
|
||||||
# nginx server{} blocks to block AI bots.
|
return (
|
||||||
config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}"
|
f'if ($http_user_agent ~* "{list_to_pcre(all_user_agents(robot_json))}") {{\n'
|
||||||
return config
|
f' return 403;\n'
|
||||||
|
f'}}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def json_to_caddy(robot_json):
|
def json_to_caddy(robot_json):
|
||||||
caddyfile = "@aibots {\n "
|
"""Generate a Caddyfile snippet to block AI bots."""
|
||||||
caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"'
|
return (
|
||||||
caddyfile += "\n}"
|
"@aibots {\n"
|
||||||
return caddyfile
|
f' header_regexp User-Agent "{list_to_pcre(all_user_agents(robot_json))}"\n'
|
||||||
|
"}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def json_to_haproxy(robots_json):
|
def json_to_haproxy(robots_json):
|
||||||
# Creates a source file for HAProxy. Follow instructions in the README to implement it.
|
"""Generate HAProxy configuration source."""
|
||||||
txt = "\n".join(f"{k}" for k in robots_json.keys())
|
return "\n".join(all_user_agents(robots_json))
|
||||||
return txt
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_file_if_changed(file_name, converter):
|
def update_file_if_changed(file_name, converter):
|
||||||
"""Update files if newer content is available and log the (in)actions."""
|
"""Update output files only if the content has changed."""
|
||||||
new_content = converter(load_robots_json())
|
new_content = converter(load_robots_json())
|
||||||
filepath = Path(file_name)
|
filepath = Path(file_name)
|
||||||
# "touch" will create the file if it doesn't exist yet
|
|
||||||
filepath.touch()
|
filepath.touch()
|
||||||
old_content = filepath.read_text(encoding="utf-8")
|
old_content = filepath.read_text(encoding="utf-8")
|
||||||
|
|
||||||
if old_content == new_content:
|
if old_content == new_content:
|
||||||
print(f"{file_name} is already up to date.")
|
print(f"{file_name} is already up to date.")
|
||||||
else:
|
else:
|
||||||
Path(file_name).write_text(new_content, encoding="utf-8")
|
filepath.write_text(new_content, encoding="utf-8")
|
||||||
print(f"{file_name} has been updated.")
|
print(f"{file_name} has been updated.")
|
||||||
|
|
||||||
|
|
||||||
def conversions():
|
def conversions():
|
||||||
"""Triggers the conversions from the json file."""
|
"""Generate all output files from robots.json."""
|
||||||
update_file_if_changed(file_name="./robots.txt", converter=json_to_txt)
|
update_file_if_changed("./robots.txt", json_to_txt)
|
||||||
update_file_if_changed(
|
update_file_if_changed("./table-of-bot-metrics.md", json_to_table)
|
||||||
file_name="./table-of-bot-metrics.md",
|
update_file_if_changed("./.htaccess", json_to_htaccess)
|
||||||
converter=json_to_table,
|
update_file_if_changed("./nginx-block-ai-bots.conf", json_to_nginx)
|
||||||
)
|
update_file_if_changed("./Caddyfile", json_to_caddy)
|
||||||
update_file_if_changed(
|
update_file_if_changed("./haproxy-block-ai-bots.txt", json_to_haproxy)
|
||||||
file_name="./.htaccess",
|
|
||||||
converter=json_to_htaccess,
|
|
||||||
)
|
|
||||||
update_file_if_changed(
|
|
||||||
file_name="./nginx-block-ai-bots.conf",
|
|
||||||
converter=json_to_nginx,
|
|
||||||
)
|
|
||||||
update_file_if_changed(
|
|
||||||
file_name="./Caddyfile",
|
|
||||||
converter=json_to_caddy,
|
|
||||||
)
|
|
||||||
|
|
||||||
update_file_if_changed(
|
|
||||||
file_name="./haproxy-block-ai-bots.txt",
|
|
||||||
converter=json_to_haproxy,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="ai-robots",
|
prog="ai-robots",
|
||||||
description="Collects and updates information about web scrapers of AI companies.",
|
description="Collects and updates information about web scrapers of AI companies.",
|
||||||
|
@ -251,6 +221,7 @@ if __name__ == "__main__":
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Create the robots.txt and markdown table from robots.json",
|
help="Create the robots.txt and markdown table from robots.json",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not (args.update or args.convert):
|
if not (args.update or args.convert):
|
||||||
|
@ -259,5 +230,6 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
if args.update:
|
if args.update:
|
||||||
ingest_darkvisitors()
|
ingest_darkvisitors()
|
||||||
|
|
||||||
if args.convert:
|
if args.convert:
|
||||||
conversions()
|
conversions()
|
||||||
|
|
|
@ -3,8 +3,18 @@
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import unittest
|
import unittest
|
||||||
|
import re
|
||||||
|
|
||||||
|
from robots import (
|
||||||
|
json_to_txt,
|
||||||
|
json_to_table,
|
||||||
|
json_to_htaccess,
|
||||||
|
json_to_nginx,
|
||||||
|
json_to_haproxy,
|
||||||
|
json_to_caddy,
|
||||||
|
clean_robot_name
|
||||||
|
)
|
||||||
|
|
||||||
from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy
|
|
||||||
|
|
||||||
class RobotsUnittestExtensions:
|
class RobotsUnittestExtensions:
|
||||||
def loadJson(self, pathname):
|
def loadJson(self, pathname):
|
||||||
|
@ -14,7 +24,6 @@ class RobotsUnittestExtensions:
|
||||||
def assertEqualsFile(self, f, s):
|
def assertEqualsFile(self, f, s):
|
||||||
with open(f, "rt") as f:
|
with open(f, "rt") as f:
|
||||||
f_contents = f.read()
|
f_contents = f.read()
|
||||||
|
|
||||||
return self.assertMultiLineEqual(f_contents, s)
|
return self.assertMultiLineEqual(f_contents, s)
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,6 +59,7 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
robots_htaccess = json_to_htaccess(self.robots_dict)
|
robots_htaccess = json_to_htaccess(self.robots_dict)
|
||||||
self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
|
self.assertEqualsFile("test_files/.htaccess", robots_htaccess)
|
||||||
|
|
||||||
|
|
||||||
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
maxDiff = 8192
|
maxDiff = 8192
|
||||||
|
|
||||||
|
@ -60,6 +70,7 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
robots_nginx = json_to_nginx(self.robots_dict)
|
robots_nginx = json_to_nginx(self.robots_dict)
|
||||||
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx)
|
||||||
|
|
||||||
|
|
||||||
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
maxDiff = 8192
|
maxDiff = 8192
|
||||||
|
|
||||||
|
@ -70,11 +81,6 @@ class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
robots_haproxy = json_to_haproxy(self.robots_dict)
|
robots_haproxy = json_to_haproxy(self.robots_dict)
|
||||||
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
|
self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy)
|
||||||
|
|
||||||
class TestRobotsNameCleaning(unittest.TestCase):
|
|
||||||
def test_clean_name(self):
|
|
||||||
from robots import clean_robot_name
|
|
||||||
|
|
||||||
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
|
||||||
|
|
||||||
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
maxDiff = 8192
|
maxDiff = 8192
|
||||||
|
@ -87,8 +93,37 @@ class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions):
|
||||||
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
|
self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRobotsNameCleaning(unittest.TestCase):
|
||||||
|
def test_clean_name(self):
|
||||||
|
self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User")
|
||||||
|
|
||||||
|
|
||||||
|
class TestUASynonymSupport(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.test_data = {
|
||||||
|
"MainBot": {
|
||||||
|
"ua-synonyms": ["mainbot/1.0", "Main-Bot"],
|
||||||
|
"operator": "TestCorp",
|
||||||
|
"respect": "No",
|
||||||
|
"function": "Test",
|
||||||
|
"frequency": "Often",
|
||||||
|
"description": "A test bot"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_robots_txt_includes_synonyms(self):
|
||||||
|
output = json_to_txt(self.test_data)
|
||||||
|
for variant in ["MainBot", "mainbot/1.0", "Main-Bot"]:
|
||||||
|
self.assertIn(f"User-agent: {variant}", output)
|
||||||
|
|
||||||
|
def test_htaccess_includes_synonyms(self):
|
||||||
|
output = json_to_htaccess(self.test_data)
|
||||||
|
pattern = r"(MainBot|mainbot/1\.0|Main\-Bot)"
|
||||||
|
self.assertRegex(output, pattern)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
unittest.main(verbosity=2)
|
unittest.main(verbosity=2)
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue