mirror of
https://github.com/ai-robots-txt/ai.robots.txt.git
synced 2025-06-19 10:00:52 +00:00
Compare commits
3 commits
cf598b6b71
...
842e2256e8
Author | SHA1 | Date | |
---|---|---|---|
![]() |
842e2256e8 | ||
![]() |
229ea20426 | ||
14d68f05ba |
7 changed files with 43 additions and 3 deletions
|
@ -1,3 +1,3 @@
|
||||||
RewriteEngine On
|
RewriteEngine On
|
||||||
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
|
RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC]
|
||||||
RewriteRule !^/?robots\.txt$ - [F,L]
|
RewriteRule !^/?robots\.txt$ - [F,L]
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
@aibots {
|
@aibots {
|
||||||
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
|
header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)"
|
||||||
}
|
}
|
|
@ -60,7 +60,11 @@ QuillBot
|
||||||
quillbot.com
|
quillbot.com
|
||||||
SBIntuitionsBot
|
SBIntuitionsBot
|
||||||
Scrapy
|
Scrapy
|
||||||
|
SemrushBot
|
||||||
|
SemrushBot-BA
|
||||||
|
SemrushBot-CT
|
||||||
SemrushBot-OCOB
|
SemrushBot-OCOB
|
||||||
|
SemrushBot-SI
|
||||||
SemrushBot-SWA
|
SemrushBot-SWA
|
||||||
Sidetrade indexer bot
|
Sidetrade indexer bot
|
||||||
TikTokSpider
|
TikTokSpider
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
|
if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") {
|
||||||
return 403;
|
return 403;
|
||||||
}
|
}
|
28
robots.json
28
robots.json
|
@ -433,6 +433,27 @@
|
||||||
"operator": "[Zyte](https://www.zyte.com)",
|
"operator": "[Zyte](https://www.zyte.com)",
|
||||||
"respect": "Unclear at this time."
|
"respect": "Unclear at this time."
|
||||||
},
|
},
|
||||||
|
"SemrushBot": {
|
||||||
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
"function": "Crawls your site for ContentShake AI tool.",
|
||||||
|
"frequency": "Roughly once every 10 seconds.",
|
||||||
|
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
||||||
|
},
|
||||||
|
"SemrushBot-BA": {
|
||||||
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
"function": "Crawls your site for ContentShake AI tool.",
|
||||||
|
"frequency": "Roughly once every 10 seconds.",
|
||||||
|
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
||||||
|
},
|
||||||
|
"SemrushBot-CT": {
|
||||||
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
"function": "Crawls your site for ContentShake AI tool.",
|
||||||
|
"frequency": "Roughly once every 10 seconds.",
|
||||||
|
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
||||||
|
},
|
||||||
"SemrushBot-OCOB": {
|
"SemrushBot-OCOB": {
|
||||||
"operator": "[Semrush](https://www.semrush.com/)",
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
"respect": "[Yes](https://www.semrush.com/bot/)",
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
@ -440,6 +461,13 @@
|
||||||
"frequency": "Roughly once every 10 seconds.",
|
"frequency": "Roughly once every 10 seconds.",
|
||||||
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
||||||
},
|
},
|
||||||
|
"SemrushBot-SI": {
|
||||||
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
"function": "Crawls your site for ContentShake AI tool.",
|
||||||
|
"frequency": "Roughly once every 10 seconds.",
|
||||||
|
"description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)."
|
||||||
|
},
|
||||||
"SemrushBot-SWA": {
|
"SemrushBot-SWA": {
|
||||||
"operator": "[Semrush](https://www.semrush.com/)",
|
"operator": "[Semrush](https://www.semrush.com/)",
|
||||||
"respect": "[Yes](https://www.semrush.com/bot/)",
|
"respect": "[Yes](https://www.semrush.com/bot/)",
|
||||||
|
|
|
@ -60,7 +60,11 @@ User-agent: QuillBot
|
||||||
User-agent: quillbot.com
|
User-agent: quillbot.com
|
||||||
User-agent: SBIntuitionsBot
|
User-agent: SBIntuitionsBot
|
||||||
User-agent: Scrapy
|
User-agent: Scrapy
|
||||||
|
User-agent: SemrushBot
|
||||||
|
User-agent: SemrushBot-BA
|
||||||
|
User-agent: SemrushBot-CT
|
||||||
User-agent: SemrushBot-OCOB
|
User-agent: SemrushBot-OCOB
|
||||||
|
User-agent: SemrushBot-SI
|
||||||
User-agent: SemrushBot-SWA
|
User-agent: SemrushBot-SWA
|
||||||
User-agent: Sidetrade indexer bot
|
User-agent: Sidetrade indexer bot
|
||||||
User-agent: TikTokSpider
|
User-agent: TikTokSpider
|
||||||
|
|
|
@ -62,7 +62,11 @@
|
||||||
| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
|
| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. |
|
||||||
| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis |
|
| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis |
|
||||||
| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
|
| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." |
|
||||||
|
| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
|
| SemrushBot\-BA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
|
| SemrushBot\-CT | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
|
| SemrushBot\-SI | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
| SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
| SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). |
|
||||||
| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
|
| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. |
|
||||||
| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |
|
| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. |
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue