From b2970316d8ae56515662792a62893f056566026a Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 30 Aug 2024 01:13:29 +0000 Subject: [PATCH 001/201] Removing previously generated files --- robots.txt | 37 ------------------------------------- table-of-bot-metrics.md | 38 -------------------------------------- 2 files changed, 75 deletions(-) delete mode 100644 robots.txt delete mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 4fdca4c..0000000 --- a/robots.txt +++ /dev/null @@ -1,37 +0,0 @@ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md deleted file mode 100644 index 1a96903..0000000 --- a/table-of-bot-metrics.md +++ /dev/null @@ -1,38 +0,0 @@ -| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 054c97ad4f6c8c8819451c1e7b77c00662f42a71 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 30 Aug 2024 01:13:29 +0000 Subject: [PATCH 002/201] Daily update from Dark Visitors --- robots.txt | 37 +++++++++++++++++++++++++++++++++++++ table-of-bot-metrics.md | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 robots.txt create mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..4fdca4c --- /dev/null +++ b/robots.txt @@ -0,0 +1,37 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Diffbot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: GPTBot +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: anthropic-ai +User-agent: cohere-ai +User-agent: facebookexternalhit +User-agent: img2dataset +User-agent: omgili +User-agent: omgilibot +Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md new file mode 100644 index 0000000..1a96903 --- /dev/null +++ b/table-of-bot-metrics.md @@ -0,0 +1,38 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 9a4ebb57ee480141a5181d29b2645efafeeb2ec4 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 31 Aug 2024 01:13:04 +0000 Subject: [PATCH 003/201] Removing previously generated files --- robots.txt | 37 ------------------------------------- table-of-bot-metrics.md | 38 -------------------------------------- 2 files changed, 75 deletions(-) delete mode 100644 robots.txt delete mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 4fdca4c..0000000 --- a/robots.txt +++ /dev/null @@ -1,37 +0,0 @@ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md deleted file mode 100644 index 1a96903..0000000 --- a/table-of-bot-metrics.md +++ /dev/null @@ -1,38 +0,0 @@ -| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 9a7f556d87fb09609f465547c4317a775346f55a Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 31 Aug 2024 01:13:04 +0000 Subject: [PATCH 004/201] Daily update from Dark Visitors --- robots.txt | 37 +++++++++++++++++++++++++++++++++++++ table-of-bot-metrics.md | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 robots.txt create mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..4fdca4c --- /dev/null +++ b/robots.txt @@ -0,0 +1,37 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Diffbot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: GPTBot +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: anthropic-ai +User-agent: cohere-ai +User-agent: facebookexternalhit +User-agent: img2dataset +User-agent: omgili +User-agent: omgilibot +Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md new file mode 100644 index 0000000..1a96903 --- /dev/null +++ b/table-of-bot-metrics.md @@ -0,0 +1,38 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 01589718dff5cee3e270a928521984a407a5a979 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sun, 1 Sep 2024 01:24:52 +0000 Subject: [PATCH 005/201] Removing previously generated files --- robots.txt | 37 ------------------------------------- table-of-bot-metrics.md | 38 -------------------------------------- 2 files changed, 75 deletions(-) delete mode 100644 robots.txt delete mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 4fdca4c..0000000 --- a/robots.txt +++ /dev/null @@ -1,37 +0,0 @@ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md deleted file mode 100644 index 1a96903..0000000 --- a/table-of-bot-metrics.md +++ /dev/null @@ -1,38 +0,0 @@ -| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 543e993b08de1b03844c4ad2312e2a36768cdace Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sun, 1 Sep 2024 01:24:53 +0000 Subject: [PATCH 006/201] Daily update from Dark Visitors --- robots.txt | 37 +++++++++++++++++++++++++++++++++++++ table-of-bot-metrics.md | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 robots.txt create mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..4fdca4c --- /dev/null +++ b/robots.txt @@ -0,0 +1,37 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Diffbot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: GPTBot +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: anthropic-ai +User-agent: cohere-ai +User-agent: facebookexternalhit +User-agent: img2dataset +User-agent: omgili +User-agent: omgilibot +Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md new file mode 100644 index 0000000..1a96903 --- /dev/null +++ b/table-of-bot-metrics.md @@ -0,0 +1,38 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 567bd00aec0cf1397478e8e1bb99965770721ebf Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 2 Sep 2024 01:15:07 +0000 Subject: [PATCH 007/201] Removing previously generated files --- robots.txt | 37 ------------------------------------- table-of-bot-metrics.md | 38 -------------------------------------- 2 files changed, 75 deletions(-) delete mode 100644 robots.txt delete mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 4fdca4c..0000000 --- a/robots.txt +++ /dev/null @@ -1,37 +0,0 @@ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md deleted file mode 100644 index 1a96903..0000000 --- a/table-of-bot-metrics.md +++ /dev/null @@ -1,38 +0,0 @@ -| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From c9325c9e18d00786de7d0cec50ab82bf9e84b6f5 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 2 Sep 2024 01:15:07 +0000 Subject: [PATCH 008/201] Daily update from Dark Visitors --- robots.txt | 37 +++++++++++++++++++++++++++++++++++++ table-of-bot-metrics.md | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 robots.txt create mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..4fdca4c --- /dev/null +++ b/robots.txt @@ -0,0 +1,37 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Diffbot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: GPTBot +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: anthropic-ai +User-agent: cohere-ai +User-agent: facebookexternalhit +User-agent: img2dataset +User-agent: omgili +User-agent: omgilibot +Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md new file mode 100644 index 0000000..1a96903 --- /dev/null +++ b/table-of-bot-metrics.md @@ -0,0 +1,38 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From cc18b8617c7c01b343e7534bf948bff8667856cf Mon Sep 17 00:00:00 2001 From: nisbet-hubbard <87453615+nisbet-hubbard@users.noreply.github.com> Date: Tue, 3 Sep 2024 07:48:48 +0800 Subject: [PATCH 009/201] Update main.yml --- .github/workflows/main.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b127d7..bd10a45 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,14 +20,7 @@ jobs: - run: | git config --global user.name "ai.robots.txt" git config --global user.email "ai.robots.txt@users.noreply.github.com" - git rm robots.txt - git rm table-of-bot-metrics.md - git add -A - git commit -m "Removing previously generated files" - git push php -f code/action.php - git config --global user.name "ai.robots.txt" - git config --global user.email "ai.robots.txt@users.noreply.github.com" git add -A if [ -n "${{ inputs.message }}" ]; then git commit -m "${{ inputs.message }}" @@ -35,4 +28,4 @@ jobs: git commit -m "${{ github.event.head_commit.message }}" fi git push - shell: bash \ No newline at end of file + shell: bash From 7151f6c5695d704df25fa115734bacd0122f3fb2 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Sep 2024 01:12:56 +0000 Subject: [PATCH 010/201] Removing previously generated files --- robots.txt | 37 ------------------------------------- table-of-bot-metrics.md | 38 -------------------------------------- 2 files changed, 75 deletions(-) delete mode 100644 robots.txt delete mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt deleted file mode 100644 index 4fdca4c..0000000 --- a/robots.txt +++ /dev/null @@ -1,37 +0,0 @@ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: Claude-Web -User-agent: ClaudeBot -User-agent: Diffbot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: GPTBot -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot -Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md deleted file mode 100644 index 1a96903..0000000 --- a/table-of-bot-metrics.md +++ /dev/null @@ -1,38 +0,0 @@ -| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From fb5c995243c74389117589ed2a2b6d68abbb9a72 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Sep 2024 01:12:57 +0000 Subject: [PATCH 011/201] Daily update from Dark Visitors --- robots.txt | 37 +++++++++++++++++++++++++++++++++++++ table-of-bot-metrics.md | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 robots.txt create mode 100644 table-of-bot-metrics.md diff --git a/robots.txt b/robots.txt new file mode 100644 index 0000000..4fdca4c --- /dev/null +++ b/robots.txt @@ -0,0 +1,37 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: Diffbot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: GPTBot +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +User-agent: anthropic-ai +User-agent: cohere-ai +User-agent: facebookexternalhit +User-agent: img2dataset +User-agent: omgili +User-agent: omgilibot +Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md new file mode 100644 index 0000000..1a96903 --- /dev/null +++ b/table-of-bot-metrics.md @@ -0,0 +1,38 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 837329440466bc3bfe145d23610e7b87787b3ec8 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Fri, 6 Sep 2024 19:05:26 -0700 Subject: [PATCH 012/201] chore: add iaskspider/2.0 --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index a1cfaa1..c31df62 100644 --- a/robots.json +++ b/robots.json @@ -125,6 +125,13 @@ "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" }, + "iaskspider/2.0": { + "description": "Used to provide answers to user queries.", + "frequency": "Unclear at this time.", + "function": "Crawls sites to provide answers to user queries.", + "operator": "iAsk", + "respect": "No" + }, "ICC-Crawler": { "description": "Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business.", "frequency": "No information.", From 1c1b42368407484f765ddf7af2266cd9fa44fbb9 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 7 Sep 2024 02:05:43 +0000 Subject: [PATCH 013/201] chore: add iaskspider/2.0 --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index 4fdca4c..da17a01 100644 --- a/robots.txt +++ b/robots.txt @@ -16,6 +16,7 @@ User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video +User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: Meta-ExternalAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 1a96903..39b7959 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -18,6 +18,7 @@ | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | | ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | From 809851ae88fdc0fe5129484fe0e6e000d2734ede Mon Sep 17 00:00:00 2001 From: Malte Ubl Date: Sat, 7 Sep 2024 15:59:25 -0700 Subject: [PATCH 014/201] Add instructions for AI bot blocking on Vercel --- FAQ.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FAQ.md b/FAQ.md index d60ca9a..49cbdfb 100644 --- a/FAQ.md +++ b/FAQ.md @@ -33,6 +33,8 @@ That depends on your stack. - Cloudflare - [Block AI bots, scrapers and crawlers with a single click](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) by Cloudflare - [I’m blocking AI crawlers](https://roelant.net/en/2024/im-blocking-ai-crawlers-part-2/) by Roelant +- Vercel + - [Block AI Bots Firewall Rule](https://vercel.com/templates/firewall/block-ai-bots-firewall-rule) by Vercel ## Why should we block these crawlers? From 5963cbf9f79404095221f4e8c14ce0f54bd3b627 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 8 Sep 2024 01:19:31 +0000 Subject: [PATCH 015/201] Daily update from Dark Visitors --- robots.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/robots.json b/robots.json index c31df62..a53cebd 100644 --- a/robots.json +++ b/robots.json @@ -125,13 +125,6 @@ "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" }, - "iaskspider/2.0": { - "description": "Used to provide answers to user queries.", - "frequency": "Unclear at this time.", - "function": "Crawls sites to provide answers to user queries.", - "operator": "iAsk", - "respect": "No" - }, "ICC-Crawler": { "description": "Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business.", "frequency": "No information.", @@ -237,6 +230,13 @@ "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)" }, + "iaskspider/2.0": { + "description": "Used to provide answers to user queries.", + "frequency": "Unclear at this time.", + "function": "Crawls sites to provide answers to user queries.", + "operator": "iAsk", + "respect": "No" + }, "img2dataset": { "description": "Downloads large sets of images into datasets for LLM training or other purposes.", "frequency": "At the discretion of img2dataset users.", From 6b8d7f5890d6bed722a95297996c054c210bd3b8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 9 Sep 2024 01:16:21 +0000 Subject: [PATCH 016/201] Daily update from Dark Visitors --- robots.txt | 2 +- table-of-bot-metrics.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/robots.txt b/robots.txt index da17a01..e097a47 100644 --- a/robots.txt +++ b/robots.txt @@ -16,7 +16,6 @@ User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video -User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: Meta-ExternalAgent @@ -32,6 +31,7 @@ User-agent: YouBot User-agent: anthropic-ai User-agent: cohere-ai User-agent: facebookexternalhit +User-agent: iaskspider/2.0 User-agent: img2dataset User-agent: omgili User-agent: omgilibot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 39b7959..d9441b5 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -18,7 +18,6 @@ | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | | ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | @@ -34,6 +33,7 @@ | anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 0106d4b15a4e1d3e0fe15ff0c137b53767afe7b4 Mon Sep 17 00:00:00 2001 From: Urvish Patel <169079981+urvish-p80@users.noreply.github.com> Date: Mon, 23 Sep 2024 08:19:27 -0400 Subject: [PATCH 017/201] Add additional resource - README.md A detailed blogpost to - See the live dashboard showing the websites that are blocking AI Bots such as GPTBot, CCBot, Google-extended and ByteSpider from crawling and scraping the content on their website. Learn which AI crawlers / scrapers do what and how to block them using Robots.txt. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c2d79f5..9502b0d 100644 --- a/README.md +++ b/README.md @@ -33,3 +33,4 @@ If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your- - [Blocking Bots With 11ty And Apache](https://flamedfury.com/posts/blocking-bots-with-11ty-and-apache/) by fLaMEd fury - [Blockin' bots on Netlify](https://www.jeremiak.com/blog/block-bots-netlify-edge-functions/) by Jeremia Kimelman - [Blocking AI web crawlers](https://underlap.org/blocking-ai-web-crawlers) by Glyn Normington +- [Block AI Bots from Crawling Websites Using Robots.txt](https://originality.ai/ai-bot-blocking) by Jonathan Gillham, Originality.AI From af05890b078a28251b8cd75e6a97ebf0441d1b35 Mon Sep 17 00:00:00 2001 From: Julian Mair <13933169+cityrolr@users.noreply.github.com> Date: Mon, 23 Sep 2024 23:27:27 +0200 Subject: [PATCH 018/201] Update README.md For people who don't use or don't want to use RSS for this, I've added a little explanation of how to subscribe to releases via GitHub. --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c2d79f5..3d79036 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ https://github.com/ai-robots-txt/ai.robots.txt/releases.atom You can subscribe with [Feedly](https://feedly.com/i/subscription/feed/https://github.com/ai-robots-txt/ai.robots.txt/releases.atom), [Inoreader](https://www.inoreader.com/?add_feed=https://github.com/ai-robots-txt/ai.robots.txt/releases.atom), [The Old Reader](https://theoldreader.com/feeds/subscribe?url=https://github.com/ai-robots-txt/ai.robots.txt/releases.atom), [Feedbin](https://feedbin.me/?subscribe=https://github.com/ai-robots-txt/ai.robots.txt/releases.atom), or any other reader app. +Alternatively, you can also subscribe to new releases with your GitHub account by clicking the ⬇️ on "Watch" button at the top of this page, clicking "Custom" and selecting "Releases". + ## Report abusive crawlers If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). From a6de89e6bdcc552a13ac7bd56b78017d251e01bc Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Thu, 26 Sep 2024 21:41:28 +0000 Subject: [PATCH 019/201] feat: make CCBot entry more accurate --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index a53cebd..12ed898 100644 --- a/robots.json +++ b/robots.json @@ -42,10 +42,10 @@ "respect": "No" }, "CCBot": { - "description": "Sources data that is made openly available and is used to train AI models.", - "frequency": "Unclear at this time.", - "function": "Provides crawl data for an open source repository that has been used to train LLMs.", - "operator": "[Common Crawl](https://commoncrawl.org)", + "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers).", + "frequency": "Monthly at present.", + "function": "Provides open crawl dataset, used for many purposes, including Machine Learning/AI.", + "operator": "[Common Crawl Foundation](https://commoncrawl.org)", "respect": "[Yes](https://commoncrawl.org/ccbot)" }, "ChatGPT-User": { From 44d975c799130d58380b49f5c2bbb4ba33f1ae1a Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 27 Sep 2024 00:21:49 +0000 Subject: [PATCH 020/201] Merge pull request #42 from commoncrawl/main feat: make CCBot entry more accurate --- table-of-bot-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d9441b5..213b098 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -6,7 +6,7 @@ | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides crawl data for an open source repository that has been used to train LLMs. | Unclear at this time. | Sources data that is made openly available and is used to train AI models. | +| CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | | Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | From 7851cea4fd4e233cd7ed74e48c3316d114e29f3b Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 27 Sep 2024 01:18:04 +0000 Subject: [PATCH 021/201] Daily update from Dark Visitors --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 12ed898..f8c6876 100644 --- a/robots.json +++ b/robots.json @@ -139,6 +139,13 @@ "operator": "[ImageSift](https://imagesift.com)", "respect": "[Yes](https://imagesift.com/about)" }, + "Kangaroo Bot": { + "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot", + "frequency": "Unclear at this time.", + "function": "AI Data Scrapers", + "operator": "Unclear at this time.", + "respect": "Unclear at this time." + }, "Meta-ExternalAgent": { "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"", "frequency": "No information.", From 632e9d65109584c2ed12ecf2f4a898e9a276e604 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 28 Sep 2024 01:17:19 +0000 Subject: [PATCH 022/201] Daily update from Dark Visitors --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index e097a47..c11be04 100644 --- a/robots.txt +++ b/robots.txt @@ -18,6 +18,7 @@ User-agent: GoogleOther-Image User-agent: GoogleOther-Video User-agent: ICC-Crawler User-agent: ImagesiftBot +User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent User-agent: Meta-ExternalFetcher User-agent: OAI-SearchBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 213b098..dfeb86a 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -20,6 +20,7 @@ | GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | From 6a988be27f37e175539920f6cdbf6aa4c89170b3 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 28 Sep 2024 13:58:00 -0700 Subject: [PATCH 023/201] chore: add sidetrade bot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f8c6876..83c91a7 100644 --- a/robots.json +++ b/robots.json @@ -184,10 +184,17 @@ "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", - "function": "Scrapes data a variety of uses including training AI.", + "function": "Scrapes data for a variety of uses including training AI.", "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, + "Sidetrade indexer bot": { + "description": "AI product training.", + "frequency": "No information.", + "function": "Extracts data for a variety of uses including training AI.", + "operator": "[Sidetrade](https://www.sidetrade.com)", + "respect": "Unclear at this time." + }, "Timpibot": { "description": "Makes data available for training AI models.", "frequency": "No information.", From 6d9ce1d62aa29117e0f7badc23e0b16d0afc3573 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 28 Sep 2024 20:58:18 +0000 Subject: [PATCH 024/201] chore: add sidetrade bot --- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/robots.txt b/robots.txt index c11be04..a593d88 100644 --- a/robots.txt +++ b/robots.txt @@ -25,6 +25,7 @@ User-agent: OAI-SearchBot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy +User-agent: Sidetrade indexer bot User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index dfeb86a..a77b4bb 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -26,7 +26,8 @@ | OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | From 9c2394f23bc83f06fbc8de410939045e5b3ba1bc Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 30 Sep 2024 16:25:20 -0700 Subject: [PATCH 025/201] chore: add ISSCyberRiskCrawler --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 83c91a7..c446cd7 100644 --- a/robots.json +++ b/robots.json @@ -132,6 +132,13 @@ "operator": "[NICT](https://nict.go.jp)", "respect": "Yes" }, + "ISSCyberRiskCrawler": { + "description": "Used to train machine learning based models to quantify cyber risk.", + "frequency": "No information.", + "function": "Scrapes data to train machine learning models.", + "operator": "[ISS-Corporate](https://iss-cyber.com)", + "respect": "No" + }, "ImagesiftBot": { "description": "Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images.", "frequency": "No information.", From 6da804e826b2f2b3d889389e961031d44a73f043 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 30 Sep 2024 23:50:18 +0000 Subject: [PATCH 026/201] chore: add ISSCyberRiskCrawler --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index a593d88..739e44f 100644 --- a/robots.txt +++ b/robots.txt @@ -17,6 +17,7 @@ User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video User-agent: ICC-Crawler +User-agent: ISSCyberRiskCrawler User-agent: ImagesiftBot User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index a77b4bb..9f2ca90 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -19,6 +19,7 @@ | GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | From dc15afe84705b5324cd9ad6440a696dff6cc49a2 Mon Sep 17 00:00:00 2001 From: Laker Turner Date: Mon, 7 Oct 2024 17:38:01 +0100 Subject: [PATCH 027/201] Update robots.json with Claude respect link --- robots.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/robots.json b/robots.json index c446cd7..6236c72 100644 --- a/robots.json +++ b/robots.json @@ -67,7 +67,7 @@ "frequency": "No information. provided.", "function": "Scrapes data to train Anthropic's AI products.", "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time." + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)" }, "Diffbot": { "description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training.", @@ -279,4 +279,4 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" } -} \ No newline at end of file +} From 9be286626d0a47761a1fa3524fb6407f4fa2de38 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 8 Oct 2024 02:30:17 +0000 Subject: [PATCH 028/201] Merge pull request #43 from lxjv/main Update robots.json with Claude respect link --- table-of-bot-metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 9f2ca90..cf14641 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -9,7 +9,7 @@ | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | | Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | From b1491d269460ca57581c2df7cf14b3f3fc4749f3 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 9 Oct 2024 01:17:37 +0000 Subject: [PATCH 029/201] Daily update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 6236c72..03db17b 100644 --- a/robots.json +++ b/robots.json @@ -279,4 +279,4 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" } -} +} \ No newline at end of file From b229f5b9366a0b9a77a4573589ed861de16db435 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Thu, 17 Oct 2024 12:25:54 +0100 Subject: [PATCH 030/201] Re-order the FAQ The "why" question should come first. --- FAQ.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/FAQ.md b/FAQ.md index 49cbdfb..1b3f247 100644 --- a/FAQ.md +++ b/FAQ.md @@ -1,5 +1,15 @@ # Frequently asked questions +## Why should we block these crawlers? + +They're extractive, confer no benefit to the creators of data they're ingesting and also have wide-ranging negative externalities. + +**[How Tech Giants Cut Corners to Harvest Data for A.I.](https://www.nytimes.com/2024/04/06/technology/tech-giants-harvest-data-artificial-intelligence.html?unlocked_article_code=1.ik0.Ofja.L21c1wyW-0xj&ugrp=m)** +> OpenAI, Google and Meta ignored corporate policies, altered their own rules and discussed skirting copyright law as they sought online information to train their newest artificial intelligence systems. + +**[How AI copyright lawsuits could make the whole industry go extinct](https://www.theverge.com/24062159/ai-copyright-fair-use-lawsuits-new-york-times-openai-chatgpt-decoder-podcast)** +> The New York Times' lawsuit against OpenAI is part of a broader, industry-shaking copyright challenge that could define the future of AI. + ## How do we know AI companies/bots respect `robots.txt`? The short answer is that we don't. `robots.txt` is a well-established standard, but compliance is voluntary. There is no enforcement mechanism. @@ -36,16 +46,6 @@ That depends on your stack. - Vercel - [Block AI Bots Firewall Rule](https://vercel.com/templates/firewall/block-ai-bots-firewall-rule) by Vercel -## Why should we block these crawlers? - -They're extractive, confer no benefit to the creators of data they're ingesting and also have wide-ranging negative externalities. - -**[How Tech Giants Cut Corners to Harvest Data for A.I.](https://www.nytimes.com/2024/04/06/technology/tech-giants-harvest-data-artificial-intelligence.html?unlocked_article_code=1.ik0.Ofja.L21c1wyW-0xj&ugrp=m)** -> OpenAI, Google and Meta ignored corporate policies, altered their own rules and discussed skirting copyright law as they sought online information to train their newest artificial intelligence systems. - -**[How AI copyright lawsuits could make the whole industry go extinct](https://www.theverge.com/24062159/ai-copyright-fair-use-lawsuits-new-york-times-openai-chatgpt-decoder-podcast)** -> The New York Times' lawsuit against OpenAI is part of a broader, industry-shaking copyright challenge that could define the future of AI. - ## How can I contribute? Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. From e6bb7cae9ead3e33078c3b9632a44b3234f241ba Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Thu, 17 Oct 2024 12:27:05 +0100 Subject: [PATCH 031/201] Augment the "why" FAQ Ref: https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2419078796 --- FAQ.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/FAQ.md b/FAQ.md index 1b3f247..4d58350 100644 --- a/FAQ.md +++ b/FAQ.md @@ -10,6 +10,8 @@ They're extractive, confer no benefit to the creators of data they're ingesting **[How AI copyright lawsuits could make the whole industry go extinct](https://www.theverge.com/24062159/ai-copyright-fair-use-lawsuits-new-york-times-openai-chatgpt-decoder-podcast)** > The New York Times' lawsuit against OpenAI is part of a broader, industry-shaking copyright challenge that could define the future of AI. +Crawlers also sometimes impact the performance of crawled sites, or even take them down. + ## How do we know AI companies/bots respect `robots.txt`? The short answer is that we don't. `robots.txt` is a well-established standard, but compliance is voluntary. There is no enforcement mechanism. From 7bb5efd462ffe1ef80e13468a660a82e6987df81 Mon Sep 17 00:00:00 2001 From: Ivan Sagalaev Date: Thu, 17 Oct 2024 21:08:43 -0400 Subject: [PATCH 032/201] Sort the content case-insensitively before dumping to JSON --- code/dark_visitors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/code/dark_visitors.py b/code/dark_visitors.py index 484daa1..7d29c65 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -70,4 +70,6 @@ for section in soup.find_all("div", {"class": "agent-links-section"}): } print(f"Total: {len(existing_content)}") -Path("./robots.json").write_text(json.dumps(existing_content, indent=4, sort_keys=True)) \ No newline at end of file +sorted_keys = sorted(existing_content, key=lambda k: k.lower()) +existing_content = {k: existing_content[k] for k in sorted_keys} +Path("./robots.json").write_text(json.dumps(existing_content, indent=4)) \ No newline at end of file From cfaade6e2f8e55b462328262a381386079238943 Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Sat, 19 Oct 2024 00:01:15 +0200 Subject: [PATCH 033/201] log the diff in the update action daily_update.yml --- .github/workflows/daily_update.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml index 2ae0398..e0ce102 100644 --- a/.github/workflows/daily_update.yml +++ b/.github/workflows/daily_update.yml @@ -16,6 +16,7 @@ jobs: git config --global user.name "dark-visitors" git config --global user.email "dark-visitors@users.noreply.github.com" python code/dark_visitors.py + git --no-pager diff git add -A git diff --quiet && git diff --staged --quiet || (git commit -m "Daily update from Dark Visitors" && git push) shell: bash @@ -24,4 +25,4 @@ jobs: uses: ./.github/workflows/main.yml secrets: inherit with: - message: "Daily update from Dark Visitors" \ No newline at end of file + message: "Daily update from Dark Visitors" From a46d06d436584273b99cdaa45837560f9d46204b Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Sat, 19 Oct 2024 00:04:15 +0200 Subject: [PATCH 034/201] log changes made by the action in main.yml --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bd10a45..140e0fd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,6 +21,7 @@ jobs: git config --global user.name "ai.robots.txt" git config --global user.email "ai.robots.txt@users.noreply.github.com" php -f code/action.php + git --no-pager diff git add -A if [ -n "${{ inputs.message }}" ]; then git commit -m "${{ inputs.message }}" From b3068a8d90c6cb091b25d6125d758cd02b774bbb Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Sat, 19 Oct 2024 00:12:25 +0200 Subject: [PATCH 035/201] add some signposts --- .github/workflows/daily_update.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml index e0ce102..6b6624a 100644 --- a/.github/workflows/daily_update.yml +++ b/.github/workflows/daily_update.yml @@ -15,7 +15,9 @@ jobs: pip install beautifulsoup4 requests git config --global user.name "dark-visitors" git config --global user.email "dark-visitors@users.noreply.github.com" + echo "Running update script ..." python code/dark_visitors.py + echo "... done." git --no-pager diff git add -A git diff --quiet && git diff --staged --quiet || (git commit -m "Daily update from Dark Visitors" && git push) From b584f613cd29e1fbb88d5e55e24dda85f506927d Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Sat, 19 Oct 2024 00:13:09 +0200 Subject: [PATCH 036/201] add some signposts to the log --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 140e0fd..3e3ddfc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,7 +20,9 @@ jobs: - run: | git config --global user.name "ai.robots.txt" git config --global user.email "ai.robots.txt@users.noreply.github.com" + echo "Running update script ..." php -f code/action.php + echo "... done." git --no-pager diff git add -A if [ -n "${{ inputs.message }}" ]; then From 25adc6b8027e832119fd73fa679c89cd602d2e62 Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Sat, 19 Oct 2024 00:28:41 +0200 Subject: [PATCH 037/201] log git repository status --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 3e3ddfc..ea8edc5 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,6 +20,8 @@ jobs: - run: | git config --global user.name "ai.robots.txt" git config --global user.email "ai.robots.txt@users.noreply.github.com" + git log -1 + git status echo "Running update script ..." php -f code/action.php echo "... done." From faf81efb126cfb94ad59b1164b6176357bdb337c Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 19 Oct 2024 01:17:15 +0000 Subject: [PATCH 038/201] Daily update from Dark Visitors --- robots.json | 274 ++++++++++++++++++++++++++-------------------------- 1 file changed, 137 insertions(+), 137 deletions(-) diff --git a/robots.json b/robots.json index 03db17b..db308d7 100644 --- a/robots.json +++ b/robots.json @@ -14,72 +14,93 @@ "respect": "Yes" }, "Amazonbot": { - "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses.", - "frequency": "No information. provided.", - "function": "Service improvement and enabling answers for Alexa users.", "operator": "Amazon", - "respect": "Yes" + "respect": "Yes", + "function": "Service improvement and enabling answers for Alexa users.", + "frequency": "No information. provided.", + "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." + }, + "anthropic-ai": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information. provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "Applebot": { - "description": "Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot", - "frequency": "Unclear at this time.", - "function": "AI Search Crawlers", "operator": "Unclear at this time.", - "respect": "Unclear at this time." + "respect": "Unclear at this time.", + "function": "AI Search Crawlers", + "frequency": "Unclear at this time.", + "description": "Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot" }, "Applebot-Extended": { - "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools.", - "frequency": "Unclear at this time.", - "function": "Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others.", "operator": "[Apple](https://support.apple.com/en-us/119829#datausage)", - "respect": "Yes" + "respect": "Yes", + "function": "Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others.", + "frequency": "Unclear at this time.", + "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, "Bytespider": { - "description": "Downloads data to train LLMS, including ChatGPT competitors.", - "frequency": "Unclear at this time.", - "function": "LLM training.", "operator": "ByteDance", - "respect": "No" + "respect": "No", + "function": "LLM training.", + "frequency": "Unclear at this time.", + "description": "Downloads data to train LLMS, including ChatGPT competitors." }, "CCBot": { - "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers).", - "frequency": "Monthly at present.", - "function": "Provides open crawl dataset, used for many purposes, including Machine Learning/AI.", "operator": "[Common Crawl Foundation](https://commoncrawl.org)", - "respect": "[Yes](https://commoncrawl.org/ccbot)" + "respect": "[Yes](https://commoncrawl.org/ccbot)", + "function": "Provides open crawl dataset, used for many purposes, including Machine Learning/AI.", + "frequency": "Monthly at present.", + "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers)." }, "ChatGPT-User": { - "description": "Used by plugins in ChatGPT to answer queries based on user input.", - "frequency": "Only when prompted by a user.", - "function": "Takes action based on user prompts.", "operator": "[OpenAI](https://openai.com)", - "respect": "Yes" + "respect": "Yes", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "Used by plugins in ChatGPT to answer queries based on user input." }, "Claude-Web": { - "description": "Scrapes data to train LLMs and AI products offered by Anthropic.", - "frequency": "No information. provided.", - "function": "Scrapes data to train Anthropic's AI products.", "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time." + "respect": "Unclear at this time.", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information. provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "ClaudeBot": { - "description": "Scrapes data to train LLMs and AI products offered by Anthropic.", - "frequency": "No information. provided.", - "function": "Scrapes data to train Anthropic's AI products.", "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)" + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information. provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + }, + "cohere-ai": { + "operator": "[Cohere](https://cohere.com)", + "respect": "Unclear at this time.", + "function": "Retrieves data to provide responses to user-initiated prompts.", + "frequency": "Takes action based on user prompts.", + "description": "Retrieves data based on user prompts." }, "Diffbot": { - "description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training.", - "frequency": "Unclear at this time.", - "function": "Aggregates structured web data for monitoring and AI model training.", "operator": "[Diffbot](https://www.diffbot.com/)", - "respect": "At the discretion of Diffbot users." + "respect": "At the discretion of Diffbot users.", + "function": "Aggregates structured web data for monitoring and AI model training.", + "frequency": "Unclear at this time.", + "description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training." }, "FacebookBot": { - "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically.", - "frequency": "Up to 1 page per second", + "operator": "Meta/Facebook", + "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", "function": "Training language models", + "frequency": "Up to 1 page per second", + "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." + }, + "facebookexternalhit": { + "description": "Unclear at this time.", + "frequency": "Unclear at this time.", + "function": "No information.", "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)" }, @@ -90,19 +111,12 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, - "GPTBot": { - "description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies.", - "frequency": "No information.", - "function": "Scrapes data to train OpenAI's products.", - "operator": "[OpenAI](https://openai.com)", - "respect": "Yes" - }, "Google-Extended": { - "description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search.", - "frequency": "No information.", - "function": "LLM training.", "operator": "Google", - "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "LLM training.", + "frequency": "No information.", + "description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search." }, "GoogleOther": { "description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\"", @@ -125,6 +139,20 @@ "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" }, + "GPTBot": { + "operator": "[OpenAI](https://openai.com)", + "respect": "Yes", + "function": "Scrapes data to train OpenAI's products.", + "frequency": "No information.", + "description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies." + }, + "iaskspider/2.0": { + "description": "Used to provide answers to user queries.", + "frequency": "Unclear at this time.", + "function": "Crawls sites to provide answers to user queries.", + "operator": "iAsk", + "respect": "No" + }, "ICC-Crawler": { "description": "Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business.", "frequency": "No information.", @@ -132,13 +160,6 @@ "operator": "[NICT](https://nict.go.jp)", "respect": "Yes" }, - "ISSCyberRiskCrawler": { - "description": "Used to train machine learning based models to quantify cyber risk.", - "frequency": "No information.", - "function": "Scrapes data to train machine learning models.", - "operator": "[ISS-Corporate](https://iss-cyber.com)", - "respect": "No" - }, "ImagesiftBot": { "description": "Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images.", "frequency": "No information.", @@ -146,40 +167,68 @@ "operator": "[ImageSift](https://imagesift.com)", "respect": "[Yes](https://imagesift.com/about)" }, - "Kangaroo Bot": { - "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot", - "frequency": "Unclear at this time.", - "function": "AI Data Scrapers", - "operator": "Unclear at this time.", + "img2dataset": { + "description": "Downloads large sets of images into datasets for LLM training or other purposes.", + "frequency": "At the discretion of img2dataset users.", + "function": "Scrapes images for use in LLMs.", + "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, + "ISSCyberRiskCrawler": { + "description": "Used to train machine learning based models to quantify cyber risk.", + "frequency": "No information.", + "function": "Scrapes data to train machine learning models.", + "operator": "[ISS-Corporate](https://iss-cyber.com)", + "respect": "No" + }, + "Kangaroo Bot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" + }, "Meta-ExternalAgent": { - "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"", - "frequency": "No information.", - "function": "Used to train models and improve products.", "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", - "respect": "Yes." + "respect": "Yes.", + "function": "Used to train models and improve products.", + "frequency": "No information.", + "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, "Meta-ExternalFetcher": { - "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher", - "frequency": "Unclear at this time.", - "function": "AI Assistants", "operator": "Unclear at this time.", - "respect": "Unclear at this time." + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, "OAI-SearchBot": { - "description": "Crawls sites to surface as results in SearchGPT.", - "frequency": "No information.", - "function": "Search result generation.", "operator": "[OpenAI](https://openai.com)", - "respect": "[Yes](https://platform.openai.com/docs/bots)" + "respect": "[Yes](https://platform.openai.com/docs/bots)", + "function": "Search result generation.", + "frequency": "No information.", + "description": "Crawls sites to surface as results in SearchGPT." + }, + "omgili": { + "operator": "[Webz.io](https://webz.io/)", + "respect": "[Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/)", + "function": "Data is sold.", + "frequency": "No information.", + "description": "Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training." + }, + "omgilibot": { + "description": "Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io.", + "frequency": "No information.", + "function": "Data is sold.", + "operator": "[Webz.io](https://webz.io/)", + "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, "PerplexityBot": { - "description": "Operated by Perplexity to obtain results in response to user queries.", - "frequency": "Takes action based on user prompts.", - "function": "Used to answer queries at the request of users.", "operator": "[Perplexity](https://www.perplexity.ai/)", - "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)" + "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", + "function": "Used to answer queries at the request of users.", + "frequency": "Takes action based on user prompts.", + "description": "Operated by Perplexity to obtain results in response to user queries." }, "PetalBot": { "description": "Operated by Huawei to provide search and AI assistant services.", @@ -203,11 +252,11 @@ "respect": "Unclear at this time." }, "Timpibot": { - "description": "Makes data available for training AI models.", - "frequency": "No information.", - "function": "Scrapes data for use in training LLMs.", "operator": "[Timpi](https://timpi.io)", - "respect": "Unclear at this time." + "respect": "Unclear at this time.", + "function": "Scrapes data for use in training LLMs.", + "frequency": "No information.", + "description": "Makes data available for training AI models." }, "VelenPublicWebCrawler": { "description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\"", @@ -217,66 +266,17 @@ "respect": "[Yes](https://velen.io)" }, "Webzio-Extended": { - "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended", - "frequency": "Unclear at this time.", - "function": "AI Data Scrapers", "operator": "Unclear at this time.", - "respect": "Unclear at this time." + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" }, "YouBot": { - "description": "Retrieves data used for You.com web search engine and LLMs.", - "frequency": "No information.", - "function": "Scrapes data for search engine and LLMs.", "operator": "[You](https://about.you.com/youchat/)", - "respect": "[Yes](https://about.you.com/youbot/)" - }, - "anthropic-ai": { - "description": "Scrapes data to train LLMs and AI products offered by Anthropic.", - "frequency": "No information. provided.", - "function": "Scrapes data to train Anthropic's AI products.", - "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time." - }, - "cohere-ai": { - "description": "Retrieves data based on user prompts.", - "frequency": "Takes action based on user prompts.", - "function": "Retrieves data to provide responses to user-initiated prompts.", - "operator": "[Cohere](https://cohere.com)", - "respect": "Unclear at this time." - }, - "facebookexternalhit": { - "description": "Unclear at this time.", - "frequency": "Unclear at this time.", - "function": "No information.", - "operator": "Meta/Facebook", - "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)" - }, - "iaskspider/2.0": { - "description": "Used to provide answers to user queries.", - "frequency": "Unclear at this time.", - "function": "Crawls sites to provide answers to user queries.", - "operator": "iAsk", - "respect": "No" - }, - "img2dataset": { - "description": "Downloads large sets of images into datasets for LLM training or other purposes.", - "frequency": "At the discretion of img2dataset users.", - "function": "Scrapes images for use in LLMs.", - "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", - "respect": "Unclear at this time." - }, - "omgili": { - "description": "Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training.", + "respect": "[Yes](https://about.you.com/youbot/)", + "function": "Scrapes data for search engine and LLMs.", "frequency": "No information.", - "function": "Data is sold.", - "operator": "[Webz.io](https://webz.io/)", - "respect": "[Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/)" - }, - "omgilibot": { - "description": "Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io.", - "frequency": "No information.", - "function": "Data is sold.", - "operator": "[Webz.io](https://webz.io/)", - "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" + "description": "Retrieves data used for You.com web search engine and LLMs." } } \ No newline at end of file From bdf30be7dcce79152af6b95d4520c23600a4ca13 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 19 Oct 2024 04:33:46 +0100 Subject: [PATCH 039/201] Dump out file contents in PHP script --- code/action.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/code/action.php b/code/action.php index 52ebbe6..f6a1d3d 100644 --- a/code/action.php +++ b/code/action.php @@ -12,6 +12,8 @@ It generates: */ $robots = json_decode(file_get_contents('robots.json'), 1); +var_dump($robots); + $robots_txt = null; $robots_table = '| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |'."\n"; @@ -24,5 +26,8 @@ foreach($robots as $robot => $details) { $robots_txt .= 'Disallow: /'; +var_dump($robots_txt); +var_dump($robots_table); + file_put_contents('robots.txt', $robots_txt); file_put_contents('table-of-bot-metrics.md', $robots_table); From a80bd18fb8f27cf234fa2f21e79a6fa99f7878dd Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 19 Oct 2024 03:34:29 +0000 Subject: [PATCH 040/201] Dump out file contents in PHP script --- robots.txt | 18 +++++++++--------- table-of-bot-metrics.md | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/robots.txt b/robots.txt index 739e44f..13681f3 100644 --- a/robots.txt +++ b/robots.txt @@ -1,6 +1,7 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma User-agent: Amazonbot +User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended User-agent: Bytespider @@ -8,21 +9,27 @@ User-agent: CCBot User-agent: ChatGPT-User User-agent: Claude-Web User-agent: ClaudeBot +User-agent: cohere-ai User-agent: Diffbot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: FriendlyCrawler -User-agent: GPTBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video +User-agent: GPTBot +User-agent: iaskspider/2.0 User-agent: ICC-Crawler -User-agent: ISSCyberRiskCrawler User-agent: ImagesiftBot +User-agent: img2dataset +User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent User-agent: Meta-ExternalFetcher User-agent: OAI-SearchBot +User-agent: omgili +User-agent: omgilibot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy @@ -31,11 +38,4 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: YouBot -User-agent: anthropic-ai -User-agent: cohere-ai -User-agent: facebookexternalhit -User-agent: iaskspider/2.0 -User-agent: img2dataset -User-agent: omgili -User-agent: omgilibot Disallow: / \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index cf14641..111ccbb 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -3,6 +3,7 @@ | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | | Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | @@ -10,21 +11,27 @@ | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | | Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | | Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | | ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | @@ -33,10 +40,3 @@ | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | From 38a388097cd18f620da38391398608edd1d4786b Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 19 Oct 2024 04:42:27 +0100 Subject: [PATCH 041/201] Fix typo and trigger rerun of main job --- code/dark_visitors.py | 2 +- robots.json | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/code/dark_visitors.py b/code/dark_visitors.py index 7d29c65..5de65fe 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -34,7 +34,7 @@ for section in soup.find_all("div", {"class": "agent-links-section"}): default_values = { "Unclear at this time.", - "No information. provided.", + "No information provided.", "No information.", "No explicit frequency provided." } diff --git a/robots.json b/robots.json index db308d7..c50d63c 100644 --- a/robots.json +++ b/robots.json @@ -1,14 +1,14 @@ { "AI2Bot": { "description": "Explores 'certain domains' to find web content.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "function": "Content is used to train open language models.", "operator": "[Ai2](https://allenai.org/crawler)", "respect": "Yes" }, "Ai2Bot-Dolma": { "description": "Explores 'certain domains' to find web content.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "function": "Content is used to train open language models.", "operator": "[Ai2](https://allenai.org/crawler)", "respect": "Yes" @@ -17,14 +17,14 @@ "operator": "Amazon", "respect": "Yes", "function": "Service improvement and enabling answers for Alexa users.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." }, "anthropic-ai": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "Unclear at this time.", "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "Applebot": { @@ -66,14 +66,14 @@ "operator": "[Anthropic](https://www.anthropic.com)", "respect": "Unclear at this time.", "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information. provided.", + "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "cohere-ai": { From 6a359e7fd719285b75c4bf6aa8d95403d7573c4e Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 19 Oct 2024 03:43:00 +0000 Subject: [PATCH 042/201] Fix typo and trigger rerun of main job --- table-of-bot-metrics.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 111ccbb..0e6884c 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -1,16 +1,16 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | |-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information. provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information. provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information. provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | From 6bb598820ec670db0c333b4950362a8844c3c0ab Mon Sep 17 00:00:00 2001 From: fabianegli Date: Fri, 18 Oct 2024 23:24:13 +0200 Subject: [PATCH 043/201] ignore venv --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 496ee2c..edef0f5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -.DS_Store \ No newline at end of file +.DS_Store +.venv +venv From 0c05461f84ca0d8ad9fa525821c10dbd0937db92 Mon Sep 17 00:00:00 2001 From: fabianegli Date: Sat, 19 Oct 2024 13:06:34 +0200 Subject: [PATCH 044/201] simplify repo and added some tests --- .github/workflows/daily_update.yml | 9 +- .github/workflows/main.yml | 36 --- .gitignore | 1 + code/action.php | 33 --- code/dark_visitors.py | 161 ++++++++------ code/test_files/robots.json | 282 ++++++++++++++++++++++++ code/test_files/robots.txt | 41 ++++ code/test_files/table-of-bot-metrics.md | 42 ++++ code/tests.py | 21 ++ robots.txt | 2 +- table-of-bot-metrics.md | 80 +++---- 11 files changed, 527 insertions(+), 181 deletions(-) delete mode 100644 .github/workflows/main.yml delete mode 100644 code/action.php create mode 100644 code/test_files/robots.json create mode 100644 code/test_files/robots.txt create mode 100644 code/test_files/table-of-bot-metrics.md create mode 100644 code/tests.py diff --git a/.github/workflows/daily_update.yml b/.github/workflows/daily_update.yml index 6b6624a..11eeab3 100644 --- a/.github/workflows/daily_update.yml +++ b/.github/workflows/daily_update.yml @@ -1,5 +1,8 @@ name: Daily Update from Dark Visitors on: + push: + branches: + - "main" schedule: - cron: "0 0 * * *" @@ -22,9 +25,3 @@ jobs: git add -A git diff --quiet && git diff --staged --quiet || (git commit -m "Daily update from Dark Visitors" && git push) shell: bash - call-main: - needs: dark-visitors - uses: ./.github/workflows/main.yml - secrets: inherit - with: - message: "Daily update from Dark Visitors" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index ea8edc5..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,36 +0,0 @@ -on: - workflow_call: - inputs: - message: - type: string - required: true - description: The message to commit - push: - paths: - - 'robots.json' - -jobs: - ai-robots-txt: - runs-on: ubuntu-latest - name: ai-robots-txt - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 2 - - run: | - git config --global user.name "ai.robots.txt" - git config --global user.email "ai.robots.txt@users.noreply.github.com" - git log -1 - git status - echo "Running update script ..." - php -f code/action.php - echo "... done." - git --no-pager diff - git add -A - if [ -n "${{ inputs.message }}" ]; then - git commit -m "${{ inputs.message }}" - else - git commit -m "${{ github.event.head_commit.message }}" - fi - git push - shell: bash diff --git a/.gitignore b/.gitignore index edef0f5..cbe1c29 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .DS_Store .venv venv +__pycache__ diff --git a/code/action.php b/code/action.php deleted file mode 100644 index f6a1d3d..0000000 --- a/code/action.php +++ /dev/null @@ -1,33 +0,0 @@ - $details) { - $robots_txt .= 'User-agent: '.$robot."\n"; - $robots_table .= '| '.$robot.' | '.$details['operator'].' | '.$details['respect'].' | '.$details['function'].' | '.$details['frequency'].' | '.$details['description'].' | '."\n"; -} - -$robots_txt .= 'Disallow: /'; - -var_dump($robots_txt); -var_dump($robots_table); - -file_put_contents('robots.txt', $robots_txt); -file_put_contents('table-of-bot-metrics.md', $robots_table); diff --git a/code/dark_visitors.py b/code/dark_visitors.py index 5de65fe..838ce67 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -4,72 +4,103 @@ from pathlib import Path import requests from bs4 import BeautifulSoup -session = requests.Session() -response = session.get("https://darkvisitors.com/agents") -soup = BeautifulSoup(response.text, "html.parser") -existing_content = json.loads(Path("./robots.json").read_text()) -to_include = [ - "AI Assistants", - "AI Data Scrapers", - "AI Search Crawlers", - # "Archivers", - # "Developer Helpers", - # "Fetchers", - # "Intelligence Gatherers", - # "Scrapers", - # "Search Engine Crawlers", - # "SEO Crawlers", - # "Uncategorized", - "Undocumented AI Agents" -] +def get_updated_robots_json(): + session = requests.Session() + response = session.get("https://darkvisitors.com/agents") + soup = BeautifulSoup(response.text, "html.parser") -for section in soup.find_all("div", {"class": "agent-links-section"}): - category = section.find("h2").get_text() - if category not in to_include: - continue - for agent in section.find_all("a", href=True): - name = agent.find("div", {"class": "agent-name"}).get_text().strip() - desc = agent.find("p").get_text().strip() - - default_values = { - "Unclear at this time.", - "No information provided.", - "No information.", - "No explicit frequency provided." - } - default_value = "Unclear at this time." - - # Parse the operator information from the description if possible - operator = default_value - if "operated by " in desc: - try: - operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip() - except Exception as e: - print(f"Error: {e}") - - def consolidate(field: str, value: str) -> str: - # New entry - if name not in existing_content: - return value - # New field - if field not in existing_content[name]: - return value - # Unclear value - if existing_content[name][field] in default_values and value not in default_values: - return value - # Existing value - return existing_content[name][field] + existing_content = json.loads(Path("./robots.json").read_text()) + to_include = [ + "AI Assistants", + "AI Data Scrapers", + "AI Search Crawlers", + # "Archivers", + # "Developer Helpers", + # "Fetchers", + # "Intelligence Gatherers", + # "Scrapers", + # "Search Engine Crawlers", + # "SEO Crawlers", + # "Uncategorized", + "Undocumented AI Agents", + ] - existing_content[name] = { - "operator": consolidate("operator", operator), - "respect": consolidate("respect", default_value), - "function": consolidate("function", f"{category}"), - "frequency": consolidate("frequency", default_value), - "description": consolidate("description", f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}") - } + for section in soup.find_all("div", {"class": "agent-links-section"}): + category = section.find("h2").get_text() + if category not in to_include: + continue + for agent in section.find_all("a", href=True): + name = agent.find("div", {"class": "agent-name"}).get_text().strip() + desc = agent.find("p").get_text().strip() -print(f"Total: {len(existing_content)}") -sorted_keys = sorted(existing_content, key=lambda k: k.lower()) -existing_content = {k: existing_content[k] for k in sorted_keys} -Path("./robots.json").write_text(json.dumps(existing_content, indent=4)) \ No newline at end of file + default_values = { + "Unclear at this time.", + "No information provided.", + "No information.", + "No explicit frequency provided.", + } + default_value = "Unclear at this time." + + # Parse the operator information from the description if possible + operator = default_value + if "operated by " in desc: + try: + operator = desc.split("operated by ", 1)[1].split(".", 1)[0].strip() + except Exception as e: + print(f"Error: {e}") + + def consolidate(field: str, value: str) -> str: + # New entry + if name not in existing_content: + return value + # New field + if field not in existing_content[name]: + return value + # Unclear value + if ( + existing_content[name][field] in default_values + and value not in default_values + ): + return value + # Existing value + return existing_content[name][field] + + existing_content[name] = { + "operator": consolidate("operator", operator), + "respect": consolidate("respect", default_value), + "function": consolidate("function", f"{category}"), + "frequency": consolidate("frequency", default_value), + "description": consolidate( + "description", + f"{desc} More info can be found at https://darkvisitors.com/agents{agent['href']}", + ), + } + + print(f"Total: {len(existing_content)}") + sorted_keys = sorted(existing_content, key=lambda k: k.lower()) + sorted_robots = {k: existing_content[k] for k in sorted_keys} + return sorted_robots + + +def json_to_txt(robots_json): + robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) + robots_txt += "\nDisallow: /\n" + return robots_txt + + +def json_to_table(robots_json): + table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" + table += "|-----|----------|-----------------------|----------|------------------|-------------|\n" + + for name, robot in robots_json.items(): + table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' + + return table + + +if __name__ == "__main__": + robots_json = get_updated_robots_json() + Path("./robots.json").write_text(json.dumps(robots_json, indent=4)) + Path("./robots.txt").write_text(json_to_txt(robots_json)) + Path("./table-of-bot-metrics.md").write_text(json_to_table(robots_json)) diff --git a/code/test_files/robots.json b/code/test_files/robots.json new file mode 100644 index 0000000..c50d63c --- /dev/null +++ b/code/test_files/robots.json @@ -0,0 +1,282 @@ +{ + "AI2Bot": { + "description": "Explores 'certain domains' to find web content.", + "frequency": "No information provided.", + "function": "Content is used to train open language models.", + "operator": "[Ai2](https://allenai.org/crawler)", + "respect": "Yes" + }, + "Ai2Bot-Dolma": { + "description": "Explores 'certain domains' to find web content.", + "frequency": "No information provided.", + "function": "Content is used to train open language models.", + "operator": "[Ai2](https://allenai.org/crawler)", + "respect": "Yes" + }, + "Amazonbot": { + "operator": "Amazon", + "respect": "Yes", + "function": "Service improvement and enabling answers for Alexa users.", + "frequency": "No information provided.", + "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." + }, + "anthropic-ai": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + }, + "Applebot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Search Crawlers", + "frequency": "Unclear at this time.", + "description": "Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot" + }, + "Applebot-Extended": { + "operator": "[Apple](https://support.apple.com/en-us/119829#datausage)", + "respect": "Yes", + "function": "Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others.", + "frequency": "Unclear at this time.", + "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." + }, + "Bytespider": { + "operator": "ByteDance", + "respect": "No", + "function": "LLM training.", + "frequency": "Unclear at this time.", + "description": "Downloads data to train LLMS, including ChatGPT competitors." + }, + "CCBot": { + "operator": "[Common Crawl Foundation](https://commoncrawl.org)", + "respect": "[Yes](https://commoncrawl.org/ccbot)", + "function": "Provides open crawl dataset, used for many purposes, including Machine Learning/AI.", + "frequency": "Monthly at present.", + "description": "Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers)." + }, + "ChatGPT-User": { + "operator": "[OpenAI](https://openai.com)", + "respect": "Yes", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "Used by plugins in ChatGPT to answer queries based on user input." + }, + "Claude-Web": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + }, + "ClaudeBot": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", + "function": "Scrapes data to train Anthropic's AI products.", + "frequency": "No information provided.", + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + }, + "cohere-ai": { + "operator": "[Cohere](https://cohere.com)", + "respect": "Unclear at this time.", + "function": "Retrieves data to provide responses to user-initiated prompts.", + "frequency": "Takes action based on user prompts.", + "description": "Retrieves data based on user prompts." + }, + "Diffbot": { + "operator": "[Diffbot](https://www.diffbot.com/)", + "respect": "At the discretion of Diffbot users.", + "function": "Aggregates structured web data for monitoring and AI model training.", + "frequency": "Unclear at this time.", + "description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training." + }, + "FacebookBot": { + "operator": "Meta/Facebook", + "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", + "function": "Training language models", + "frequency": "Up to 1 page per second", + "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." + }, + "facebookexternalhit": { + "description": "Unclear at this time.", + "frequency": "Unclear at this time.", + "function": "No information.", + "operator": "Meta/Facebook", + "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)" + }, + "FriendlyCrawler": { + "description": "Unclear who the operator is; but data is used for training/machine learning.", + "frequency": "Unclear at this time.", + "function": "We are using the data from the crawler to build datasets for machine learning experiments.", + "operator": "Unknown", + "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" + }, + "Google-Extended": { + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "LLM training.", + "frequency": "No information.", + "description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search." + }, + "GoogleOther": { + "description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\"", + "frequency": "No information.", + "function": "Scrapes data.", + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" + }, + "GoogleOther-Image": { + "description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\"", + "frequency": "No information.", + "function": "Scrapes data.", + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" + }, + "GoogleOther-Video": { + "description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\"", + "frequency": "No information.", + "function": "Scrapes data.", + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)" + }, + "GPTBot": { + "operator": "[OpenAI](https://openai.com)", + "respect": "Yes", + "function": "Scrapes data to train OpenAI's products.", + "frequency": "No information.", + "description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies." + }, + "iaskspider/2.0": { + "description": "Used to provide answers to user queries.", + "frequency": "Unclear at this time.", + "function": "Crawls sites to provide answers to user queries.", + "operator": "iAsk", + "respect": "No" + }, + "ICC-Crawler": { + "description": "Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business.", + "frequency": "No information.", + "function": "Scrapes data to train and support AI technologies.", + "operator": "[NICT](https://nict.go.jp)", + "respect": "Yes" + }, + "ImagesiftBot": { + "description": "Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images.", + "frequency": "No information.", + "function": "ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products", + "operator": "[ImageSift](https://imagesift.com)", + "respect": "[Yes](https://imagesift.com/about)" + }, + "img2dataset": { + "description": "Downloads large sets of images into datasets for LLM training or other purposes.", + "frequency": "At the discretion of img2dataset users.", + "function": "Scrapes images for use in LLMs.", + "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", + "respect": "Unclear at this time." + }, + "ISSCyberRiskCrawler": { + "description": "Used to train machine learning based models to quantify cyber risk.", + "frequency": "No information.", + "function": "Scrapes data to train machine learning models.", + "operator": "[ISS-Corporate](https://iss-cyber.com)", + "respect": "No" + }, + "Kangaroo Bot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" + }, + "Meta-ExternalAgent": { + "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", + "respect": "Yes.", + "function": "Used to train models and improve products.", + "frequency": "No information.", + "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" + }, + "Meta-ExternalFetcher": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" + }, + "OAI-SearchBot": { + "operator": "[OpenAI](https://openai.com)", + "respect": "[Yes](https://platform.openai.com/docs/bots)", + "function": "Search result generation.", + "frequency": "No information.", + "description": "Crawls sites to surface as results in SearchGPT." + }, + "omgili": { + "operator": "[Webz.io](https://webz.io/)", + "respect": "[Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/)", + "function": "Data is sold.", + "frequency": "No information.", + "description": "Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training." + }, + "omgilibot": { + "description": "Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io.", + "frequency": "No information.", + "function": "Data is sold.", + "operator": "[Webz.io](https://webz.io/)", + "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" + }, + "PerplexityBot": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", + "function": "Used to answer queries at the request of users.", + "frequency": "Takes action based on user prompts.", + "description": "Operated by Perplexity to obtain results in response to user queries." + }, + "PetalBot": { + "description": "Operated by Huawei to provide search and AI assistant services.", + "frequency": "No explicit frequency provided.", + "function": "Used to provide recommendations in Hauwei assistant and AI search services.", + "operator": "[Huawei](https://huawei.com/)", + "respect": "Yes" + }, + "Scrapy": { + "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", + "frequency": "No information.", + "function": "Scrapes data for a variety of uses including training AI.", + "operator": "[Zyte](https://www.zyte.com)", + "respect": "Unclear at this time." + }, + "Sidetrade indexer bot": { + "description": "AI product training.", + "frequency": "No information.", + "function": "Extracts data for a variety of uses including training AI.", + "operator": "[Sidetrade](https://www.sidetrade.com)", + "respect": "Unclear at this time." + }, + "Timpibot": { + "operator": "[Timpi](https://timpi.io)", + "respect": "Unclear at this time.", + "function": "Scrapes data for use in training LLMs.", + "frequency": "No information.", + "description": "Makes data available for training AI models." + }, + "VelenPublicWebCrawler": { + "description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\"", + "frequency": "No information.", + "function": "Scrapes data for business data sets and machine learning models.", + "operator": "[Velen Crawler](https://velen.io)", + "respect": "[Yes](https://velen.io)" + }, + "Webzio-Extended": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" + }, + "YouBot": { + "operator": "[You](https://about.you.com/youchat/)", + "respect": "[Yes](https://about.you.com/youbot/)", + "function": "Scrapes data for search engine and LLMs.", + "frequency": "No information.", + "description": "Retrieves data used for You.com web search engine and LLMs." + } +} \ No newline at end of file diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt new file mode 100644 index 0000000..927f6f4 --- /dev/null +++ b/code/test_files/robots.txt @@ -0,0 +1,41 @@ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: anthropic-ai +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: Claude-Web +User-agent: ClaudeBot +User-agent: cohere-ai +User-agent: Diffbot +User-agent: FacebookBot +User-agent: facebookexternalhit +User-agent: FriendlyCrawler +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: GPTBot +User-agent: iaskspider/2.0 +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: img2dataset +User-agent: ISSCyberRiskCrawler +User-agent: Kangaroo Bot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: omgili +User-agent: omgilibot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Sidetrade indexer bot +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +Disallow: / diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md new file mode 100644 index 0000000..257ba99 --- /dev/null +++ b/code/test_files/table-of-bot-metrics.md @@ -0,0 +1,42 @@ +| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | +|-----|----------|-----------------------|----------|------------------|-------------| +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | +| Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | diff --git a/code/tests.py b/code/tests.py new file mode 100644 index 0000000..ffa7574 --- /dev/null +++ b/code/tests.py @@ -0,0 +1,21 @@ +"""These tests can be run with pytest. +This requires pytest: pip install pytest +cd to the `code` directory and run `pytest` +""" + +import json +from pathlib import Path + +from dark_visitors import json_to_txt, json_to_table + + +def test_robots_txt_creation(): + robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_txt = json_to_txt(robots_json) + assert Path("test_files/robots.txt").read_text() == robots_txt + + +def test_table_of_bot_metrices_md(): + robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_table = json_to_table(robots_json) + assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table diff --git a/robots.txt b/robots.txt index 13681f3..927f6f4 100644 --- a/robots.txt +++ b/robots.txt @@ -38,4 +38,4 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: YouBot -Disallow: / \ No newline at end of file +Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0e6884c..257ba99 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -1,42 +1,42 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | |-----|----------|-----------------------|----------|------------------|-------------| -| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | -| CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | -| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | -| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | -| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | -| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | -| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | -| Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | -| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | -| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | -| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | -| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | -| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | -| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | -| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | +| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | +| CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | +| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | +| FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | +| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | +| img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | +| Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | +| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | +| omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | +| Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | +| VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | +| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 7e2b3ab0372080ba885a6c1969d8101135f6bae8 Mon Sep 17 00:00:00 2001 From: fabianegli Date: Sat, 19 Oct 2024 19:09:34 +0200 Subject: [PATCH 045/201] rename action --- .github/workflows/{daily_update.yml => ai_robots_update.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{daily_update.yml => ai_robots_update.yml} (95%) diff --git a/.github/workflows/daily_update.yml b/.github/workflows/ai_robots_update.yml similarity index 95% rename from .github/workflows/daily_update.yml rename to .github/workflows/ai_robots_update.yml index 11eeab3..ea5c760 100644 --- a/.github/workflows/daily_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -1,4 +1,4 @@ -name: Daily Update from Dark Visitors +name: Updates for AI robots files on: push: branches: From 6ab8fb2d37082f524ca7a5d724669e8175e9f94f Mon Sep 17 00:00:00 2001 From: fabianegli Date: Sat, 19 Oct 2024 19:11:01 +0200 Subject: [PATCH 046/201] no more failure when run without network --- code/dark_visitors.py | 69 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/code/dark_visitors.py b/code/dark_visitors.py index 838ce67..820c9c1 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -5,12 +5,27 @@ import requests from bs4 import BeautifulSoup -def get_updated_robots_json(): - session = requests.Session() - response = session.get("https://darkvisitors.com/agents") - soup = BeautifulSoup(response.text, "html.parser") +def load_robots_json(): + """Load the robots.json contents into a dictionary.""" + return json.loads(Path("./robots.json").read_text(encoding="utf-8")) - existing_content = json.loads(Path("./robots.json").read_text()) + +def get_agent_soup(): + """Retrieve current known agents from darkvisitors.com""" + session = requests.Session() + try: + response = session.get("https://darkvisitors.com/agents") + except requests.exceptions.ConnectionError: + print( + "ERROR: Could not gather the current agents from https://darkvisitors.com/agents" + ) + return + return BeautifulSoup(response.text, "html.parser") + + +def updated_robots_json(soup): + """Update AI scraper information with data from darkvisitors.""" + existing_content = load_robots_json() to_include = [ "AI Assistants", "AI Data Scrapers", @@ -83,13 +98,31 @@ def get_updated_robots_json(): return sorted_robots +def ingest_darkvisitors(): + + old_robots_json = load_robots_json() + soup = get_agent_soup() + if soup: + robots_json = updated_robots_json(soup) + print( + "robots.json is unchanged." + if robots_json == old_robots_json + else "robots.json got updates." + ) + Path("./robots.json").write_text( + json.dumps(robots_json, indent=4), encoding="utf-8" + ) + + def json_to_txt(robots_json): + """Compose the robots.txt from the robots.json file.""" robots_txt = "\n".join(f"User-agent: {k}" for k in robots_json.keys()) robots_txt += "\nDisallow: /\n" return robots_txt def json_to_table(robots_json): + """Compose a markdown table with the information in robots.json""" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" table += "|-----|----------|-----------------------|----------|------------------|-------------|\n" @@ -99,8 +132,26 @@ def json_to_table(robots_json): return table +def update_file_if_changed(file_name, converter): + """Update files if newer content is available and log the (in)actions.""" + new_content = converter(load_robots_json()) + old_content = Path(file_name).read_text(encoding="utf-8") + if old_content == new_content: + print(f"{file_name} is already up to date.") + else: + Path(file_name).write_text(new_content, encoding="utf-8") + print(f"{file_name} has been updated.") + + +def conversions(): + """Triggers the conversions from the json file.""" + update_file_if_changed(file_name="./robots.txt", converter=json_to_txt) + update_file_if_changed( + file_name="./table-of-bot-metrics.md", + converter=json_to_table, + ) + + if __name__ == "__main__": - robots_json = get_updated_robots_json() - Path("./robots.json").write_text(json.dumps(robots_json, indent=4)) - Path("./robots.txt").write_text(json_to_txt(robots_json)) - Path("./table-of-bot-metrics.md").write_text(json_to_table(robots_json)) + ingest_darkvisitors() + conversions() From 3ab22bc49887325dde1ce74d0b5952fcef87e2ea Mon Sep 17 00:00:00 2001 From: fabianegli Date: Sat, 19 Oct 2024 19:56:41 +0200 Subject: [PATCH 047/201] make conversions and updates separately triggerable --- .github/workflows/ai_robots_update.yml | 13 ++++++++--- code/dark_visitors.py | 30 ++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index ea5c760..b346e10 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -18,10 +18,17 @@ jobs: pip install beautifulsoup4 requests git config --global user.name "dark-visitors" git config --global user.email "dark-visitors@users.noreply.github.com" - echo "Running update script ..." - python code/dark_visitors.py + echo "Updating robots.json with data from darkvisitor.com ..." + python code/dark_visitors.py --update echo "... done." git --no-pager diff git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Daily update from Dark Visitors" && git push) + git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) + + echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..." + python code/dark_visitors.py --convert + echo "... done." + git --no-pager diff + git add -A + git diff --quiet && git diff --staged --quiet || (git commit -m "Updated from new robots.json" && git push) shell: bash diff --git a/code/dark_visitors.py b/code/dark_visitors.py index 820c9c1..cf44e8e 100644 --- a/code/dark_visitors.py +++ b/code/dark_visitors.py @@ -153,5 +153,31 @@ def conversions(): if __name__ == "__main__": - ingest_darkvisitors() - conversions() + import argparse + + parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser( + prog="ai-robots", + description="Collects and updates information about web scrapers of AI companies.", + epilog="One of the flags must be set.\n", + ) + parser.add_argument( + "--update", + action="store_true", + help="Update the robots.json file with data from darkvisitors.com/agents", + ) + parser.add_argument( + "--convert", + action="store_true", + help="Create the robots.txt and markdown table from robots.json", + ) + args = parser.parse_args() + + if not (args.update or args.convert): + print("ERROR: please provide one of the possible flags.") + parser.print_help() + + if args.update: + ingest_darkvisitors() + if args.convert: + conversions() From fe5f4076738888d51a7f8719f503294996050d6f Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 27 Oct 2024 00:54:47 +0000 Subject: [PATCH 048/201] Update from Dark Visitors --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index c50d63c..4922c84 100644 --- a/robots.json +++ b/robots.json @@ -1,10 +1,10 @@ { "AI2Bot": { - "description": "Explores 'certain domains' to find web content.", - "frequency": "No information provided.", - "function": "Content is used to train open language models.", "operator": "[Ai2](https://allenai.org/crawler)", - "respect": "Yes" + "respect": "Yes", + "function": "Content is used to train open language models.", + "frequency": "No information provided.", + "description": "Explores 'certain domains' to find web content." }, "Ai2Bot-Dolma": { "description": "Explores 'certain domains' to find web content.", From bc0a0ad0e97f93c152d582ad7b67543b399a3158 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 29 Oct 2024 00:52:12 +0000 Subject: [PATCH 049/201] Update from Dark Visitors --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 4922c84..dbd5ae4 100644 --- a/robots.json +++ b/robots.json @@ -90,6 +90,13 @@ "frequency": "Unclear at this time.", "description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training." }, + "DuckAssistBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot" + }, "FacebookBot": { "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", From 9e06cf3bc9eb9cd4947eb1a887cfa07ecde117b3 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 29 Oct 2024 00:52:12 +0000 Subject: [PATCH 050/201] Updated from new robots.json --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index 927f6f4..4b9cc6a 100644 --- a/robots.txt +++ b/robots.txt @@ -11,6 +11,7 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: Diffbot +User-agent: DuckAssistBot User-agent: FacebookBot User-agent: facebookexternalhit User-agent: FriendlyCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 257ba99..fe6baa2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -13,6 +13,7 @@ | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | +| DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | From 9295b6a963f0ccba30392e005419b42eed2b264e Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 9 Nov 2024 04:45:47 +0000 Subject: [PATCH 051/201] Clarify our rationale I deleted the point about excessive load on crawled sites as any other crawler could potentially be guilty of this and I wouldn't want our scope to creep to all crawlers. Ref: https://github.com/ai-robots-txt/ai.robots.txt/issues/53#issuecomment-2466042550 --- FAQ.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/FAQ.md b/FAQ.md index 4d58350..15db540 100644 --- a/FAQ.md +++ b/FAQ.md @@ -2,7 +2,7 @@ ## Why should we block these crawlers? -They're extractive, confer no benefit to the creators of data they're ingesting and also have wide-ranging negative externalities. +They're extractive, confer no benefit to the creators of data they're ingesting and also have wide-ranging negative externalities: particularly copyright abuse and environmental impact. **[How Tech Giants Cut Corners to Harvest Data for A.I.](https://www.nytimes.com/2024/04/06/technology/tech-giants-harvest-data-artificial-intelligence.html?unlocked_article_code=1.ik0.Ofja.L21c1wyW-0xj&ugrp=m)** > OpenAI, Google and Meta ignored corporate policies, altered their own rules and discussed skirting copyright law as they sought online information to train their newest artificial intelligence systems. @@ -10,7 +10,11 @@ They're extractive, confer no benefit to the creators of data they're ingesting **[How AI copyright lawsuits could make the whole industry go extinct](https://www.theverge.com/24062159/ai-copyright-fair-use-lawsuits-new-york-times-openai-chatgpt-decoder-podcast)** > The New York Times' lawsuit against OpenAI is part of a broader, industry-shaking copyright challenge that could define the future of AI. -Crawlers also sometimes impact the performance of crawled sites, or even take them down. +**[Reconciling the contrasting narratives on the environmental impact of large language models](https://www.nature.com/articles/s41598-024-76682-6) +> Studies have shown that the training of just one LLM can consume as much energy as five cars do across their lifetimes. The water footprint of AI is also substantial; for example, recent work has highlighted that water consumption associated with AI models involves data centers using millions of gallons of water per day for cooling. Additionally, the energy consumption and carbon emissions of AI are projected to grow quickly in the coming years [...]. + +**[Scientists Predict AI to Generate Millions of Tons of E-Waste](https://www.sciencealert.com/scientists-predict-ai-to-generate-millions-of-tons-of-e-waste) +> we could end up with between 1.2 million and 5 million metric tons of additional electronic waste by the end of this decade [the 2020's]. ## How do we know AI companies/bots respect `robots.txt`? From 2c88909be39ad7e0b113e1245fbf1d134b267e8b Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sun, 10 Nov 2024 01:02:18 +0000 Subject: [PATCH 052/201] Fix formatting --- FAQ.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FAQ.md b/FAQ.md index 15db540..044f710 100644 --- a/FAQ.md +++ b/FAQ.md @@ -10,10 +10,10 @@ They're extractive, confer no benefit to the creators of data they're ingesting **[How AI copyright lawsuits could make the whole industry go extinct](https://www.theverge.com/24062159/ai-copyright-fair-use-lawsuits-new-york-times-openai-chatgpt-decoder-podcast)** > The New York Times' lawsuit against OpenAI is part of a broader, industry-shaking copyright challenge that could define the future of AI. -**[Reconciling the contrasting narratives on the environmental impact of large language models](https://www.nature.com/articles/s41598-024-76682-6) +**[Reconciling the contrasting narratives on the environmental impact of large language models](https://www.nature.com/articles/s41598-024-76682-6)** > Studies have shown that the training of just one LLM can consume as much energy as five cars do across their lifetimes. The water footprint of AI is also substantial; for example, recent work has highlighted that water consumption associated with AI models involves data centers using millions of gallons of water per day for cooling. Additionally, the energy consumption and carbon emissions of AI are projected to grow quickly in the coming years [...]. -**[Scientists Predict AI to Generate Millions of Tons of E-Waste](https://www.sciencealert.com/scientists-predict-ai-to-generate-millions-of-tons-of-e-waste) +**[Scientists Predict AI to Generate Millions of Tons of E-Waste](https://www.sciencealert.com/scientists-predict-ai-to-generate-millions-of-tons-of-e-waste)** > we could end up with between 1.2 million and 5 million metric tons of additional electronic waste by the end of this decade [the 2020's]. ## How do we know AI companies/bots respect `robots.txt`? From d50615d3947e524ca92fbaa05eaac6c5bc59121d Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sun, 10 Nov 2024 01:06:13 +0000 Subject: [PATCH 053/201] Improve formatting This clarifies the scope of the tip is Apache httpd. --- FAQ.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FAQ.md b/FAQ.md index 044f710..967cf41 100644 --- a/FAQ.md +++ b/FAQ.md @@ -42,8 +42,8 @@ That depends on your stack. - Apache httpd - [Blockin' bots.](https://ethanmarcotte.com/wrote/blockin-bots/) by Ethan Marcotte - [Blocking Bots With 11ty And Apache](https://flamedfury.com/posts/blocking-bots-with-11ty-and-apache/) by fLaMEd fury -> [!TIP] -> The snippets in these articles all use `mod_rewrite`, which [should be considered a last resort](https://httpd.apache.org/docs/trunk/rewrite/avoid.html). A good alternative that's less resource-intensive is `mod_setenvif`; see [httpd docs](https://httpd.apache.org/docs/trunk/rewrite/access.html#blocking-of-robots) for an example. You should also consider [setting this up in `httpd.conf` instead of `.htaccess`](https://httpd.apache.org/docs/trunk/howto/htaccess.html#when) if it's available to you. + > [!TIP] + > The snippets in these articles all use `mod_rewrite`, which [should be considered a last resort](https://httpd.apache.org/docs/trunk/rewrite/avoid.html). A good alternative that's less resource-intensive is `mod_setenvif`; see [httpd docs](https://httpd.apache.org/docs/trunk/rewrite/access.html#blocking-of-robots) for an example. You should also consider [setting this up in `httpd.conf` instead of `.htaccess`](https://httpd.apache.org/docs/trunk/howto/htaccess.html#when) if it's available to you. - Netlify - [Blockin' bots on Netlify](https://www.jeremiak.com/blog/block-bots-netlify-edge-functions/) by Jeremia Kimelman - Cloudflare From adfd4af872d5fd91817915ac0ca539165e1e0cd2 Mon Sep 17 00:00:00 2001 From: "Y. Meyer-Norwood" <106889957+norwd@users.noreply.github.com> Date: Mon, 11 Nov 2024 12:58:40 +1300 Subject: [PATCH 054/201] Create upload-robots-txt-file-to-release.yml --- .../upload-robots-txt-file-to-release.yml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/upload-robots-txt-file-to-release.yml diff --git a/.github/workflows/upload-robots-txt-file-to-release.yml b/.github/workflows/upload-robots-txt-file-to-release.yml new file mode 100644 index 0000000..df57bee --- /dev/null +++ b/.github/workflows/upload-robots-txt-file-to-release.yml @@ -0,0 +1,23 @@ +--- + +name: "Upload robots.txt file to release" +run-name: "Upload robots.txt file to release" + +on: + release: + types: + - published + +jobs: + upload-robots-txt-file-to-release: + name: "Upload robots.txt file to release" + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@v4 + + - name: "Upload" + run: gh --repo "${REPO}" release upload "${TAG}" robots.txt + env: + REPO: ${{ github.repository }} + TAG: ${{ github.event.release.tag_name }} From 94ceb3cffdc3001dccfdfbd48140cd8057116242 Mon Sep 17 00:00:00 2001 From: "Y. Meyer-Norwood" <106889957+norwd@users.noreply.github.com> Date: Mon, 11 Nov 2024 13:04:55 +1300 Subject: [PATCH 055/201] Add authentication for `gh` command --- .github/workflows/upload-robots-txt-file-to-release.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/upload-robots-txt-file-to-release.yml b/.github/workflows/upload-robots-txt-file-to-release.yml index df57bee..370feb6 100644 --- a/.github/workflows/upload-robots-txt-file-to-release.yml +++ b/.github/workflows/upload-robots-txt-file-to-release.yml @@ -8,6 +8,9 @@ on: types: - published +permissions: + contents: write + jobs: upload-robots-txt-file-to-release: name: "Upload robots.txt file to release" @@ -19,5 +22,6 @@ jobs: - name: "Upload" run: gh --repo "${REPO}" release upload "${TAG}" robots.txt env: + GH_TOKEN: ${{ github.token }} REPO: ${{ github.repository }} TAG: ${{ github.event.release.tag_name }} From e8f0784a0058f8a737ef150ae132ecb13051979d Mon Sep 17 00:00:00 2001 From: "Y. Meyer-Norwood" <106889957+norwd@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:26:37 +1300 Subject: [PATCH 056/201] Explicitly use release tag for checkout --- .github/workflows/upload-robots-txt-file-to-release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/upload-robots-txt-file-to-release.yml b/.github/workflows/upload-robots-txt-file-to-release.yml index 370feb6..5bf2b29 100644 --- a/.github/workflows/upload-robots-txt-file-to-release.yml +++ b/.github/workflows/upload-robots-txt-file-to-release.yml @@ -18,6 +18,8 @@ jobs: steps: - name: "Checkout" uses: actions/checkout@v4 + with: + ref: ${{ github.event.release.tag_name }} - name: "Upload" run: gh --repo "${REPO}" release upload "${TAG}" robots.txt From 80002f5e17e5fd3ab87cd74c17f6a102e9cd634e Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Tue, 19 Nov 2024 03:33:45 +0000 Subject: [PATCH 057/201] Allow facebookexternalhit At the time of writing, this crawler does not appear to be for the purpose of AI. See: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/ (accessed on 19 November 2024). Fixes https://github.com/ai-robots-txt/ai.robots.txt/issues/40 --- robots.json | 7 ------- 1 file changed, 7 deletions(-) diff --git a/robots.json b/robots.json index dbd5ae4..21fc1de 100644 --- a/robots.json +++ b/robots.json @@ -104,13 +104,6 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, - "facebookexternalhit": { - "description": "Unclear at this time.", - "frequency": "Unclear at this time.", - "function": "No information.", - "operator": "Meta/Facebook", - "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)" - }, "FriendlyCrawler": { "description": "Unclear who the operator is; but data is used for training/machine learning.", "frequency": "Unclear at this time.", From 58985737e783aff099fc9dd06b895179d7833c34 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 19 Nov 2024 16:46:21 +0000 Subject: [PATCH 058/201] Updated from new robots.json --- robots.txt | 1 - table-of-bot-metrics.md | 1 - 2 files changed, 2 deletions(-) diff --git a/robots.txt b/robots.txt index 4b9cc6a..1865026 100644 --- a/robots.txt +++ b/robots.txt @@ -13,7 +13,6 @@ User-agent: cohere-ai User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot -User-agent: facebookexternalhit User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index fe6baa2..d1eed4b 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -15,7 +15,6 @@ | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | -| facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | | Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | From 37065f911805370426a9a33b1af5f400b24a0c16 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 24 Nov 2024 00:57:05 +0000 Subject: [PATCH 059/201] Update from Dark Visitors --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 21fc1de..51a5b50 100644 --- a/robots.json +++ b/robots.json @@ -223,6 +223,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "PanguBot": { + "operator": "the Chinese company Huawei", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" + }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", From 609ddca39295d1ec8ec47bc4c1c609135bc238d3 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 24 Nov 2024 00:57:06 +0000 Subject: [PATCH 060/201] Updated from new robots.json --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index 1865026..c41ed6d 100644 --- a/robots.txt +++ b/robots.txt @@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: PanguBot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d1eed4b..e905d2f 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -32,6 +32,7 @@ | OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | From bd38c3019412afdb53ee394e2dd8ccc9294b83a3 Mon Sep 17 00:00:00 2001 From: fabianegli Date: Tue, 26 Nov 2024 09:12:11 +0100 Subject: [PATCH 061/201] specify file encodings in tests --- code/tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/code/tests.py b/code/tests.py index ffa7574..16e4fe3 100644 --- a/code/tests.py +++ b/code/tests.py @@ -10,12 +10,12 @@ from dark_visitors import json_to_txt, json_to_table def test_robots_txt_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_json = json.loads(Path("test_files/robots.json").read_text(encoding="utf-8")) robots_txt = json_to_txt(robots_json) - assert Path("test_files/robots.txt").read_text() == robots_txt + assert Path("test_files/robots.txt").read_text(encoding="utf-8") == robots_txt def test_table_of_bot_metrices_md(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_json = json.loads(Path("test_files/robots.json").read_text(encoding="utf-8")) robots_table = json_to_table(robots_json) - assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table + assert Path("test_files/table-of-bot-metrics.md").read_text(encoding="utf-8") == robots_table From b64284d6846da62e1ad78146c4fcb6e7ff0eb80c Mon Sep 17 00:00:00 2001 From: fabianegli Date: Tue, 26 Nov 2024 09:41:46 +0100 Subject: [PATCH 062/201] restore correct attribution logic to before PR #55 --- .github/workflows/ai_robots_update.yml | 16 ++++------- .github/workflows/main.yml | 38 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index b346e10..654b0b5 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -1,8 +1,5 @@ name: Updates for AI robots files on: - push: - branches: - - "main" schedule: - cron: "0 0 * * *" @@ -24,11 +21,10 @@ jobs: git --no-pager diff git add -A git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) - - echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..." - python code/dark_visitors.py --convert - echo "... done." - git --no-pager diff - git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Updated from new robots.json" && git push) shell: bash + call-main: + needs: dark-visitors + uses: ./.github/workflows/main.yml + secrets: inherit + with: + message: "Update from Dark Visitors" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..a4c47d6 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,38 @@ +on: + workflow_call: + inputs: + message: + type: string + required: true + description: The message to commit + push: + paths: + - 'robots.json' + branches: + - "main" + +jobs: + ai-robots-txt: + runs-on: ubuntu-latest + name: ai-robots-txt + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 2 + - run: | + git config --global user.name "ai.robots.txt" + git config --global user.email "ai.robots.txt@users.noreply.github.com" + git log -1 + git status + echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..." + python code/dark_visitors.py --convert + echo "... done." + git --no-pager diff + git add -A + if [ -n "${{ inputs.message }}" ]; then + git commit -m "${{ inputs.message }}" + else + git commit -m "${{ github.event.head_commit.message }}" + fi + git push + shell: bash From eb8e1a49b5fd36b57490b37831d26013223b4eb9 Mon Sep 17 00:00:00 2001 From: fabianegli Date: Fri, 29 Nov 2024 09:02:47 +0100 Subject: [PATCH 063/201] Revert "specify file encodings in tests" This reverts commit bd38c3019412afdb53ee394e2dd8ccc9294b83a3. --- code/tests.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/code/tests.py b/code/tests.py index 16e4fe3..ffa7574 100644 --- a/code/tests.py +++ b/code/tests.py @@ -10,12 +10,12 @@ from dark_visitors import json_to_txt, json_to_table def test_robots_txt_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text(encoding="utf-8")) + robots_json = json.loads(Path("test_files/robots.json").read_text()) robots_txt = json_to_txt(robots_json) - assert Path("test_files/robots.txt").read_text(encoding="utf-8") == robots_txt + assert Path("test_files/robots.txt").read_text() == robots_txt def test_table_of_bot_metrices_md(): - robots_json = json.loads(Path("test_files/robots.json").read_text(encoding="utf-8")) + robots_json = json.loads(Path("test_files/robots.json").read_text()) robots_table = json_to_table(robots_json) - assert Path("test_files/table-of-bot-metrics.md").read_text(encoding="utf-8") == robots_table + assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table From 2036a68c1f6d5b217439976000e8f7162e2dbb3f Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 4 Dec 2024 00:55:50 +0000 Subject: [PATCH 064/201] Update from Dark Visitors --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 51a5b50..1c00b63 100644 --- a/robots.json +++ b/robots.json @@ -83,6 +83,13 @@ "frequency": "Takes action based on user prompts.", "description": "Retrieves data based on user prompts." }, + "cohere-training-data-crawler": { + "operator": "Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler" + }, "Diffbot": { "operator": "[Diffbot](https://www.diffbot.com/)", "respect": "At the discretion of Diffbot users.", From 3a43714908dd7df42a9ecf35c107e609bc2f9120 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 4 Jan 2025 04:55:34 +0000 Subject: [PATCH 065/201] Rename Python code The name dark_visitors.py gives the impression that the code is entirely related to the dark visitors website, whereas the update command relates to dark visitors and the convert command is unrelated to dark visitors. --- .github/workflows/ai_robots_update.yml | 2 +- .github/workflows/main.yml | 2 +- code/{dark_visitors.py => robots.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename code/{dark_visitors.py => robots.py} (100%) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index 654b0b5..59e785d 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -16,7 +16,7 @@ jobs: git config --global user.name "dark-visitors" git config --global user.email "dark-visitors@users.noreply.github.com" echo "Updating robots.json with data from darkvisitor.com ..." - python code/dark_visitors.py --update + python code/robots.py --update echo "... done." git --no-pager diff git add -A diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a4c47d6..40ac9ab 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,7 +25,7 @@ jobs: git log -1 git status echo "Updating robots.txt and table-of-bot-metrics.md if necessary ..." - python code/dark_visitors.py --convert + python code/robots.py --convert echo "... done." git --no-pager diff git add -A diff --git a/code/dark_visitors.py b/code/robots.py similarity index 100% rename from code/dark_visitors.py rename to code/robots.py From e4c12ee2f84e2cb6643f7eeb7dd6eb50c6e91df8 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 4 Jan 2025 05:03:48 +0000 Subject: [PATCH 066/201] Rename in test code --- code/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/tests.py b/code/tests.py index ffa7574..9cf35fe 100644 --- a/code/tests.py +++ b/code/tests.py @@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest` import json from pathlib import Path -from dark_visitors import json_to_txt, json_to_table +from robots import json_to_txt, json_to_table def test_robots_txt_creation(): From 996b9c678cbdd90dea414006cc14027b29118d5c Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sat, 4 Jan 2025 05:28:41 +0000 Subject: [PATCH 067/201] Improve job name The purpose of the job is to convert the JSON file to the other files. --- .github/workflows/ai_robots_update.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index 59e785d..7e11ce8 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -22,7 +22,8 @@ jobs: git add -A git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) shell: bash - call-main: + convert: + name: convert needs: dark-visitors uses: ./.github/workflows/main.yml secrets: inherit From 9e372d069625f2a2939c19fb8bfc703548a2ae42 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sun, 5 Jan 2025 01:45:33 +0000 Subject: [PATCH 068/201] Ensure dependency installed Ref: https://github.com/ai-robots-txt/ai.robots.txt/issues/60#issuecomment-2571437913 Ref: https://stackoverflow.com/questions/11783875/importerror-no-module-named-bs4-beautifulsoup --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a4c47d6..cb5fefc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -20,6 +20,7 @@ jobs: with: fetch-depth: 2 - run: | + pip install beautifulsoup4 git config --global user.name "ai.robots.txt" git config --global user.email "ai.robots.txt@users.noreply.github.com" git log -1 From c01a68403687f44ef3235ee726ff70b9d6a133f4 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Sun, 5 Jan 2025 05:03:50 +0000 Subject: [PATCH 069/201] Convert robots.json more frequently Specifically, when github workflows or code is changed as either of these can affect the conversion results. Ref: https://github.com/ai-robots-txt/ai.robots.txt/issues/60 --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index cb5fefc..4abbe2b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -8,6 +8,8 @@ on: push: paths: - 'robots.json' + - '.github/workflows/**' + - 'code/**' branches: - "main" From ca8620e28b8b3baddc34852e3cb2ece2bf89d18d Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sun, 5 Jan 2025 05:05:20 +0000 Subject: [PATCH 070/201] Merge pull request #63 from glyn/push-paths Convert robots.json more frequently --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index c41ed6d..1ae5558 100644 --- a/robots.txt +++ b/robots.txt @@ -10,6 +10,7 @@ User-agent: ChatGPT-User User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai +User-agent: cohere-training-data-crawler User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e905d2f..1106d0f 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -12,6 +12,7 @@ | Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| cohere-training-data-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | From 83cd54647015829bbf241931e3d602c6081d2a1c Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Mon, 6 Jan 2025 11:39:41 +0100 Subject: [PATCH 071/201] allow Action to succeed even if no changes were made Before, the Action would fail in case there were no changes made to any files by the converter. --- .github/workflows/main.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4abbe2b..d26a5a0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,6 +32,13 @@ jobs: echo "... done." git --no-pager diff git add -A + if [ "$(git diff --staged)" ]; then + # To have the action run successfully, if no changes are staged, we + # manually skip the later commits because they fail with exit code 1 + # and this would then display as a failure for the Action. + echo "No staged changes to commit. Skipping commit and push." + exit 0 + fi if [ -n "${{ inputs.message }}" ]; then git commit -m "${{ inputs.message }}" else From 30ee95701162ac8f67cf6183641b2a140fcde721 Mon Sep 17 00:00:00 2001 From: Fabian Egli Date: Mon, 6 Jan 2025 12:05:42 +0100 Subject: [PATCH 072/201] bail when NO changes are staged --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d26a5a0..ac20d99 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,7 +32,7 @@ jobs: echo "... done." git --no-pager diff git add -A - if [ "$(git diff --staged)" ]; then + if [ -z "$(git diff --staged)" ]; then # To have the action run successfully, if no changes are staged, we # manually skip the later commits because they fail with exit code 1 # and this would then display as a failure for the Action. From 143f8f228588b1f66bc1435fc21457f610807d5f Mon Sep 17 00:00:00 2001 From: Jordan Atwood Date: Mon, 6 Jan 2025 12:34:38 -0800 Subject: [PATCH 073/201] Block SemrushBot --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 1c00b63..c444cb4 100644 --- a/robots.json +++ b/robots.json @@ -258,6 +258,13 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, + "SemrushBot": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Scrapes data for use in LLM article-writing tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports." + }, "Sidetrade indexer bot": { "description": "AI product training.", "frequency": "No information.", From ec454b71d3984e58f323bb71631847dfe6b51b78 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 6 Jan 2025 20:51:56 +0000 Subject: [PATCH 074/201] Merge pull request #67 from Nightfirecat/semrushbot Block SemrushBot --- robots.txt | 1 + table-of-bot-metrics.md | 1 + 2 files changed, 2 insertions(+) diff --git a/robots.txt b/robots.txt index 1ae5558..5c32c96 100644 --- a/robots.txt +++ b/robots.txt @@ -35,6 +35,7 @@ User-agent: PanguBot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy +User-agent: SemrushBot User-agent: Sidetrade indexer bot User-agent: Timpibot User-agent: VelenPublicWebCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 1106d0f..31c9367 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -37,6 +37,7 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Scrapes data for use in LLM article-writing tool. | Roughly once every 10 seconds. | SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports. | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | From 933aa6159da9dbe7025f6294e98a6d3e326b43a3 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Tue, 7 Jan 2025 11:02:29 +0100 Subject: [PATCH 075/201] Implementing htaccess generation --- .htaccess | 3 +++ code/robots.py | 22 +++++++++++++++++++++- code/test_files/.htaccess | 3 +++ code/tests.py | 8 +++++++- 4 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 .htaccess create mode 100644 code/test_files/.htaccess diff --git a/.htaccess b/.htaccess new file mode 100644 index 0000000..31ba5f7 --- /dev/null +++ b/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/code/robots.py b/code/robots.py index cf44e8e..d35d74b 100644 --- a/code/robots.py +++ b/code/robots.py @@ -132,10 +132,26 @@ def json_to_table(robots_json): return table +def json_to_htaccess(robot_json): + htaccess = "RewriteEngine On\n" + htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" + + robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) + htaccess += "|".join(robots) + htaccess += ").*$ [NC]\n" + htaccess += "RewriteRule .* - [F,L]" + return htaccess + + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) - old_content = Path(file_name).read_text(encoding="utf-8") + filepath = Path(file_name) + if not filepath.exists(): + filepath.write_text(new_content, encoding="utf-8") + print(f"{file_name} has been created.") + return + old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: print(f"{file_name} is already up to date.") else: @@ -150,6 +166,10 @@ def conversions(): file_name="./table-of-bot-metrics.md", converter=json_to_table, ) + update_file_if_changed( + file_name="./.htaccess", + converter=json_to_htaccess, + ) if __name__ == "__main__": diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess new file mode 100644 index 0000000..a34bf55 --- /dev/null +++ b/code/test_files/.htaccess @@ -0,0 +1,3 @@ +RewriteEngine On +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteRule .* - [F,L] \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 9cf35fe..6f778c3 100644 --- a/code/tests.py +++ b/code/tests.py @@ -6,7 +6,7 @@ cd to the `code` directory and run `pytest` import json from pathlib import Path -from robots import json_to_txt, json_to_table +from robots import json_to_txt, json_to_table, json_to_htaccess def test_robots_txt_creation(): @@ -19,3 +19,9 @@ def test_table_of_bot_metrices_md(): robots_json = json.loads(Path("test_files/robots.json").read_text()) robots_table = json_to_table(robots_json) assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table + + +def test_htaccess_creation(): + robots_json = json.loads(Path("test_files/robots.json").read_text()) + robots_htaccess = json_to_htaccess(robots_json) + assert Path("test_files/.htaccess").read_text() == robots_htaccess From 189e75bbfd06715a5d30972d3aa4c23974aecee0 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Fri, 17 Jan 2025 21:25:23 +0100 Subject: [PATCH 076/201] Adding usage instructions --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index b3c2e7c..45c8f3a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,19 @@ A number of these crawlers have been sourced from [Dark Visitors](https://darkvi If you'd like to add information about a crawler to the list, please make a pull request with the bot name added to `robots.txt`, `ai.txt`, and any relevant details in `table-of-bot-metrics.md` to help people understand what's crawling. +## Usage + +Many visitors will find these files from this repository most useful: +- `robots.txt` +- `.htaccess` + +The first one tells search engine and AI crawlers which parts of your website should be scanned or avoided. The webpages of your server are returned anyway, but the crawler "pledges" not to use them. By default, the provided `robots.txt` tells every AI crawler not to scan any page in your website. This is not bulletproof, as an evil crawler could simply ignore the `robots.txt` content. + +The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. + +We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. + + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`. From b455af66e7903e76162d43f3e8f0900084fb9539 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Fri, 17 Jan 2025 21:42:08 +0100 Subject: [PATCH 077/201] Adding clarification about performance and code comment --- README.md | 3 ++- code/robots.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 45c8f3a..dd84a16 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,9 @@ The first one tells search engine and AI crawlers which parts of your website sh The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. -We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. +We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. ## Contributing diff --git a/code/robots.py b/code/robots.py index d35d74b..f2ddbb8 100644 --- a/code/robots.py +++ b/code/robots.py @@ -133,7 +133,9 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): - htaccess = "RewriteEngine On\n" + # Creates a .htaccess filter file. It uses a regular expression to filter out + #User agents that contain any of the blocked values. + htaccess += "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) From 8aee2f24bb03a8d91a2fb17c3a98628411239d40 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:39:07 +0100 Subject: [PATCH 078/201] Fixed space in comment Co-authored-by: Glyn Normington --- code/robots.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index f2ddbb8..0172330 100644 --- a/code/robots.py +++ b/code/robots.py @@ -134,7 +134,7 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out - #User agents that contain any of the blocked values. + # User agents that contain any of the blocked values. htaccess += "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" From 1cc4b59dfc4acd5666478efea658b1adf1af8aee Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:40:03 +0100 Subject: [PATCH 079/201] Shortened htaccess instructions Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd84a16..badd23b 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Many visitors will find these files from this repository most useful: - `robots.txt` - `.htaccess` -The first one tells search engine and AI crawlers which parts of your website should be scanned or avoided. The webpages of your server are returned anyway, but the crawler "pledges" not to use them. By default, the provided `robots.txt` tells every AI crawler not to scan any page in your website. This is not bulletproof, as an evil crawler could simply ignore the `robots.txt` content. +`robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. From d65128d10acfd14b714488170b3a261912cc3729 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Sat, 18 Jan 2025 12:41:09 +0100 Subject: [PATCH 080/201] Removed paragraph in favour of future FAQ.md Co-authored-by: Glyn Normington --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index badd23b..505a8dd 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,6 @@ The second one tells your own webserver to return an error page when one of the Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. -We suggest adding both files, as some crawlers may respect `robots.txt` while not having an identifiable User Agent; on the other hand, other crawlers may not respect the `robots.txt`, but they provide a identifiable User Agent by which we can filter them out. ## Contributing From 5aa08bc0022e8e9960e4cf52359ca2d910f795bf Mon Sep 17 00:00:00 2001 From: Joshua Sheard Date: Sun, 19 Jan 2025 22:03:50 +0000 Subject: [PATCH 081/201] Add Crawlspace --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index c444cb4..d71c80b 100644 --- a/robots.json +++ b/robots.json @@ -90,6 +90,13 @@ "frequency": "Unclear at this time.", "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler" }, + "Crawlspace": { + "operator": "[Crawlspace](https://crawlspace.dev)", + "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)", + "function": "Scrapes data", + "frequency": "Unclear at this time.", + "description": "Provides crawling services for any purpose, but most likely to be used for AI model training." + }, "Diffbot": { "operator": "[Diffbot](https://www.diffbot.com/)", "respect": "At the discretion of Diffbot users.", @@ -300,4 +307,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 70fd6c0fb13cdf4f0525bf061556e8e50ca7b8d9 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:25:07 +0100 Subject: [PATCH 082/201] Add mention of htaccess in readme Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 505a8dd..cd8d467 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action, courtesy of [Adam](https://github.com/newbold), will then generate the updated `robots.txt` and `table-of-bot-metrics.md`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. ## Subscribe to updates From 013b7abfa1f2126e9320ddbab90ff87af54b092c Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:27:02 +0100 Subject: [PATCH 083/201] Update README.md Co-authored-by: Glyn Normington --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cd8d467..1417a85 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ Many visitors will find these files from this repository most useful: `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). -The second one tells your own webserver to return an error page when one of the listed AI crawlers tries to request a page from your website. A `.htaccess` file does not work on every webserver, but works correctly on most common and cheap shared hosting providers. The majority of AI crawlers set a "User Agent" string in every request they send, by which they are identifiable: this string is used to filter the request. Instead of simply hoping the crawler pledges to respect our intention, this solution actively sends back a bad webpage (an error or an empty page). Note that this solution isn't bulletproof either, as anyone can fake the sent User Agent. +### `.htaccess` + +`.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. From 52241bdca6c9930f7b225264cd862b5f98a2d68f Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:27:56 +0100 Subject: [PATCH 084/201] Update README.md Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1417a85..bb6558c 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Many visitors will find these files from this repository most useful: `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. -Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. Nevertheless, most shared hosting providers only allow `.htaccess` configuration. +Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. ## Contributing From 33c38ee70b3a45343ddb360ae79e743e42bc8f76 Mon Sep 17 00:00:00 2001 From: Massimo Gismondi <24638827+MassiminoilTrace@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:28:32 +0100 Subject: [PATCH 085/201] Update README.md Co-authored-by: Glyn Normington --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bb6558c..648f5ed 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,12 @@ If you'd like to add information about a crawler to the list, please make a pull ## Usage -Many visitors will find these files from this repository most useful: +This repository provides the following files: - `robots.txt` - `.htaccess` +### `robots.txt` + `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). ### `.htaccess` From a9956f7825080467adbbda6e41d7dfbaee47210b Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Mon, 20 Jan 2025 06:50:48 +0100 Subject: [PATCH 086/201] Removed additional sections --- README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/README.md b/README.md index 648f5ed..065b0b7 100644 --- a/README.md +++ b/README.md @@ -14,14 +14,9 @@ This repository provides the following files: - `robots.txt` - `.htaccess` -### `robots.txt` - `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). -### `.htaccess` - `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. - Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. From 4f03818280e7979697250ac5d59da12290db2e9f Mon Sep 17 00:00:00 2001 From: Massimo Gismondi Date: Mon, 20 Jan 2025 06:51:06 +0100 Subject: [PATCH 087/201] Removed if condition and added a little comments --- code/robots.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/code/robots.py b/code/robots.py index 0172330..087b00b 100644 --- a/code/robots.py +++ b/code/robots.py @@ -135,9 +135,10 @@ def json_to_table(robots_json): def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out # User agents that contain any of the blocked values. - htaccess += "RewriteEngine On\n" + htaccess = "RewriteEngine On\n" htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" + # Escape spaces in each User Agent to build the regular expression robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) htaccess += "|".join(robots) htaccess += ").*$ [NC]\n" @@ -149,10 +150,8 @@ def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) filepath = Path(file_name) - if not filepath.exists(): - filepath.write_text(new_content, encoding="utf-8") - print(f"{file_name} has been created.") - return + # "touch" will create the file if it doesn't exist yet + filepath.touch() old_content = filepath.read_text(encoding="utf-8") if old_content == new_content: print(f"{file_name} is already up to date.") From 7427d96bac08d59276292ca7a66d77365f7d26b9 Mon Sep 17 00:00:00 2001 From: Joshua Sheard Date: Mon, 20 Jan 2025 10:59:02 +0000 Subject: [PATCH 088/201] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index d71c80b..465a61c 100644 --- a/robots.json +++ b/robots.json @@ -95,7 +95,7 @@ "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)", "function": "Scrapes data", "frequency": "Unclear at this time.", - "description": "Provides crawling services for any purpose, but most likely to be used for AI model training." + "description": "Provides crawling services for any purpose, probably including AI model training." }, "Diffbot": { "operator": "[Diffbot](https://www.diffbot.com/)", From 6c552a3daa591f47a81936ebc41c822dc35b9fa2 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 20 Jan 2025 17:45:42 +0000 Subject: [PATCH 089/201] Merge pull request #71 from jsheard/patch-1 Add Crawlspace --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.htaccess b/.htaccess index 31ba5f7..beaddc3 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] RewriteRule .* - [F,L] \ No newline at end of file diff --git a/robots.txt b/robots.txt index 5c32c96..fd388fd 100644 --- a/robots.txt +++ b/robots.txt @@ -11,6 +11,7 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler +User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 31c9367..f44c585 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -13,6 +13,7 @@ | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere-training-data-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | From 9c060dee1c9cead8a3cb1092bdf8615cf33f3656 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 21 Jan 2025 00:49:22 +0000 Subject: [PATCH 090/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 465a61c..4d7d582 100644 --- a/robots.json +++ b/robots.json @@ -307,4 +307,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 05b79b8a5886983c818eaad107fcf6c7de5fad3a Mon Sep 17 00:00:00 2001 From: nisbet-hubbard <87453615+nisbet-hubbard@users.noreply.github.com> Date: Mon, 27 Jan 2025 19:41:03 +0800 Subject: [PATCH 091/201] Update robots.json --- robots.json | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index 4d7d582..7f3cba3 100644 --- a/robots.json +++ b/robots.json @@ -265,12 +265,19 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, - "SemrushBot": { + "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Scrapes data for use in LLM article-writing tool.", + "function": "Crawls your site for ContentShake AI tool.", "frequency": "Roughly once every 10 seconds.", - "description": "SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports." + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-SWA": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Checks URLs on your site for SWA tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." }, "Sidetrade indexer bot": { "description": "AI product training.", @@ -307,4 +314,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 89d4c6e5ca03f0aedec09b9191e2aece6f2efec3 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 1 Feb 2025 10:51:01 +0000 Subject: [PATCH 092/201] Merge pull request #73 from nisbet-hubbard/patch-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Actually block Semrush’s AI tools --- .htaccess | 2 +- robots.txt | 3 ++- table-of-bot-metrics.md | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index beaddc3..97482e2 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] RewriteRule .* - [F,L] \ No newline at end of file diff --git a/robots.txt b/robots.txt index fd388fd..3839e55 100644 --- a/robots.txt +++ b/robots.txt @@ -36,7 +36,8 @@ User-agent: PanguBot User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy -User-agent: SemrushBot +User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: Timpibot User-agent: VelenPublicWebCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f44c585..b51bbae 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -38,7 +38,8 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Scrapes data for use in LLM article-writing tool. | Roughly once every 10 seconds. | SemrushBot is a bot which, among other functions, scrapes data for use in ContentShake AI tool reports. | +| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | From bebffccc0ced8c420276c93f3109c2e71cd5ca0c Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sun, 2 Feb 2025 00:52:50 +0000 Subject: [PATCH 093/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 7f3cba3..79762a0 100644 --- a/robots.json +++ b/robots.json @@ -314,4 +314,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 261a2b83b90fe89f1d842066709c019fd1dba30f Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:26:19 -0500 Subject: [PATCH 094/201] update README to inclide list of ai bots Cloudflare considers verified --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 065b0b7..6758570 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,19 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). + +If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: +- Amazonbot +- Applebot +- CCBot +- ChatGPT-User +- DuckAssistBot +- GoogleOther +- GPTBot +- OAI-SearchBot +- PerplexityBot +- PetalBot + ## Additional resources - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight From e396a2ec781095c5e2659eefb99c46ab7715a664 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:31:20 -0500 Subject: [PATCH 095/201] forgot to include heading --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6758570..e70d283 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). - +## Cloudflare Verified Bots If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot From f99339922fa9afdbb00e18bb99105e81cd3f8e88 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:36:33 -0500 Subject: [PATCH 096/201] grammar update and include syntax for verified bot condition --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e70d283..f471ede 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). ## Cloudflare Verified Bots -If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that make use of [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots) conditions, please note that the following AI web crawlers are considered verified bots by Cloudflare: +If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot - CCBot From af87b85d7f00bc285cb414280e02d2f42284a9d8 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Fri, 14 Feb 2025 12:39:08 -0500 Subject: [PATCH 097/201] include return after heading --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f471ede..303f009 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). ## Cloudflare Verified Bots + If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: - Amazonbot - Applebot From 5b13c2e504c843c2a95981cee1c2655d9f21c8f4 Mon Sep 17 00:00:00 2001 From: always-be-testing Date: Sat, 15 Feb 2025 11:22:10 -0500 Subject: [PATCH 098/201] add more concise message about verified bots Co-authored-by: Glyn Normington --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index 303f009..a206c83 100644 --- a/README.md +++ b/README.md @@ -39,21 +39,7 @@ Alternatively, you can also subscribe to new releases with your GitHub account b ## Report abusive crawlers If you use [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) alongside this list, you can report abusive crawlers that don't respect `robots.txt` [here](https://docs.google.com/forms/d/e/1FAIpQLScbUZ2vlNSdcsb8LyTeSF7uLzQI96s0BKGoJ6wQ6ocUFNOKEg/viewform). - -## Cloudflare Verified Bots - -If you are unable to make use of [Cloudflare's hard block](https://blog.cloudflare.com/declaring-your-aindependence-block-ai-bots-scrapers-and-crawlers-with-a-single-click) and/or have WAF rules that use the `cf.bot_management.verified_bot` condition based on [Cloudflare's Verified Bots](https://radar.cloudflare.com/traffic/verified-bots), please note that the following AI web crawlers are considered verified bots by Cloudflare: -- Amazonbot -- Applebot -- CCBot -- ChatGPT-User -- DuckAssistBot -- GoogleOther -- GPTBot -- OAI-SearchBot -- PerplexityBot -- PetalBot - +But even if you don't use Cloudflare's hard block, their list of [verified bots](https://radar.cloudflare.com/traffic/verified-bots) may come in handy. ## Additional resources - [Blocking Bots with Nginx](https://rknight.me/blog/blocking-bots-with-nginx/) by Robb Knight From a9ec4ffa6fd1816ee6c1c146fa75983abc0b2edc Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sun, 16 Feb 2025 13:36:39 -0800 Subject: [PATCH 099/201] chore: add Brightbot 1.0 --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 79762a0..a634634 100644 --- a/robots.json +++ b/robots.json @@ -41,6 +41,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "Brightbot 1.0": { + "operator": "Browsing.ai", + "respect": "Unclear at this time.", + "function": "LLM/AI training.", + "frequency": "Unclear at this time.", + "description": "Scrapes data to train LLMs and AI products focused on website customer support." + }, "Bytespider": { "operator": "ByteDance", "respect": "No", @@ -314,4 +321,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 693289bb29c42b7a526d8210d1f743ca3608690d Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sun, 16 Feb 2025 21:37:52 +0000 Subject: [PATCH 100/201] chore: add Brightbot 1.0 --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.htaccess b/.htaccess index 97482e2..512c274 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] RewriteRule .* - [F,L] \ No newline at end of file diff --git a/robots.txt b/robots.txt index 3839e55..80c40e8 100644 --- a/robots.txt +++ b/robots.txt @@ -4,6 +4,7 @@ User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b51bbae..af32bf2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -6,6 +6,7 @@ | anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | From abfd6dfcd15267ed03b5fda4cd3eac2512604ed2 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Mon, 17 Feb 2025 00:53:32 +0000 Subject: [PATCH 101/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index a634634..cdc7bb5 100644 --- a/robots.json +++ b/robots.json @@ -321,4 +321,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From c0d418cd875b432fd4558be57ad3c009326b631e Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Mon, 17 Feb 2025 21:00:57 +0100 Subject: [PATCH 102/201] .htaccess: Allow robots access to /robots.txt --- .htaccess | 2 +- code/robots.py | 2 +- code/test_files/.htaccess | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 512c274..c42f99e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] -RewriteRule .* - [F,L] \ No newline at end of file +RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index 087b00b..bb18e70 100644 --- a/code/robots.py +++ b/code/robots.py @@ -142,7 +142,7 @@ def json_to_htaccess(robot_json): robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) htaccess += "|".join(robots) htaccess += ").*$ [NC]\n" - htaccess += "RewriteRule .* - [F,L]" + htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index a34bf55..2e78674 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] -RewriteRule .* - [F,L] \ No newline at end of file +RewriteRule !^/?robots\.txt$ - [F,L] From a884a2afb9dbc7338b0faa24b3c10308adbc48e4 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Mon, 17 Feb 2025 21:00:57 +0100 Subject: [PATCH 103/201] .htaccess: Make regex in RewriteCond safe Improve the regular expression by removing unneeded anchors and escaping special characters (not just space) to prevent false positives or a misbehaving rewrite rule. --- .htaccess | 2 +- code/robots.py | 19 ++++++++++--------- code/test_files/.htaccess | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.htaccess b/.htaccess index c42f99e..2313293 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Brightbot\ 1.0|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|cohere-training-data-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot-OCOB|SemrushBot-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index bb18e70..a8a674d 100644 --- a/code/robots.py +++ b/code/robots.py @@ -1,8 +1,9 @@ import json -from pathlib import Path - +import re import requests + from bs4 import BeautifulSoup +from pathlib import Path def load_robots_json(): @@ -99,7 +100,6 @@ def updated_robots_json(soup): def ingest_darkvisitors(): - old_robots_json = load_robots_json() soup = get_agent_soup() if soup: @@ -132,16 +132,17 @@ def json_to_table(robots_json): return table +def list_to_pcre(lst): + # Python re is not 100% identical to PCRE which is used by Apache, but it + # should probably be close enough in the real world for re.escape to work. + return f"({"|".join(map(re.escape, lst))})" + + def json_to_htaccess(robot_json): # Creates a .htaccess filter file. It uses a regular expression to filter out # User agents that contain any of the blocked values. htaccess = "RewriteEngine On\n" - htaccess += "RewriteCond %{HTTP_USER_AGENT} ^.*(" - - # Escape spaces in each User Agent to build the regular expression - robots = map(lambda el: el.replace(" ", "\\ "), robot_json.keys()) - htaccess += "|".join(robots) - htaccess += ").*$ [NC]\n" + htaccess += f"RewriteCond %{{HTTP_USER_AGENT}} {list_to_pcre(robot_json.keys())} [NC]\n" htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 2e78674..90ddcf2 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} ^.*(AI2Bot|Ai2Bot-Dolma|Amazonbot|anthropic-ai|Applebot|Applebot-Extended|Bytespider|CCBot|ChatGPT-User|Claude-Web|ClaudeBot|cohere-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google-Extended|GoogleOther|GoogleOther-Image|GoogleOther-Video|GPTBot|iaskspider/2.0|ICC-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta-ExternalAgent|Meta-ExternalFetcher|OAI-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio-Extended|YouBot).*$ [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] From 0bd3fa63b832ffd8fa908675656c7007021f6654 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:12:04 +0100 Subject: [PATCH 104/201] table-of-bot-metrics.md: Escape robot names for Markdown table Some characters which could occur in a crawler's name have a special meaning in Markdown. They are escaped to prevent them from having unintended side effects. The escaping is only applied to the first (Name) column of the table. The rest of the columns is expected to already be Markdown encoded in robots.json. --- code/robots.py | 8 ++++++-- table-of-bot-metrics.md | 40 ++++++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/code/robots.py b/code/robots.py index a8a674d..62fb061 100644 --- a/code/robots.py +++ b/code/robots.py @@ -121,13 +121,17 @@ def json_to_txt(robots_json): return robots_txt +def escape_md(s): + return re.sub(r"([]*\\|`(){}<>#+-.!_[])", r"\\\1", s) + + def json_to_table(robots_json): """Compose a markdown table with the information in robots.json""" table = "| Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description |\n" - table += "|-----|----------|-----------------------|----------|------------------|-------------|\n" + table += "|------|----------|-----------------------|----------|------------------|-------------|\n" for name, robot in robots_json.items(): - table += f'| {name} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' + table += f'| {escape_md(name)} | {robot["operator"]} | {robot["respect"]} | {robot["function"]} | {robot["frequency"]} | {robot["description"]} |\n' return table diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index af32bf2..ce82047 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -1,48 +1,48 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| +|------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | -| Brightbot 1.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | +| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | -| cohere-training-data-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | -| SemrushBot-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | -| SemrushBot-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 17b826a6d3868cf87fb52adf95f52872ac5c4437 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:13:27 +0100 Subject: [PATCH 105/201] Update tests and convert to stock unittest For these simple tests Python's built-in unittest framework is more than enough. No additional dependencies are required. Added some more test cases with "special" characters to test the escaping code better. --- code/test_files/.htaccess | 2 +- code/test_files/robots.json | 44 ++++++++++++++++- code/test_files/robots.txt | 6 +++ code/test_files/table-of-bot-metrics.md | 38 +++++++++------ code/tests.py | 65 ++++++++++++++++++------- 5 files changed, 120 insertions(+), 35 deletions(-) mode change 100644 => 100755 code/tests.py diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 90ddcf2..7e39092 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/test_files/robots.json b/code/test_files/robots.json index c50d63c..b0cbfbb 100644 --- a/code/test_files/robots.json +++ b/code/test_files/robots.json @@ -278,5 +278,47 @@ "function": "Scrapes data for search engine and LLMs.", "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." + }, + "crawler.with.dots": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression dots need to be escaped." + }, + "star***crawler": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression stars need to be escaped." + }, + "Is this a crawler?": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression spaces and question marks need to be escaped." + }, + "a[mazing]{42}(robot)": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped." + }, + "2^32$": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression RE anchor characters need to be escaped." + }, + "curl|sudo bash": { + "operator": "Test suite", + "respect": "No", + "function": "To ensure the code works correctly.", + "frequency": "No information.", + "description": "When used in the .htaccess regular expression pipes need to be escaped." } -} \ No newline at end of file +} diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt index 927f6f4..03c3c25 100644 --- a/code/test_files/robots.txt +++ b/code/test_files/robots.txt @@ -38,4 +38,10 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: YouBot +User-agent: crawler.with.dots +User-agent: star***crawler +User-agent: Is this a crawler? +User-agent: a[mazing]{42}(robot) +User-agent: 2^32$ +User-agent: curl|sudo bash Disallow: / diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md index 257ba99..88af6c0 100644 --- a/code/test_files/table-of-bot-metrics.md +++ b/code/test_files/table-of-bot-metrics.md @@ -1,35 +1,35 @@ | Name | Operator | Respects `robots.txt` | Data use | Visit regularity | Description | -|-----|----------|-----------------------|----------|------------------|-------------| +|------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | -| Ai2Bot-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | -| anthropic-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | -| Applebot-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | -| ChatGPT-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | +| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| cohere-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | +| cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | facebookexternalhit | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | No information. | Unclear at this time. | Unclear at this time. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | -| Google-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | +| Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | -| GoogleOther-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | +| GoogleOther\-Video | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GPTBot | [OpenAI](https://openai.com) | Yes | Scrapes data to train OpenAI's products. | No information. | Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies. | -| iaskspider/2.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | -| ICC-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | +| iaskspider/2\.0 | iAsk | No | Crawls sites to provide answers to user queries. | Unclear at this time. | Used to provide answers to user queries. | +| ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | -| OAI-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | +| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | @@ -38,5 +38,11 @@ | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | -| Webzio-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | +| crawler\.with\.dots | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression dots need to be escaped. | +| star\*\*\*crawler | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression stars need to be escaped. | +| Is this a crawler? | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression spaces and question marks need to be escaped. | +| a\[mazing\]\{42\}\(robot\) | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression parantheses, braces, etc. need to be escaped. | +| 2^32$ | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression RE anchor characters need to be escaped. | +| curl\|sudo bash | Test suite | No | To ensure the code works correctly. | No information. | When used in the .htaccess regular expression pipes need to be escaped. | diff --git a/code/tests.py b/code/tests.py old mode 100644 new mode 100755 index 6f778c3..94cbb47 --- a/code/tests.py +++ b/code/tests.py @@ -1,27 +1,58 @@ -"""These tests can be run with pytest. -This requires pytest: pip install pytest -cd to the `code` directory and run `pytest` -""" +#!/usr/bin/env python3 +"""To run these tests just execute this script.""" import json -from pathlib import Path +import unittest from robots import json_to_txt, json_to_table, json_to_htaccess +class RobotsUnittestExtensions: + def loadJson(self, pathname): + with open(pathname, "rt") as f: + return json.load(f) -def test_robots_txt_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_txt = json_to_txt(robots_json) - assert Path("test_files/robots.txt").read_text() == robots_txt + def assertEqualsFile(self, f, s): + with open(f, "rt") as f: + f_contents = f.read() + + return self.assertMultiLineEqual(f_contents, s) -def test_table_of_bot_metrices_md(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_table = json_to_table(robots_json) - assert Path("test_files/table-of-bot-metrics.md").read_text() == robots_table +class TestRobotsTXTGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_robots_txt_generation(self): + robots_txt = json_to_txt(self.robots_dict) + self.assertEqualsFile("test_files/robots.txt", robots_txt) -def test_htaccess_creation(): - robots_json = json.loads(Path("test_files/robots.json").read_text()) - robots_htaccess = json_to_htaccess(robots_json) - assert Path("test_files/.htaccess").read_text() == robots_htaccess +class TestTableMetricsGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 32768 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_table_generation(self): + robots_table = json_to_table(self.robots_dict) + self.assertEqualsFile("test_files/table-of-bot-metrics.md", robots_table) + + +class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_htaccess_generation(self): + robots_htaccess = json_to_htaccess(self.robots_dict) + self.assertEqualsFile("test_files/.htaccess", robots_htaccess) + + +if __name__ == "__main__": + import os + os.chdir(os.path.dirname(__file__)) + + unittest.main(verbosity=2) From c7c1e7b96fe74f90590f4d375c1bab4be53a4044 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:15:10 +0100 Subject: [PATCH 106/201] robots.py: Make executable --- code/robots.py | 2 ++ 1 file changed, 2 insertions(+) mode change 100644 => 100755 code/robots.py diff --git a/code/robots.py b/code/robots.py old mode 100644 new mode 100755 index 62fb061..6bf7920 --- a/code/robots.py +++ b/code/robots.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import re import requests From 1d55a205e4c8447829abdd34098ef9b0fedefee1 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Tue, 18 Feb 2025 05:08:28 +0000 Subject: [PATCH 107/201] Document testing in README Fixes: https://github.com/ai-robots-txt/ai.robots.txt/issues/81 --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a206c83..30a85da 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,11 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. +You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing: +```console +code/tests.py +``` + ## Subscribe to updates You can subscribe to list updates via RSS/Atom with the releases feed: From 8a7489633326465fd7e83fecece6740440d38eb6 Mon Sep 17 00:00:00 2001 From: Dennis Camera Date: Tue, 18 Feb 2025 10:23:40 +0100 Subject: [PATCH 108/201] Add workflow to run tests on pull request or push to main --- .github/workflows/run-tests.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/run-tests.yml diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 0000000..c98861f --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,21 @@ +on: + pull_request: + branches: + - main + push: + branches: + - main +jobs: + run-tests: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + with: + fetch-depth: 2 + - name: Install dependencies + run: | + pip install -U requests beautifulsoup4 + - name: Run tests + run: | + code/tests.py From 6ecfcdfcbfd1bd36da1982b7a4f9f95cbeb8101a Mon Sep 17 00:00:00 2001 From: deyigifts Date: Mon, 24 Mar 2025 14:16:57 +0800 Subject: [PATCH 109/201] Update perplexity bot Update based on perplexity bot docs --- robots.json | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index cdc7bb5..eaac816 100644 --- a/robots.json +++ b/robots.json @@ -253,10 +253,17 @@ }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", - "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", + "respect": "[Yes](https://docs.perplexity.ai/guides/bots)", + "function": "Search result generation.", + "frequency": "No information.", + "description": "Crawls sites to surface as results in Perplexity." + }, + "Perplexity‑User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", "function": "Used to answer queries at the request of users.", - "frequency": "Takes action based on user prompts.", - "description": "Operated by Perplexity to obtain results in response to user queries." + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." }, "PetalBot": { "description": "Operated by Huawei to provide search and AI assistant services.", @@ -321,4 +328,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From da85207314724c02d151a7bdfcdca3ef3fd056a1 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:27:09 +0100 Subject: [PATCH 110/201] Implement new function "json_to_nginx" which outputs an Nginx configuration snippet --- code/robots.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/code/robots.py b/code/robots.py index 6bf7920..f58f2b8 100755 --- a/code/robots.py +++ b/code/robots.py @@ -152,6 +152,12 @@ def json_to_htaccess(robot_json): htaccess += "RewriteRule !^/?robots\\.txt$ - [F,L]\n" return htaccess +def json_to_nginx(robot_json): + # Creates an Nginx config file. This config snippet can be included in + # nginx server{} blocks to block AI bots. + config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" + return config + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -178,6 +184,10 @@ def conversions(): file_name="./.htaccess", converter=json_to_htaccess, ) + update_file_if_changed( + file_name="./nginx-block-ai-bots.conf", + converter=json_to_nginx, + ) if __name__ == "__main__": From 5a312c5f4d1fcd89c17f4d6cb360ad7230857402 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:11 +0100 Subject: [PATCH 111/201] Mention Nginx config feature in README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 30a85da..b984672 100644 --- a/README.md +++ b/README.md @@ -13,16 +13,19 @@ If you'd like to add information about a crawler to the list, please make a pull This repository provides the following files: - `robots.txt` - `.htaccess` +- `nginx-block-ai-bots.conf` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). `.htaccess` may be used to configure web servers such as [Apache httpd](https://httpd.apache.org/) to return an error page when one of the listed AI crawlers sends a request to the web server. Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/current/howto/htaccess.html), more performant methods than an `.htaccess` file exist. +`nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. + ## Contributing -A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, and `.htaccess`. +A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. You can run the tests by [installing](https://www.python.org/about/gettingstarted/) Python 3 and issuing: ```console From 4f3f4cd0dd0f421c2787b1336d37b8da06998882 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 12:28:50 +0100 Subject: [PATCH 112/201] Add assembled version of nginx-block-ai-bots.conf file --- nginx-block-ai-bots.conf | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 nginx-block-ai-bots.conf diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf new file mode 100644 index 0000000..ce30520 --- /dev/null +++ b/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { + return 403; +} \ No newline at end of file From 7c3b5a2cb21f5404cf4e2af1acf8689ba77d7b06 Mon Sep 17 00:00:00 2001 From: Thomas Leister Date: Thu, 27 Mar 2025 16:12:18 +0100 Subject: [PATCH 113/201] Add tests for Nginx config generator --- code/test_files/nginx-block-ai-bots.conf | 3 +++ code/tests.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 code/test_files/nginx-block-ai-bots.conf diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf new file mode 100644 index 0000000..d1b559e --- /dev/null +++ b/code/test_files/nginx-block-ai-bots.conf @@ -0,0 +1,3 @@ +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { + return 403; +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index 94cbb47..61d69b4 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -50,6 +50,16 @@ class TestHtaccessGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_htaccess = json_to_htaccess(self.robots_dict) self.assertEqualsFile("test_files/.htaccess", robots_htaccess) +class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_nginx_generation(self): + robots_nginx = json_to_nginx(self.robots_dict) + self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) + if __name__ == "__main__": import os From 68d1d93714bbe4931811f301c7030ca979d95b39 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 27 Mar 2025 19:29:30 +0000 Subject: [PATCH 114/201] Merge pull request #91 from deyigifts/perplexity-user Update perplexity bots --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 2313293..2f5d0e4 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 80c40e8..8c79fc2 100644 --- a/robots.txt +++ b/robots.txt @@ -35,6 +35,7 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot +User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ce82047..0cc2264 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | +| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 6851413c52b91b9729bbbfd75f84af364b490bde Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 27 Mar 2025 19:49:15 +0000 Subject: [PATCH 115/201] Merge pull request #94 from ThomasLeister/feature/implement-nginx-configuration-snippet-export Implement Nginx configuration snippet export --- nginx-block-ai-bots.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ce30520..72d65ec 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file From ec18af76242c1b62bbbfc7e1df72098b423402a6 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 27 Mar 2025 12:51:22 -0700 Subject: [PATCH 116/201] Revert "Merge pull request #91 from deyigifts/perplexity-user" This reverts commit 68d1d93714bbe4931811f301c7030ca979d95b39. --- .htaccess | 2 +- robots.txt | 1 - table-of-bot-metrics.md | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.htaccess b/.htaccess index 2f5d0e4..2313293 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 8c79fc2..80c40e8 100644 --- a/robots.txt +++ b/robots.txt @@ -35,7 +35,6 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot -User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0cc2264..ce82047 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,8 +36,7 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | -| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From c249de99a317b54e8891f1682dbf514e7763986e Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 28 Mar 2025 00:54:28 +0000 Subject: [PATCH 117/201] Update from Dark Visitors --- robots.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/robots.json b/robots.json index eaac816..e907c8b 100644 --- a/robots.json +++ b/robots.json @@ -258,7 +258,7 @@ "frequency": "No information.", "description": "Crawls sites to surface as results in Perplexity." }, - "Perplexity‑User": { + "Perplexity\u2011User": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://docs.perplexity.ai/guides/bots)", "function": "Used to answer queries at the request of users.", @@ -328,4 +328,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 5b8650b99b35ff2aa1aa9ae26183b312edc48d45 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 29 Mar 2025 00:54:10 +0000 Subject: [PATCH 118/201] Update from Dark Visitors --- .htaccess | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 2313293..2f5d0e4 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/robots.txt b/robots.txt index 80c40e8..8c79fc2 100644 --- a/robots.txt +++ b/robots.txt @@ -35,6 +35,7 @@ User-agent: omgili User-agent: omgilibot User-agent: PanguBot User-agent: PerplexityBot +User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ce82047..0cc2264 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | -| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | +| PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | +| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From ae8f74c10cec97ec758bf39345ff20717302c665 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:22:04 -0700 Subject: [PATCH 119/201] Update robots.json --- robots.json | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/robots.json b/robots.json index e907c8b..f711d43 100644 --- a/robots.json +++ b/robots.json @@ -69,13 +69,6 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "Claude-Web": { - "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", - "function": "Scrapes data to train Anthropic's AI products.", - "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." - }, "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", @@ -83,6 +76,20 @@ "frequency": "No information provided.", "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, + "Claude-User": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", + "frequency": "No information provided.", + "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." + }, + "Claude-SearchBot": { + "operator": "[Anthropic](https://www.anthropic.com)", + "respect": "Unclear at this time.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "frequency": "No information provided.", + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", "respect": "Unclear at this time.", @@ -328,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8dc36aa2e2bbc9b99bde043b593cdc6c9669f401 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Tue, 1 Apr 2025 15:23:28 -0700 Subject: [PATCH 120/201] Update robots.txt --- robots.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/robots.txt b/robots.txt index 8c79fc2..e19468d 100644 --- a/robots.txt +++ b/robots.txt @@ -8,8 +8,9 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: Claude-Web User-agent: ClaudeBot +User-agent: Claude-User +User-agent: Claude-SearchBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Crawlspace From 6b0349f37ddf69ef9ec0e09a884b351f4a0e4b43 Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Fri, 4 Apr 2025 15:20:30 -0700 Subject: [PATCH 121/201] fix python complaining about f-string syntax ``` python code/tests.py Traceback (most recent call last): File "/Users/fbarthelemy/Code/ai.robots.txt/code/tests.py", line 7, in from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx File "/Users/fbarthelemy/Code/ai.robots.txt/code/robots.py", line 144 return f"({"|".join(map(re.escape, lst))})" ^ SyntaxError: f-string: expecting '}' ``` --- code/robots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index f58f2b8..90c0e8c 100755 --- a/code/robots.py +++ b/code/robots.py @@ -141,7 +141,8 @@ def json_to_table(robots_json): def list_to_pcre(lst): # Python re is not 100% identical to PCRE which is used by Apache, but it # should probably be close enough in the real world for re.escape to work. - return f"({"|".join(map(re.escape, lst))})" + formatted = "|".join(map(re.escape, lst)) + return f"({formatted})" def json_to_htaccess(robot_json): From 5f5a89c38c27b676c3212f6ea3895d31f315f37e Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Fri, 4 Apr 2025 17:34:14 -0700 Subject: [PATCH 122/201] Fix html-mangled hyphen in Perplexity-Users Fixes: #99 --- .htaccess | 2 +- code/robots.py | 15 +++++++++++++++ code/test_files/.htaccess | 2 +- code/test_files/nginx-block-ai-bots.conf | 2 +- code/test_files/robots.json | 7 +++++++ code/test_files/robots.txt | 1 + code/test_files/table-of-bot-metrics.md | 1 + code/tests.py | 5 +++++ nginx-block-ai-bots.conf | 2 +- robots.json | 14 +++++++------- robots.txt | 2 +- table-of-bot-metrics.md | 2 +- 12 files changed, 42 insertions(+), 13 deletions(-) diff --git a/.htaccess b/.htaccess index 2f5d0e4..27a7e11 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/robots.py b/code/robots.py index 90c0e8c..d158b36 100755 --- a/code/robots.py +++ b/code/robots.py @@ -50,6 +50,7 @@ def updated_robots_json(soup): continue for agent in section.find_all("a", href=True): name = agent.find("div", {"class": "agent-name"}).get_text().strip() + name = clean_robot_name(name) desc = agent.find("p").get_text().strip() default_values = { @@ -101,6 +102,20 @@ def updated_robots_json(soup): return sorted_robots +def clean_robot_name(name): + """ Clean the robot name by removing some characters that were mangled by html software once. """ + # This was specifically spotted in "Perplexity-User" + # Looks like a non-breaking hyphen introduced by the HTML rendering software + # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots + # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, + # and it's only the Row-Heading that has the special hyphen + # + # Technically, there's no reason there wouldn't someday be a bot that + # actually uses a non-breaking hyphen, but that seems unlikely, + # so this solution should be fine for now. + return re.sub(r"\u2011", "-", name) + + def ingest_darkvisitors(): old_robots_json = load_robots_json() soup = get_agent_soup() diff --git a/code/test_files/.htaccess b/code/test_files/.htaccess index 7e39092..f0d6783 100644 --- a/code/test_files/.htaccess +++ b/code/test_files/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/code/test_files/nginx-block-ai-bots.conf b/code/test_files/nginx-block-ai-bots.conf index d1b559e..c569b15 100644 --- a/code/test_files/nginx-block-ai-bots.conf +++ b/code/test_files/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)") { return 403; } \ No newline at end of file diff --git a/code/test_files/robots.json b/code/test_files/robots.json index b0cbfbb..385f284 100644 --- a/code/test_files/robots.json +++ b/code/test_files/robots.json @@ -223,6 +223,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "Perplexity-User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", + "function": "Used to answer queries at the request of users.", + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." + }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/)", diff --git a/code/test_files/robots.txt b/code/test_files/robots.txt index 03c3c25..ee201f8 100644 --- a/code/test_files/robots.txt +++ b/code/test_files/robots.txt @@ -30,6 +30,7 @@ User-agent: Meta-ExternalFetcher User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: Scrapy diff --git a/code/test_files/table-of-bot-metrics.md b/code/test_files/table-of-bot-metrics.md index 88af6c0..9b280aa 100644 --- a/code/test_files/table-of-bot-metrics.md +++ b/code/test_files/table-of-bot-metrics.md @@ -32,6 +32,7 @@ | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [No](https://www.macstories.net/stories/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler/) | Used to answer queries at the request of users. | Takes action based on user prompts. | Operated by Perplexity to obtain results in response to user queries. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | diff --git a/code/tests.py b/code/tests.py index 61d69b4..f58b445 100755 --- a/code/tests.py +++ b/code/tests.py @@ -60,6 +60,11 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestRobotsNameCleaning(unittest.TestCase): + def test_clean_name(self): + from robots import clean_robot_name + + self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") if __name__ == "__main__": import os diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 72d65ec..0577bd9 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|PerplexityBot|Perplexity‑User|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.json b/robots.json index e907c8b..8fd7572 100644 --- a/robots.json +++ b/robots.json @@ -251,6 +251,13 @@ "frequency": "Unclear at this time.", "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" }, + "Perplexity-User": { + "operator": "[Perplexity](https://www.perplexity.ai/)", + "respect": "[No](https://docs.perplexity.ai/guides/bots)", + "function": "Used to answer queries at the request of users.", + "frequency": "Only when prompted by a user.", + "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." + }, "PerplexityBot": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[Yes](https://docs.perplexity.ai/guides/bots)", @@ -258,13 +265,6 @@ "frequency": "No information.", "description": "Crawls sites to surface as results in Perplexity." }, - "Perplexity\u2011User": { - "operator": "[Perplexity](https://www.perplexity.ai/)", - "respect": "[No](https://docs.perplexity.ai/guides/bots)", - "function": "Used to answer queries at the request of users.", - "frequency": "Only when prompted by a user.", - "description": "Visit web pages to help provide an accurate answer and include links to the page in Perplexity response." - }, "PetalBot": { "description": "Operated by Huawei to provide search and AI assistant services.", "frequency": "No explicit frequency provided.", diff --git a/robots.txt b/robots.txt index 8c79fc2..c531918 100644 --- a/robots.txt +++ b/robots.txt @@ -34,8 +34,8 @@ User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot User-agent: PanguBot +User-agent: Perplexity-User User-agent: PerplexityBot -User-agent: Perplexity‑User User-agent: PetalBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0cc2264..d92df34 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,8 +36,8 @@ | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | +| Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | -| Perplexity‑User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From c6f308cbd0a00166f5085fa4adc98630c767e11e Mon Sep 17 00:00:00 2001 From: Frederic Barthelemy Date: Sat, 5 Apr 2025 09:01:52 -0700 Subject: [PATCH 123/201] PR Feedback: log special-case, comment consistency --- code/robots.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/code/robots.py b/code/robots.py index d158b36..86ea413 100755 --- a/code/robots.py +++ b/code/robots.py @@ -107,13 +107,16 @@ def clean_robot_name(name): # This was specifically spotted in "Perplexity-User" # Looks like a non-breaking hyphen introduced by the HTML rendering software # Reading the source page for Perplexity: https://docs.perplexity.ai/guides/bots - # You can see the bot is listed several times as "Perplexity‑User" with a normal hyphen, + # You can see the bot is listed several times as "Perplexity-User" with a normal hyphen, # and it's only the Row-Heading that has the special hyphen # # Technically, there's no reason there wouldn't someday be a bot that # actually uses a non-breaking hyphen, but that seems unlikely, # so this solution should be fine for now. - return re.sub(r"\u2011", "-", name) + result = re.sub(r"\u2011", "-", name) + if result != name: + print(f"\tCleaned '{name}' to '{result}' - unicode/html mangled chars normalized.") + return result def ingest_darkvisitors(): From b65f45e408461560a32f44f05860f80655737467 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 10 Apr 2025 10:12:51 -0700 Subject: [PATCH 124/201] chore(robots.json): adds imgproxy crawler --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8fd7572..4c9f7d7 100644 --- a/robots.json +++ b/robots.json @@ -195,6 +195,13 @@ "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, + "imgproxy": { + "frequency": "No information.", + "function": "Not documented or explained on operator's site.", + "operator": "[imgproxy](https://imgproxy.net)", + "respect": "Unclear at this time.", + "description": "AI-powered image processing." + }, "ISSCyberRiskCrawler": { "description": "Used to train machine learning based models to quantify cyber risk.", "frequency": "No information.", @@ -328,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4a764bba18f10167cb5f7107c8721e5dc208100f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 10 Apr 2025 19:22:34 +0000 Subject: [PATCH 125/201] Merge pull request #102 from ai-robots-txt/imgproxy-bot chore(robots.json): adds imgproxy crawler --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index 27a7e11..c0e5fbb 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 0577bd9..a6bbfa2 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index c531918..de25a56 100644 --- a/robots.txt +++ b/robots.txt @@ -26,6 +26,7 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset +User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d92df34..b3e51fe 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -28,6 +28,7 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | +| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | From 305188b2e78855d4e7193f29a3e7205f96fa86f6 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 11 Apr 2025 00:55:52 +0000 Subject: [PATCH 126/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 4c9f7d7..eff38ac 100644 --- a/robots.json +++ b/robots.json @@ -335,4 +335,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From d9f882a9b21170754c4b37ff1bbc237171876684 Mon Sep 17 00:00:00 2001 From: Joshua Sheard Date: Mon, 14 Apr 2025 15:46:01 +0100 Subject: [PATCH 127/201] Include "AI Agents" from Dark Visitors --- code/robots.py | 1 + 1 file changed, 1 insertion(+) diff --git a/code/robots.py b/code/robots.py index 86ea413..8a06b55 100755 --- a/code/robots.py +++ b/code/robots.py @@ -30,6 +30,7 @@ def updated_robots_json(soup): """Update AI scraper information with data from darkvisitors.""" existing_content = load_robots_json() to_include = [ + "AI Agents", "AI Assistants", "AI Data Scrapers", "AI Search Crawlers", From a96e33098975edf1c05c8d9684b36b9fa31f7ef2 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 15 Apr 2025 00:57:01 +0000 Subject: [PATCH 128/201] Update from Dark Visitors --- robots.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/robots.json b/robots.json index eff38ac..8bba6b2 100644 --- a/robots.json +++ b/robots.json @@ -230,6 +230,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "NovaAct": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact" + }, "OAI-SearchBot": { "operator": "[OpenAI](https://openai.com)", "respect": "[Yes](https://platform.openai.com/docs/bots)", @@ -251,6 +258,13 @@ "operator": "[Webz.io](https://webz.io/)", "respect": "[Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html)" }, + "Operator": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator" + }, "PanguBot": { "operator": "the Chinese company Huawei", "respect": "Unclear at this time.", From e0cdb278fbd243f554579fe5050850f124b286a8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 16 Apr 2025 00:57:11 +0000 Subject: [PATCH 129/201] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index c0e5fbb..d10e796 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a6bbfa2..c37cef5 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index de25a56..1e3aa80 100644 --- a/robots.txt +++ b/robots.txt @@ -31,9 +31,11 @@ User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: Meta-ExternalAgent User-agent: Meta-ExternalFetcher +User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot +User-agent: Operator User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b3e51fe..4c87b41 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -33,9 +33,11 @@ | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | +| Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | From 4a6f37d72718aeb44d1d8cbcccb740ace3fe82d6 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:42:58 -0700 Subject: [PATCH 130/201] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f711d43..ba052ae 100644 --- a/robots.json +++ b/robots.json @@ -78,7 +78,7 @@ }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent.", "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." From fd41de8522536a25de71f37310a05e77d71a0792 Mon Sep 17 00:00:00 2001 From: Kyle Buckingham Date: Wed, 16 Apr 2025 16:43:03 -0700 Subject: [PATCH 131/201] Update robots.json Co-authored-by: Glyn Normington --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index ba052ae..ca6fc40 100644 --- a/robots.json +++ b/robots.json @@ -85,7 +85,7 @@ }, "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", - "respect": "Unclear at this time.", + "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." From d05ede8fe164c5c3f47acecc1343e6d2ea5c294b Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 17:46:56 +0100 Subject: [PATCH 132/201] Clarify our position on sponsorship Some firms, including those with .ai domains, have offered to sponsor this project. So make our position clear. --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 967cf41..487f784 100644 --- a/FAQ.md +++ b/FAQ.md @@ -55,3 +55,7 @@ That depends on your stack. ## How can I contribute? Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. + +## Can my company sponsor ai.robots.txt? + +No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From b1856e6988a93bd834b228f121fa3524d11c7be7 Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 18 Apr 2025 18:40:44 +0100 Subject: [PATCH 133/201] Donations --- FAQ.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/FAQ.md b/FAQ.md index 487f784..7264819 100644 --- a/FAQ.md +++ b/FAQ.md @@ -56,6 +56,10 @@ That depends on your stack. Open a pull request. It will be reviewed and acted upon appropriately. **We really appreciate contributions** — this is a community effort. +## I'd like to donate money + +That's kind of you, but we don't need your money. If you insist, we'd love you to make a donation to the [American Civil Liberties Union](https://www.aclu.org/), the [Disasters Emergency Committee](https://www.dec.org.uk/), or a similar organisation. + ## Can my company sponsor ai.robots.txt? No, thank you. We do not accept sponsorship of any kind. We prefer to maintain our independence. Our costs are negligible as we are entirely volunteer-based and community-driven. From 33c5ce1326367abecccad23742779783c10c36a1 Mon Sep 17 00:00:00 2001 From: Dennis Lee Date: Mon, 21 Apr 2025 18:55:11 +0100 Subject: [PATCH 134/201] Update robots.json Updated robots list with five new proposed AI bots: aiHitBot Cotoyogi Factset_spyderbot FirecrawlAgent TikTokSpider --- robots.json | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8bba6b2..698b31e 100644 --- a/robots.json +++ b/robots.json @@ -13,6 +13,13 @@ "operator": "[Ai2](https://allenai.org/crawler)", "respect": "Yes" }, + "aiHitBot": { + "operator": "[aiHit](https://www.aihitdata.com/about)", + "respect": "Yes", + "function": "A massive, artificial intelligence/machine learning, automated system.", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -97,6 +104,13 @@ "frequency": "Unclear at this time.", "description": "cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler" }, + "Cotoyogi": { + "operator": "[ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/)", + "respect": "Yes", + "function": "AI LLM Scraper.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training in Japanese language." + }, "Crawlspace": { "operator": "[Crawlspace](https://crawlspace.dev)", "respect": "[Yes](https://news.ycombinator.com/item?id=42756654)", @@ -125,6 +139,20 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "Factset_spyderbot": { + "operator": "[Factset](https://www.factset.com/ai)", + "respect": "Unclear at this time.", + "function": "AI model training.", + "frequency": "No information provided.", + "description": "Scrapes data for AI training." + }, + "FirecrawlAgent": { + "operator": "[Firecrawl](https://www.firecrawl.dev/)", + "respect": "Yes", + "function": "AI scraper and LLM training", + "frequency": "No information provided.", + "description": "Scrapes data for AI systems and LLM training." + }, "FriendlyCrawler": { "description": "Unclear who the operator is; but data is used for training/machine learning.", "frequency": "Unclear at this time.", @@ -321,6 +349,13 @@ "operator": "[Sidetrade](https://www.sidetrade.com)", "respect": "Unclear at this time." }, + "TikTokSpider": { + "operator": "ByteDance", + "respect": "Unclear at this time.", + "function": "LLM training.", + "frequency": "Unclear at this time.", + "description": "Downloads data to train LLMS, as per Bytespider." + }, "Timpibot": { "operator": "[Timpi](https://timpi.io)", "respect": "Unclear at this time.", @@ -349,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From bbec639c14f3e7258729718dd2c6fc0b1734a9b1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 22 Apr 2025 14:50:26 +0000 Subject: [PATCH 135/201] Merge pull request #109 from dennislee1/patch-1 AI bots to consider adding --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++++ table-of-bot-metrics.md | 5 +++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index d10e796..b4ab72f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c37cef5..090275a 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1e3aa80..1f8eaf2 100644 --- a/robots.txt +++ b/robots.txt @@ -1,5 +1,6 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma +User-agent: aiHitBot User-agent: Amazonbot User-agent: anthropic-ai User-agent: Applebot @@ -12,10 +13,13 @@ User-agent: Claude-Web User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler +User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot User-agent: FacebookBot +User-agent: Factset_spyderbot +User-agent: FirecrawlAgent User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther @@ -44,6 +48,7 @@ User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot +User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4c87b41..0249766 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -2,6 +2,7 @@ |------|----------|-----------------------|----------|------------------|-------------| | AI2Bot | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | +| aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | @@ -14,10 +15,13 @@ | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | +| Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | +| FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | @@ -46,6 +50,7 @@ | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | +| TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | From 8d25a424d96dbf4a3cb12d4bb51929a764aa8f89 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 23 Apr 2025 00:56:52 +0000 Subject: [PATCH 136/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 698b31e..df9dcda 100644 --- a/robots.json +++ b/robots.json @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9d846ced45cdd13c7ecc03353c6ec554b5f9015d Mon Sep 17 00:00:00 2001 From: maia Date: Thu, 24 Apr 2025 04:08:20 +0200 Subject: [PATCH 137/201] Update robots.json Lowercase meta-external* as that was not technically the UA for the bots, also removed a period in the "respect" for consistency --- robots.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/robots.json b/robots.json index df9dcda..f2cec91 100644 --- a/robots.json +++ b/robots.json @@ -244,14 +244,14 @@ "frequency": "Unclear at this time.", "description": "Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot" }, - "Meta-ExternalAgent": { + "meta-externalagent": { "operator": "[Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers)", - "respect": "Yes.", + "respect": "Yes", "function": "Used to train models and improve products.", "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, - "Meta-ExternalFetcher": { + "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", "function": "AI Assistants", @@ -384,4 +384,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4654e14e9c857a228289d3258835182838202503 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 24 Apr 2025 07:00:34 +0000 Subject: [PATCH 138/201] Merge pull request #112 from maiavixen/main Fixed meta-external* being titlecase, and removed period for consistency --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++-- table-of-bot-metrics.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index b4ab72f..a97f98a 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 090275a..3320071 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1f8eaf2..53291ca 100644 --- a/robots.txt +++ b/robots.txt @@ -33,8 +33,8 @@ User-agent: img2dataset User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher +User-agent: meta-externalagent +User-agent: meta-externalfetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0249766..5c093b8 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -35,8 +35,8 @@ | imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | -| Meta\-ExternalAgent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes. | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | -| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 934ac7b31864d0cf0b21bb1580ddea97ec8b4994 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 25 Apr 2025 00:56:57 +0000 Subject: [PATCH 139/201] Update from Dark Visitors --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2cec91..e15a196 100644 --- a/robots.json +++ b/robots.json @@ -251,6 +251,13 @@ "frequency": "No information.", "description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\"" }, + "Meta-ExternalAgent": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent" + }, "meta-externalfetcher": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -258,6 +265,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "Meta-ExternalFetcher": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -384,4 +398,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From c6c7f1748f1e28053184539a70a6a08f5aeabc37 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 26 Apr 2025 00:55:12 +0000 Subject: [PATCH 140/201] Update from Dark Visitors --- .htaccess | 2 +- nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.htaccess b/.htaccess index a97f98a..586adab 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3320071..fc58d61 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|meta\-externalfetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 53291ca..232e119 100644 --- a/robots.txt +++ b/robots.txt @@ -34,7 +34,9 @@ User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent +User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher +User-agent: Meta-ExternalFetcher User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 5c093b8..4dd6076 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -36,7 +36,9 @@ | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | +| Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From 50e739dd738bb821018a863491b770dd8ee61155 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 08:42:52 +0200 Subject: [PATCH 141/201] HAProxy converter added. --- README.md | 9 +++++++ code/robots.py | 9 +++++++ haproxy-block-ai-bots.txt | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 haproxy-block-ai-bots.txt diff --git a/README.md b/README.md index b984672..1f1eff6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -22,6 +23,14 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; +1. Add the file to the config directory of HAProxy +2. Add the following lines in the `frontend` section; + ``` + acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt + http-request deny if ai_robot + ``` + (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) ## Contributing diff --git a/code/robots.py b/code/robots.py index 8a06b55..da157c1 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,6 +178,11 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config +def json_to_haproxy(robots_json): + # Creates a source file for HAProxy. Follow instructions in the README to implement it. + txt = "\n".join(f"{k}" for k in robots_json.keys()) + return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" @@ -208,6 +213,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./haproxy-block-ai-bots.txt", + converter=json_to_haproxy, + ) if __name__ == "__main__": diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..3c326bd --- /dev/null +++ b/haproxy-block-ai-bots.txt @@ -0,0 +1,57 @@ +AI2Bot +Ai2Bot-Dolma +aiHitBot +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Brightbot 1.0 +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +cohere-training-data-crawler +Cotoyogi +Crawlspace +Diffbot +DuckAssistBot +FacebookBot +Factset_spyderbot +FirecrawlAgent +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +imgproxy +ISSCyberRiskCrawler +Kangaroo Bot +meta-externalagent +Meta-ExternalAgent +meta-externalfetcher +Meta-ExternalFetcher +NovaAct +OAI-SearchBot +omgili +omgilibot +Operator +PanguBot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +SemrushBot-OCOB +SemrushBot-SWA +Sidetrade indexer bot +TikTokSpider +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot \ No newline at end of file From 66da70905f503239faeb0e49204776f508928048 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:09:40 +0200 Subject: [PATCH 142/201] Fixed incorrect English sentence. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1f1eff6..ff124e3 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ acl ai_robot hdr_sub(user-agent) -i -f /etc/haproxy/haproxy-block-ai-bots.txt http-request deny if ai_robot ``` - (Note that the path of the `haproxy-block-ai-bots.txt` may be different on your environment.) + (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) ## Contributing From a4a9f2ac2b9116d104789664231af4017d3828a7 Mon Sep 17 00:00:00 2001 From: Rik Wijnen Date: Mon, 28 Apr 2025 09:30:26 +0200 Subject: [PATCH 143/201] Tests for HAProxy file added. --- code/test_files/haproxy-block-ai-bots.txt | 47 +++++++++++++++++++++++ code/tests.py | 12 +++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 code/test_files/haproxy-block-ai-bots.txt diff --git a/code/test_files/haproxy-block-ai-bots.txt b/code/test_files/haproxy-block-ai-bots.txt new file mode 100644 index 0000000..5ed6939 --- /dev/null +++ b/code/test_files/haproxy-block-ai-bots.txt @@ -0,0 +1,47 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +Perplexity-User +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot +crawler.with.dots +star***crawler +Is this a crawler? +a[mazing]{42}(robot) +2^32$ +curl|sudo bash \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index f58b445..e179c44 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -60,6 +60,16 @@ class TestNginxConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): robots_nginx = json_to_nginx(self.robots_dict) self.assertEqualsFile("test_files/nginx-block-ai-bots.conf", robots_nginx) +class TestHaproxyConfigGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_haproxy_generation(self): + robots_haproxy = json_to_haproxy(self.robots_dict) + self.assertEqualsFile("test_files/haproxy-block-ai-bots.txt", robots_haproxy) + class TestRobotsNameCleaning(unittest.TestCase): def test_clean_name(self): from robots import clean_robot_name From 1310dbae4656e212ff01e7d8530d78c76dfd5a9f Mon Sep 17 00:00:00 2001 From: Crazyroostereye <63781667+Crazyroostereye1@users.noreply.github.com> Date: Thu, 1 May 2025 12:21:32 +0200 Subject: [PATCH 144/201] Added a Caddyfile converter (#110) Co-authored-by: Julian Beittel Co-authored-by: Glyn Normington --- Caddyfile | 3 +++ README.md | 3 +++ code/robots.py | 12 ++++++++++++ code/test_files/Caddyfile | 3 +++ code/tests.py | 13 ++++++++++++- 5 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 Caddyfile create mode 100644 code/test_files/Caddyfile diff --git a/Caddyfile b/Caddyfile new file mode 100644 index 0000000..1857d75 --- /dev/null +++ b/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" +} \ No newline at end of file diff --git a/README.md b/README.md index ff124e3..8d7bfb1 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ This repository provides the following files: - `robots.txt` - `.htaccess` - `nginx-block-ai-bots.conf` +- `Caddyfile` - `haproxy-block-ai-bots.txt` `robots.txt` implements the Robots Exclusion Protocol ([RFC 9309](https://www.rfc-editor.org/rfc/rfc9309.html)). @@ -23,6 +24,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ `nginx-block-ai-bots.conf` implements a Nginx configuration snippet that can be included in any virtual host `server {}` block via the `include` directive. +`Caddyfile` includes a Header Regex matcher group you can copy or import into your Caddyfile, the rejection can then be handled as followed `abort @aibots` + `haproxy-block-ai-bots.txt` may be used to configure HAProxy to block AI bots. To implement it; 1. Add the file to the config directory of HAProxy 2. Add the following lines in the `frontend` section; diff --git a/code/robots.py b/code/robots.py index da157c1..054c2be 100755 --- a/code/robots.py +++ b/code/robots.py @@ -178,12 +178,20 @@ def json_to_nginx(robot_json): config = f"if ($http_user_agent ~* \"{list_to_pcre(robot_json.keys())}\") {{\n return 403;\n}}" return config + +def json_to_caddy(robot_json): + caddyfile = "@aibots {\n " + caddyfile += f' header_regexp User-Agent "{list_to_pcre(robot_json.keys())}"' + caddyfile += "\n}" + return caddyfile + def json_to_haproxy(robots_json): # Creates a source file for HAProxy. Follow instructions in the README to implement it. txt = "\n".join(f"{k}" for k in robots_json.keys()) return txt + def update_file_if_changed(file_name, converter): """Update files if newer content is available and log the (in)actions.""" new_content = converter(load_robots_json()) @@ -213,6 +221,10 @@ def conversions(): file_name="./nginx-block-ai-bots.conf", converter=json_to_nginx, ) + update_file_if_changed( + file_name="./Caddyfile", + converter=json_to_caddy + update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", converter=json_to_haproxy, diff --git a/code/test_files/Caddyfile b/code/test_files/Caddyfile new file mode 100644 index 0000000..82f365a --- /dev/null +++ b/code/test_files/Caddyfile @@ -0,0 +1,3 @@ +@aibots { + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|Diffbot|FacebookBot|facebookexternalhit|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|OAI\-SearchBot|omgili|omgilibot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|Sidetrade\ indexer\ bot|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot|crawler\.with\.dots|star\*\*\*crawler|Is\ this\ a\ crawler\?|a\[mazing\]\{42\}\(robot\)|2\^32\$|curl\|sudo\ bash)" +} \ No newline at end of file diff --git a/code/tests.py b/code/tests.py index e179c44..434406f 100755 --- a/code/tests.py +++ b/code/tests.py @@ -4,7 +4,7 @@ import json import unittest -from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy +from robots import json_to_txt, json_to_table, json_to_htaccess, json_to_nginx, json_to_haproxy, json_to_caddy class RobotsUnittestExtensions: def loadJson(self, pathname): @@ -76,6 +76,17 @@ class TestRobotsNameCleaning(unittest.TestCase): self.assertEqual(clean_robot_name("Perplexity‑User"), "Perplexity-User") +class TestCaddyfileGeneration(unittest.TestCase, RobotsUnittestExtensions): + maxDiff = 8192 + + def setUp(self): + self.robots_dict = self.loadJson("test_files/robots.json") + + def test_caddyfile_generation(self): + robots_caddyfile = json_to_caddy(self.robots_dict) + self.assertEqualsFile("test_files/Caddyfile", robots_caddyfile) + + if __name__ == "__main__": import os os.chdir(os.path.dirname(__file__)) From ec995cd686a09b4af1c6a59d95e1ced122f1d5fc Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Thu, 1 May 2025 11:27:40 +0100 Subject: [PATCH 145/201] Fix Python syntax error --- code/robots.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/robots.py b/code/robots.py index 054c2be..c795649 100755 --- a/code/robots.py +++ b/code/robots.py @@ -223,7 +223,8 @@ def conversions(): ) update_file_if_changed( file_name="./Caddyfile", - converter=json_to_caddy + converter=json_to_caddy, + ) update_file_if_changed( file_name="./haproxy-block-ai-bots.txt", From 678380727e8685af8c5311bcfa1f55c7aa866d3b Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 1 May 2025 10:29:06 +0000 Subject: [PATCH 146/201] Merge pull request #115 from glyn/syntax Fix Python syntax error --- Caddyfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Caddyfile b/Caddyfile index 1857d75..0e10cfa 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|Meta\-ExternalAgent|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file From 36a52a88d8e3832091d73062ef268acb46f6e031 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 12 May 2025 20:20:18 -0700 Subject: [PATCH 147/201] Bing AI opt-out instructions --- README.md | 2 ++ docs/additional-steps/bing.md | 36 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 docs/additional-steps/bing.md diff --git a/README.md b/README.md index 8d7bfb1..28ef743 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +[Bing uses the data it crawls for AI and training, you may opt out by adding a `meta` tag to the `head` of your site.]((./docs/additional-steps/bing.md)) + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md new file mode 100644 index 0000000..37c60c7 --- /dev/null +++ b/docs/additional-steps/bing.md @@ -0,0 +1,36 @@ +# Bing (bingbot) + +It's not well publicised, but Bing uses the data it crawls for AI and training. + +However, the current thinking is, blocking a search engine of this size using `robots.txt` seems a quite drastic approach as it is second only to Google and could significantly impact your website in search results. + +Additionally, Bing powers a number of search engines such as Yahoo and AOL, and its search results are also used in Duck Duck Go, amongst others. + +Fortunately, Bing supports a relatively simple opt-out method, requiring an additional step. + +## How to opt-out of AI training + +You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. + +The line you need to add is: + +```plaintext + +``` + +By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." + +## Will my site be negatively affected + +Simple answer, no. +The original use of "noarchive" has been retired by all search engines. Google retired its use in 2024. + +The use of this metatag will not impact your site in search engines or in any other meaningful way if you add it to your page(s). + +It is now solely used by a handful of crawlers, such as Bingbot and Amazonbot, to signify to them not to use your data for AI/training. + +## Resources + +Bing Blog AI opt-out announcement: https://blogs.bing.com/webmaster/september-2023/Announcing-new-options-for-webmasters-to-control-usage-of-their-content-in-Bing-Chat + +Bing metatag information, including AI opt-out: https://www.bing.com/webmasters/help/which-robots-metatags-does-bing-support-5198d240 From b4610a725cac409b5c686d0f68383ea3f6daa818 Mon Sep 17 00:00:00 2001 From: Florent Poinsaut Date: Wed, 14 May 2025 14:11:56 +0200 Subject: [PATCH 148/201] Add Traefik plugin --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 8d7bfb1..80de135 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,12 @@ Note that, as stated in the [httpd documentation](https://httpd.apache.org/docs/ ``` (Note that the path of the `haproxy-block-ai-bots.txt` may be different in your environment.) +### Related + +- [Robots.txt Traefik plugin](https://plugins.traefik.io/plugins/681b2f3fba3486128fc34fae/robots-txt-plugin): +middleware plugin for [Traefik](https://traefik.io/traefik/) to automatically add rules of [robots.txt](./robots.txt) +file on-the-fly. + ## Contributing A note about contributing: updates should be added/made to `robots.json`. A GitHub action will then generate the updated `robots.txt`, `table-of-bot-metrics.md`, `.htaccess` and `nginx-block-ai-bots.conf`. From 9539256cb3116b626439bf79a776ea67b7aa2edd Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 14 May 2025 16:46:32 -0700 Subject: [PATCH 149/201] chore(robots.json): adds QualifiedBot crawler --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index e15a196..99e28d3 100644 --- a/robots.json +++ b/robots.json @@ -335,6 +335,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "QualifiedBot": { + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time.", + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", @@ -398,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 0c56b96fd99bfcc736c4f64c0df9bb87a1fc6075 Mon Sep 17 00:00:00 2001 From: Joe Hoyle Date: Thu, 15 May 2025 11:26:47 -0400 Subject: [PATCH 150/201] Fix JSON syntax error --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 99e28d3..f518037 100644 --- a/robots.json +++ b/robots.json @@ -340,7 +340,7 @@ "frequency": "No explicit frequency provided.", "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time.", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", From 1c470babbefed7b470443f6dd834e721c58481d6 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 15 May 2025 16:12:30 +0000 Subject: [PATCH 151/201] Merge pull request #123 from joehoyle/patch-1 Fix JSON syntax error --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 586adab..de88e50 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 0e10cfa..43ad3bf 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 3c326bd..9770b45 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -46,6 +46,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +QualifiedBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index fc58d61..afbd77c 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 232e119..dea4dd5 100644 --- a/robots.txt +++ b/robots.txt @@ -46,6 +46,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 4dd6076..57469fa 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -48,6 +48,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 498aa50760fe3850820b46933bf87b82918e5803 Mon Sep 17 00:00:00 2001 From: Patrick Evans Date: Thu, 15 May 2025 11:10:06 -0500 Subject: [PATCH 152/201] lint robots.json during pull requests --- .github/workflows/run-tests.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index c98861f..042cc13 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -19,3 +19,10 @@ jobs: - name: Run tests run: | code/tests.py + lint-json: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: JQ Json Lint + run: jq . robots.json From 16d1de70943e9d448d3d5e02e91d86e38dac80d7 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 16 May 2025 00:59:08 +0000 Subject: [PATCH 153/201] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index f518037..f2ed4c2 100644 --- a/robots.json +++ b/robots.json @@ -336,11 +336,11 @@ "respect": "Yes" }, "QualifiedBot": { - "description": "Operated by Qualified as part of their suite of AI product offerings.", - "frequency": "No explicit frequency provided.", - "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", - "operator": "[Qualified](https://www.qualified.com)", - "respect": "Unclear at this time." + "description": "Operated by Qualified as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI agents and other related products; usage can be assumed to support said products.", + "operator": "[Qualified](https://www.qualified.com)", + "respect": "Unclear at this time." }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", @@ -405,4 +405,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 5fba0b746d550b6ae4d7c9605904b6ec102d0f98 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 15:40:46 -0700 Subject: [PATCH 154/201] chore(robots.json): adds MistralAI-User/1.0 crawler --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index f518037..647e664 100644 --- a/robots.json +++ b/robots.json @@ -272,6 +272,13 @@ "frequency": "Unclear at this time.", "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, + "MistralAI-User/1.0": { + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", From ca918a963f735019a0c66343bf8338a9228d94f5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 15 May 2025 21:16:49 -0700 Subject: [PATCH 155/201] chore(robots.json): adds Google-CloudVertexBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index f2ed4c2..b459533 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, + "Google-CloudVertexBot": { + "operator": "Google", + "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", + "function": "Build and manage AI models for businesses employing Vertex AI", + "frequency": "No information.", + "description": "Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents." + }, "Google-Extended": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -405,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From dd1ed174b77ca2c0c4a40d6f4bce6beda4a1c296 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 16 May 2025 11:35:15 +0000 Subject: [PATCH 156/201] Merge pull request #129 from ai-robots-txt/google-cloudvertexbot chore(robots.json): adds Google-CloudVertexBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index de88e50..b2204d7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 43ad3bf..36fd20c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 9770b45..7389f10 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ FacebookBot Factset_spyderbot FirecrawlAgent FriendlyCrawler +Google-CloudVertexBot Google-Extended GoogleOther GoogleOther-Image diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index afbd77c..f05f785 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index dea4dd5..a5be10b 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther User-agent: GoogleOther-Image diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 57469fa..d8542b3 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | +| Google\-CloudVertexBot | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Build and manage AI models for businesses employing Vertex AI | No information. | Google-CloudVertexBot crawls sites on the site owners' request when building Vertex AI Agents. | | Google\-Extended | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | LLM training. | No information. | Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search. | | GoogleOther | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | | GoogleOther\-Image | Google | [Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers) | Scrapes data. | No information. | "Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development." | From 7a2e6cba52e782ab32552c2335637af761afbe51 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 17 May 2025 00:57:28 +0000 Subject: [PATCH 157/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index b459533..1ecfcd8 100644 --- a/robots.json +++ b/robots.json @@ -412,4 +412,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 9297c7dfa3122109a6f3ae3ce18026e0e6c94ebe Mon Sep 17 00:00:00 2001 From: Mihitoko Date: Mon, 19 May 2025 23:56:57 +0200 Subject: [PATCH 158/201] Mention X-Robots-Tag header as alternative for bing --- docs/additional-steps/bing.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/additional-steps/bing.md b/docs/additional-steps/bing.md index 37c60c7..f9afb78 100644 --- a/docs/additional-steps/bing.md +++ b/docs/additional-steps/bing.md @@ -10,15 +10,19 @@ Fortunately, Bing supports a relatively simple opt-out method, requiring an addi ## How to opt-out of AI training -You must add a metatag in the `` of your webpage. This also needs to be added to every page on your website. +You must add a metatag in the `` of your webpage or set the [X-Robots-Tag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Headers/X-Robots-Tag) HTTP header in your response. This also needs to be added to every page or response on your website. -The line you need to add is: +If using the metatag, the line you need to add is: ```plaintext ``` +Or include the HTTP response header: +```plaintext +X-Robots-Tag: noarchive +``` -By adding this line, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." +By adding this line or header, you are signifying to Bing: "Do not use the content for training Microsoft's generative AI foundation models." ## Will my site be negatively affected From 8a8001cbece8a5607163bed6eb83d5bb35bb24e5 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 20 May 2025 13:55:25 -0700 Subject: [PATCH 159/201] chore(README): updates the opening line of our README to clarify the types of agents we block --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 232b3ed..307f005 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This is an open list of web crawlers associated with AI companies and the training of LLMs to block. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 8b151b2cdc6ef8e949fd59d26d7456bcbb60d4e7 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 06:52:36 -0700 Subject: [PATCH 160/201] Update README.md Co-authored-by: Glyn Normington --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 307f005..f427af4 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ -This list contains AI-related crawlers of all types, regardless of purpose. Users should consult [the table of bot metrics](./table-of-bot-metrics.md) to guide the implementation of their list. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). +This list contains AI-related crawlers of all types, regardless of purpose. We encourage you to contribute to and implement this list on your own site. See [information about the listed crawlers](./table-of-bot-metrics.md) and the [FAQ](https://github.com/ai-robots-txt/ai.robots.txt/blob/main/FAQ.md). A number of these crawlers have been sourced from [Dark Visitors](https://darkvisitors.com) and we appreciate the ongoing effort they put in to track these crawlers. From 1c2acd75b7def13d9dc85233bfb4aaca8bcafd12 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 15:27:26 +0000 Subject: [PATCH 161/201] Merge pull request #126 from ai-robots-txt/mistral-bot chore(robots.json): adds MistralAI-User/1.0 crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index b2204d7..cc483c7 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 36fd20c..205acbd 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7389f10..de5b4fb 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -38,6 +38,7 @@ meta-externalagent Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher +MistralAI-User/1.0 NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index f05f785..3274559 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a5be10b..b3e16f8 100644 --- a/robots.txt +++ b/robots.txt @@ -38,6 +38,7 @@ User-agent: meta-externalagent User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher +User-agent: MistralAI-User/1.0 User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index d8542b3..84c69f5 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -40,6 +40,7 @@ | Meta\-ExternalAgent | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Meta-ExternalAgent is a web crawler used by Meta to download training data for its AI models and improve its products by indexing content directly. More info can be found at https://darkvisitors.com/agents/agents/meta-externalagent | | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | +| MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b1d9a60a38c04cac81dd156ad73ddef7eb60b50b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 21 May 2025 11:40:33 -0700 Subject: [PATCH 162/201] chore(robots.json): adds wpbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index bddefdd..ed7d63d 100644 --- a/robots.json +++ b/robots.json @@ -412,6 +412,13 @@ "frequency": "Unclear at this time.", "description": "Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended" }, + "wpbot": { + "operator": "[QuantumCloud](https://www.quantumcloud.com)", + "respect": "Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9)", + "function": "Live chat support and lead generation.", + "frequency": "Unclear at this time.", + "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -419,4 +426,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 7c5389f4a0c5f60745e1a7552e142bd33a587d8f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 19:00:23 +0000 Subject: [PATCH 163/201] Merge pull request #98 from kylebuckingham/main Updating Claude Bots --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- table-of-bot-metrics.md | 3 ++- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.htaccess b/.htaccess index cc483c7..2146722 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 205acbd..879426d 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index de5b4fb..8a9ccf9 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,8 +9,9 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -Claude-Web ClaudeBot +Claude-User +Claude-SearchBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 3274559..5f96718 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { return 403; } \ No newline at end of file diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 84c69f5..e6f35a4 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,8 +11,9 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| Claude\-Web | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From fedb658cc08225d71a6b0f32c9c2859b7420f0ee Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 21 May 2025 21:06:05 +0000 Subject: [PATCH 164/201] Merge pull request #133 from ai-robots-txt/wpbot chore(robots.json): adds wpbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 2146722..3337284 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 879426d..2001edc 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8a9ccf9..377710b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -58,4 +58,5 @@ TikTokSpider Timpibot VelenPublicWebCrawler Webzio-Extended +wpbot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f96718..ba1f8c6 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8690e50..92e527b 100644 --- a/robots.txt +++ b/robots.txt @@ -58,5 +58,6 @@ User-agent: TikTokSpider User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended +User-agent: wpbot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e6f35a4..c795559 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -60,4 +60,5 @@ | Timpibot | [Timpi](https://timpi.io) | Unclear at this time. | Scrapes data for use in training LLMs. | No information. | Makes data available for training AI models. | | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | +| wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 7bf7f9164d55a58e0d6080dff52eea3ed5d3584e Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 22 May 2025 00:58:45 +0000 Subject: [PATCH 165/201] Update from Dark Visitors --- robots.json | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/robots.json b/robots.json index 9dce781..06187ae 100644 --- a/robots.json +++ b/robots.json @@ -76,12 +76,12 @@ "frequency": "Only when prompted by a user.", "description": "Used by plugins in ChatGPT to answer queries based on user input." }, - "ClaudeBot": { + "Claude-SearchBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Scrapes data to train Anthropic's AI products.", + "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", "frequency": "No information provided.", - "description": "Scrapes data to train LLMs and AI products offered by Anthropic." + "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." }, "Claude-User": { "operator": "[Anthropic](https://www.anthropic.com)", @@ -90,12 +90,19 @@ "frequency": "No information provided.", "description": "Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent." }, - "Claude-SearchBot": { + "Claude-Web": { + "operator": "Anthropic", + "respect": "Unclear at this time.", + "function": "Undocumented AI Agents", + "frequency": "Unclear at this time.", + "description": "Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web" + }, + "ClaudeBot": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "[Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler)", - "function": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses.", + "function": "Scrapes data to train Anthropic's AI products.", "frequency": "No information provided.", - "description": "Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses." + "description": "Scrapes data to train LLMs and AI products offered by Anthropic." }, "cohere-ai": { "operator": "[Cohere](https://cohere.com)", @@ -287,11 +294,11 @@ "description": "Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher" }, "MistralAI-User/1.0": { - "operator": "Mistral AI", - "function": "Takes action based on user prompts.", - "frequency": "Only when prompted by a user.", - "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", - "respect": "Yes" + "operator": "Mistral AI", + "function": "Takes action based on user prompts.", + "frequency": "Only when prompted by a user.", + "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", + "respect": "Yes" }, "NovaAct": { "operator": "Unclear at this time.", @@ -433,4 +440,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 093ab81d789528bb5d89c2d2c708b8f157e3b795 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 23 May 2025 00:58:57 +0000 Subject: [PATCH 166/201] Update from Dark Visitors --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 5 +++-- nginx-block-ai-bots.conf | 2 +- robots.txt | 5 +++-- table-of-bot-metrics.md | 5 +++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.htaccess b/.htaccess index 3337284..26c6e72 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 2001edc..7a1076c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 377710b..8ef373b 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -9,9 +9,10 @@ Brightbot 1.0 Bytespider CCBot ChatGPT-User -ClaudeBot -Claude-User Claude-SearchBot +Claude-User +Claude-Web +ClaudeBot cohere-ai cohere-training-data-crawler Cotoyogi diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ba1f8c6..a691c55 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|ClaudeBot|Claude\-User|Claude\-SearchBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 92e527b..3330b20 100644 --- a/robots.txt +++ b/robots.txt @@ -9,9 +9,10 @@ User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot User-agent: ChatGPT-User -User-agent: ClaudeBot -User-agent: Claude-User User-agent: Claude-SearchBot +User-agent: Claude-User +User-agent: Claude-Web +User-agent: ClaudeBot User-agent: cohere-ai User-agent: cohere-training-data-crawler User-agent: Cotoyogi diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c795559..0e6a88e 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -11,9 +11,10 @@ | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | | ChatGPT\-User | [OpenAI](https://openai.com) | Yes | Takes action based on user prompts. | Only when prompted by a user. | Used by plugins in ChatGPT to answer queries based on user input. | -| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | -| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | | Claude\-SearchBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | No information provided. | Claude-SearchBot navigates the web to improve search result quality for users. It analyzes online content specifically to enhance the relevance and accuracy of search responses. | +| Claude\-User | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | No information provided. | Claude-User supports Claude AI users. When individuals ask questions to Claude, it may access websites using a Claude-User agent. | +| Claude\-Web | Anthropic | Unclear at this time. | Undocumented AI Agents | Unclear at this time. | Claude-Web is an AI-related agent operated by Anthropic. It's currently unclear exactly what it's used for, since there's no official documentation. If you can provide more detail, please contact us. More info can be found at https://darkvisitors.com/agents/agents/claude-web | +| ClaudeBot | [Anthropic](https://www.anthropic.com) | [Yes](https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler) | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | cohere\-ai | [Cohere](https://cohere.com) | Unclear at this time. | Retrieves data to provide responses to user-initiated prompts. | Takes action based on user prompts. | Retrieves data based on user prompts. | | cohere\-training\-data\-crawler | Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products | Unclear at this time. | AI Data Scrapers | Unclear at this time. | cohere-training-data-crawler is a web crawler operated by Cohere to download training data for its LLMs (Large Language Models) that power its enterprise AI products. More info can be found at https://darkvisitors.com/agents/agents/cohere-training-data-crawler | | Cotoyogi | [ROIS](https://ds.rois.ac.jp/en_center8/en_crawler/) | Yes | AI LLM Scraper. | No information provided. | Scrapes data for AI training in Japanese language. | From 3e8edd083e32e01bd5cf0629f108815092d5f7ec Mon Sep 17 00:00:00 2001 From: imp <80153024+not-not-the-imp@users.noreply.github.com> Date: Fri, 23 May 2025 13:03:49 +0100 Subject: [PATCH 167/201] Add AndiBot and PhindBot Fixes #75 --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 06187ae..8a8432b 100644 --- a/robots.json +++ b/robots.json @@ -20,6 +20,13 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provide genAI summary ." + }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -363,6 +370,13 @@ "operator": "[Huawei](https://huawei.com/)", "respect": "Yes" }, + "PhindBot": { + "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "frequency": "No explicit frequency provided.", + "function": "AI-enhanced search engine.", + "operator": "[phind](https://www.phind.com/)", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", @@ -440,4 +454,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From d22b9ec51ac14b0d9dfdd86a04cd566731c0e8c4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:00:13 -0700 Subject: [PATCH 168/201] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8a8432b..9bf9fdd 100644 --- a/robots.json +++ b/robots.json @@ -23,7 +23,7 @@ "Andibot": { "operator": "[Andi](https://andisearch.com/)", "respect": "Unclear at this time", - "function": "search engine using generative AI, AI Search Assistant", + "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", "description": "Scrapes website and provide genAI summary ." }, From 4259b25cccbb4d0bff740261f1eb825fa86bf381 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:01:09 -0700 Subject: [PATCH 169/201] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 9bf9fdd..50c2fae 100644 --- a/robots.json +++ b/robots.json @@ -25,7 +25,7 @@ "respect": "Unclear at this time", "function": "Search engine using generative AI, AI Search Assistant", "frequency": "No information provided.", - "description": "Scrapes website and provide genAI summary ." + "description": "Scrapes website and provides AI summary." }, "Amazonbot": { "operator": "Amazon", From 268922f8f2fa05b4eb4cc991116fce2700c09184 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Sat, 31 May 2025 16:02:05 -0700 Subject: [PATCH 170/201] Update robots.json --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 50c2fae..6f3a05d 100644 --- a/robots.json +++ b/robots.json @@ -371,7 +371,7 @@ "respect": "Yes" }, "PhindBot": { - "description": "Company offers AI agent that use genAI and generate extra web query on the fly", + "description": "Company offers an AI agent that uses AI and generate extra web query on the fly", "frequency": "No explicit frequency provided.", "function": "AI-enhanced search engine.", "operator": "[phind](https://www.phind.com/)", From 1dd66b696963dd3f9a577aa4ca59d83c8bc41aef Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 11:53:06 -0700 Subject: [PATCH 171/201] Revert "chore(robots.json): adds imgproxy crawler" This reverts commit b65f45e408461560a32f44f05860f80655737467. --- robots.json | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/robots.json b/robots.json index 6f3a05d..f730f50 100644 --- a/robots.json +++ b/robots.json @@ -251,13 +251,6 @@ "operator": "[img2dataset](https://github.com/rom1504/img2dataset)", "respect": "Unclear at this time." }, - "imgproxy": { - "frequency": "No information.", - "function": "Not documented or explained on operator's site.", - "operator": "[imgproxy](https://imgproxy.net)", - "respect": "Unclear at this time.", - "description": "AI-powered image processing." - }, "ISSCyberRiskCrawler": { "description": "Used to train machine learning based models to quantify cyber risk.", "frequency": "No information.", @@ -454,4 +447,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 899ce01c554359ecc66aa36bc2af367069175e10 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 2 Jun 2025 14:47:31 -0700 Subject: [PATCH 172/201] chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .github/workflows/ai_robots_update.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ai_robots_update.yml b/.github/workflows/ai_robots_update.yml index 7e11ce8..17c1cc8 100644 --- a/.github/workflows/ai_robots_update.yml +++ b/.github/workflows/ai_robots_update.yml @@ -20,7 +20,12 @@ jobs: echo "... done." git --no-pager diff git add -A - git diff --quiet && git diff --staged --quiet || (git commit -m "Update from Dark Visitors" && git push) + if ! git diff --cached --quiet; then + git commit -m "Update from Dark Visitors" + git push + else + echo "No changes to commit." + fi shell: bash convert: name: convert From 87016d15040f50630121666c40a6048df8a4169d Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 3 Jun 2025 01:00:29 +0000 Subject: [PATCH 173/201] Update from Dark Visitors --- robots.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/robots.json b/robots.json index f730f50..d8e8262 100644 --- a/robots.json +++ b/robots.json @@ -20,13 +20,6 @@ "frequency": "No information provided.", "description": "Scrapes data for AI systems." }, - "Andibot": { - "operator": "[Andi](https://andisearch.com/)", - "respect": "Unclear at this time", - "function": "Search engine using generative AI, AI Search Assistant", - "frequency": "No information provided.", - "description": "Scrapes website and provides AI summary." - }, "Amazonbot": { "operator": "Amazon", "respect": "Yes", @@ -34,6 +27,13 @@ "frequency": "No information provided.", "description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses." }, + "Andibot": { + "operator": "[Andi](https://andisearch.com/)", + "respect": "Unclear at this time", + "function": "Search engine using generative AI, AI Search Assistant", + "frequency": "No information provided.", + "description": "Scrapes website and provides AI summary." + }, "anthropic-ai": { "operator": "[Anthropic](https://www.anthropic.com)", "respect": "Unclear at this time.", From d239e7e5ad0e9c72d17961c57deaf10c1feef899 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 01:52:35 +0000 Subject: [PATCH 174/201] Merge pull request #139 from ai-robots-txt/workflow-fix chore(ai_robots_update.yml): correct workflow by revising git flags + adding guard --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 3 ++- nginx-block-ai-bots.conf | 2 +- robots.txt | 3 ++- table-of-bot-metrics.md | 3 ++- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.htaccess b/.htaccess index 26c6e72..ddb7255 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7a1076c..60ed4d3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 8ef373b..6c23da0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -2,6 +2,7 @@ AI2Bot Ai2Bot-Dolma aiHitBot Amazonbot +Andibot anthropic-ai Applebot Applebot-Extended @@ -33,7 +34,6 @@ iaskspider/2.0 ICC-Crawler ImagesiftBot img2dataset -imgproxy ISSCyberRiskCrawler Kangaroo Bot meta-externalagent @@ -50,6 +50,7 @@ PanguBot Perplexity-User PerplexityBot PetalBot +PhindBot QualifiedBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index a691c55..1c50815 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|imgproxy|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 3330b20..4b362e4 100644 --- a/robots.txt +++ b/robots.txt @@ -2,6 +2,7 @@ User-agent: AI2Bot User-agent: Ai2Bot-Dolma User-agent: aiHitBot User-agent: Amazonbot +User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended @@ -33,7 +34,6 @@ User-agent: iaskspider/2.0 User-agent: ICC-Crawler User-agent: ImagesiftBot User-agent: img2dataset -User-agent: imgproxy User-agent: ISSCyberRiskCrawler User-agent: Kangaroo Bot User-agent: meta-externalagent @@ -50,6 +50,7 @@ User-agent: PanguBot User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot +User-agent: PhindBot User-agent: QualifiedBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 0e6a88e..737b217 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -4,6 +4,7 @@ | Ai2Bot\-Dolma | [Ai2](https://allenai.org/crawler) | Yes | Content is used to train open language models. | No information provided. | Explores 'certain domains' to find web content. | | aiHitBot | [aiHit](https://www.aihitdata.com/about) | Yes | A massive, artificial intelligence/machine learning, automated system. | No information provided. | Scrapes data for AI systems. | | Amazonbot | Amazon | Yes | Service improvement and enabling answers for Alexa users. | No information provided. | Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses. | +| Andibot | [Andi](https://andisearch.com/) | Unclear at this time | Search engine using generative AI, AI Search Assistant | No information provided. | Scrapes website and provides AI summary. | | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | @@ -35,7 +36,6 @@ | ICC\-Crawler | [NICT](https://nict.go.jp) | Yes | Scrapes data to train and support AI technologies. | No information. | Use the collected data for artificial intelligence technologies; provide data to third parties, including commercial companies; those companies can use the data for their own business. | | ImagesiftBot | [ImageSift](https://imagesift.com) | [Yes](https://imagesift.com/about) | ImageSiftBot is a web crawler that scrapes the internet for publicly available images to support our suite of web intelligence products | No information. | Once images and text are downloaded from a webpage, ImageSift analyzes this data from the page and stores the information in an index. Our web intelligence products use this index to enable search and retrieval of similar images. | | img2dataset | [img2dataset](https://github.com/rom1504/img2dataset) | Unclear at this time. | Scrapes images for use in LLMs. | At the discretion of img2dataset users. | Downloads large sets of images into datasets for LLM training or other purposes. | -| imgproxy | [imgproxy](https://imgproxy.net) | Unclear at this time. | Not documented or explained on operator's site. | No information. | AI-powered image processing. | | ISSCyberRiskCrawler | [ISS-Corporate](https://iss-cyber.com) | No | Scrapes data to train machine learning models. | No information. | Used to train machine learning based models to quantify cyber risk. | | Kangaroo Bot | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Kangaroo Bot is used by the company Kangaroo LLM to download data to train AI models tailored to Australian language and culture. More info can be found at https://darkvisitors.com/agents/agents/kangaroo-bot | | meta\-externalagent | [Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers) | Yes | Used to train models and improve products. | No information. | "The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly." | @@ -52,6 +52,7 @@ | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | +| PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 3187fd8a3219620a4b0f3b7b7950f40e5aac4ad1 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Tue, 3 Jun 2025 12:24:16 -0700 Subject: [PATCH 175/201] chore(robots.json): adds YandexAdditional crawlers --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index d8e8262..45340fd 100644 --- a/robots.json +++ b/robots.json @@ -440,6 +440,20 @@ "frequency": "Unclear at this time.", "description": "wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support." }, + "YandexAdditional": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, + "YandexAdditionalBot": { + "operator": "[Yandex](https://yandex.ru)", + "respect": "[Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en)", + "function": "Scrapes/analyzes data for the YandexGPT LLM.", + "frequency": "No information.", + "description": "Retrieves data used for YandexGPT quick answers features." + }, "YouBot": { "operator": "[You](https://about.you.com/youchat/)", "respect": "[Yes](https://about.you.com/youbot/)", @@ -447,4 +461,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 080946c360465c5e0b0af8dd475bb0248bca77a1 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 19:51:25 +0000 Subject: [PATCH 176/201] Merge pull request #140 from ai-robots-txt/yandex-bots chore(robots.json): adds YandexAdditional crawlers --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ddb7255..c381fce 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 60ed4d3..3527a7a 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 6c23da0..a8ba9aa 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -61,4 +61,6 @@ Timpibot VelenPublicWebCrawler Webzio-Extended wpbot +YandexAdditional +YandexAdditionalBot YouBot \ No newline at end of file diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 1c50815..5f7a0db 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4b362e4..d26ccb4 100644 --- a/robots.txt +++ b/robots.txt @@ -61,5 +61,7 @@ User-agent: Timpibot User-agent: VelenPublicWebCrawler User-agent: Webzio-Extended User-agent: wpbot +User-agent: YandexAdditional +User-agent: YandexAdditionalBot User-agent: YouBot Disallow: / diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 737b217..3275a20 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -63,4 +63,6 @@ | VelenPublicWebCrawler | [Velen Crawler](https://velen.io) | [Yes](https://velen.io) | Scrapes data for business data sets and machine learning models. | No information. | "Our goal with this crawler is to build business datasets and machine learning models to better understand the web." | | Webzio\-Extended | Unclear at this time. | Unclear at this time. | AI Data Scrapers | Unclear at this time. | Webzio-Extended is a web crawler used by Webz.io to maintain a repository of web crawl data that it sells to other companies, including those using it to train AI models. More info can be found at https://darkvisitors.com/agents/agents/webzio-extended | | wpbot | [QuantumCloud](https://www.quantumcloud.com) | Unclear at this time; opt out provided via [Google Form](https://forms.gle/ajBaxygz9jSR8p8G9) | Live chat support and lead generation. | Unclear at this time. | wpbot is a used to support the functionality of the AI Chatbot for WordPress plugin. It supports the use of customer models, data collection and customer support. | +| YandexAdditional | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | +| YandexAdditionalBot | [Yandex](https://yandex.ru) | [Yes](https://yandex.ru/support/webmaster/en/search-appearance/fast.html?lang=en) | Scrapes/analyzes data for the YandexGPT LLM. | No information. | Retrieves data used for YandexGPT quick answers features. | | YouBot | [You](https://about.you.com/youchat/) | [Yes](https://about.you.com/youbot/) | Scrapes data for search engine and LLMs. | No information. | Retrieves data used for You.com web search engine and LLMs. | From 8f75f4a2f5f8f8381a73e6a6faab241e0ad58a14 Mon Sep 17 00:00:00 2001 From: Ivan Chupin Date: Wed, 4 Jun 2025 03:48:42 +0500 Subject: [PATCH 177/201] Add SBIntuitionsBot --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 45340fd..cd7ec4a 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,13 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "SBIntuitionsBot": { + "description": "AI development and information analysis", + "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", + "frequency": "No information.", + "function": "Uses data gathered in AI development and information analysis.", + "operator": "[SB Intuitions](https://www.sbintuitions.co.jp/en/)" + }, "Scrapy": { "description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\"", "frequency": "No information.", From 3efabc603dcae235cc04d1e2f2e9113c70e6197c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Tue, 3 Jun 2025 23:28:48 +0000 Subject: [PATCH 178/201] Merge pull request #141 from Ivan-Chupin/patch-1 Add SBIntuitionsBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index c381fce..48971b1 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 3527a7a..117e653 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index a8ba9aa..c2ebb47 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,7 @@ PerplexityBot PetalBot PhindBot QualifiedBot +SBIntuitionsBot Scrapy SemrushBot-OCOB SemrushBot-SWA diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 5f7a0db..edcf8a7 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index d26ccb4..1c5e989 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,7 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB User-agent: SemrushBot-SWA diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 3275a20..c9a3910 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,7 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 2b5a59a303a81847234ac11ec2f180ee9795db90 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Wed, 4 Jun 2025 01:00:07 +0000 Subject: [PATCH 179/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index cd7ec4a..739f579 100644 --- a/robots.json +++ b/robots.json @@ -468,4 +468,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 03831a7eb55ae3682e45a651588df13acc4a4a04 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:46:58 -0700 Subject: [PATCH 180/201] chore(robots.json): adds Quillbot --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..bea7350 100644 --- a/robots.json +++ b/robots.json @@ -377,6 +377,20 @@ "operator": "[Qualified](https://www.qualified.com)", "respect": "Unclear at this time." }, + "QuillBot": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, + "quillbot.com": { + "description": "Operated by QuillBot as part of their suite of AI product offerings.", + "frequency": "No explicit frequency provided.", + "function": "Company offers AI detection, writing tools and other services.", + "operator": "[Quillbot](https://quillbot.com)", + "respect": "Unclear at this time." + }, "SBIntuitionsBot": { "description": "AI development and information analysis", "respect": "[Yes](https://www.sbintuitions.co.jp/en/bot/)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 4568d69b0edad5708fe49fafd2b5c0572cd2dfa4 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 4 Jun 2025 10:54:14 -0700 Subject: [PATCH 181/201] chore(robots.json): adds Panscient --- robots.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..8d1163a 100644 --- a/robots.json +++ b/robots.json @@ -342,6 +342,20 @@ "frequency": "Unclear at this time.", "description": "PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot" }, + "Panscient": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, + "panscient.com": { + "operator": "[Panscient](https://panscient.com)", + "respect": "[Yes](https://panscient.com/faq.htm)", + "function": "Data collection and analysis using machine learning and AI.", + "frequency": "The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address.", + "description": "Compiles data on businesses and business professionals that is structured using AI and machine learning." + }, "Perplexity-User": { "operator": "[Perplexity](https://www.perplexity.ai/)", "respect": "[No](https://docs.perplexity.ai/guides/bots)", @@ -468,4 +482,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 9c28c63a0c4889d694f7ebdd278c17564f4b72a3 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 17:54:57 +0000 Subject: [PATCH 182/201] Merge pull request #142 from ai-robots-txt/quillbot chore(robots.json): adds Quillbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 48971b1..ee898fd 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 117e653..de3bf3b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c2ebb47..be27797 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -52,6 +52,8 @@ PerplexityBot PetalBot PhindBot QualifiedBot +QuillBot +quillbot.com SBIntuitionsBot Scrapy SemrushBot-OCOB diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index edcf8a7..d7decae 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 1c5e989..4ed2deb 100644 --- a/robots.txt +++ b/robots.txt @@ -52,6 +52,8 @@ User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot User-agent: QualifiedBot +User-agent: QuillBot +User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy User-agent: SemrushBot-OCOB diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index c9a3910..e0fcd26 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -54,6 +54,8 @@ | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | +| QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | +| quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | From 75ea75a95b006e68f89f8826bf84caff569a1eb8 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Wed, 4 Jun 2025 18:04:06 +0000 Subject: [PATCH 183/201] Merge pull request #143 from ai-robots-txt/panscient chore(robots.json): adds Panscient --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 2 ++ nginx-block-ai-bots.conf | 2 +- robots.txt | 2 ++ table-of-bot-metrics.md | 2 ++ 6 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index ee898fd..5fefc69 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index de3bf3b..5caa249 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index be27797..fe153c8 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -47,6 +47,8 @@ omgili omgilibot Operator PanguBot +Panscient +panscient.com Perplexity-User PerplexityBot PetalBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index d7decae..e5d660b 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 4ed2deb..26a9d78 100644 --- a/robots.txt +++ b/robots.txt @@ -47,6 +47,8 @@ User-agent: omgili User-agent: omgilibot User-agent: Operator User-agent: PanguBot +User-agent: Panscient +User-agent: panscient.com User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index e0fcd26..69b9ec2 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -49,6 +49,8 @@ | omgilibot | [Webz.io](https://webz.io/) | [Yes](https://web.archive.org/web/20170704003301/http://omgili.com/Crawler.html) | Data is sold. | No information. | Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io. | | Operator | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Operator is an AI agent created by OpenAI that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/operator | | PanguBot | the Chinese company Huawei | Unclear at this time. | AI Data Scrapers | Unclear at this time. | PanguBot is a web crawler operated by the Chinese company Huawei. It's used to download training data for its multimodal LLM (Large Language Model) called PanGu. More info can be found at https://darkvisitors.com/agents/agents/pangubot | +| Panscient | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | +| panscient\.com | [Panscient](https://panscient.com) | [Yes](https://panscient.com/faq.htm) | Data collection and analysis using machine learning and AI. | The Panscient web crawler will request a page at most once every second from the same domain name or the same IP address. | Compiles data on businesses and business professionals that is structured using AI and machine learning. | | Perplexity\-User | [Perplexity](https://www.perplexity.ai/) | [No](https://docs.perplexity.ai/guides/bots) | Used to answer queries at the request of users. | Only when prompted by a user. | Visit web pages to help provide an accurate answer and include links to the page in Perplexity response. | | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | From 77393df5aa2ac7e3f2bb3e54ff919470c63e5b09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Thu, 5 Jun 2025 00:59:28 +0000 Subject: [PATCH 184/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 22a1370..6df5084 100644 --- a/robots.json +++ b/robots.json @@ -496,4 +496,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 528d77bf072780a40c169cab399eb0b2139edb83 Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 5 Jun 2025 09:14:23 -0700 Subject: [PATCH 185/201] chore(robots.json): adds bedrockbot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 739f579..a3f75bc 100644 --- a/robots.json +++ b/robots.json @@ -55,6 +55,13 @@ "frequency": "Unclear at this time.", "description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools." }, + "bedrockbot": { + "operator": "[Amazon](https://amazon.com)", + "respect": "[Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector)", + "function": "Data scraping for custom AI applications.", + "frequency": "Unclear at this time.", + "description": "Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application." + }, "Brightbot 1.0": { "operator": "Browsing.ai", "respect": "Unclear at this time.", @@ -468,4 +475,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From ac7ed17e71a59a67d54279d010477abecfb15caf Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 5 Jun 2025 16:51:17 +0000 Subject: [PATCH 186/201] Merge pull request #145 from ai-robots-txt/aws-bedrockbot chore(robots.json): adds bedrockbot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 5fefc69..dbbde1e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 5caa249..08b2fd3 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe153c8..cdaeecd 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -6,6 +6,7 @@ Andibot anthropic-ai Applebot Applebot-Extended +bedrockbot Brightbot 1.0 Bytespider CCBot diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index e5d660b..542ac65 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 26a9d78..a8dc655 100644 --- a/robots.txt +++ b/robots.txt @@ -6,6 +6,7 @@ User-agent: Andibot User-agent: anthropic-ai User-agent: Applebot User-agent: Applebot-Extended +User-agent: bedrockbot User-agent: Brightbot 1.0 User-agent: Bytespider User-agent: CCBot diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 69b9ec2..ee324f7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -8,6 +8,7 @@ | anthropic\-ai | [Anthropic](https://www.anthropic.com) | Unclear at this time. | Scrapes data to train Anthropic's AI products. | No information provided. | Scrapes data to train LLMs and AI products offered by Anthropic. | | Applebot | Unclear at this time. | Unclear at this time. | AI Search Crawlers | Unclear at this time. | Applebot is a web crawler used by Apple to index search results that allow the Siri AI Assistant to answer user questions. Siri's answers normally contain references to the website. More info can be found at https://darkvisitors.com/agents/agents/applebot | | Applebot\-Extended | [Apple](https://support.apple.com/en-us/119829#datausage) | Yes | Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others. | Unclear at this time. | Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools. | +| bedrockbot | [Amazon](https://amazon.com) | [Yes](https://docs.aws.amazon.com/bedrock/latest/userguide/webcrawl-data-source-connector.html#configuration-webcrawl-connector) | Data scraping for custom AI applications. | Unclear at this time. | Connects to and crawls URLs that have been selected for use in a user's AWS bedrock application. | | Brightbot 1\.0 | Browsing.ai | Unclear at this time. | LLM/AI training. | Unclear at this time. | Scrapes data to train LLMs and AI products focused on website customer support. | | Bytespider | ByteDance | No | LLM training. | Unclear at this time. | Downloads data to train LLMS, including ChatGPT competitors. | | CCBot | [Common Crawl Foundation](https://commoncrawl.org) | [Yes](https://commoncrawl.org/ccbot) | Provides open crawl dataset, used for many purposes, including Machine Learning/AI. | Monthly at present. | Web archive going back to 2008. [Cited in thousands of research papers per year](https://commoncrawl.org/research-papers). | From e21f6ae1b6eeb0782012a4a31e6af61b6a21cb09 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Fri, 6 Jun 2025 00:59:25 +0000 Subject: [PATCH 187/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 8f5dadd..3b5c434 100644 --- a/robots.json +++ b/robots.json @@ -503,4 +503,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 7867c3e26c267301cc528edf89fccf5203bfb99b Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Mon, 9 Jun 2025 08:44:25 -0700 Subject: [PATCH 188/201] chore(robots.json): adds EchoboxBot (#148) --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 3b5c434..911dadc 100644 --- a/robots.json +++ b/robots.json @@ -160,6 +160,13 @@ "frequency": "Unclear at this time.", "description": "DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot" }, + "EchoboxBot": { + "operator": "[Echobox](https://echobox.com)", + "respect": "Unclear at this time.", + "function": "Data collection to support AI-powered products.", + "frequency": "Unclear at this time.", + "description": "Supports company's AI-powered social and email management products." + }, "FacebookBot": { "operator": "Meta/Facebook", "respect": "[Yes](https://developers.facebook.com/docs/sharing/bot/)", @@ -503,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 3759a6bf146f0153bbb7ab880ff95380a4fc739e Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 9 Jun 2025 15:44:36 +0000 Subject: [PATCH 189/201] chore(robots.json): adds EchoboxBot (#148) --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index dbbde1e..74b4e5b 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 08b2fd3..9ca050b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index cdaeecd..dc9f851 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -21,6 +21,7 @@ Cotoyogi Crawlspace Diffbot DuckAssistBot +EchoboxBot FacebookBot Factset_spyderbot FirecrawlAgent diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 542ac65..4fd7a29 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index a8dc655..8964626 100644 --- a/robots.txt +++ b/robots.txt @@ -21,6 +21,7 @@ User-agent: Cotoyogi User-agent: Crawlspace User-agent: Diffbot User-agent: DuckAssistBot +User-agent: EchoboxBot User-agent: FacebookBot User-agent: Factset_spyderbot User-agent: FirecrawlAgent diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index ee324f7..b596540 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -23,6 +23,7 @@ | Crawlspace | [Crawlspace](https://crawlspace.dev) | [Yes](https://news.ycombinator.com/item?id=42756654) | Scrapes data | Unclear at this time. | Provides crawling services for any purpose, probably including AI model training. | | Diffbot | [Diffbot](https://www.diffbot.com/) | At the discretion of Diffbot users. | Aggregates structured web data for monitoring and AI model training. | Unclear at this time. | Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training. | | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | +| EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | From cf598b6b71e0e3c6b59a699c74218a62d43c5e16 Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 10 Jun 2025 01:00:37 +0000 Subject: [PATCH 190/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 911dadc..19dffe8 100644 --- a/robots.json +++ b/robots.json @@ -510,4 +510,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From 14d68f05ba42b04e8c33fcca5bacf7d8dee86cae Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Wed, 11 Jun 2025 13:50:53 -0700 Subject: [PATCH 191/201] chore(robots.json): adds additional SemrushBot user agents --- robots.json | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/robots.json b/robots.json index 19dffe8..0ad3731 100644 --- a/robots.json +++ b/robots.json @@ -433,6 +433,27 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, + "SemrushBot": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-BA": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, + "SemrushBot-CT": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", @@ -440,6 +461,13 @@ "frequency": "Roughly once every 10 seconds.", "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." }, + "SemrushBot-SI": { + "operator": "[Semrush](https://www.semrush.com/)", + "respect": "[Yes](https://www.semrush.com/bot/)", + "function": "Crawls your site for ContentShake AI tool.", + "frequency": "Roughly once every 10 seconds.", + "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + }, "SemrushBot-SWA": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", From 842e2256e896e6d2904fa501df59f2ca17ea6bc0 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Thu, 12 Jun 2025 07:12:00 +0000 Subject: [PATCH 192/201] Merge pull request #150 from ai-robots-txt/semrush-bots chore(robots.json): adds additional SemrushBot user agents --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 4 ++++ nginx-block-ai-bots.conf | 2 +- robots.txt | 4 ++++ table-of-bot-metrics.md | 4 ++++ 6 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 74b4e5b..a6c8fea 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 9ca050b..7ed7e03 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index dc9f851..fe6c362 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -60,7 +60,11 @@ QuillBot quillbot.com SBIntuitionsBot Scrapy +SemrushBot +SemrushBot-BA +SemrushBot-CT SemrushBot-OCOB +SemrushBot-SI SemrushBot-SWA Sidetrade indexer bot TikTokSpider diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 4fd7a29..13aac72 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 8964626..b130083 100644 --- a/robots.txt +++ b/robots.txt @@ -60,7 +60,11 @@ User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy +User-agent: SemrushBot +User-agent: SemrushBot-BA +User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB +User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: TikTokSpider diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index b596540..a834382 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -62,7 +62,11 @@ | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | SBIntuitionsBot | [SB Intuitions](https://www.sbintuitions.co.jp/en/) | [Yes](https://www.sbintuitions.co.jp/en/bot/) | Uses data gathered in AI development and information analysis. | No information. | AI development and information analysis | | Scrapy | [Zyte](https://www.zyte.com) | Unclear at this time. | Scrapes data for a variety of uses including training AI. | No information. | "AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets." | +| SemrushBot | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-BA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-CT | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-OCOB | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | +| SemrushBot\-SI | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Crawls your site for ContentShake AI tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | SemrushBot\-SWA | [Semrush](https://www.semrush.com/) | [Yes](https://www.semrush.com/bot/) | Checks URLs on your site for SWA tool. | Roughly once every 10 seconds. | You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL). | | Sidetrade indexer bot | [Sidetrade](https://www.sidetrade.com) | Unclear at this time. | Extracts data for a variety of uses including training AI. | No information. | AI product training. | | TikTokSpider | ByteDance | Unclear at this time. | LLM training. | Unclear at this time. | Downloads data to train LLMS, as per Bytespider. | From d760f9216f8d6295b43e00862daec913f82610ad Mon Sep 17 00:00:00 2001 From: Cory Dransfeldt Date: Thu, 12 Jun 2025 13:08:29 -0700 Subject: [PATCH 193/201] chore(robots.json): adds MyCentralAIScraperBot --- robots.json | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 19dffe8..94e4f79 100644 --- a/robots.json +++ b/robots.json @@ -314,6 +314,13 @@ "description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.", "respect": "Yes" }, + "MyCentralAIScraperBot": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is uncleaar at this time." + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -510,4 +517,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} \ No newline at end of file +} From 8f17718e762831498b09438835b2e1cb74f7e19d Mon Sep 17 00:00:00 2001 From: Glyn Normington Date: Fri, 13 Jun 2025 10:28:12 +0100 Subject: [PATCH 194/201] Fix typo --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 94e4f79..befdb35 100644 --- a/robots.json +++ b/robots.json @@ -319,7 +319,7 @@ "respect": "Unclear at this time.", "function": "AI data scraper", "frequency": "Unclear at this time.", - "description": "Operator and data use is uncleaar at this time." + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", From e53d81c66d353016e27e75215b22dc8557e1a82c Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Fri, 13 Jun 2025 09:28:41 +0000 Subject: [PATCH 195/201] Merge pull request #152 from ai-robots-txt/MyCentralAIScraperBot chore(robots.json): adds MyCentralAIScraperBot --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a6c8fea..a2d6a6e 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 7ed7e03..e99d69c 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index fe6c362..7d1d3a0 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -43,6 +43,7 @@ Meta-ExternalAgent meta-externalfetcher Meta-ExternalFetcher MistralAI-User/1.0 +MyCentralAIScraperBot NovaAct OAI-SearchBot omgili diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index 13aac72..ef0259e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index b130083..ab60705 100644 --- a/robots.txt +++ b/robots.txt @@ -43,6 +43,7 @@ User-agent: Meta-ExternalAgent User-agent: meta-externalfetcher User-agent: Meta-ExternalFetcher User-agent: MistralAI-User/1.0 +User-agent: MyCentralAIScraperBot User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index a834382..f8a2005 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -45,6 +45,7 @@ | meta\-externalfetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | Meta\-ExternalFetcher | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | Meta-ExternalFetcher is dispatched by Meta AI products in response to user prompts, when they need to fetch an individual links. More info can be found at https://darkvisitors.com/agents/agents/meta-externalfetcher | | MistralAI\-User/1\.0 | Mistral AI | Yes | Takes action based on user prompts. | Only when prompted by a user. | MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response. | +| MyCentralAIScraperBot | Unclear at this time. | Unclear at this time. | AI data scraper | Unclear at this time. | Operator and data use is unclear at this time. | | NovaAct | Unclear at this time. | Unclear at this time. | AI Agents | Unclear at this time. | Nova Act is an AI agent created by Amazon that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/novaact | | OAI\-SearchBot | [OpenAI](https://openai.com) | [Yes](https://platform.openai.com/docs/bots) | Search result generation. | No information. | Crawls sites to surface as results in SearchGPT. | | omgili | [Webz.io](https://webz.io/) | [Yes](https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/) | Data is sold. | No information. | Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training. | From b05f2fee000caf6c784135335b14c2254572754e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Spagnuolo?= Date: Fri, 13 Jun 2025 17:15:13 -0300 Subject: [PATCH 196/201] Update robots.json with new crawler Update with Poseidon Research Crawler as found in nytimes.com/robots.txt --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 52e196a..9bd1f85 100644 --- a/robots.json +++ b/robots.json @@ -405,6 +405,13 @@ "operator": "[phind](https://www.phind.com/)", "respect": "Unclear at this time." }, + "Poseidon Research Crawler": { + "operator": "[Poseidon Research](https://www.poseidonresearch.com)", + "description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.", + "frequency": "No explicit frequency provided.", + "function": "AI research crawler", + "respect": "Unclear at this time." + }, "QualifiedBot": { "description": "Operated by Qualified as part of their suite of AI product offerings.", "frequency": "No explicit frequency provided.", From 2b68568ac26db7a37d859fec12b0bb35e61cfedc Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Sat, 14 Jun 2025 00:58:11 +0000 Subject: [PATCH 197/201] Update from Dark Visitors --- robots.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/robots.json b/robots.json index 52e196a..7c10cab 100644 --- a/robots.json +++ b/robots.json @@ -315,11 +315,11 @@ "respect": "Yes" }, "MyCentralAIScraperBot": { - "operator": "Unclear at this time.", - "respect": "Unclear at this time.", - "function": "AI data scraper", - "frequency": "Unclear at this time.", - "description": "Operator and data use is unclear at this time." + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI data scraper", + "frequency": "Unclear at this time.", + "description": "Operator and data use is unclear at this time." }, "NovaAct": { "operator": "Unclear at this time.", @@ -545,4 +545,4 @@ "frequency": "No information.", "description": "Retrieves data used for You.com web search engine and LLMs." } -} +} \ No newline at end of file From eb05f2f5276ff963f786cf8fd57d8ea909e82fd9 Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Sat, 14 Jun 2025 14:04:03 +0000 Subject: [PATCH 198/201] Merge pull request #153 from sergiospagnuolo/Poseidon Update robots.json with new crawler --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index a2d6a6e..27637c2 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index e99d69c..528ba08 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index 7d1d3a0..c7c3054 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -56,6 +56,7 @@ Perplexity-User PerplexityBot PetalBot PhindBot +Poseidon Research Crawler QualifiedBot QuillBot quillbot.com diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index ef0259e..c1afb05 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index ab60705..0edf11f 100644 --- a/robots.txt +++ b/robots.txt @@ -56,6 +56,7 @@ User-agent: Perplexity-User User-agent: PerplexityBot User-agent: PetalBot User-agent: PhindBot +User-agent: Poseidon Research Crawler User-agent: QualifiedBot User-agent: QuillBot User-agent: quillbot.com diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index f8a2005..78a3408 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -58,6 +58,7 @@ | PerplexityBot | [Perplexity](https://www.perplexity.ai/) | [Yes](https://docs.perplexity.ai/guides/bots) | Search result generation. | No information. | Crawls sites to surface as results in Perplexity. | | PetalBot | [Huawei](https://huawei.com/) | Yes | Used to provide recommendations in Hauwei assistant and AI search services. | No explicit frequency provided. | Operated by Huawei to provide search and AI assistant services. | | PhindBot | [phind](https://www.phind.com/) | Unclear at this time. | AI-enhanced search engine. | No explicit frequency provided. | Company offers an AI agent that uses AI and generate extra web query on the fly | +| Poseidon Research Crawler | [Poseidon Research](https://www.poseidonresearch.com) | Unclear at this time. | AI research crawler | No explicit frequency provided. | Lab focused on scaling the interpretability research necessary to make better AI systems possible. | | QualifiedBot | [Qualified](https://www.qualified.com) | Unclear at this time. | Company offers AI agents and other related products; usage can be assumed to support said products. | No explicit frequency provided. | Operated by Qualified as part of their suite of AI product offerings. | | QuillBot | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | | quillbot\.com | [Quillbot](https://quillbot.com) | Unclear at this time. | Company offers AI detection, writing tools and other services. | No explicit frequency provided. | Operated by QuillBot as part of their suite of AI product offerings. | From 7535893aecf2895dd09bf00de397afde4edf0f8f Mon Sep 17 00:00:00 2001 From: paulrudy <1110792+paulrudy@users.noreply.github.com> Date: Sun, 15 Jun 2025 16:39:17 -0700 Subject: [PATCH 199/201] re-add facebookexternalhit --- robots.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/robots.json b/robots.json index 5d5f692..60c431c 100644 --- a/robots.json +++ b/robots.json @@ -174,6 +174,13 @@ "frequency": "Up to 1 page per second", "description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically." }, + "facebookexternalhit": { + "operator": "Meta/Facebook", + "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", + "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", + "frequency": "Unclear at this time.", + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)", "respect": "Unclear at this time.", From 5326c202b57d87a9921810ef26fca98edce77d5f Mon Sep 17 00:00:00 2001 From: "ai.robots.txt" Date: Mon, 16 Jun 2025 15:12:42 +0000 Subject: [PATCH 200/201] Merge pull request #154 from paulrudy/main re-add facebookexternalhit --- .htaccess | 2 +- Caddyfile | 2 +- haproxy-block-ai-bots.txt | 1 + nginx-block-ai-bots.conf | 2 +- robots.txt | 1 + table-of-bot-metrics.md | 1 + 6 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.htaccess b/.htaccess index 27637c2..3ba960f 100644 --- a/.htaccess +++ b/.htaccess @@ -1,3 +1,3 @@ RewriteEngine On -RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] +RewriteCond %{HTTP_USER_AGENT} (AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot) [NC] RewriteRule !^/?robots\.txt$ - [F,L] diff --git a/Caddyfile b/Caddyfile index 528ba08..b675b9b 100644 --- a/Caddyfile +++ b/Caddyfile @@ -1,3 +1,3 @@ @aibots { - header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" + header_regexp User-Agent "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)" } \ No newline at end of file diff --git a/haproxy-block-ai-bots.txt b/haproxy-block-ai-bots.txt index c7c3054..9cfb5c6 100644 --- a/haproxy-block-ai-bots.txt +++ b/haproxy-block-ai-bots.txt @@ -23,6 +23,7 @@ Diffbot DuckAssistBot EchoboxBot FacebookBot +facebookexternalhit Factset_spyderbot FirecrawlAgent FriendlyCrawler diff --git a/nginx-block-ai-bots.conf b/nginx-block-ai-bots.conf index c1afb05..a53333e 100644 --- a/nginx-block-ai-bots.conf +++ b/nginx-block-ai-bots.conf @@ -1,3 +1,3 @@ -if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { +if ($http_user_agent ~* "(AI2Bot|Ai2Bot\-Dolma|aiHitBot|Amazonbot|Andibot|anthropic\-ai|Applebot|Applebot\-Extended|bedrockbot|Brightbot\ 1\.0|Bytespider|CCBot|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawlspace|Diffbot|DuckAssistBot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Google\-CloudVertexBot|Google\-Extended|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iaskspider/2\.0|ICC\-Crawler|ImagesiftBot|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|MistralAI\-User/1\.0|MyCentralAIScraperBot|NovaAct|OAI\-SearchBot|omgili|omgilibot|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot|SemrushBot\-BA|SemrushBot\-CT|SemrushBot\-OCOB|SemrushBot\-SI|SemrushBot\-SWA|Sidetrade\ indexer\ bot|TikTokSpider|Timpibot|VelenPublicWebCrawler|Webzio\-Extended|wpbot|YandexAdditional|YandexAdditionalBot|YouBot)") { return 403; } \ No newline at end of file diff --git a/robots.txt b/robots.txt index 0edf11f..9d69a3a 100644 --- a/robots.txt +++ b/robots.txt @@ -23,6 +23,7 @@ User-agent: Diffbot User-agent: DuckAssistBot User-agent: EchoboxBot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler diff --git a/table-of-bot-metrics.md b/table-of-bot-metrics.md index 78a3408..a5ab4c7 100644 --- a/table-of-bot-metrics.md +++ b/table-of-bot-metrics.md @@ -25,6 +25,7 @@ | DuckAssistBot | Unclear at this time. | Unclear at this time. | AI Assistants | Unclear at this time. | DuckAssistBot is used by DuckDuckGo's DuckAssist feature to fetch content and generate realtime AI answers to user searches. More info can be found at https://darkvisitors.com/agents/agents/duckassistbot | | EchoboxBot | [Echobox](https://echobox.com) | Unclear at this time. | Data collection to support AI-powered products. | Unclear at this time. | Supports company's AI-powered social and email management products. | | FacebookBot | Meta/Facebook | [Yes](https://developers.facebook.com/docs/sharing/bot/) | Training language models | Up to 1 page per second | Officially used for training Meta "speech recognition technology," unknown if used to train Meta AI specifically. | +| facebookexternalhit | Meta/Facebook | [No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) | Ostensibly only for sharing, but likely used as an AI crawler as well | Unclear at this time. | Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is "to crawl the content of an app or website that was shared on one of Meta’s family of apps…". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary. | | Factset\_spyderbot | [Factset](https://www.factset.com/ai) | Unclear at this time. | AI model training. | No information provided. | Scrapes data for AI training. | | FirecrawlAgent | [Firecrawl](https://www.firecrawl.dev/) | Yes | AI scraper and LLM training | No information provided. | Scrapes data for AI systems and LLM training. | | FriendlyCrawler | Unknown | [Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler) | We are using the data from the crawler to build datasets for machine learning experiments. | Unclear at this time. | Unclear who the operator is; but data is used for training/machine learning. | From 4ed17b8e4af67d347b039429eb633c96acbba72f Mon Sep 17 00:00:00 2001 From: dark-visitors Date: Tue, 17 Jun 2025 01:00:21 +0000 Subject: [PATCH 201/201] Update from Dark Visitors --- robots.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/robots.json b/robots.json index 60c431c..ab2a383 100644 --- a/robots.json +++ b/robots.json @@ -179,7 +179,7 @@ "respect": "[No](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313)", "function": "Ostensibly only for sharing, but likely used as an AI crawler as well", "frequency": "Unclear at this time.", - "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta’s family of apps…\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." + "description": "Note that excluding FacebookExternalHit will block incorporating OpenGraph data when sharing in social media, including rich links in Apple's Messages app. [According to Meta](https://developers.facebook.com/docs/sharing/webmasters/web-crawlers/), its purpose is \"to crawl the content of an app or website that was shared on one of Meta\u2019s family of apps\u2026\". However, see discussions [here](https://github.com/ai-robots-txt/ai.robots.txt/pull/21) and [here](https://github.com/ai-robots-txt/ai.robots.txt/issues/40#issuecomment-2524591313) for evidence to the contrary." }, "Factset_spyderbot": { "operator": "[Factset](https://www.factset.com/ai)",