123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- # robots.txt
- # https://owly.fans/robots.txt
- # Page is under the FOPL-ZERO license
- # https://owly.fans/license/fopl-zero
- # This page is also on git:
- # Please feel free to suggest a change to this file
- # https://github.com/DynTylluan/owly.fans/blob/main/robots.txt (main)
- # https://notabug.org/DynTylluan/owly.fans/src/main/robots.txt (mirror)
- # https://tildegit.org/cass/owly.fans/src/branch/main/robots.txt (mirror)
- # AdIdxBot (Bing/Microsoft)
- # Used by Bing Ads for «quality control» for, you guessed it, ads!
- # https://bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0
- User-Agent: AdIdxBot
- Disallow: /
- # Amazon
- # Used for Alexa - does not cite sources.
- # https://developer.amazon.com/support/amazonbot
- User-Agent: Amazonbot
- Disallow: /
- # Anthropic
- # «AI research and products that put safety at the
- # frontier» - no thanks!
- # https://anthropic.com
- User-agent: anthropic-ai
- Disallow: /
- # Apple's bot
- # Used for Siri and Spotlight Suggestions. Does not cite its sources.
- # https://support.apple.com/en-us/HT204683
- User-Agent: Applebot
- User-Agent: AppleNewsBot
- Disallow: /
- # Claude
- # Used to help AI
- # https://claude.ai
- User-agent: Claude-Web
- Disallow: /
- # Cohere
- # Stupid bot used to help artificial intelligence.
- # https://cohere.com
- User-agent: cohere-ai
- Disallow: /
- # Common Crawl bot
- # From what I've leant from reading online, «[t]he majority of ChatGPT's
- # training data comes from the Common Crawl crawler bot».
- # Quote: https://datadome.co/threat-research/how-chatgpt-openai-might-use-your-content-now-in-the-future
- # https://commoncrawl.org/faq
- User-agent: CCBot
- Disallow: /
- # FacebookBot
- # Used to «improve language models for our speech recognition technology»,
- # so more AI rubbish that I don't want from a company that I don't like.
- User-Agent: FacebookBot
- Disallow: /
- # Google's AdSense/StoreBot bots
- # Go away - I don't want you here.
- # https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
- User-Agent: AdsBot-Google-Mobile
- User-Agent: Storebot-Google
- User-Agent: AdsBot-Google
- User-agent: Mediapartners-Google
- Disallow: /
- # Google's AI training bot
- # I also don't want to have my content used without giving me credit.
- # https://blog.google/technology/ai/an-update-on-web-publisher-controls
- User-agent: Google-Extended
- Disallow: /
- # GPTBot is OpenAI's web crawler
- # I do not want it to use my content as it does not cite its sources.
- # https://platform.openai.com/docs/gptbot
- User-agent: GPTBot
- User-agent: ChatGPT-User
- Disallow: /
- # ia_archiver
- # In the past, the Internet Archive (https://archive.org) used to have a
- # bot called ia_archiver that you could allow (or disallow) to index your
- # website, sometimes around 2017, this crawler stopped obeying robots.txt.
- # Putting ia_archiver and setting it to «Disallow: /» will not work. If
- # you really want your website to not be seen on this archive, you will
- # need to E-Mail info@archive.org.
- # https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives
- # https://web.archive.org/web/20150322111536/http://archive.org/about/exclude.php
- # https://blog.reputationx.com/block-wayback-machine
- # Kitty
- # This bot is cool - go wild, queen
- User-Agent: digikitty_x86
- Allow: /
- # Magpie Crawler
- # Used to download pages and get «indexed and analysed by our system».
- # I don't want to use your service!
- User-agent: magpie-crawler
- Disallow: /
- # Pinterest crawler
- # I don't want my website to be used to «collect rich metadata like the
- # price, description and availability of your products» or however they
- # want to word that rubbish.
- # https://help.pinterest.com/en/business/article/pinterest-crawler
- User-agent: Pinterestbot
- Disallow: /
- # This is where I copy Bytemoth
- # I am taking it in good faith that Bytemoth knows what he's doing and that
- # the following bots are for marketers - so I am blocking them.
- # As taken from the following URL: http://bytemoth.nfshost.com/robots.txt
- User-agent: adsbot
- User-agent: AhrefsBot
- User-agent: BLEXBot
- User-agent: dotbot
- User-agent: Pandalytics
- User-agent: SemrushBot
- User-agent: SemrushBot-BA
- User-agent: SemrushBot-BM
- User-agent: SemrushBot-CT
- User-agent: SemrushBot-SA
- User-agent: SemrushBot-SI
- User-agent: SemrushBot-SWA
- User-agent: serpstatbot
- User agent: MTRobot
- User-agent: PageThing
- Disallow: /
- # Bots that you might like to block
- # There are a few bots that I am okay/don't really care all that much if
- # they index my website, but not everyone might be okay with them, so
- # if you copy this TXT on your website and want to have the bot not index
- # it, simply remove the «#» before the «User-agent» and «Disallow» part.
- # DuckDuckGo
- # The search engine website uses the following bot to index sites.
- # https://duckduckgo.com/duckduckbot
- # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
- # https://duckduckgo.com/duckduckgo-help-pages/results/sources
- # User-agent: DuckDuckBot
- # Disallow: /
- # Other Google bots
- # Don't like Google? Unmark these.
- # https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
- # User-agent: Googlebot
- # User-agent: Googlebot-News
- # User-agent: Googlebot-Image
- # User-agent: Googlebot-Video
- # User-agent: Storebot-Google
- # User-agent: Google-InspectionTool
- # User-agent: GoogleOther
- # User-agent: Google-Extended
- # User-agent: APIs-Google
- # User-agent: AdsBot-Google-Mobile
- # User-agent: AdsBot-Google
- # User-agent: Mediapartners-Google
- # User-agent: FeedFetcher-Google
- # User-agent: GoogleProducer
- # User-agent: GoogleProducer; (+http://goo.gl/7y4SX)
- # User-agent: google-speakr
- # Disallow: /
- # Don't allow any bots
- # If you don't want *any* good bots browsing your website, then unmark
- # this. Please note that there are some bots still that don't obay
- # robots.txt at all, like ia_archiver.
- # User-agent: *
- # Disallow: /
- # Allow all bots
- # If you do, however, want to allow all bots, you should uncheck the
- # following. You can also just *not* have a robots.txt and it'll
- # work all the same.
- # User-agent: *
- # Allow: /
|