robots.txt 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # robots.txt
  2. # https://owly.fans/robots.txt
  3. # Page is under the FOPL-ZERO license
  4. # https://owly.fans/license/fopl-zero
  5. # This page is also on git:
  6. # Please feel free to suggest a change to this file
  7. # https://github.com/DynTylluan/owly.fans/blob/main/robots.txt (main)
  8. # https://notabug.org/DynTylluan/owly.fans/src/main/robots.txt (mirror)
  9. # https://tildegit.org/cass/owly.fans/src/branch/main/robots.txt (mirror)
  10. # AdIdxBot (Bing/Microsoft)
  11. # Used by Bing Ads for «quality control» for, you guessed it, ads!
  12. # https://bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0
  13. User-Agent: AdIdxBot
  14. Disallow: /
  15. # Amazon
  16. # Used for Alexa - does not cite sources.
  17. # https://developer.amazon.com/support/amazonbot
  18. User-Agent: Amazonbot
  19. Disallow: /
  20. # Anthropic
  21. # «AI research and products that put safety at the
  22. # frontier» - no thanks!
  23. # https://anthropic.com
  24. User-agent: anthropic-ai
  25. Disallow: /
  26. # Apple's bot
  27. # Used for Siri and Spotlight Suggestions. Does not cite its sources.
  28. # https://support.apple.com/en-us/HT204683
  29. User-Agent: Applebot
  30. User-Agent: AppleNewsBot
  31. Disallow: /
  32. # Claude
  33. # Used to help AI
  34. # https://claude.ai
  35. User-agent: Claude-Web
  36. Disallow: /
  37. # Cohere
  38. # Stupid bot used to help artificial intelligence.
  39. # https://cohere.com
  40. User-agent: cohere-ai
  41. Disallow: /
  42. # Common Crawl bot
  43. # From what I've leant from reading online, «[t]he majority of ChatGPT's
  44. # training data comes from the Common Crawl crawler bot».
  45. # Quote: https://datadome.co/threat-research/how-chatgpt-openai-might-use-your-content-now-in-the-future
  46. # https://commoncrawl.org/faq
  47. User-agent: CCBot
  48. Disallow: /
  49. # FacebookBot
  50. # Used to «improve language models for our speech recognition technology»,
  51. # so more AI rubbish that I don't want from a company that I don't like.
  52. User-Agent: FacebookBot
  53. Disallow: /
  54. # Google's AdSense/StoreBot bots
  55. # Go away - I don't want you here.
  56. # https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
  57. User-Agent: AdsBot-Google-Mobile
  58. User-Agent: Storebot-Google
  59. User-Agent: AdsBot-Google
  60. User-agent: Mediapartners-Google
  61. Disallow: /
  62. # Google's AI training bot
  63. # I also don't want to have my content used without giving me credit.
  64. # https://blog.google/technology/ai/an-update-on-web-publisher-controls
  65. User-agent: Google-Extended
  66. Disallow: /
  67. # GPTBot is OpenAI's web crawler
  68. # I do not want it to use my content as it does not cite its sources.
  69. # https://platform.openai.com/docs/gptbot
  70. User-agent: GPTBot
  71. User-agent: ChatGPT-User
  72. Disallow: /
  73. # ia_archiver
  74. # In the past, the Internet Archive (https://archive.org) used to have a
  75. # bot called ia_archiver that you could allow (or disallow) to index your
  76. # website, sometimes around 2017, this crawler stopped obeying robots.txt.
  77. # Putting ia_archiver and setting it to «Disallow: /» will not work. If
  78. # you really want your website to not be seen on this archive, you will
  79. # need to E-Mail info@archive.org.
  80. # https://blog.archive.org/2017/04/17/robots-txt-meant-for-search-engines-dont-work-well-for-web-archives
  81. # https://web.archive.org/web/20150322111536/http://archive.org/about/exclude.php
  82. # https://blog.reputationx.com/block-wayback-machine
  83. # Kitty
  84. # This bot is cool - go wild, queen
  85. User-Agent: digikitty_x86
  86. Allow: /
  87. # Magpie Crawler
  88. # Used to download pages and get «indexed and analysed by our system».
  89. # I don't want to use your service!
  90. User-agent: magpie-crawler
  91. Disallow: /
  92. # Pinterest crawler
  93. # I don't want my website to be used to «collect rich metadata like the
  94. # price, description and availability of your products» or however they
  95. # want to word that rubbish.
  96. # https://help.pinterest.com/en/business/article/pinterest-crawler
  97. User-agent: Pinterestbot
  98. Disallow: /
  99. # This is where I copy Bytemoth
  100. # I am taking it in good faith that Bytemoth knows what he's doing and that
  101. # the following bots are for marketers - so I am blocking them.
  102. # As taken from the following URL: http://bytemoth.nfshost.com/robots.txt
  103. User-agent: adsbot
  104. User-agent: AhrefsBot
  105. User-agent: BLEXBot
  106. User-agent: dotbot
  107. User-agent: Pandalytics
  108. User-agent: SemrushBot
  109. User-agent: SemrushBot-BA
  110. User-agent: SemrushBot-BM
  111. User-agent: SemrushBot-CT
  112. User-agent: SemrushBot-SA
  113. User-agent: SemrushBot-SI
  114. User-agent: SemrushBot-SWA
  115. User-agent: serpstatbot
  116. User agent: MTRobot
  117. User-agent: PageThing
  118. Disallow: /
  119. # Bots that you might like to block
  120. # There are a few bots that I am okay/don't really care all that much if
  121. # they index my website, but not everyone might be okay with them, so
  122. # if you copy this TXT on your website and want to have the bot not index
  123. # it, simply remove the «#» before the «User-agent» and «Disallow» part.
  124. # DuckDuckGo
  125. # The search engine website uses the following bot to index sites.
  126. # https://duckduckgo.com/duckduckbot
  127. # https://duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
  128. # https://duckduckgo.com/duckduckgo-help-pages/results/sources
  129. # User-agent: DuckDuckBot
  130. # Disallow: /
  131. # Other Google bots
  132. # Don't like Google? Unmark these.
  133. # https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
  134. # User-agent: Googlebot
  135. # User-agent: Googlebot-News
  136. # User-agent: Googlebot-Image
  137. # User-agent: Googlebot-Video
  138. # User-agent: Storebot-Google
  139. # User-agent: Google-InspectionTool
  140. # User-agent: GoogleOther
  141. # User-agent: Google-Extended
  142. # User-agent: APIs-Google
  143. # User-agent: AdsBot-Google-Mobile
  144. # User-agent: AdsBot-Google
  145. # User-agent: Mediapartners-Google
  146. # User-agent: FeedFetcher-Google
  147. # User-agent: GoogleProducer
  148. # User-agent: GoogleProducer; (+http://goo.gl/7y4SX)
  149. # User-agent: google-speakr
  150. # Disallow: /
  151. # Don't allow any bots
  152. # If you don't want *any* good bots browsing your website, then unmark
  153. # this. Please note that there are some bots still that don't obay
  154. # robots.txt at all, like ia_archiver.
  155. # User-agent: *
  156. # Disallow: /
  157. # Allow all bots
  158. # If you do, however, want to allow all bots, you should uncheck the
  159. # following. You can also just *not* have a robots.txt and it'll
  160. # work all the same.
  161. # User-agent: *
  162. # Allow: /