robots.txt 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. ## Some of this is cribbed from https://www.dcc-servers.net/robots.txt
  2. ### Geographically Meh ### {{{
  3. ## no need for Chinese or Russian searches
  4. User-agent: Baiduspider
  5. Disallow: /
  6. ## Czech Republic
  7. User-agent: SeznamBot
  8. Disallow: /
  9. ## No need for Russian searches and they fetch but ignore robots.txt
  10. User-Agent: Yandex
  11. Disallow: /
  12. ### End Geographically Meh ### }}}
  13. ### SEO Dung Bots ### {{{
  14. ## "The World's Experts in Search Analytics"
  15. ## is yet another SEO outfit that hammers HTTP servers without permission
  16. ## and without benefit for at least some HTTP server operators.
  17. User-Agent: Searchmetrics
  18. Disallow: /
  19. ## Claimed SEO; ignores robots.txt
  20. User-Agent: lipperhey
  21. Disallow: /
  22. ## Claimed SEO
  23. User-Agent: dataprovider.com
  24. Disallow: /
  25. ## SEO
  26. ## http://www.semrush.com/bot.html suggests its results are for users:
  27. ## "Well, the real question is why do you not want the bot visiting
  28. ## your page? Most bots are both harmless and quite beneficial. Bots
  29. ## like Googlebot discover sites by following links from page to page.
  30. ## This bot is crawling your page to help parse the content, so that
  31. ## the relevant information contained within your site is easily indexed
  32. ## and made more readily available to users searching for the content
  33. ## you provide."
  34. User-Agent: SemrushBot
  35. Disallow: /
  36. ## SEO bs
  37. User-agent: spbot
  38. Disallow: /
  39. ## SEO bs
  40. ## Wasn't respecting 'dotbot' block...
  41. User-agent: DotBot
  42. Disallow: /
  43. ## SEO bs
  44. User-agent: Baiduspider
  45. Disallow: /
  46. ### End SEO Dung Bots ### }}}
  47. ### Poorly Implemented Crap Bots ### {{{
  48. ## Stupid bot
  49. User-Agent: purebot
  50. Disallow: /
  51. ## Seems to only search for non-existent pages.
  52. ## See ezooms.bot@gmail.com and wowrack.com
  53. User-Agent: Ezooms
  54. Disallow: /
  55. ## http://www.majestic12.co.uk/bot.php?+ follows many bogus and corrupt links
  56. ## and so generates a lot of error log noise.
  57. ## It does us no good and is a waste of our bandwidth.
  58. User-Agent: MJ12bot
  59. Disallow: /
  60. ## There is no need to waste bandwith on an outfit trying to monetize our
  61. ## web pages. $50 for data scraped from the web is too much
  62. ## never bothers fetching robots.txt
  63. ## See http://www.domaintools.com
  64. User-Agent: SurveyBot
  65. Disallow: /
  66. User-Agent: DomainTools
  67. Disallow: /
  68. ## Too many mangled links and implausible home page
  69. User-Agent: sitebot
  70. Disallow: /
  71. ## At best another broken spider that thinks all URLs are at the top level.
  72. ## At worst, a malware scanner.
  73. ## Never fetches robots.txt, contrary to http://www.warebay.com/bot.html.
  74. ## See SolomonoBot/1.02 (http://www.solomono.ru)
  75. User-Agent: SolomonoBot
  76. Disallow: /
  77. ## Yet another claimed search engine that generates bad links from plain text.
  78. ## It fetches and then ignores robots.txt
  79. ## 188.138.48.235 http://www.warebay.com/bot.html
  80. User-Agent: WBSearchBot
  81. Disallow: /
  82. ## Ignores robots.txt
  83. User-Agent: Sosospider
  84. Disallow: /
  85. ## Does not handle protocol relative links. It does not fetch robots.txt.
  86. User-Agent: 360Spider
  87. Disallow: /
  88. ## Does not handle protocol relative links.
  89. User-Agent: 80legs
  90. Disallow: /
  91. ## Does not know the difference between a hyperlink <A HREF="..."></A> and
  92. ## anchors that are not links such as <A NAME="..."></A>
  93. User-Agent: YamanaLab-Robot
  94. Disallow: /
  95. ## Ignores rel="nofollow" in links
  96. ## parses ...href='asdf' onclick='... (single quote ('') instead of double ("")
  97. ## as if " onclick=..." were part of the URL.
  98. ## It fetches robots.txt and then ignores it
  99. User-Agent: Aboundex
  100. Disallow: /
  101. User-Agent: Aboundexbot
  102. Disallow: /
  103. ## Fetches robots.txt for only some domains.
  104. ## It searches for non-existent but often abused URLs such as .../contact.cgi
  105. User-Agent: yunyun
  106. Disallow: /
  107. ## Multiple long crawls a day... and .ru
  108. User-Agent: MegaIndex.ru
  109. Disallow: /
  110. ### End Poorly Implemented Crap Bots ### }}}
  111. ### Waste of Bandwidth ### {{{
  112. ## Monetizers of other people's bandwidth.
  113. User-Agent: Exabot
  114. Disallow: /
  115. ## Monetizers of other people's bandwidth.
  116. User-Agent: findlinks
  117. Disallow: /
  118. ## Monetizers of other people's bandwidth.
  119. User-Agent: aiHitBot
  120. Disallow: /
  121. ## Monetizer of other people's bandwidth. It ignores robots.txt.
  122. User-Agent: AhrefsBot
  123. Disallow: /
  124. ## Yet another monetizer of other people's bandwidth that hits selected
  125. ## pages every few seconds from about a dozen HTTP clients around the
  126. ## world without let, leave, hindrance, or notice.
  127. ## There is no apparent way to ask them to stop. One DinoPing agent at
  128. ## support@edis.at responded to a request to stop with "just use iptables"
  129. ## on 2012/08/13.
  130. ## They're blind to the irony that one of their targets is
  131. ## <A HREF="that-which-we-dont.html">http://www.rhyolite.com/anti-spam/that-which-we-dont.html</A>
  132. User-Agent: DinoPing
  133. Disallow: /
  134. ## Waste of bandwidth
  135. User-Agent: masscan
  136. Disallow: /
  137. ## Waste of bandwidth
  138. User-Agent: escan
  139. Disallow: /
  140. ## No apparent reason to spend bandwidth or attention on its bad URLs in logs
  141. User-Agent: discoverybot
  142. Disallow: /
  143. ## Unasked for tracking. Monetizes
  144. User-agent: Uptimebot
  145. Disallow: /
  146. ## Monetizing
  147. User-agent: AhrefsBot
  148. Disallow: /
  149. ### End Waste of Bandwidth ### }}}
  150. ### Get Off My Lawn ### {{{
  151. ## Cutsy story is years stale and no longer excuses bad crawling
  152. User-Agent: dotnetdotcom
  153. Disallow: /
  154. ## Cutsy story is years stale and no longer excuses bad crawling
  155. User-Agent: dotbot
  156. Disallow: /
  157. ## Unprovoked, unasked for "monitoring" and "checking"
  158. User-Agent: panopta.com
  159. Disallow: /
  160. ## No "biomedical, biochemical, drug, health and disease related data" here.
  161. ## 192.31.21.179 switch from www.integromedb.org/Crawler to "Java/1.6.0_20"
  162. ## and "-" after integromedb was added to robots.txt
  163. User-Agent: www.integromedb.org/Crawler
  164. Disallow: /
  165. ## Ambulence chasers with stupid spider that hits the bad spider trap.
  166. User-Agent: ip-web-crawler.com
  167. Disallow: /
  168. ## Little public information
  169. User-Agent: Findxbot
  170. Disallow: /
  171. ## Don't know why it crawled me
  172. User-Agent: ips-agent
  173. Disallow: /
  174. ## Don't know why it crawled me
  175. User-Agent:Go-http-client
  176. Disallow: /
  177. ### End Get Off My Lawn ### }}}
  178. ### Plain Attack ### {{{
  179. ## evil
  180. User-Agent: ZmEu
  181. Disallow: /
  182. ## evil
  183. User-Agent: Morfeus
  184. Disallow: /
  185. ## evil
  186. User-Agent: Snoopy
  187. Disallow: /
  188. ### End Plain Attacks ### }}}
  189. User-agent: bot-pge.chlooe.com
  190. Disallow: /
  191. ## Firewall anything that goes to trap
  192. User-agent: *
  193. Allow: /
  194. Disallow: /badbottrap
  195. Disallow: /.well-known