robots.txt

   1 #
   2 # robots.txt, based on the one for http://www.wikipedia.org/ and friends
   3 #
   4 # Please note: There are a lot of pages on this site, and there are
   5 # some misbehaved spiders out there that go _way_ too fast. If you're
   6 # irresponsible, your access to the site may be blocked.
   7 #
   8
   9 # advertising-related bots:
  10 User-agent: Mediapartners-Google*
  11 Disallow: /
  12
  13 # Wikipedia work bots:
  14 User-agent: IsraBot
  15 Disallow: /
  16
  17 User-agent: Orthogaffe
  18 Disallow: /
  19
  20 # Crawlers that are kind enough to obey, but which we'd rather not have
  21 # unless they're feeding search engines.
  22 User-agent: UbiCrawler
  23 Disallow: /
  24
  25 User-agent: DOC
  26 Disallow: /
  27
  28 User-agent: Zao
  29 Disallow: /
  30
  31 # Some bots are known to be trouble, particularly those designed to copy
  32 # entire sites. Please obey robots.txt.
  33 User-agent: sitecheck.internetseer.com
  34 Disallow: /
  35
  36 User-agent: Zealbot
  37 Disallow: /
  38
  39 User-agent: MSIECrawler
  40 Disallow: /
  41
  42 User-agent: SiteSnagger
  43 Disallow: /
  44
  45 User-agent: WebStripper
  46 Disallow: /
  47
  48 User-agent: WebCopier
  49 Disallow: /
  50
  51 User-agent: Fetch
  52 Disallow: /
  53
  54 User-agent: Offline Explorer
  55 Disallow: /
  56
  57 User-agent: Teleport
  58 Disallow: /
  59
  60 User-agent: TeleportPro
  61 Disallow: /
  62
  63 User-agent: WebZIP
  64 Disallow: /
  65
  66 User-agent: linko
  67 Disallow: /
  68
  69 User-agent: HTTrack
  70 Disallow: /
  71
  72 User-agent: Microsoft.URL.Control
  73 Disallow: /
  74
  75 User-agent: Xenu
  76 Disallow: /
  77
  78 User-agent: larbin
  79 Disallow: /
  80
  81 User-agent: libwww
  82 Disallow: /
  83
  84 User-agent: ZyBORG
  85 Disallow: /
  86
  87 User-agent: Download Ninja
  88 Disallow: /
  89
  90 #
  91 # Sorry, wget in its recursive mode is a frequent problem.
  92 # Please read the man page and use it properly; there is a
  93 # --wait option you can use to set the delay between hits,
  94 # for instance.
  95 #
  96 User-agent: wget
  97 Disallow: /
  98
  99 #
 100 # The 'grub' distributed client has been *very* poorly behaved.
 101 #
 102 User-agent: grub-client
 103 Disallow: /
 104
 105 #
 106 # Doesn't follow robots.txt anyway, but...
 107 #
 108 User-agent: k2spider
 109 Disallow: /
 110
 111 #
 112 # Hits many times per second, not acceptable
 113 # http://www.nameprotect.com/botinfo.html
 114 User-agent: NPBot
 115 Disallow: /
 116
 117 # A capture bot, downloads gazillions of pages with no public benefit
 118 # http://www.webreaper.net/
 119 User-agent: WebReaper
 120 Disallow: /
 121
 122 # Prevent TurnItIn
 123 User-agent: TurnitinBot
 124 Disallow: /
 125
 126 # Disable AI harvesting bots
 127 User-agent: CCBot
 128 Disallow: /
 129
 130 User-agent: ChatGPT-User
 131 Disallow: /
 132
 133 User-agent: GPTBot
 134 Disallow: /
 135
 136 User-agent: Google-Extended
 137 Disallow: /
 138
 139 User-agent: Omgilibot
 140 Disallow: /
 141
 142 User-agent: FacebookBot
 143 Disallow: /
 144
 145
 146 # Don't allow the wayback-maschine to index user-pages
 147 #User-agent: ia_archiver
 148 #Disallow: /wiki/User
 149 #Disallow: /wiki/Benutzer
 150
 151 #
 152 # Friendly, low-speed bots are welcome viewing article pages, but not
 153 # dynamically-generated pages please.
 154 #
 155 # Inktomi's "Slurp" can read a minimum delay between hits; if your
 156 # bot supports such a thing using the 'Crawl-delay' or another
 157 # instruction, please let us know.
 158 #
 159 User-agent: *
 160 Disallow: /mediawiki/
 161 Disallow: /trap/
 162 Disallow: /Special
 163 Disallow: /Special:Random
 164 Disallow: /Special%3ARandom
 165 Disallow: /Special:Search
 166 Disallow: /Special%3ASearch
 167
 168 ## *at least* 1 second please. preferably more :D
 169 Crawl-delay: 123