Zuletzt aktiv 1 month ago

Änderung 5819fe84c311d07b96c56fec3222f7723f62cb6b

httpd.conf_spiders Originalformat
1# To relieve servers
2
3##Imagine a robots.txt file like this (Google understands this format):
4#User-agent: *
5#Disallow: /detailed
6#Disallow: /?action=detailed
7#Disallow: /*/detailed
8#Crawl-delay: 20
9##
10
11# to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive
12# RewriteEngine On
13# RewriteOptions Inherit
14
15# Then this will work in your virtualservers as wel as in the main, except for those you don't set up
16
17# And you want to enforce those policies, you can do this:
18
19# put this below in your httpd.conf file
20RewriteEngine On
21
22# Set a general robots.txt file (For all virtual hosts here) to a file apache can access
23RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L]
24# a robots.txt file is the now the only file that will be allowed to be downloaded by any bots blocked here below
25
26# Block fake google when it's not coming from their IP range's (A fake googlebot) [F] => Failure
27RewriteCond %{HTTP:X-FORWARDED-FOR} !^66\.249\.(6[4-9]|[78][0-9]|9[0-5])\.
28RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC]
29RewriteRule .* - [F,L]
30# End if match
31
32# IF THE UA STARTS WITH THESE
33RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR]
34RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR]
35RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR]
36RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR]
37RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR]
38RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR]
39RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR]
40RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR]
41RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR]
42RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR]
43RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR]
44RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR]
45RewriteCond %{HTTP_USER_AGENT} ^(mister.?pix|moget|mozilla.?newt|nameprotect|navroad|backdoorbot|nearsite) [NC,OR]
46RewriteCond %{HTTP_USER_AGENT} ^(net.?vampire|netants|netcraft|netmechanic|netspider|nextgensearchbot) [NC,OR]
47RewriteCond %{HTTP_USER_AGENT} ^(attach|nicerspro|nimblecrawler|npbot|octopus|offline.?explorer) [NC,OR]
48RewriteCond %{HTTP_USER_AGENT} ^(offline.?navigator|openfind|outfoxbot|pagegrabber|papa|pavuk) [NC,OR]
49RewriteCond %{HTTP_USER_AGENT} ^(pcbrowser|php.?version.?tracker|pockey|propowerbot|prowebwalker) [NC,OR]
50RewriteCond %{HTTP_USER_AGENT} ^(psbot|pump|queryn|recorder|realdownload|reaper|reget|true_robot) [NC,OR]
51RewriteCond %{HTTP_USER_AGENT} ^(repomonkey|rma|internetseer|sitesnagger|siphon|slysearch|smartdownload) [NC,OR]
52RewriteCond %{HTTP_USER_AGENT} ^(snake|snapbot|snoopy|sogou|spacebison|spankbot|spanner|sqworm|superbot) [NC,OR]
53RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|takeout|teleport) [NC,OR]
54RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR]
55RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR]
56RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR]
57RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC]
58# ISSUE 403 / SERVE ERRORDOCUMENT
59RewriteRule . - [F,L]
60# End if match
61
62# Block real Engines , not respecting robots.txt but allowing correct calls to pass (all detail searches basically)
63# It seemst to take about 2 days of 403 to make it respect the robots.txt file, even though this one got downloaded several times
64# Google
65RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR]
66# Bing
67RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR]
68# msnbot
69RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR]
70# Slurp
71RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC]
72# block all detail searches, the rest may pass (things like /detailed, /EN/detailed and ?action=detailed
73RewriteCond %{REQUEST_URI} ^(/detailed|/[A-Z]{2}/detailed/) [OR]
74# or with the action=detailed key set
75RewriteCond %{QUERY_STRING} action=detailed
76# ISSUE 403 / SERVE ERRORDOCUMENT
77RewriteRule .* - [F,L]
78# End if match
79
80# Defenite blocks
81RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR]
82# Baidus
83RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR]
84
85# Deepspider
86RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR]
87
88# Known user agent strings defenitely belonging to bad spiders
89RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR]
90
91# Yandex (russian google)
92RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ YandexBot/3\.[01];\ \+http://yandex\.com/bots\)$ [NC,OR]
93
94# Pingdom
95RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.pingdom\.com/\) [NC,OR]
96
97# AhrefsBot
98RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR]
99
100# Block a rogue facebook application ?
101#RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR]
102#
103# Vagabondo
104RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR]
105
106# Ezooms
107RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR]
108
109# Monalisa
110RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact\ at\ synthesio\ dot\ fr\)$ [NC,OR]
111
112# ANYWHERE IN UA -- GREEDY REGEX
113RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC]
114
115# ISSUE 403 / SERVE ERRORDOCUMENT
116RewriteRule . - [F,L]
117# End if match
118