最后活跃于 1 month ago

Erreur32's Avatar Erreur32 修订了这个 Gist 6 months ago. 转到此修订

没有任何变更

Glenn Plas 修订了这个 Gist 14 years ago. 转到此修订

1 file changed, 49 insertions, 16 deletions

httpd.conf_spiders

@@ -1,13 +1,34 @@
1 - # To relieve the server
2 - RewriteEngine On
1 + # To relieve servers
3 2
4 - # a robots.txt file is the only file that will be allowed to be downloaded by those bots now, for all virtual servers.
5 - RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L]
3 + ##Imagine a robots.txt file like this (Google understands this format):
4 + #User-agent: *
5 + #Disallow: /detailed
6 + #Disallow: /?action=detailed
7 + #Disallow: /*/detailed
8 + #Crawl-delay: 20
9 + ##
6 10
7 11 # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive
8 12 # RewriteEngine On
9 13 # RewriteOptions Inherit
10 14
15 + # Then this will work in your virtualservers as wel as in the main, except for those you don't set up
16 +
17 + # And you want to enforce those policies, you can do this:
18 +
19 + # put this below in your httpd.conf file
20 + RewriteEngine On
21 +
22 + # Set a general robots.txt file (For all virtual hosts here) to a file apache can access
23 + RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L]
24 + # a robots.txt file is the now the only file that will be allowed to be downloaded by any bots blocked here below
25 +
26 + # Block fake google when it's not coming from their IP range's (A fake googlebot) [F] => Failure
27 + RewriteCond %{HTTP:X-FORWARDED-FOR} !^66\.249\.(6[4-9]|[78][0-9]|9[0-5])\.
28 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC]
29 + RewriteRule .* - [F,L]
30 + # End if match
31 +
11 32 # IF THE UA STARTS WITH THESE
12 33 RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR]
13 34 RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR]
@@ -33,25 +54,38 @@ RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|take
33 54 RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR]
34 55 RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR]
35 56 RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR]
36 - RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC,OR]
57 + RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC]
58 + # ISSUE 403 / SERVE ERRORDOCUMENT
59 + RewriteRule . - [F,L]
60 + # End if match
37 61
38 - # Block google
62 + # Block real Engines , not respecting robots.txt but allowing correct calls to pass (all detail searches basically)
63 + # It seemst to take about 2 days of 403 to make it respect the robots.txt file, even though this one got downloaded several times
64 + # Google
39 65 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR]
40 - # Block bing
66 + # Bing
41 67 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR]
42 - # Block msnbot
68 + # msnbot
43 69 RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR]
70 + # Slurp
71 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC]
72 + # block all detail searches, the rest may pass (things like /detailed, /EN/detailed and ?action=detailed
73 + RewriteCond %{REQUEST_URI} ^(/detailed|/[A-Z]{2}/detailed/) [OR]
74 + # or with the action=detailed key set
75 + RewriteCond %{QUERY_STRING} action=detailed
76 + # ISSUE 403 / SERVE ERRORDOCUMENT
77 + RewriteRule .* - [F,L]
78 + # End if match
44 79
45 - # Some idiot bot
80 + # Defenite blocks
46 81 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR]
47 -
48 82 # Baidus
49 83 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR]
50 84
51 85 # Deepspider
52 86 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR]
53 87
54 - # STARTS WITH WEB
88 + # Known user agent strings defenitely belonging to bad spiders
55 89 RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR]
56 90
57 91 # Yandex (russian google)
@@ -63,14 +97,12 @@ RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.ping
63 97 # AhrefsBot
64 98 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR]
65 99
66 - # Some rogue facebook faker ?
100 + # Block a rogue facebook application ?
67 101 #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR]
68 -
102 + #
69 103 # Vagabondo
70 104 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR]
71 105
72 - # Slurp
73 - RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC,OR]
74 106 # Ezooms
75 107 RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR]
76 108
@@ -81,4 +113,5 @@ RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact
81 113 RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC]
82 114
83 115 # ISSUE 403 / SERVE ERRORDOCUMENT
84 - RewriteRule . - [F,L]
116 + RewriteRule . - [F,L]
117 + # End if match

Glenn Plas 修订了这个 Gist 14 years ago. 转到此修订

1 file changed, 84 insertions

httpd.conf_spiders(文件已创建)

@@ -0,0 +1,84 @@
1 + # To relieve the server
2 + RewriteEngine On
3 +
4 + # a robots.txt file is the only file that will be allowed to be downloaded by those bots now, for all virtual servers.
5 + RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L]
6 +
7 + # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive
8 + # RewriteEngine On
9 + # RewriteOptions Inherit
10 +
11 + # IF THE UA STARTS WITH THESE
12 + RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR]
13 + RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR]
14 + RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR]
15 + RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR]
16 + RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR]
17 + RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR]
18 + RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR]
19 + RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR]
20 + RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR]
21 + RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR]
22 + RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR]
23 + RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR]
24 + RewriteCond %{HTTP_USER_AGENT} ^(mister.?pix|moget|mozilla.?newt|nameprotect|navroad|backdoorbot|nearsite) [NC,OR]
25 + RewriteCond %{HTTP_USER_AGENT} ^(net.?vampire|netants|netcraft|netmechanic|netspider|nextgensearchbot) [NC,OR]
26 + RewriteCond %{HTTP_USER_AGENT} ^(attach|nicerspro|nimblecrawler|npbot|octopus|offline.?explorer) [NC,OR]
27 + RewriteCond %{HTTP_USER_AGENT} ^(offline.?navigator|openfind|outfoxbot|pagegrabber|papa|pavuk) [NC,OR]
28 + RewriteCond %{HTTP_USER_AGENT} ^(pcbrowser|php.?version.?tracker|pockey|propowerbot|prowebwalker) [NC,OR]
29 + RewriteCond %{HTTP_USER_AGENT} ^(psbot|pump|queryn|recorder|realdownload|reaper|reget|true_robot) [NC,OR]
30 + RewriteCond %{HTTP_USER_AGENT} ^(repomonkey|rma|internetseer|sitesnagger|siphon|slysearch|smartdownload) [NC,OR]
31 + RewriteCond %{HTTP_USER_AGENT} ^(snake|snapbot|snoopy|sogou|spacebison|spankbot|spanner|sqworm|superbot) [NC,OR]
32 + RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|takeout|teleport) [NC,OR]
33 + RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR]
34 + RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR]
35 + RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR]
36 + RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC,OR]
37 +
38 + # Block google
39 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR]
40 + # Block bing
41 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR]
42 + # Block msnbot
43 + RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR]
44 +
45 + # Some idiot bot
46 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR]
47 +
48 + # Baidus
49 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR]
50 +
51 + # Deepspider
52 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR]
53 +
54 + # STARTS WITH WEB
55 + RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR]
56 +
57 + # Yandex (russian google)
58 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ YandexBot/3\.[01];\ \+http://yandex\.com/bots\)$ [NC,OR]
59 +
60 + # Pingdom
61 + RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.pingdom\.com/\) [NC,OR]
62 +
63 + # AhrefsBot
64 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR]
65 +
66 + # Some rogue facebook faker ?
67 + #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR]
68 +
69 + # Vagabondo
70 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR]
71 +
72 + # Slurp
73 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC,OR]
74 + # Ezooms
75 + RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR]
76 +
77 + # Monalisa
78 + RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact\ at\ synthesio\ dot\ fr\)$ [NC,OR]
79 +
80 + # ANYWHERE IN UA -- GREEDY REGEX
81 + RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC]
82 +
83 + # ISSUE 403 / SERVE ERRORDOCUMENT
84 + RewriteRule . - [F,L]
上一页 下一页