Erreur32 ha revisionato questo gist 6 months ago. Vai alla revisione
Nessuna modifica
Glenn Plas ha revisionato questo gist 14 years ago. Vai alla revisione
1 file changed, 49 insertions, 16 deletions
httpd.conf_spiders
| @@ -1,13 +1,34 @@ | |||
| 1 | - | # To relieve the server | |
| 2 | - | RewriteEngine On | |
| 1 | + | # To relieve servers | |
| 3 | 2 | ||
| 4 | - | # a robots.txt file is the only file that will be allowed to be downloaded by those bots now, for all virtual servers. | |
| 5 | - | RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] | |
| 3 | + | ##Imagine a robots.txt file like this (Google understands this format): | |
| 4 | + | #User-agent: * | |
| 5 | + | #Disallow: /detailed | |
| 6 | + | #Disallow: /?action=detailed | |
| 7 | + | #Disallow: /*/detailed | |
| 8 | + | #Crawl-delay: 20 | |
| 9 | + | ## | |
| 6 | 10 | ||
| 7 | 11 | # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive | |
| 8 | 12 | # RewriteEngine On | |
| 9 | 13 | # RewriteOptions Inherit | |
| 10 | 14 | ||
| 15 | + | # Then this will work in your virtualservers as wel as in the main, except for those you don't set up | |
| 16 | + | ||
| 17 | + | # And you want to enforce those policies, you can do this: | |
| 18 | + | ||
| 19 | + | # put this below in your httpd.conf file | |
| 20 | + | RewriteEngine On | |
| 21 | + | ||
| 22 | + | # Set a general robots.txt file (For all virtual hosts here) to a file apache can access | |
| 23 | + | RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] | |
| 24 | + | # a robots.txt file is the now the only file that will be allowed to be downloaded by any bots blocked here below | |
| 25 | + | ||
| 26 | + | # Block fake google when it's not coming from their IP range's (A fake googlebot) [F] => Failure | |
| 27 | + | RewriteCond %{HTTP:X-FORWARDED-FOR} !^66\.249\.(6[4-9]|[78][0-9]|9[0-5])\. | |
| 28 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC] | |
| 29 | + | RewriteRule .* - [F,L] | |
| 30 | + | # End if match | |
| 31 | + | ||
| 11 | 32 | # IF THE UA STARTS WITH THESE | |
| 12 | 33 | RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR] | |
| 13 | 34 | RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR] | |
| @@ -33,25 +54,38 @@ RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|take | |||
| 33 | 54 | RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR] | |
| 34 | 55 | RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR] | |
| 35 | 56 | RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR] | |
| 36 | - | RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC,OR] | |
| 57 | + | RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC] | |
| 58 | + | # ISSUE 403 / SERVE ERRORDOCUMENT | |
| 59 | + | RewriteRule . - [F,L] | |
| 60 | + | # End if match | |
| 37 | 61 | ||
| 38 | - | # Block google | |
| 62 | + | # Block real Engines , not respecting robots.txt but allowing correct calls to pass (all detail searches basically) | |
| 63 | + | # It seemst to take about 2 days of 403 to make it respect the robots.txt file, even though this one got downloaded several times | |
| 64 | + | ||
| 39 | 65 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR] | |
| 40 | - | # Block bing | |
| 66 | + | # Bing | |
| 41 | 67 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR] | |
| 42 | - | # Block msnbot | |
| 68 | + | # msnbot | |
| 43 | 69 | RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR] | |
| 70 | + | # Slurp | |
| 71 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC] | |
| 72 | + | # block all detail searches, the rest may pass (things like /detailed, /EN/detailed and ?action=detailed | |
| 73 | + | RewriteCond %{REQUEST_URI} ^(/detailed|/[A-Z]{2}/detailed/) [OR] | |
| 74 | + | # or with the action=detailed key set | |
| 75 | + | RewriteCond %{QUERY_STRING} action=detailed | |
| 76 | + | # ISSUE 403 / SERVE ERRORDOCUMENT | |
| 77 | + | RewriteRule .* - [F,L] | |
| 78 | + | # End if match | |
| 44 | 79 | ||
| 45 | - | # Some idiot bot | |
| 80 | + | # Defenite blocks | |
| 46 | 81 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR] | |
| 47 | - | ||
| 48 | 82 | # Baidus | |
| 49 | 83 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR] | |
| 50 | 84 | ||
| 51 | 85 | # Deepspider | |
| 52 | 86 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR] | |
| 53 | 87 | ||
| 54 | - | # STARTS WITH WEB | |
| 88 | + | # Known user agent strings defenitely belonging to bad spiders | |
| 55 | 89 | RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR] | |
| 56 | 90 | ||
| 57 | 91 | # Yandex (russian google) | |
| @@ -63,14 +97,12 @@ RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.ping | |||
| 63 | 97 | # AhrefsBot | |
| 64 | 98 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR] | |
| 65 | 99 | ||
| 66 | - | # Some rogue facebook faker ? | |
| 100 | + | # Block a rogue facebook application ? | |
| 67 | 101 | #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR] | |
| 68 | - | ||
| 102 | + | # | |
| 69 | 103 | # Vagabondo | |
| 70 | 104 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR] | |
| 71 | 105 | ||
| 72 | - | # Slurp | |
| 73 | - | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC,OR] | |
| 74 | 106 | # Ezooms | |
| 75 | 107 | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR] | |
| 76 | 108 | ||
| @@ -81,4 +113,5 @@ RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact | |||
| 81 | 113 | RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC] | |
| 82 | 114 | ||
| 83 | 115 | # ISSUE 403 / SERVE ERRORDOCUMENT | |
| 84 | - | RewriteRule . - [F,L] | |
| 116 | + | RewriteRule . - [F,L] | |
| 117 | + | # End if match | |
Glenn Plas ha revisionato questo gist 14 years ago. Vai alla revisione
1 file changed, 84 insertions
httpd.conf_spiders(file creato)
| @@ -0,0 +1,84 @@ | |||
| 1 | + | # To relieve the server | |
| 2 | + | RewriteEngine On | |
| 3 | + | ||
| 4 | + | # a robots.txt file is the only file that will be allowed to be downloaded by those bots now, for all virtual servers. | |
| 5 | + | RewriteRule ^/robots.txt$ /etc/apache2/robots.txt [L] | |
| 6 | + | ||
| 7 | + | # to enable these rules , save them to httpd.conf (debian/ubuntu) and include the following 2 lines in each VirtualHost directive | |
| 8 | + | # RewriteEngine On | |
| 9 | + | # RewriteOptions Inherit | |
| 10 | + | ||
| 11 | + | # IF THE UA STARTS WITH THESE | |
| 12 | + | RewriteCond %{HTTP_USER_AGENT} ^(aesop_com_spiderman|alexibot|backweb|bandit|batchftp|bigfoot) [NC,OR] | |
| 13 | + | RewriteCond %{HTTP_USER_AGENT} ^(black.?hole|blackwidow|blowfish|botalot|buddy|builtbottough|bullseye) [NC,OR] | |
| 14 | + | RewriteCond %{HTTP_USER_AGENT} ^(cheesebot|cherrypicker|chinaclaw|collector|copier|copyrightcheck) [NC,OR] | |
| 15 | + | RewriteCond %{HTTP_USER_AGENT} ^(cosmos|crescent|curl|custo|da|diibot|disco|dittospyder|dragonfly) [NC,OR] | |
| 16 | + | RewriteCond %{HTTP_USER_AGENT} ^(drip|easydl|ebingbong|ecatch|eirgrabber|emailcollector|emailsiphon) [NC,OR] | |
| 17 | + | RewriteCond %{HTTP_USER_AGENT} ^(emailwolf|erocrawler|exabot|eyenetie|filehound|flashget|flunky) [NC,OR] | |
| 18 | + | RewriteCond %{HTTP_USER_AGENT} ^(frontpage|getright|getweb|go.?zilla|go-ahead-got-it|gotit|grabnet) [NC,OR] | |
| 19 | + | RewriteCond %{HTTP_USER_AGENT} ^(grafula|harvest|hloader|hmview|httplib|httrack|humanlinks|ilsebot) [NC,OR] | |
| 20 | + | RewriteCond %{HTTP_USER_AGENT} ^(infonavirobot|infotekies|intelliseek|interget|iria|jennybot|jetcar) [NC,OR] | |
| 21 | + | RewriteCond %{HTTP_USER_AGENT} ^(joc|justview|jyxobot|kenjin|keyword|larbin|leechftp|lexibot|lftp|libweb) [NC,OR] | |
| 22 | + | RewriteCond %{HTTP_USER_AGENT} ^(likse|linkscan|linkwalker|lnspiderguy|lwp|magnet|mag-net|markwatch) [NC,OR] | |
| 23 | + | RewriteCond %{HTTP_USER_AGENT} ^(mata.?hari|memo|microsoft.?url|midown.?tool|miixpc|mirror|missigua) [NC,OR] | |
| 24 | + | RewriteCond %{HTTP_USER_AGENT} ^(mister.?pix|moget|mozilla.?newt|nameprotect|navroad|backdoorbot|nearsite) [NC,OR] | |
| 25 | + | RewriteCond %{HTTP_USER_AGENT} ^(net.?vampire|netants|netcraft|netmechanic|netspider|nextgensearchbot) [NC,OR] | |
| 26 | + | RewriteCond %{HTTP_USER_AGENT} ^(attach|nicerspro|nimblecrawler|npbot|octopus|offline.?explorer) [NC,OR] | |
| 27 | + | RewriteCond %{HTTP_USER_AGENT} ^(offline.?navigator|openfind|outfoxbot|pagegrabber|papa|pavuk) [NC,OR] | |
| 28 | + | RewriteCond %{HTTP_USER_AGENT} ^(pcbrowser|php.?version.?tracker|pockey|propowerbot|prowebwalker) [NC,OR] | |
| 29 | + | RewriteCond %{HTTP_USER_AGENT} ^(psbot|pump|queryn|recorder|realdownload|reaper|reget|true_robot) [NC,OR] | |
| 30 | + | RewriteCond %{HTTP_USER_AGENT} ^(repomonkey|rma|internetseer|sitesnagger|siphon|slysearch|smartdownload) [NC,OR] | |
| 31 | + | RewriteCond %{HTTP_USER_AGENT} ^(snake|snapbot|snoopy|sogou|spacebison|spankbot|spanner|sqworm|superbot) [NC,OR] | |
| 32 | + | RewriteCond %{HTTP_USER_AGENT} ^(superhttp|surfbot|asterias|suzuran|szukacz|takeout|teleport) [NC,OR] | |
| 33 | + | RewriteCond %{HTTP_USER_AGENT} ^(telesoft|the.?intraformant|thenomad|tighttwatbot|titan|urldispatcher) [NC,OR] | |
| 34 | + | RewriteCond %{HTTP_USER_AGENT} ^(turingos|turnitinbot|urly.?warning|vacuum|vci|voideye|whacker) [NC,OR] | |
| 35 | + | RewriteCond %{HTTP_USER_AGENT} ^(libwww-perl|widow|wisenutbot|wwwoffle|xaldon|xenu|zeus|zyborg|anonymouse) [NC,OR] | |
| 36 | + | RewriteCond %{HTTP_USER_AGENT} ^(Sogou\ web\ spider) [NC,OR] | |
| 37 | + | ||
| 38 | + | # Block google | |
| 39 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Googlebot/2\.[01];\ \+http://www\.google\.com/bot\.html\)$ [NC,OR] | |
| 40 | + | # Block bing | |
| 41 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ bingbot/2\.[01];\ \+http://www\.bing\.com/bingbot\.htm\)$ [NC,OR] | |
| 42 | + | # Block msnbot | |
| 43 | + | RewriteCond %{HTTP_USER_AGENT} ^msnbot-media/1\.[01]\ \(\+http://search\.msn\.com/msnbot\.htm\)$ [NC,OR] | |
| 44 | + | ||
| 45 | + | # Some idiot bot | |
| 46 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ MJ12bot/v1\.4\.2;\ http://www\.majestic12\.co\.uk/bot\.php\?\+\)$ [NC,OR] | |
| 47 | + | ||
| 48 | + | # Baidus | |
| 49 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Baiduspider/2\.[01];\ \+http://www\.baidu\.com/search/spider\.html\)$ [NC,OR] | |
| 50 | + | ||
| 51 | + | # Deepspider | |
| 52 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ OpenindexDeepSpider/Nutch-1\.[0-9]-dev;\ \+http://www\.openindex\.io/en/webmasters/spider\.html\)$ [NC,OR] | |
| 53 | + | ||
| 54 | + | # STARTS WITH WEB | |
| 55 | + | RewriteCond %{HTTP_USER_AGENT} ^web(zip|emaile|enhancer|fetch|go.?is|auto|bandit|clip|copier|master|reaper|sauger|site.?quester|whack) [NC,OR] | |
| 56 | + | ||
| 57 | + | # Yandex (russian google) | |
| 58 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ YandexBot/3\.[01];\ \+http://yandex\.com/bots\)$ [NC,OR] | |
| 59 | + | ||
| 60 | + | # Pingdom | |
| 61 | + | RewriteCond %{HTTP_USER_AGENT} ^Pingdom\.com_bot_version_1\.4_\(http://www\.pingdom\.com/\) [NC,OR] | |
| 62 | + | ||
| 63 | + | # AhrefsBot | |
| 64 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ AhrefsBot/2\.[01];\ \+http://ahrefs\.com/robot/\)$ [NC,OR] | |
| 65 | + | ||
| 66 | + | # Some rogue facebook faker ? | |
| 67 | + | #RewriteCond %{HTTP_USER_AGENT} ^facebookexternalhit/1\.1\ \(\+http://www\.facebook\.com/externalhit_uatext.php\)$ [NC,OR] | |
| 68 | + | ||
| 69 | + | # Vagabondo | |
| 70 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/4\.[01]\ \(compatible;\ \ Vagabondo/4\.0;\ webcrawler\ at\ wise-guys\ dot\ nl;\ http://webagent\.wise-guys\.nl/;\ http://www\.wise-guys\.nl/\) [NC,OR] | |
| 71 | + | ||
| 72 | + | # Slurp | |
| 73 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Yahoo!\ Slurp;\ http://help\.yahoo\.com/help/us/ysearch/slurp\)$ [NC,OR] | |
| 74 | + | # Ezooms | |
| 75 | + | RewriteCond %{HTTP_USER_AGENT} ^Mozilla/5\.0\ \(compatible;\ Ezooms/1\.[01];\ ezooms\.bot@gmail\.com\)$ [NC,OR] | |
| 76 | + | ||
| 77 | + | # Monalisa | |
| 78 | + | RewriteCond %{HTTP_USER_AGENT} ^Synthesio\ Crawler\ release\ MonaLisa\ \(contact\ at\ synthesio\ dot\ fr\)$ [NC,OR] | |
| 79 | + | ||
| 80 | + | # ANYWHERE IN UA -- GREEDY REGEX | |
| 81 | + | RewriteCond %{HTTP_USER_AGENT} ^.*(craftbot|download|extract|stripper|sucker|ninja|clshttp|webspider|leacher|collector|grabber|webpictures).*$ [NC] | |
| 82 | + | ||
| 83 | + | # ISSUE 403 / SERVE ERRORDOCUMENT | |
| 84 | + | RewriteRule . - [F,L] | |