One way to do this is by creating rules in .htaccess
, which prevent some known agents that are robots, so you would have to have a complete list or get a complex list of these agents:
RewriteEngine on
RewriteCond %{HTTP_USER_AGENT} facebookexternalhit [NC,OR]
RewriteCond %{HTTP_USER_AGENT} Twitterbot [NC,OR]
RewriteCond %{HTTP_USER_AGENT} Baiduspider [NC,OR]
RewriteCond %{HTTP_USER_AGENT} MetaURI [NC,OR]
RewriteCond %{HTTP_USER_AGENT} mediawords [NC,OR]
RewriteCond %{HTTP_USER_AGENT} FlipboardProxy [NC]
RewriteCond %{REQUEST_URI} !\/sem_crawler.htm
RewriteRule .* http://seusite.com.br/sem_crawler.htm [L]
Another way is by making use of PHP:
<?php
class CrawlerDetect
{
//lista de robôs
private $agentsInvalids = array(
'Google'=>'Google',
'MSN' => 'msnbot',
'Rambler'=>'Rambler',
'Yahoo'=> 'Yahoo',
'AbachoBOT'=> 'AbachoBOT',
'accoona'=> 'Accoona',
'AcoiRobot'=> 'AcoiRobot',
'ASPSeek'=> 'ASPSeek',
'CrocCrawler'=> 'CrocCrawler',
'Dumbot'=> 'Dumbot',
'FAST-WebCrawler'=> 'FAST-WebCrawler',
'GeonaBot'=> 'GeonaBot',
'Gigabot'=> 'Gigabot',
'Lycos spider'=> 'Lycos',
'MSRBOT'=> 'MSRBOT',
'Altavista robot'=> 'Scooter',
'AltaVista robot'=> 'Altavista',
'ID-Search Bot'=> 'IDBot',
'eStyle Bot'=> 'eStyle',
'Scrubby robot'=> 'Scrubby',
...
);
//lista de navegadores válidos
private $agentsValids = array(
'Mozilla' => 'Mozilla',
'Chrome' => 'Chrome',
'Safari' => 'Safari',
'Opera' => 'Opera',
...
);
public function __construct($USER_AGENT)
{
$invalids = implode('|',$this->agentsInvalids);
$valids = implode('|',$this->agentsValids);
/* aqui você escolhe como prefere,
acredito que basta testar uma única lista */
if (strpos($invalids, $USER_AGENT) !== false ||
strpos($valids, $USER_AGENT) === false) {
return true;
} else {
return false;
}
}
//verifica o navegador
$crawler = new CrawlerDetect($_SERVER['HTTP_USER_AGENT']);
//se for robô ele verifica
if ($crawler) {
echo "acesso inválido!";
} else {
echo "acesso válido!";
}
On this site you have a complete list or a list that shows you a list full of brownsers and Crawlers.