Good afternoon friends.
I am developing a crawler in php that will do scrapping of some urls that I will inform.
I'm trying to make it pull values from a dynamic url, but I'm not getting it.
Someone could help me.
<?php
$page_title = "MiniCrawler";
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title><?php print($page_title) ?></title>
</head>
<body>
<?php
// error handling
ini_set('display errors',1);
error_reporting(E_ALL|E_STRICT);
include_once ('simple_html_dom.php');
function limpaXml($texto){
return htmlspecialchars(html_entity_decode($texto, ENT_QUOTES, 'UTF-8'),ENT_QUOTES, 'UTF-8');
}
function dlPage($href) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($curl, CURLOPT_HEADER, false);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_URL, $href);
curl_setopt($curl, CURLOPT_REFERER, $href);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
$str = curl_exec($curl);
curl_close($curl);
// Create a DOM object
$dom = new simple_html_dom();
// Load HTML from a string
$dom->load($str);
return $dom;
}
// Settings on tophttp://www.dafiti.com.br/special-price/
$sitesToCheck = array(
// id is the page ID for selector
array(
"url" => "http://www.adidas.com.br/homem-outlet?sz=48","categoria" => "adidas" , "selector" => "div#product-grid div.product-tile div.hockeycard div.innercard"
)
);
$savePath = "json/";
$emailContent = "";
$xml = new SimpleXMLElement('<xml/>');
// For every page to check...
foreach($sitesToCheck as $site) {
$url = $site["url"];
if (!empty($url)){
//$numProdutos = dlPage($url);
//$numTotal = $numProdutos->find("p.count");
//$pag = 48;
//(int)(((int)$numTotal[0]->plaintext)/96);
$total = 432;
$porPag = 48;
$pag = $total / $porPag;
//$numProdutos->clear();
//unset($numProdutos);
for ($i=24;$i>$pag;){
$valorFinal = $porPag + $i + $i;
$novaUrl = $url."&start=".$valorFinal;
$novoSite = $xml->addChild('website');
$novoSite->addChild('webUrl', $novaUrl);
$novoSite->addChild('categoria', $site["categoria"]);
//var_dump($site);
// Calculate the cachedPage name, set oldContent = "";
$fileName = strtolower($site["categoria"]);
$oldContent = "";
// Get the URL's current page content
$html = dlPage($novaUrl);
if (!empty($html)){
$produtos = $novoSite->addChild('produtos');
$total=0;
// Find content by querying with a selector, just like a selector engine!
foreach($html->find($site["selector"]) as $element) {
if(isset($element)){
$link = $element->find('div.image a', 0);
$img = $element->find('div.image a img.show', 0);
if(!empty($element->find('div.product-info-wrapper', 0)->plaintext)){
$produto['marca'] = $site["categoria"]; //$element->find('div.product-box-brand', 0)->plaintext;
$produto['titulo'] = limpaXml($element->find('div.clearfix a span.title', 0)->plaintext);
$produto['preco_old'] = $element->find('div.clearfix div.price span.strike', 0)->plaintext;
$produto['preco'] = $element->find('div.clearfix div.price span.salesprice', -1)->plaintext;
//$produto['preco'] = $element->find('span.product-box-price-from', 0)->plaintext;
$produto['url'] = htmlspecialchars($link->href);
$produto['imagem'] = $img->getAttribute('data-original');
//$json[] = json_encode($produto);
$produtoNovo = $produtos->addChild('produto');
$produtoNovo->addChild('titulo',$produto['titulo']);
$produtoNovo->addChild('marca',$produto['marca']);
$produtoNovo->addChild('preco',$produto['preco']);
$produtoNovo->addChild('preco_old',$produto['preco_old']);
$produtoNovo->addChild('url',$produto['url']);
$produtoNovo->addChild('imagem',$produto['imagem']);
echo "$novoSite";
echo '<div style="float:left; width=300px; border:1px solid #000; padding:10px;"><a href="'.$produto['url'].'"><img src="'.$produto['imagem'].'"></a><br><p>Nome do Produto: '.$produto['titulo'].'</p><p>Marca: '.$produto['marca'].'</p><p>Preço Antigo: '.$produto['preco_old'].'</p><p>Preço: '.$produto['preco'].'</p><p>URL: '.$novaUrl.'</p></div>';
//echo 'Nome do Produto: '.$produto['titulo'].'<br />';
$total++;
}
}
}
}
//Header('Content-type: text/xml');
$arquivo = $xml->asXML();
// Save new content
file_put_contents($savePath.$fileName.'.xml',$arquivo);
$html->clear();
unset($html);
$i = $i;
}
}
}
// Retrieve the DOM from a given URL
/*
$html = file_get_html('http://www.dafiti.com.br/Sandalia-Anabela-DAFITI-SHOES-Caramelo-1746512.html');
// Find all "A" tags and print their HREFs
/*foreach($html->find('a') as $e) {
echo $e->href . '<br>';
}*/
// Retrieve all images and print their SRCs
/*foreach($html->find('a.gallery-thumb[data-img-zoom]') as $e)
echo '<img src="'.$e . '"><br>';
// Find all anchors and images
// Find all images, print their text with the "<>" included
/*foreach($html->find('img') as $e)
echo $e->outertext . '<br>';
*/
// Find the DIV tag with an id of "myId"
//foreach($html->find('div#myId') as $e)
// echo $e->innertext . '<br>';
// Find all SPAN tags that have a class of "myClass"
/*foreach($html->find('div.detail-row') as $e)
echo $e->innertext . '<br>';
// Find all TD tags with "align=center"
foreach($html->find('td[align=center]') as $e)
echo $e->innertext . '<br>';
/*
$target_url = "http://www.eleshop.com.br/";
$html = new simple_html_dom();
$html->load_file($target_url);
foreach($html->find('img') as $link){
echo '<img src="'. $link->src.'"><br />';
}
*/
?>
</body>
</html>