Crawler for Woocommerce

Question

Crawler for Woocommerce

Navigation

0

Good afternoon friends.

I am developing a crawler in php that will do scrapping of some urls that I will inform.

I'm trying to make it pull values from a dynamic url, but I'm not getting it.

Someone could help me.

<?php
$page_title = "MiniCrawler";
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <title><?php print($page_title) ?></title>
</head>
<body>

<?php  
// error handling
ini_set('display errors',1);  
error_reporting(E_ALL|E_STRICT); 


include_once ('simple_html_dom.php');

function limpaXml($texto){
    return htmlspecialchars(html_entity_decode($texto, ENT_QUOTES, 'UTF-8'),ENT_QUOTES, 'UTF-8');
}


function dlPage($href) {

    $curl = curl_init();
    curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($curl, CURLOPT_HEADER, false);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($curl, CURLOPT_URL, $href);
    curl_setopt($curl, CURLOPT_REFERER, $href);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
    curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.125 Safari/533.4");
    $str = curl_exec($curl);
    curl_close($curl);

    // Create a DOM object
    $dom = new simple_html_dom();
    // Load HTML from a string
    $dom->load($str);

    return $dom;
    }

// Settings on tophttp://www.dafiti.com.br/special-price/
$sitesToCheck = array(
                    // id is the page ID for selector
    array(

        "url" => "http://www.adidas.com.br/homem-outlet?sz=48","categoria" => "adidas" , "selector" => "div#product-grid div.product-tile div.hockeycard div.innercard"
        )


);
$savePath = "json/";
$emailContent = "";
$xml = new SimpleXMLElement('<xml/>');

// For every page to check...
foreach($sitesToCheck as $site) {

    $url = $site["url"];
    if (!empty($url)){
        //$numProdutos = dlPage($url);
        //$numTotal = $numProdutos->find("p.count");
        //$pag = 48;
        //(int)(((int)$numTotal[0]->plaintext)/96);
        $total = 432; 
        $porPag = 48;
        $pag = $total / $porPag;


        //$numProdutos->clear();
        //unset($numProdutos);
        for ($i=24;$i>$pag;){

            $valorFinal = $porPag + $i + $i;
            $novaUrl = $url."&start=".$valorFinal;
            $novoSite = $xml->addChild('website');
            $novoSite->addChild('webUrl', $novaUrl);
            $novoSite->addChild('categoria', $site["categoria"]);
            //var_dump($site);
            // Calculate the cachedPage name, set oldContent = "";
            $fileName = strtolower($site["categoria"]);
            $oldContent = "";
            // Get the URL's current page content
            $html = dlPage($novaUrl);
                if (!empty($html)){
                    $produtos = $novoSite->addChild('produtos');
                    $total=0; 
                    // Find content by querying with a selector, just like a selector engine!
                    foreach($html->find($site["selector"]) as $element) {
                        if(isset($element)){
                            $link = $element->find('div.image a', 0);
                                $img = $element->find('div.image a img.show', 0);
                            if(!empty($element->find('div.product-info-wrapper', 0)->plaintext)){
                                $produto['marca'] = $site["categoria"]; //$element->find('div.product-box-brand', 0)->plaintext; 
                                $produto['titulo'] = limpaXml($element->find('div.clearfix a span.title', 0)->plaintext);
                                $produto['preco_old'] = $element->find('div.clearfix div.price span.strike', 0)->plaintext;
                                $produto['preco'] = $element->find('div.clearfix div.price span.salesprice', -1)->plaintext;
                                //$produto['preco'] = $element->find('span.product-box-price-from', 0)->plaintext;

                                $produto['url'] = htmlspecialchars($link->href);
                                $produto['imagem'] = $img->getAttribute('data-original');
                                //$json[] = json_encode($produto);
                                $produtoNovo = $produtos->addChild('produto');
                                $produtoNovo->addChild('titulo',$produto['titulo']);
                                $produtoNovo->addChild('marca',$produto['marca']);
                                $produtoNovo->addChild('preco',$produto['preco']);
                                $produtoNovo->addChild('preco_old',$produto['preco_old']);
                                $produtoNovo->addChild('url',$produto['url']);
                                $produtoNovo->addChild('imagem',$produto['imagem']);
                                echo "$novoSite";
                                echo  '<div style="float:left; width=300px; border:1px solid #000; padding:10px;"><a href="'.$produto['url'].'"><img src="'.$produto['imagem'].'"></a><br><p>Nome do Produto: '.$produto['titulo'].'</p><p>Marca: '.$produto['marca'].'</p><p>Preço Antigo: '.$produto['preco_old'].'</p><p>Preço: '.$produto['preco'].'</p><p>URL: '.$novaUrl.'</p></div>';
                                //echo 'Nome do Produto: '.$produto['titulo'].'<br />';
                                $total++;
                            }
                        }

                    }

                }

                //Header('Content-type: text/xml');
                $arquivo = $xml->asXML();
                // Save new content
                file_put_contents($savePath.$fileName.'.xml',$arquivo);

                $html->clear();
                unset($html);
            $i = $i;


        }

    }

}



// Retrieve the DOM from a given URL


/*
$html = file_get_html('http://www.dafiti.com.br/Sandalia-Anabela-DAFITI-SHOES-Caramelo-1746512.html');

// Find all "A" tags and print their HREFs
/*foreach($html->find('a') as $e) {
    echo $e->href . '<br>';
}*/

// Retrieve all images and print their SRCs
/*foreach($html->find('a.gallery-thumb[data-img-zoom]') as $e)
    echo '<img src="'.$e . '"><br>';

// Find all anchors and images 


// Find all images, print their text with the "<>" included
/*foreach($html->find('img') as $e)
    echo $e->outertext . '<br>';
*/
// Find the DIV tag with an id of "myId"
//foreach($html->find('div#myId') as $e)
  //  echo $e->innertext . '<br>';

// Find all SPAN tags that have a class of "myClass"
/*foreach($html->find('div.detail-row') as $e)
    echo $e->innertext . '<br>';

// Find all TD tags with "align=center"
foreach($html->find('td[align=center]') as $e)
    echo $e->innertext . '<br>';



/*
$target_url = "http://www.eleshop.com.br/";
$html = new simple_html_dom();
$html->load_file($target_url);


foreach($html->find('img') as $link){
echo '<img src="'. $link->src.'"><br />';
}
*/
?>

</body>
</html>

php web-crawler web-scraping curl

asked by anonymous 09.03.2016 / 21:37

0 answers

How to force loading of JS and CSS files with each new published version? New Wordpress does not load media