Scan all pages, curl

0

I need to access a website through cURL and capture content from its page, but it does not show all content on a page, it divides it into several pages at the end it shows that menu to browse page 1, 2, 3 , 4, I need to walk through ALL of these pages in order to capture the content I want, how can I do it?

The code of this "menu" (I forgot the name of it) is this:

<center><div class='wp-pagenavi'>
<span class='pages'>1 de 8</span><span class='current'>1</span><a class="page larger" href="/page/2/">2</a><a class="page larger" href="/page/3/">3</a><span class='extend'>...</span><a class="nextpostslink" rel="next" href="/page/2/">></a><a class="last" href="/page/8/">»</a>
</div></center>

In this case I would need to navigate the 8 pages to get what I want, how to do?

    
asked by anonymous 17.01.2015 / 00:40

1 answer

3

You need to make a request for each page and capture the content inside the page.

Supposing the url to be:      link

Soon we can create a class to effect the "crawler" on all pages with a simple for loop (With class already ready):

<?php

/**
 * A simple crawler
 * By Rodrigo Nascimento
 * 
 */
set_time_limit(0);
error_reporting(E_ALL);

Class SimpleCrawler {

    private $url;
    private $userAgent;
    private $httpResponse;

    function __construct() {
        $this->userAgent       = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0";
        $this->chocolateCookie = "chocolateCookies.txt";
    }

    /**
     * Seta a url alvo
     * @param string $url
     * @return SimpleCrawler
     */
    public function setUrl($url) {
        $this->url = $url;
        return $this;
    }

    /**
     * Requisição get
     * @return SimpleCrawler
     */
    private function get(){
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $this->url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        $this->httpResponse = curl_exec($ch);
        return $this;
    }

    /**
     * Pega o conteudo da requisição
     * @return SimpleCrawler
     */
    public function getPageContent() {
        // Aqui vc pode fazer o parse do content da página utilizando regex ou seja
        // lá qual for o método utilizado.
        echo "Page Content:\n\n",
             "{$this->httpResponse}\n\n";

        return $this;
    }

    /**
     * Faz a navegação na página especificado por self::setUrl
     * @return SimpleCrawler
     */
    public function navigate() {
        echo "Visiting: {$this->url}\n";
        $this->get();

        return $this;
    }
}

/* Estancia do nosso objeto que se baseia nos seguintes métodos:
 * 
 * Definir uma url: $simpleCrawler->setUrl('site');
 * Navegar em dada url: $simpleCrawler->navigate();
 * E por fim ter acesso ao conteúdo da requisição: $simpleCrawler->getPageContent();
 * 
 */
$simpleCrawler = new SimpleCrawler;

// à partir daqui podemos executar quantas requests quisermos.
// Já que precisamos do mesmo site basta um laço simples para efetuar a navegação
$pageNum = 8;

for ($i=1;$i<=$pageNum;$i++):
    $simpleCrawler->setUrl("http://site/page/{$i}")
                  ->navigate()
                  ->getPageContent();
endfor;

This should already be enough to accomplish the mission (:

    
17.03.2015 / 21:37