Get a value inside the HTML cURL

4

I want to get a value that is inside the html of a page

<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">

I want to get only the value ref=fvFCF9D8N4Ak

I'm using cURL and php and thought the solution was a regex.

How else should I get this value?

    
asked by anonymous 11.12.2014 / 22:06

2 answers

3

Assuming you're using cURL to pull the HTML of an address, and then use PHP to collect certain data from the gathered HTML:

You can make use of the DOMDocument class to parse of the HTML, find the <a/> tag and collect the value of the href attribute.

After using parse_url() you can extract the query string of it, this is what you want:

Example

// o HTML que recolheste
$html = '<html>
<head></head>
<body>
<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">bubu</a>
</body>
</html>';

// Instanciar o DOMDocument
$dom = new DOMDocument;

// Carregar o HTML recolhido para o DOMDocument
@$dom->loadHTML($html);

// Percorrer o DOM e por cada tag 'a' encontrada
foreach ($dom->getElementsByTagName('a') as $tag) {

    // apanhar o valor do atributo 'href'
    $href = $tag->getAttribute('href');

    // se não estiver vazio
    if (!empty($href)) {

        // guardar a query string numa variável
        $queryString = parse_url($href, PHP_URL_QUERY);  // Resultado: ref=fvFCF9D8N4Ak
    }
}

See example working on Ideone .

If you only have the HTML present in the question, the method is exactly the same:

$html = '<a href="https://www.site.com/user.asp?ref=fvFCF9D8N4Ak">';

$dom = new DOMDocument;
@$dom->loadHTML($html);

foreach ($dom->getElementsByTagName('a') as $tag) {

    $href = $tag->getAttribute('href');

    if (!empty($href)) {

        $queryString = parse_url($href, PHP_URL_QUERY); // Resultado: ref=fvFCF9D8N4Ak
    }
}

See example working on Ideone .

    
11.12.2014 / 22:38
0

Hello friend you can try this solution with Regex (I made a simple test class):

<?php

/**
 * A simple crawler
 * By Rodrigo Nascimento
 * 
 */
set_time_limit(0);
error_reporting(E_ALL);

Class SimpleCrawler {

    private $url;
    private $userAgent;
    private $httpResponse;

    function __construct() {
        $this->userAgent       = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/33.0";
        $this->chocolateCookie = "chocolateCookies.txt";
    }

    /**
     * Seta a url alvo
     * @param string $url
     * @return SimpleCrawler
     */
    public function setUrl($url) {
        $this->url = $url;
        return $this;
    }

    /**
     * Requisição get
     * @return SimpleCrawler
     */
    private function get(){
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $this->url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->chocolateCookie);
        $this->httpResponse = curl_exec($ch);
        return $this;
    }

    /**
     * Pega o conteudo da requisição
     * @return SimpleCrawler
     */
    public function getPageContent() {
        return $this->httpResponse;
    }

    /**
     * Faz a navegação na página especificado por self::setUrl
     * @return SimpleCrawler
     */
    public function navigate() {
        $this->get();

        return $this;
    }
}

/* Estancia do nosso objeto que se baseia nos seguintes métodos:
 * 
 * Definir uma url: $simpleCrawler->setUrl('site');
 * Navegar em dada url: $simpleCrawler->navigate();
 * E por fim ter acesso ao conteúdo da requisição: $simpleCrawler->getPageContent();
 * 
 */

$simpleCrawler = new SimpleCrawler;
$simpleCrawler->setUrl("http://siteQualquer")
              ->navigate();

$conteudo  = $simpleCrawler->getPageContent();
$urlResult = (preg_match("#?ref=(.*?)\">#", $conteudo, $match)) ? $match[1] 
                                                                : "Não foi possível obter a url solicitada via Regex.";

echo $urlResult . PHP_EOL;
    
17.03.2015 / 21:48