Get page source with Google Chrome extension

10

You can create an extension for Chrome that takes either the source code of the page or the whole text ( Ctrl + A , Ctrl + C ), to send this to an external site (for data mining) and to return the resulting content from the site? (in this case, a graph with the main terms).

obs: form that I created: (it's in popup.js)

    var my_form=document.createElement('FORM');
    my_form.name='entrada';
    my_form.method='POST';
    my_form.action='http://sobek.ufrgs.br/newSobekSite/new-sobek.php';  
    my_form.submit();
    
asked by anonymous 09.02.2015 / 21:57

1 answer

10

As you can see from SOen , you can:

manifest.json

  

Note:

     

Note: You are likely to add the <all_urls> (and perhaps "*:*//site1.com", "*://*site2.com" ) permission on the manifest:

{
    "name": "Get pages source",
    "version": "1.0",
    "manifest_version": 2,
    "description": "Pega o conteudo da página e envia para um servidor",
    "browser_action": {
       "default_icon": "icon.png",
       "default_popup": "popup.html"
    },
    "permissions": [
        "webRequest",
        "tabs",
        "clipboardWrite",
        "clipboardRead",
        "<all_urls>"
    ]
}

popup.js

You can use Ajax to send clipboardRead to another server, as in the example:

chrome.extension.onMessage.addListener(function(request, sender) {
    if (request.action === "getSource") {
        var message, data, xhr;

        message = document.querySelector("#message");
        data = request.source;

        message.innerText = "Enviando ao servidor...";

        xhr  = new XMLHttpRequest();
        xhr.open("POST", "http://site1/webservice.php", true);
        xhr.onreadystatechange = function() {
            if(xhr.readyState === 4) {
                if (xhr.status === 200) {
                    message.innerText = "Resposta do servidor: " + xhr.responseText;
                } else {
                    message.innerText = "Err: " + xhr.status;
                }
            }
        };

        //Enviando dados como RAW
        xhr.send(request.source);
    }
});

function onWindowLoad()
{
    var message = document.querySelector('#message');

    chrome.tabs.executeScript(null, {
        file: "getPagesSource.js"
    }, function() {
        // If you try and inject into an extensions page or the webstore/NTP you'll get an error
        if (chrome.extension.lastError) {
            message.innerHTML = "Erro ao executar o script : <br>" + chrome.extension.lastError.message;
        }
    });
}

window.onload = onWindowLoad;

getPagesSource.js

To copy as if the user was copying we use clipboardWrite and request.source

function copyFromDOM(target, rich) {
    var range, dom, source, posX, posY;

    posX = window.pageXOffset;
    posY = window.pageYOffset;

    dom = document.createElement("div");
    dom.contentEditable = true;

    range = document.createRange();
    range.selectNode(target);

    window.getSelection().removeAllRanges();
    window.getSelection().addRange(range);
    document.execCommand("copy");

    document.body.appendChild(dom);

    dom.focus();

    document.execCommand("paste");

    source = rich === true ? dom.innerHTML : dom.textContent;

    window.getSelection().removeAllRanges();
    document.body.removeChild(dom);

    window.setTimeout(function() {
        window.scrollTo(posX, posY);
    }, 1);

    range = dom = null;
    return source;
}

chrome.extension.sendMessage({
    action: "getSource",
    source: copyFromDOM(document.body, false)//Copia apenas texto
});
  

Note: If you want to copy with "rich-text", then use window.getSelection().addRange

     

Note: There was a problem in using the OP as the <div contentEditable="true"></div> function, it was using GoogleChrome 38 , but after updating to version recently the function started working normally.

Copying page source code

To copy the source code of the page change the copyFromDOM(document.body, true) to something like (based on the user response Rob W a>):

getPagesSource.js

// @author Rob W <https://stackoverflow.com/users/938089/rob-w>
// Demo: var serialized_html = DOMtoString(document);

function DOMtoString(document_root) {
    var html = '',
        node = document_root.firstChild;
    while (node) {
        switch (node.nodeType) {
        case Node.ELEMENT_NODE:
            html += node.outerHTML;
            break;
        case Node.TEXT_NODE:
            html += node.nodeValue;
            break;
        case Node.CDATA_SECTION_NODE:
            html += '<![CDATA[' + node.nodeValue + ']]>';
            break;
        case Node.COMMENT_NODE:
            html += '<!--' + node.nodeValue + '-->';
            break;
        case Node.DOCUMENT_TYPE_NODE:
            // (X)HTML documents are identified by public identifiers
            html += "<!DOCTYPE " + node.name + (node.publicId ? ' PUBLIC "' + node.publicId + '"' : '') + (!node.publicId && node.systemId ? ' SYSTEM' : '') + (node.systemId ? ' "' + node.systemId + '"' : '') + '>\n';
            break;
        }
        node = node.nextSibling;
    }
    return html;
}

chrome.extension.sendMessage({
    action: "getSource",
    source: DOMtoString(document)
});

Server receiving the data

As I do not know the language of your server, I will provide an example with PHP, this example only writes to a file, but you can change to a database and use copyFromDOM(document.body, false) data instead of getPagesSource.js ( This is just an example , you can send the data in other ways to the server:

webservice.php

<?php
if (false === ($input = fopen('php://input', 'r'))) {
    echo 'Erro ao ler os dados recebidos';
} else if (false === ($output = fopen('meu-arquivo.txt', 'w'))) {
    echo 'Erro abrir arquivo para gravação';
    fclose($input);
    $input = NULL;
} else {
    $hasData = false;

    while (false === feof($input)) {
        $data = fgets($input, 128);
        if ($data !== '') {
            $hasData = true;
        }

        fwrite($output, $data);
    }

    fclose($input);
    fclose($output);

    $input = $output = NULL;

    echo $hasData ? 'Ok' : 'Área de seleção vazia, tente novamente';
}

If you are going to send via POST with RAW (type HTML forms) you will need to use x-www-form-urlencoded and x-www-form-urlencoded :

        xhr  = new XMLHttpRequest();
        xhr.open("POST", "http://site1/webservice.php", true);
        xhr.setRequestHeader("Content-type","application/x-www-form-urlencoded");

        xhr.onreadystatechange = function() {
            if(xhr.readyState === 4) {
                if (xhr.status === 200) {
                    message.innerText = "Resposta do servidor: " + xhr.responseText;
                } else {
                    message.innerText = "Err: " + xhr.status;
                }
            }
        };

        //Troque isto pela sua variável que é usada no SERVIDOR
        xhr.send('minha_variavel_do_servidor=' + window.encodeURIComponent(request.source));
  

Note: setRequestHeader works with UTF-8 it may be necessary on the server to decode this data, do you use window.encodeURIComponent or window.encodeURIComponent

Showing result in a pop-up or new window

Just not being in the scope of the question, the AP requested the use of pop-up to display the results, as many extensions use pop-up to show updates or similar things so I decided to provide such an example. To use it you need windows-1252 and iso-8859-1 , to use in the call extension within Ajax:

xhr.onreadystatechange = function() {
    var win;

    if(xhr.readyState === 4) {
        if (xhr.status === 200) {
            //Abre uma nova aba ou pop-up
            win = window.open("", "_blank", "width=600, height=600");
            win.document.write(xhr.responseText);
        } else {
            //Mostra o resultado na extensão
            message.innerText = "Err: " + xhr.status;
        }
    }
};

Avoiding deleting the user's clipboard

If you just want to copy "texts" and avoid using window.open you can use window.open().document.write ", it will only copy the text and, as I said, it will not affect the transfer area, so you do not need to add permissions to the manifest as we will no longer manipulate clipBoard , change function textContent to:

function copyFromDOM(target, rich) {
    return rich === true ? target.innerHTML : target.textContent;
}
    
09.02.2015 / 22:34