How to avoid error Max retries exceeded in scraping in Python?

0

In Python 3 I made a program to wipe table rows from a multipage public site (97893). And I create a list with the rows of each column and I put sleep to try to avoid that the scraping stops, but even using several times is not working

The starting site is this: link

p>
from bs4 import BeautifulSoup
import requests
import pandas as pd
import random
from time import sleep

def sopa(link):
    res = requests.get(link)
    soup =  BeautifulSoup(res.text, "lxml")
    table = soup.select("table")[1]
    conjunto = table.findAll("tr")
    return conjunto

planilha = []

for i in range(1,97893):
    link = "http://www.portaltransparencia.gov.br/PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina="
    link = link + str(i)
    print(link)
    conjunto = sopa(link)
    sleep(random.uniform(0.2, 10))
    conta = 0
    for linha in conjunto:
        if conta > 0:
            documento = linha.find("td", {"class": "firstChild"}, {"style": "white-space: nowrap;"}).text.strip()
            nome = linha.find("a").text.strip()
            valor = linha.find("td", {"class": "colunaValor"}).text.strip()
            dicionario = {"documento": documento, "nome": nome, "valor": valor}
            planilha.append(dicionario)
        conta = conta + 1

You stopped at 686 with these error messages:

gaierror                                  Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in _new_conn(self)
    140             conn = connection.create_connection(
--> 141                 (self.host, self.port), self.timeout, **extra_kw)
    142 

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/util/connection.py in create_connection(address, timeout, source_address, socket_options)
     59 
---> 60     for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
     61         af, socktype, proto, canonname, sa = res

/usr/lib/python3.6/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    742     addrlist = []
--> 743     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    744         af, socktype, proto, canonname, sa = res

gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    600                                                   body=body, headers=headers,
--> 601                                                   chunked=chunked)
    602 

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    356         else:
--> 357             conn.request(method, url, **httplib_request_kw)
    358 

/usr/lib/python3.6/http/client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 

/usr/lib/python3.6/http/client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 

/usr/lib/python3.6/http/client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 

/usr/lib/python3.6/http/client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 

/usr/lib/python3.6/http/client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in connect(self)
    165     def connect(self):
--> 166         conn = self._new_conn()
    167         self._prepare_conn(conn)

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connection.py in _new_conn(self)
    149             raise NewConnectionError(
--> 150                 self, "Failed to establish a new connection: %s" % e)
    151 

NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
~/Documentos/Code/knight/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    439                     retries=self.max_retries,
--> 440                     timeout=timeout
    441                 )

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    638             retries = retries.increment(method, url, error=e, _pool=self,
--> 639                                         _stacktrace=sys.exc_info()[2])
    640             retries.sleep()

~/Documentos/Code/knight/lib/python3.6/site-packages/urllib3/util/retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    387         if new_retry.is_exhausted():
--> 388             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    389 

MaxRetryError: HTTPConnectionPool(host='www.portaltransparencia.gov.br', port=80): Max retries exceeded with url: /PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina=686 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known',))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-4-590ac6d45255> in <module>()
      3     link = link + str(i)
      4     print(link)
----> 5     conjunto = sopa(link)
      6     sleep(random.uniform(0.2, 10))
      7     conta = 0

<ipython-input-2-7aefd26bf83b> in sopa(link)
      1 def sopa(link):
----> 2     res = requests.get(link)
      3     soup =  BeautifulSoup(res.text, "lxml")
      4     table = soup.select("table")[1]
      5     conjunto = table.findAll("tr")

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/api.py in get(url, params, **kwargs)
     70 
     71     kwargs.setdefault('allow_redirects', True)
---> 72     return request('get', url, params=params, **kwargs)
     73 
     74 

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/api.py in request(method, url, **kwargs)
     56     # cases, and look like a memory leak in others.
     57     with sessions.Session() as session:
---> 58         return session.request(method=method, url=url, **kwargs)
     59 
     60 

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    506         }
    507         send_kwargs.update(settings)
--> 508         resp = self.send(prep, **send_kwargs)
    509 
    510         return resp

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/sessions.py in send(self, request, **kwargs)
    616 
    617         # Send the request
--> 618         r = adapter.send(request, **kwargs)
    619 
    620         # Total elapsed time of the request (approximately)

~/Documentos/Code/knight/lib/python3.6/site-packages/requests/adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    506                 raise SSLError(e, request=request)
    507 
--> 508             raise ConnectionError(e, request=request)
    509 
    510         except ClosedPoolError as e:

ConnectionError: HTTPConnectionPool(host='www.portaltransparencia.gov.br', port=80): Max retries exceeded with url: /PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina=686 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f4b9674a780>: Failed to establish a new connection: [Errno -2] Name or service not known',))

Please, do I need very large downtime to do this scraping? Or may it be a failure in the quality of my connection?

    
asked by anonymous 19.01.2018 / 12:41

1 answer

1

I ran the program down in response, it took almost 2 hours (even with 100 threads), but the data is here: link (not will be around for a long time)

I can not effectively answer the question title, because there may be a lot of reasons, but I can help improve the code:

Since they are 97893 pages, it is very unviable to make all requests in the same thread , even if you make 1 request per second that takes more than 27 hours.

It's also very important to save a file / database so you do not have to run the program many more times (it's a heavy and time-consuming program), so the next time you need the data, just open the file and the infos are there .

I've done a lot of webcrawlers, and when I have a lot of requests so large to always use threads, which, in this case, slowed down the execution time, taking advantage of the part you made to effectively parse the html BeautifulSoup ), I have refined some things (use 100 threads in the code below):

PS: I did not need to use random:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import random
import threading, queue, json
from time import sleep

def sopa(link):
    res = requests.get(link)
    soup =  BeautifulSoup(res.text, "lxml")
    table = soup.select("table")[1]
    conjunto = table.findAll("tr")
    return conjunto

def p_manager(p_q): # funcao responsavel pelos prints aqui vai atuar o nosso p_q definido em baixo
    while True:
        msg = p_q.get()
        print(msg)
        p_q.task_done()


def handle_reqs(work):
    total_w = len(work)
    while work:
        i = work.pop(0) # fazendo assim vamos aliviando a memoria
        link = "http://www.portaltransparencia.gov.br/PortalComprasDiretasFavorecido.asp?TipoPesquisa=2&Ano=2017&Pagina={}".format(i)
        p_q.put('[+] {}/{} - getting: {}'.format(len(work), total_w, link))
        conjunto = sopa(link)
        conta = 0
        for linha in conjunto:
            if conta > 0:
                documento = linha.find("td", {"class": "firstChild"}, {"style": "white-space: nowrap;"}).text.strip()
                nome = linha.find("a").text.strip()
                valor = linha.find("td", {"class": "colunaValor"}).text.strip()
                dicionario = {"documento": documento, "nome": nome, "valor": valor}
                planilha.append(dicionario)
            conta = conta + 1
    if(threading.active_count() <= 3): # se só houverem 3 threads (esta, a main, e a daemon), acabamos o scrapping
        data_q.put(True) # acabou e enviamos o sinal para desbloquear e escrever no ficheiro

p_q = queue.Queue() # responsavel pelos prints, nao queremos sobrecarregar as outras threads com os prints (chamadas de sistema)
t = threading.Thread(target=p_manager, args=(p_q,))
t.daemon = True # daemon, significa que o programa acaba independentemente se esta tem trabalho pendente ou nao
t.start() # inicia-la

data_q = queue.Queue() # responsavel pelo rastreio do do final scrapping 
planilha = []
num_threads = 100 # vamos usar 100 threads
works = [list(range(1, 97893))[i::num_threads] for i in range(num_threads)] # preparar o trabalho para cada thread
for w in works: # dividir o trabalho pelas threads
    threading.Thread(target=handle_reqs, args=(w,)).start() # iniciar cada uma

data_q.get() # bloquear até receber o sinal e continuar o prog

with open('tests.txt', 'w') as f:
    json.dump(planilha, f, sort_keys=True, indent=4)
    
19.01.2018 / 14:22