I can not do web scraping properly from a Python web site

Question

I can not do web scraping properly from a Python web site

Navigation

#1 by (0 votes)

0

Well, I was making a code that would check the day of each comic / gif page, and if the day is the same as the current day (in the code I put 14 only because the site does not update the weekend and I I needed to test it somehow), I would download the comic / gif. However, two errors occur: the code does not download all the strips / gifs (I noticed a pattern of up to 5 downloaded strips, nothing more), in addition, sometimes even when the date is less than the current date, the code downloads the same way.

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('C:\Users\Rafael\Desktop\Scraping\leninja_imgs')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 14
    n = 0

    for day in daysPost:
        if int(day.getText()) == actualday:
            req = requests.get(imgLinks[n])
            img = open(os.path.basename(imgLinks[n]), "wb")

            for chunk in req.iter_content(100000):
                img.write(chunk)    

        else:
            print("Não foi possível baixar a imagem!")
            return False
        n += 1
    return True

get_img()

python python-3.x web-scraping beautifulsoup

asked by anonymous 15.09.2018 / 23:39

1 answer

How to use make call another namespace without using include? Bootstrap with loop

score 0 · Answer 1

Images are a list, so you can download them all:

# coding=utf-8

from bs4 import BeautifulSoup
import requests
import datetime
import os

os.chdir('./')

def get_img():
    r = requests.get("https://leninja.com.br/page/2/")
    soup = BeautifulSoup(r.text, 'lxml')
    daysPost = soup.select(".day-post")
    imgLinks = [i.get("src") for i in soup.select(".le-inner-content img")]
    #actualday = datetime.datetime.now().day
    actualday = 20
    n = 0

    for img in imgLinks:
        try:
            req = requests.get(imgLinks[n])


        except requests.exceptions.RequestException as e:
            print e
            print("Não foi possível baixar a imagem!")
            return False

        img = open(os.path.basename(imgLinks[n]), "wb")

        for chunk in req.iter_content(100000):
            img.write(chunk)
        n += 1
    return True

get_img()