Converting Web Scraper from Python 2.7 to 3.5

Question

Converting Web Scraper from Python 2.7 to 3.5

Navigation

1

Good afternoon everyone!

It is as follows: I found a script in Python 2.7 but I have version 3.6. As I am new in this scope, I wanted to work manually to convert this script. Here is the code below:

### Modified for naming pics by tag
import requests,urllib
import shutil
import re
import os
import os.path
import time
from bs4 import BeautifulSoup

def getnopages(noofpages):
for link in alla:
    href = str(link.get('href'))
    if href.find('page') >=0:
        currpages = int((re.findall('\d+',href))[-1])
        if currpages > noofpages:
            noofpages = currpages
return noofpages

def getlink(url):
soup = BeautifulSoup(urllib.request import urlopen(url))
alla = soup.findAll("img")
for link in alla:
    href = str(link.get('src'))
    if href.startswith('http://images'):
        parts = re.search('(.*\/\d*\/)(thumb.*?(\d+\.\w+))',href).groups()
        href = parts[0] + parts[-1]
        item_name = "item_"+parts[-1].split(".")[0]
        atmp = soup.find("div",  { 'id':item_name} ).findNext('span', { 'class' : 'thumb-info-big' })
        if atmp == None:
            atmp = soup.find("div",  { 'id':item_name} ).findNext('span', { 'class' : 'thumb-info-small' })
        if atmp == None:
            atmp = soup.find("div",  { 'id':item_name} ).findNext('span', { 'class' : 'thumb-info-medium' })
        pic_ex = "." + parts[-1].split(".")[1]
        pic_name = atmp.contents[-2].string
        print ("href")
        downloadable.append([href, pic_name, pic_ex])


def downloadfiles(downloadable):
no = 0
DEFAULT_DIRECTORY = str(os.getcwd()) + "/pics"
os.chdir(DEFAULT_DIRECTORY)
for item in downloadable[0:]:
    pic_url = item[0]
    pic_name = item[1]
    pic_ex = item[2]
    print(item)
    print (str(no) + " from " + str(len(downloadable)) + " pictures")
    response = requests.get(pic_url, stream=True)
    outputDirectory = pic_name
    if not os.path.exists(outputDirectory):
        os.makedirs(outputDirectory)
    os.chdir(DEFAULT_DIRECTORY +"/" + outputDirectory)
    with open(pic_name+str(time.time())+pic_ex, 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    os.chdir(DEFAULT_DIRECTORY)
    no+=1
    del response


#url = 'http://wall.alphacoders.com/search.php?search=avril&page=1'
#url = 'http://wall.alphacoders.com/by_collection.php?id=565&page=1'
url = 'http://wall.alphacoders.com/by_category.php?id=7&name=Celebrity+Wallpapers&page=1'
baseurl = url[0:url.find('page=')+5]
print(baseurl)
r = requests.get(url)
soup = BeautifulSoup(r.content)
alla = soup.find_all("a")
noofpages = -1
noofpages = getnopages(noofpages)
print(noofpages)
downloadable = [] 

for each in range(1,noofpages):
getlink (baseurl+str(each))
print(len(downloadable))
print (str(each) + " from " + (str(noofpages) + " pages"))

downloadfiles(downloadable)

Do you have any knowledge of where I get a base with command equivalencies?

If you have the knowledge of conversion, teach me!

Thanks, guys! :)

Update 1:

I've reviewed the code and have corrected most of the code syntax errors, but I'm still experiencing problems in the last few lines:

for each in range(1,noofpages):
getlink (baseurl+str(each))
print(len(downloadable))
print (str(each) + " from " + (str(noofpages) + " pages"))

It has a syntax error in the getlink, so I made the following change:

for each in range(1,noofpages):
print(getlink(baseurl+str(each)))
print(len(downloadable)
print(str(each) + "from" + (str(noofpages) + "pages"))

Still it continues to accuse syntax error in itself

python python-3.x python-2.7 web-scraping

asked by anonymous 28.03.2017 / 21:18

0 answers

How can I customize my data for use in Theano? Graph does not appear on the mobile screen