I need to download content from a website. I made a code in python 3.5. When I run it just for a single page the code works fine but when I put it in a loop or function it gives error.
The code as function is as follows:
from bs4 import BeautifulSoup
from selenium import webdriver
import html2text
def getPEP(strg):
driver = webdriver.Firefox()
driver.page_source = driver.get(strg)
html = driver.page_source
driver.close()
text=html2text.html2text(html)
return(text);
def salva():
peps = open('PEP.txt', 'r')
lines = tuple(peps)
peps.close()
for i in range(1):
strg=lines[i].replace('\n','')
print(strg + '\n')
str(strg)
getPEP(strg)
start = '# '
end = ', \n\n[ ![Join us on'
cleaned=(text.split(start))[1].split(end)[0]
file = open(str(i)+'.txt', 'w')
file.write(cleaned.replace(' ** ','').replace('**',''))
file.close()
print('arquivo ' + str(i) + 'gravado com sucesso' )
return;
salva()
When I put only the command line like the following:
>>> strg='http://www.mtsamples.com/site/pages/sample.asp?type=3-Allergy%20/%20Immunology&sample=386-Allergic%20Rhinitis, Allergic Rhinitis'
>>> driver = webdriver.Firefox()
>>> driver.page_source = driver.get(strg)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: can't set attribute
>>> html = driver.page_source
>>> driver.close()
>>> text=html2text.html2text(html)
As long as I get this error, I can still retrieve the text from the site. now when I put it as a function:
>>> def getPEP(strg):
... driver = webdriver.Firefox()
... driver.page_source = driver.get(strg)
... html = driver.page_source
... driver.close()
... text=html2text.html2text(html)
... return(text);
...
>>> text=getPEP()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
TypeError: getPEP() missing 1 required positional argument: 'strg'
Then the program stops at error and does not return the text I need! Can anyone help me?
The fucao getPEP () worked, thank you! Now my code looks like this:
from bs4 import BeautifulSoup
from selenium import webdriver
import html2text
# driver.page_source = driver.get())#
def getPEP(strg):
driver = webdriver.Firefox()
driver.get(strg)
html = driver.page_source
driver.close()
text=html2text.html2text(html)
return(text);
def salva(arqv):
peps = open(arqv, 'r')
lines = tuple(peps)
peps.close()
for i in range(len(lines)):
strg=lines[i].replace('\n','')
text=getPEP(strg)
start = '# '
end = ', \n\n[ ![Join us on'
cleaned=(text.split(start))[1].split(end)[0]
file = open(str(i)+'.txt', 'w')
file.write(cleaned.replace(' ** ','').replace('**',''))
file.close()
print('arquivo ' + str(i) + 'gravado com sucesso' )
return;
getPEP('PEP.txt')
The getPEP (strg) function is working fine, thank you!
When I call the function salva(arqv)
it was to read the urls I want to download, to collect the text that I want to save through the getPEP(strg)
function and write to a file, is giving the following error:
Traceback (most recent call last):
File "crawlerPEP.py", line 35, in <module>
getPEP('PEP.txt')
File "crawlerPEP.py", line 10, in getPEP
driver.get(strg)
File "/home/angelica/Documents/PyEnv3/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 248, in get
self.execute(Command.GET, {'url': url})
File "/home/angelica/Documents/PyEnv3/lib/python3.5/site-packages/selenium/webdriver/remote/webdriver.py", line 236, in execute
self.error_handler.check_response(response)
File "/home/angelica/Documents/PyEnv3/lib/python3.5/site-packages/selenium/webdriver/remote/errorhandler.py", line 192, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: Target URL PEP.txt is not well-formed.
Stacktrace:
at FirefoxDriver.prototype.get (file:///tmp/tmpiag201lm/extensions/[email protected]/components/driver-component.js:10636)
at DelayedCommand.prototype.executeInternal_/h (file:///tmp/tmpiag201lm/extensions/[email protected]/components/command-processor.js:12661)
at DelayedCommand.prototype.executeInternal_ (file:///tmp/tmpiag201lm/extensions/[email protected]/components/command-processor.js:12666)
at DelayedCommand.prototype.execute/< (file:///tmp/tmpiag201lm/extensions/[email protected]/components/command-processor.js:12608)