I'm developing a python spider that should collect some information and download the PDF.
The code seems to be correct as it downloads to the directory of the first PDF and then returns the .json. However, it happens that after the first reading it begins to accuse the error highlighted in the title of this thread.
I did not find any documentation that validated this error or ways to adjust, because it does not tell you where the problem is, if you can help, I'm grateful. Here is the error code and print:
def parse(self, response):
normas_regulamentadoras = response.xpath('//*[@class="item-page"]')
for normas in normas_regulamentadoras:
titulo = normas.xpath('.//*[contains(@href,".pdf")]/text()').extract_first()
dataPublicacao = normas.xpath('.//*[contains(text(),"Publicado: ")]/text()').extract_first()
objeto = normas.xpath('.//*[@style="text-align: justify;"]/text()').extract()
url_pdf = normas.xpath('.//*[contains(@href,".pdf")]/@href').extract_first()
req = Request(url=url_pdf, callback=self.parsePdf)
req.meta['Titulo'] = titulo
req.meta['Data'] = dataPublicacao
req.meta['Objeto'] = objeto
yield req
def parsePdf(self, response):
item = Items()
item['author'] = "MTE"
item['title'] = response.meta['Titulo']
item['description'] = response.meta['Objeto']
item['url'] = response.url
item['guid'] = hashlib.sha1(response.url).hexdigest()
item['crawledDate'] = response.meta['Data']
item['extends'] = {'Abrangencia': 'Federal'}
item['source'] = "MTE"
item['media'] = [{'title': 'linkarquivo', 'link': self.link_arquivos + (item['guid'] + '.pdf')}]
if '.pdf' in response.url:
with open(os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")), "wb") as code:
code.write(response.body)
try:
subprocess.call([self.pdf2text, "-layout", "-nopgbrk",
os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")),
os.path.join(self.diretorio_temporario, (item['guid'] + ".txt"))])
except Exception as e:
print 'erro no pdt2text: ' + e.message
txt = codecs.open(os.path.join(self.diretorio_temporario, (item['guid'] + ".txt")), encoding='latin-1',
mode="rb")
texto = txt.read()
txt.close()
item['content'] = texto
print(response)
yield item