Syntax Error: AcroForm field object is wrong type in Python

0

I'm developing a python spider that should collect some information and download the PDF.

The code seems to be correct as it downloads to the directory of the first PDF and then returns the .json. However, it happens that after the first reading it begins to accuse the error highlighted in the title of this thread.

I did not find any documentation that validated this error or ways to adjust, because it does not tell you where the problem is, if you can help, I'm grateful. Here is the error code and print:

def parse(self, response):
    normas_regulamentadoras = response.xpath('//*[@class="item-page"]')
    for normas in normas_regulamentadoras:
        titulo = normas.xpath('.//*[contains(@href,".pdf")]/text()').extract_first()
        dataPublicacao = normas.xpath('.//*[contains(text(),"Publicado: ")]/text()').extract_first()
        objeto = normas.xpath('.//*[@style="text-align: justify;"]/text()').extract()
        url_pdf = normas.xpath('.//*[contains(@href,".pdf")]/@href').extract_first()
        req = Request(url=url_pdf, callback=self.parsePdf)
        req.meta['Titulo'] = titulo
        req.meta['Data'] = dataPublicacao
        req.meta['Objeto'] = objeto
        yield req

def parsePdf(self, response):
    item = Items()
    item['author'] = "MTE"
    item['title'] = response.meta['Titulo']
    item['description'] = response.meta['Objeto']
    item['url'] = response.url
    item['guid'] = hashlib.sha1(response.url).hexdigest()
    item['crawledDate'] = response.meta['Data']
    item['extends'] = {'Abrangencia': 'Federal'}
    item['source'] = "MTE"
    item['media'] = [{'title': 'linkarquivo', 'link': self.link_arquivos + (item['guid'] + '.pdf')}]
    if '.pdf' in response.url:
        with open(os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")), "wb") as code:
            code.write(response.body)
        try:
            subprocess.call([self.pdf2text, "-layout", "-nopgbrk",
                             os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")),
                             os.path.join(self.diretorio_temporario, (item['guid'] + ".txt"))])
        except Exception as e:
            print 'erro no pdt2text: ' + e.message
        txt = codecs.open(os.path.join(self.diretorio_temporario, (item['guid'] + ".txt")), encoding='latin-1',
                          mode="rb")
        texto = txt.read()
        txt.close()
        item['content'] = texto
        print(response)
    yield item

    
asked by anonymous 09.05.2018 / 14:17

0 answers