Syntax Error: AcroForm field object is wrong type in Python

Question

Syntax Error: AcroForm field object is wrong type in Python

Navigation

0

I'm developing a python spider that should collect some information and download the PDF.

The code seems to be correct as it downloads to the directory of the first PDF and then returns the .json. However, it happens that after the first reading it begins to accuse the error highlighted in the title of this thread.

I did not find any documentation that validated this error or ways to adjust, because it does not tell you where the problem is, if you can help, I'm grateful. Here is the error code and print:

def parse(self, response):
    normas_regulamentadoras = response.xpath('//*[@class="item-page"]')
    for normas in normas_regulamentadoras:
        titulo = normas.xpath('.//*[contains(@href,".pdf")]/text()').extract_first()
        dataPublicacao = normas.xpath('.//*[contains(text(),"Publicado: ")]/text()').extract_first()
        objeto = normas.xpath('.//*[@style="text-align: justify;"]/text()').extract()
        url_pdf = normas.xpath('.//*[contains(@href,".pdf")]/@href').extract_first()
        req = Request(url=url_pdf, callback=self.parsePdf)
        req.meta['Titulo'] = titulo
        req.meta['Data'] = dataPublicacao
        req.meta['Objeto'] = objeto
        yield req

def parsePdf(self, response):
    item = Items()
    item['author'] = "MTE"
    item['title'] = response.meta['Titulo']
    item['description'] = response.meta['Objeto']
    item['url'] = response.url
    item['guid'] = hashlib.sha1(response.url).hexdigest()
    item['crawledDate'] = response.meta['Data']
    item['extends'] = {'Abrangencia': 'Federal'}
    item['source'] = "MTE"
    item['media'] = [{'title': 'linkarquivo', 'link': self.link_arquivos + (item['guid'] + '.pdf')}]
    if '.pdf' in response.url:
        with open(os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")), "wb") as code:
            code.write(response.body)
        try:
            subprocess.call([self.pdf2text, "-layout", "-nopgbrk",
                             os.path.join(self.diretorio_arquivos, (item['guid'] + ".pdf")),
                             os.path.join(self.diretorio_temporario, (item['guid'] + ".txt"))])
        except Exception as e:
            print 'erro no pdt2text: ' + e.message
        txt = codecs.open(os.path.join(self.diretorio_temporario, (item['guid'] + ".txt")), encoding='latin-1',
                          mode="rb")
        texto = txt.read()
        txt.close()
        item['content'] = texto
        print(response)
    yield item

python web-scraping scrapy

asked by anonymous 09.05.2018 / 14:17

0 answers

Ajax returning error msg No Access-Control-Allow-Origin header is present on the requested resource. - MEAN STACK