Fix Encoding problem when exporting to csv from a scrapy file

0

How can I fix the encoding problem when saving the file to csv? this problem is only happening when saved in csv.

fromscrapyimport*fromprojeto_iruan.itemsimport*importcsvclassimprensaNacional(scrapy.Spider):name='imprensaNacional'start_urls=['http://www.imprensanacional.gov.br/leiturajornal?data=11-09-2018&secao=dou3']imprensaNacional="imprensaNacional.csv"
    custom_settings = {
        'FEED_FORMAT': csv
    }

    def __init__(self):
        # empty outputfile
        open(self.imprensaNacional, "w").close()

    def parse(self, response):
        url_base = 'http://www.imprensanacional.gov.br/'
        script = response.xpath('//*[@class="span8 hierarchy-wrapper"]//*[contains(text(),"AVISO DE LICITA")]')
        for urls in script:
            links = urls.xpath('.//@href').extract_first().encode('utf-8')
            link_completo = url_base + links
            yield Request(url=link_completo, callback=self.parseAviso)

    def parseAviso(self, response):
        with open(self.imprensaNacional, "a") as f:
            writer = csv.writer(f, delimiter=";")
            conteudo = response.xpath('//*[@class="journal-content-article"]')
            for info in conteudo:
                titulo = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().encode('utf-8')
                pregao = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[3].encode('utf-8')
                uasg = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[6].encode('utf-8')
                tipo = info.xpath('.//*[@class="identifica"]/text()[contains(.,"AVISO")]').extract_first().encode('utf-8')
                pregoeiro = info.xpath('.//*[@class="assina"]/text()').extract_first().encode('utf-8')
                descricao = info.xpath('.//*[@class="dou-paragraph"]/text()').extract_first().encode('utf-8')
                dou = info.xpath('.//*[@class="dou-paragraph"]/text()[contains(.,"(")]').extract_first().encode('utf-8')
                orgao = info.xpath('.//*[@class="orgao-dou-data"]/text()').extract_first().encode('utf-8')
                data_publicacao = info.xpath('.//*[@class="publicado-dou-data"]/text()').extract_first().encode('utf-8')
                edicao_dou = info.xpath('.//*[@class="edicao-dou-data"]/text()').extract_first().encode('utf-8')
                secao = info.xpath('.//*[@class="secao-dou"]/text()').extract_first().encode('utf-8')
                pagina = info.xpath('.//*[@class="secao-dou-data"]/text()').extract_first().encode('utf-8')
                writer.writerow([titulo,tipo,pregao,uasg,dou,data_publicacao,edicao_dou,secao,pagina,orgao,pregoeiro,response.url,descricao])
                yield {'Titulo': titulo, 'Tipo': tipo, 'Pregao': pregao, 'UASG': uasg,
                       'DOU': dou, 'DataPublicacao': data_publicacao, 'Edicao': edicao_dou,
                       'Secao': secao, 'Pagina': pagina, 'Orgao': orgao, 'Pregoeiro': pregao,
                       'Url': response.url, 'Descricao': descricao}
    
asked by anonymous 18.09.2018 / 03:27

1 answer

1

You are encoding your strings in utf-8, but it looks like you are using a spreadsheet program to read them that is not recognizing this encoding.

If you are using excel, when importing to excel, use the import dialog (instead of simply opening the file) so you can select the utf-8 encoding option. Another alternative is to try to use the cp1252 encoding which is usually the standard encoding of excel in Portuguese. Change all .encode('utf-8') to .encode('cp1252') and see if it works out.

As for double lines, you are opening the file in text mode, and since the csv module of python 2 expects a file in binary format, you end up having duplicate line breaks. Change the line

with open(self.imprensaNacional, "a") as f:

by

with open(self.imprensaNacional, "ab") as f:

to use binary mode.

    
18.09.2018 / 05:32