How can I fix the encoding problem when saving the file to csv? this problem is only happening when saved in csv.
fromscrapyimport*fromprojeto_iruan.itemsimport*importcsvclassimprensaNacional(scrapy.Spider):name='imprensaNacional'start_urls=['http://www.imprensanacional.gov.br/leiturajornal?data=11-09-2018&secao=dou3']imprensaNacional="imprensaNacional.csv"
custom_settings = {
'FEED_FORMAT': csv
}
def __init__(self):
# empty outputfile
open(self.imprensaNacional, "w").close()
def parse(self, response):
url_base = 'http://www.imprensanacional.gov.br/'
script = response.xpath('//*[@class="span8 hierarchy-wrapper"]//*[contains(text(),"AVISO DE LICITA")]')
for urls in script:
links = urls.xpath('.//@href').extract_first().encode('utf-8')
link_completo = url_base + links
yield Request(url=link_completo, callback=self.parseAviso)
def parseAviso(self, response):
with open(self.imprensaNacional, "a") as f:
writer = csv.writer(f, delimiter=";")
conteudo = response.xpath('//*[@class="journal-content-article"]')
for info in conteudo:
titulo = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().encode('utf-8')
pregao = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[3].encode('utf-8')
uasg = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[6].encode('utf-8')
tipo = info.xpath('.//*[@class="identifica"]/text()[contains(.,"AVISO")]').extract_first().encode('utf-8')
pregoeiro = info.xpath('.//*[@class="assina"]/text()').extract_first().encode('utf-8')
descricao = info.xpath('.//*[@class="dou-paragraph"]/text()').extract_first().encode('utf-8')
dou = info.xpath('.//*[@class="dou-paragraph"]/text()[contains(.,"(")]').extract_first().encode('utf-8')
orgao = info.xpath('.//*[@class="orgao-dou-data"]/text()').extract_first().encode('utf-8')
data_publicacao = info.xpath('.//*[@class="publicado-dou-data"]/text()').extract_first().encode('utf-8')
edicao_dou = info.xpath('.//*[@class="edicao-dou-data"]/text()').extract_first().encode('utf-8')
secao = info.xpath('.//*[@class="secao-dou"]/text()').extract_first().encode('utf-8')
pagina = info.xpath('.//*[@class="secao-dou-data"]/text()').extract_first().encode('utf-8')
writer.writerow([titulo,tipo,pregao,uasg,dou,data_publicacao,edicao_dou,secao,pagina,orgao,pregoeiro,response.url,descricao])
yield {'Titulo': titulo, 'Tipo': tipo, 'Pregao': pregao, 'UASG': uasg,
'DOU': dou, 'DataPublicacao': data_publicacao, 'Edicao': edicao_dou,
'Secao': secao, 'Pagina': pagina, 'Orgao': orgao, 'Pregoeiro': pregao,
'Url': response.url, 'Descricao': descricao}