.csv
file generated by python to .xlsx
?
I have two problems:
-
One of them is that I could not figure out how to do this conversion
-
The second is that even if you pass the
crawl <nome> -o <nome>.csv -s CSV_DELIMITER=";"
In the scrapy file, at the time I open the file directly generated by Excel, it does not format the columns, leaving them separated by commas.
Mycode:
#-*-coding:utf-8-*-importscrapyfromscrapyimport*fromAranhas.itemsimportImprensaNacionalimportcsvfromAranhas.settingsimport*fromAranhas.pipelinesimport*classimprensa_Nacional(scrapy.Spider):name='imprensa_Nacional'start_urls=['http://www.imprensanacional.gov.br/leiturajornal?data=11-09-2018&secao=dou3']custom_settings={'FEED_EXPORT_FIELDS':['Titulo','Tipo','Pregao','UASG','DOU','DataPublicacao','EdicaoDou','SecaoDou','Pagina','Orgao','Pregoeiro','Url','Descricao'],'FEED_FORMAT':'csv'}defparse(self,response):url_base='http://www.imprensanacional.gov.br/'script=response.xpath('//*[@class="span8 hierarchy-wrapper"]//*[contains(text(),"AVISO DE LICITA")]')
for urls in script:
links = urls.xpath('.//@href').extract_first()
link_completo = url_base + str(links)
yield Request(url=link_completo, callback=self.parseAviso)
def parseAviso(self, response):
conteudo = response.xpath('//*[@class="journal-content-article"]')
for info in conteudo:
titulo = info.xpath('.//*[@class="identifica"]/text()').extract_first().strip()
pregao = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[3].strip()
uasg = info.xpath(u'.//*[@class="identifica"]/text()[contains(.,"N\xba")]').extract_first().split()[6].strip()
tipo = info.xpath('.//*[@class="identifica"]/text()[contains(.,"AVISO")]').extract_first().strip()
pregoeiro = info.xpath('.//*[@class="assina"]/text()').extract_first().strip()
descricao = info.xpath('.//*[@class="dou-paragraph"]/text()').extract_first().strip()
dou = info.xpath('.//*[@class="dou-paragraph"]/text()[contains(.,"(")]').extract_first().strip()
orgao = info.xpath('.//*[@class="orgao-dou-data"]/text()').extract_first().strip()
data_publicacao = info.xpath('.//*[@class="publicado-dou-data"]/text()').extract_first().strip()
edicao_dou = info.xpath('.//*[@class="edicao-dou-data"]/text()').extract_first().strip()
secao = info.xpath('.//*[@class="secao-dou"]/text()').extract_first().strip()
pagina = info.xpath('.//*[@class="secao-dou-data"]/text()').extract_first().strip()
item = ImprensaNacional()
item.set_all()
item['Titulo'] = titulo.encode('iso-8859-1')
item['Tipo'] = tipo.encode('iso-8859-1')
item['Pregao'] = pregao.encode('iso-8859-1')
item['UASG'] = uasg.encode('iso-8859-1')
item['DOU'] = dou.encode('iso-8859-1')
item['DataPublicacao'] = data_publicacao.encode('iso-8859-1')
item['EdicaoDou'] = edicao_dou.encode('iso-8859-1')
item['SecaoDou'] = secao.encode('iso-8859-1')
item['Pagina'] = pagina.encode('iso-8859-1')
item['Orgao'] = orgao.encode('iso-8859-1')
item['Pregoeiro'] = pregoeiro.encode('iso-8859-1')
item['Url'] = response.url.encode('iso-8859-1')
item['Descricao'] = descricao.encode('iso-8859-1')
yield item