Hello,
I'm trying to extract data, but strangely the return is coming by 2 in 2, ie if I make 6 different requests it returns 6 results but only 3 of them are destined.
In the example below the response even though a different request (ncm_in and ncm_fn distinct) returned the same result
{'fob': 7432439.0,
'id_pais': '040',
'mounth': '1',
'ncm_fn': '160249',
'ncm_in': '160241',
'peso': 6002622.0,
'year': '2017'},
{'fob': 7432439.0,
'id_pais': '040',
'mounth': '1',
'ncm_fn': '021019',
'ncm_in': '021011',
'peso': 6002622.0,
'year': '2017'}
Am I missing something?
from scrapy.selector import Selector
from scrapy import Spider, Request, FormRequest
from scrapy.loader import ItemLoader
from GlobalDeskScrap.items import AliceWebItem
from GlobalDeskScrap.settings import NCMS, COUNTRIES
import json
from pprint import pprint
class AlicewebSpider(Spider):
name = 'aliceweb'
allowed_domains = ['aliceweb.mdic.gov.br']
start_urls = ['http://aliceweb.mdic.gov.br//usuario/login']
url_key = 'http://aliceweb.mdic.gov.br//consulta-ncm/consultar'
url_data = 'http://aliceweb.mdic.gov.br//consulta-ncm/grid-totalizacao'
resquest_key = '//input[contains(@id,"refazerConsultaDados")]/@value'
grid_table = '//table[@id="consulta_simples"]/tbody/tr'
grid_rows = {
'mounth': 'td[1]//text()',
'fob': 'td[2]//text()',
'peso': 'td[3]//text()',
}
def parse(self, response):
yield FormRequest.from_response(
response,
formdata={
'tx_usuario': 'your_login',
'tx_senha': 'your_pass',
},
callback=self.get_request_key,
)
def get_request_key(self, response):
param = {
"tipoConsulta": "EXP_NCM",
"originalType": "exportacaoNcm",
"municipioPorUf": "",
"valorMunUF": "",
"tipoNcm": "TX_CD_SH6",
"valorNcmInicial": "",
"valorNcmFinal": "",
"valorNcmCesta": "",
"pais": "",
"uf": "",
"porto": "",
"via": "",
"primeiroDetalhamento": "",
"periodo-periodoMesInicio[]": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
"periodo-periodoAnoInicio[]": ["2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017"],
"periodo-periodoMesFinal[]": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
"periodo-periodoAnoFinal[]": ["2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017"],
"tipoClassificacao": "",
"periodoClassificacao": "p1",
"valorInicialClassificacao": "",
"valorFinalClassificacao": ""}
for c, _ in COUNTRIES.items():
for _, ncm in NCMS.items():
param['pais'] = c
param['valorNcmInicial'] = ncm['ncm_in']
param['valorNcmFinal'] = ncm['ncm_fn']
yield FormRequest(
url=self.url_key,
formdata=param,
callback=self.extract_request_key,
meta={'id_pais': param['pais'],
'ncm_in': param['valorNcmInicial'],
'ncm_fn': param['valorNcmFinal'],
'year': param['periodo-periodoAnoInicio[]'][0]}
)
def extract_request_key(self, response):
params = get_resquest_parameter(
response.xpath(self.resquest_key).extract_first())
yield FormRequest(
url=self.url_data,
formdata=params,
callback=self.get_data,
meta={'id_pais': response.meta['id_pais'],
'ncm_in': response.meta['ncm_in'],
'ncm_fn': response.meta['ncm_fn'],
'year': response.meta['year']}
)
def get_data(self, response):
selector = Selector(response)
for row in selector.xpath(self.grid_table):
loader = ItemLoader(item=AliceWebItem(), selector=row)
for field, xpath in self.grid_rows.items():
loader.add_xpath(field, xpath)
loader.add_value('id_pais', response.meta['id_pais'])
loader.add_value('ncm_in', response.meta['ncm_in'])
loader.add_value('ncm_fn', response.meta['ncm_fn'])
loader.add_value('year', response.meta['year'])
yield loader.load_item()
def get_resquest_parameter(key):
params = {}
params["refazerConsultaDados"] = key
params["page"] = '1'
return params