Scrapy Multiples Requests returning the same information

0

Hello,

I'm trying to extract data, but strangely the return is coming by 2 in 2, ie if I make 6 different requests it returns 6 results but only 3 of them are destined.

In the example below the response even though a different request (ncm_in and ncm_fn distinct) returned the same result

{'fob': 7432439.0,
 'id_pais': '040',
 'mounth': '1',
 'ncm_fn': '160249',
 'ncm_in': '160241',
 'peso': 6002622.0,
 'year': '2017'},
{'fob': 7432439.0,
 'id_pais': '040',
 'mounth': '1',
 'ncm_fn': '021019',
 'ncm_in': '021011',
 'peso': 6002622.0,
 'year': '2017'}

Am I missing something?

from scrapy.selector import Selector
from scrapy import Spider, Request, FormRequest
from scrapy.loader import ItemLoader

from GlobalDeskScrap.items import AliceWebItem
from GlobalDeskScrap.settings import NCMS, COUNTRIES

import json
from pprint import pprint


class AlicewebSpider(Spider):
    name = 'aliceweb'
    allowed_domains = ['aliceweb.mdic.gov.br']
    start_urls = ['http://aliceweb.mdic.gov.br//usuario/login']
    url_key = 'http://aliceweb.mdic.gov.br//consulta-ncm/consultar'
    url_data = 'http://aliceweb.mdic.gov.br//consulta-ncm/grid-totalizacao'

    resquest_key = '//input[contains(@id,"refazerConsultaDados")]/@value'
    grid_table = '//table[@id="consulta_simples"]/tbody/tr'
    grid_rows = {
        'mounth': 'td[1]//text()',
        'fob': 'td[2]//text()',
        'peso': 'td[3]//text()',
    }

    def parse(self, response):
        yield FormRequest.from_response(
            response,
            formdata={
                'tx_usuario': 'your_login',
                'tx_senha': 'your_pass',
            },
            callback=self.get_request_key,
        )

    def get_request_key(self, response):
        param = {
            "tipoConsulta": "EXP_NCM",
            "originalType": "exportacaoNcm",
            "municipioPorUf": "",
            "valorMunUF": "",
            "tipoNcm": "TX_CD_SH6",
            "valorNcmInicial": "",
            "valorNcmFinal": "",
            "valorNcmCesta": "",
            "pais": "",
            "uf": "",
            "porto": "",
            "via": "",
            "primeiroDetalhamento": "",
            "periodo-periodoMesInicio[]": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
            "periodo-periodoAnoInicio[]": ["2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017"],
            "periodo-periodoMesFinal[]": ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"],
            "periodo-periodoAnoFinal[]": ["2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017", "2017"],
            "tipoClassificacao": "",
            "periodoClassificacao": "p1",
            "valorInicialClassificacao": "",
            "valorFinalClassificacao": ""}

        for c, _ in COUNTRIES.items():
            for _, ncm in NCMS.items():
                param['pais'] = c
                param['valorNcmInicial'] = ncm['ncm_in']
                param['valorNcmFinal'] = ncm['ncm_fn']

                yield FormRequest(
                    url=self.url_key,
                    formdata=param,
                    callback=self.extract_request_key,
                    meta={'id_pais': param['pais'],
                          'ncm_in': param['valorNcmInicial'],
                          'ncm_fn': param['valorNcmFinal'],
                          'year': param['periodo-periodoAnoInicio[]'][0]}
                )

    def extract_request_key(self, response):
        params = get_resquest_parameter(
            response.xpath(self.resquest_key).extract_first())

        yield FormRequest(
            url=self.url_data,
            formdata=params,
            callback=self.get_data,
            meta={'id_pais': response.meta['id_pais'],
                  'ncm_in': response.meta['ncm_in'],
                  'ncm_fn': response.meta['ncm_fn'],
                  'year': response.meta['year']}
        )

    def get_data(self, response):
        selector = Selector(response)

        for row in selector.xpath(self.grid_table):
            loader = ItemLoader(item=AliceWebItem(), selector=row)

            for field, xpath in self.grid_rows.items():
                loader.add_xpath(field, xpath)

            loader.add_value('id_pais', response.meta['id_pais'])
            loader.add_value('ncm_in', response.meta['ncm_in'])
            loader.add_value('ncm_fn', response.meta['ncm_fn'])
            loader.add_value('year', response.meta['year'])

            yield loader.load_item()


def get_resquest_parameter(key):
    params = {}
    params["refazerConsultaDados"] = key
    params["page"] = '1'
    return params
    
asked by anonymous 31.01.2018 / 22:57

0 answers