How do I extract data into Scalepy Models.py fields?

0

I want to remove all "Municipalities" from the tag starting at this page. link

And then remove information such as: "county name", "mayor", etc. of the pages of each county in the list.

Using the shell with the line to line extract all the information. Problem is it's time for the spider to work.

#Spider.py
# -*- coding: utf-8 -*-
import scrapy
import urlparse
from scrapy.http import FormRequest
from scrapy.loader import ItemLoader
from municipios.items import Municipio
import time


class GetmunSpider(scrapy.Spider):
    name = 'getMun'
    allowed_domains = ['anmp.pt']
    start_urls = ['https://www.anmp.pt/anmp/pro/mun1/mun101w3.php?cod=M2200']

    def municipio_attr(self, response):
        municipios_url = response.xpath('//select/option/@value').extract()
        for municipio in municipios_url:
            full_url = ['https://www.anmp.pt/anmp/pro/mun1/{0}'.format(i) for i in municipios_url]      
            yield FormRequest.from_response(str(full_url), callback=self.parse) 

    def parse(self, response):

    #dados do municipio
    municipio = ItemLoader(item = Municipio(), response = response)
    municipio.add_xpath('nome', '//div[@class="sel3"]/text()'.extract())
    municipio.add_xpath('pres_camara', '//div[@class="f3"]/text()')[3].extract().split(",")[0]
    municipio.add_xpath('pres_assembleia', '//div[@class="f3"]/text()')[4].extract().split(",")[0]
    municipio.add_xpath('contacto', '//div[@class="sel2"]/text()').extract()
    municipio.add_value('endereco', " ".join(contacto)[:-42])
    municipio.add_value('telefone', contacto[2])
    municipio.add_value('fax', contacto[3])

    return Municipio.load_item()

the contact field was only created to remove the phone and fax from the same html tag

#Item.py
import scrapy
from scrapy.item import Item, Field

class Municipio(scrapy.Item):
    nome = scrapy.Field()
    pres_camara = scrapy.Field()
    pres_assembleia = scrapy.Field()
    endereco = scrapy.Field()
    telefone = scrapy.Field()
    fax = scrapy.Field()

Although Scrapy does not show any errors at the end, I can only pass the table with the various fields, which still appear in the wrong order.

I have searched the Scrapy documentation and various websites and tutorials but failed.

Can someone point me in the right direction? Thank you! Carlos

    
asked by anonymous 29.07.2018 / 20:36

0 answers