I want to remove all "Municipalities" from the tag starting at this page. link
And then remove information such as: "county name", "mayor", etc. of the pages of each county in the list.
Using the shell with the line to line extract all the information. Problem is it's time for the spider to work.
#Spider.py
# -*- coding: utf-8 -*-
import scrapy
import urlparse
from scrapy.http import FormRequest
from scrapy.loader import ItemLoader
from municipios.items import Municipio
import time
class GetmunSpider(scrapy.Spider):
name = 'getMun'
allowed_domains = ['anmp.pt']
start_urls = ['https://www.anmp.pt/anmp/pro/mun1/mun101w3.php?cod=M2200']
def municipio_attr(self, response):
municipios_url = response.xpath('//select/option/@value').extract()
for municipio in municipios_url:
full_url = ['https://www.anmp.pt/anmp/pro/mun1/{0}'.format(i) for i in municipios_url]
yield FormRequest.from_response(str(full_url), callback=self.parse)
def parse(self, response):
#dados do municipio
municipio = ItemLoader(item = Municipio(), response = response)
municipio.add_xpath('nome', '//div[@class="sel3"]/text()'.extract())
municipio.add_xpath('pres_camara', '//div[@class="f3"]/text()')[3].extract().split(",")[0]
municipio.add_xpath('pres_assembleia', '//div[@class="f3"]/text()')[4].extract().split(",")[0]
municipio.add_xpath('contacto', '//div[@class="sel2"]/text()').extract()
municipio.add_value('endereco', " ".join(contacto)[:-42])
municipio.add_value('telefone', contacto[2])
municipio.add_value('fax', contacto[3])
return Municipio.load_item()
the contact field was only created to remove the phone and fax from the same html tag
#Item.py
import scrapy
from scrapy.item import Item, Field
class Municipio(scrapy.Item):
nome = scrapy.Field()
pres_camara = scrapy.Field()
pres_assembleia = scrapy.Field()
endereco = scrapy.Field()
telefone = scrapy.Field()
fax = scrapy.Field()
Although Scrapy does not show any errors at the end, I can only pass the table with the various fields, which still appear in the wrong order.
I have searched the Scrapy documentation and various websites and tutorials but failed.
Can someone point me in the right direction? Thank you! Carlos