But I'm facing a problem. And I ended up getting confused, I decided to return the code in a functional point.
# -*- coding: utf-8 -*-
# coding: utf-8
import scrapy
from mbu2.items import Mbu2Item2
import urlparse
from scrapy.http import Request
class Spider2Spider(scrapy.Spider):
name = "spider2"
# allowed_domains = [""]
start_urls = (
# 'file:///C:/scrapy/mbu/mbu2/video.html',
'file:///C:/scrapy/mbu/mbu2/list.htm',
)
def parse(self, response):
# filename = response.url.split("/")[-1] + '.html'
# with open(filename, 'wb') as f:
# f.write(response.body)
# item = Mbu2Item()
# return item
posts = response.xpath('/html/body/div/div[2]/div/div[1]/div[2]/ul/li')
posts.pop(0)
for post in posts:
print(post)
item = Mbu2Item2()
item['currentitemlist'] = response.url
item['currentitemlink'] = urlparse.urljoin(response.url,post.xpath('div/div/h2/a/@href').extract()[0].strip())
item['posttitle'] = post.xpath('div/div/h2/a/text()').extract()[0].strip()
# print(item['posttitle'])
item['posturl'] = urlparse.urljoin(response.url,post.xpath('div/div/h2/a/@href').extract()[0].strip())
item['postautor'] = post.xpath('div/div/div/div[1]/a/text()').extract()[0].strip()
# print(item['postautorurl'])
item['postautorurl'] = urlparse.urljoin(response.url,post.xpath('div/div/div/div[1]/a/@href').extract()[0].strip())
item['postcat'] = post.xpath('div/div/div/div[2]/span/a/text()').extract()[0].strip()
# print(item['postcaturl'])
item['postcaturl'] = urlparse.urljoin(response.url,post.xpath('div/div/div/div[2]/span/a/@href').extract()[0].strip())
# print(item['posttitle'], item['posturl'], item['postautor'], item['postautorurl'])[0].strip()
# request = Request(item['posturl'],
# callback=self.parse_page2)
# request.meta['item'] = item
return Request(item['posturl'], meta={'item': item},
callback=self.parse_item)
# return item
def parse_item(self, response):
item = response.meta['item']
item['currentitemlink2'] = response.url
# item['desc'] = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div/div[1]/p/text()').extract()[0].strip()
item['videosrcembed'] = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/article/iframe/@src').extract()[0].strip()
item['textcontent'] = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/article/div[1]').extract()[0].strip()
item['relatedcatlinks'] = response.xpath('/html/body/div[1]/div/div/div[1]/div[1]/div[2]/article/div[2]').extract()[0].strip()
# filename = response.url.split("/")[-1] + '.html'
# with open(filename, 'wb') as f:
# f.write(response.body)
yield item
Primary Issue
When I roll the spider. It records only 1 item.
I made the modifications in the logic, so he registered 25 items, but did not complete with the second Request.
(I need to add new requests to every read list, add-> start_page-> append (new_url)
But I am not sure how to identify when it closes the cycle of an Item () and when it is parsing a listing.
Can you help me?