import scrapy
class mingyan(scrapy.Spider):
name ="paomian"
start_urls = [
'http://www.pmtown.com/archives/category/早报'
]
def parse(self, response):
for vin response.css('ul.article-list li'):
lianjie = v.css('a::attr(href)')[0].extract()
title = v.css('a::attr(title)')[0].extract()[5:]
detail = v.css('p::text')[0].extract()
image = v.css('div.item-img>a>img::attr(src)').extract_first()
img = ''
if imageUrl is not None:
img=imageUrl
else:
img='null'
yield {
'title': title,
'introduction': detail,
'detailUrl':lianjie,
'imageUrl':img,
}
dt = response.css('#wrap div.main.container div.content div.sec-panel.archive-list div.pagination.clearfix a.next')
next_page = aa.css('a::attr(href)').extract_first()
print('-------->%s',next_page)
print(next_page)
if next_page is not None:
nexthref = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse,dont_filter=True)
# scrapy crawl 'paomian' -o paomian.json
以上是每日最新的泡面小镇早报篇数据抓取仅供练手而已~如有侵权请联系我删掉文章
网友评论