普通方式翻页1
#coding:utf-8
import scrapy
from freebuf2.items import Freebuf2Item
import time
from scrapy.crawler import CrawlerProcess
class freebuf2Spider(scrapy.Spider):
name ='freebuf2'
allowed_domains = []
start_urls = ["http://www.freebuf.com/"]
def parse(self, response):
for link in response.xpath("//div[contains(@class, 'news_inner news-list')]/div/a/@href").extract():
yield scrapy.Request(link, callback=self.parse_next)#这里不好理解的朋友,先去看看yield的用法。我是按协程(就是中断执行)理解的,感觉容易理解。
next_url = response.xpath("//div[@class='news-more']/a/@href").extract()#找到下一个链接,也就是翻页。
if next_url:
yield scrapy.Request(next_url[0],callback=self.parse)
def parse_next(self,response):
item = Freebuf2Item()
item['title'] = response.xpath("//h2/text()").extract()
item['url'] = response.url
item['date'] = response.xpath("//div[@class='property']/span[@class='time']/text()").extract()
item['tags'] = response.xpath("//span[@class='tags']/a/text()").extract()
yield item
方法一:定义URLs超链接列表单分别爬取
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from GZProject.items import *
class GZSpider(scrapy.Spider):
name = "gznw" #贵州农产品爬虫
allowed_domains = ["gznw.gov.cn"]
start_urls = [
"http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=1",
"http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=2",
"http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=3"
]
def parse(self, response):
print '----------------Start------------------'
print response.url
for sel in response.xpath('//tr[@class="odd gradeX"]'):
item = GzprojectItem()
num1 = sel.xpath('td[1]/text()').extract()[0]
num2 = sel.xpath('td[2]/text()').extract()[0]
num3 = sel.xpath('td[3]/text()').extract()[0]
num4 = sel.xpath('td[4]/text()').extract()[0]
num5 = sel.xpath('td[5]/text()').extract()[0]
print num1,num2,num3,num4,num5
print '\n'
方法二:拼接不同网页URL并发送请求爬取
import scrapy
from scrapy import Request
from scrapy.selector import Selector
from GZProject.items import *
class GZSpider(scrapy.Spider):
name = "gznw" #贵州农产品爬虫
allowed_domains = ["gznw.gov.cn"]
start_urls = [
"http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page=1"
]
def parse(self, response):
print '----------------Start------------------'
print response.url
for sel in response.xpath('//tr[@class="odd gradeX"]'):
item = GzprojectItem()
num1 = sel.xpath('td[1]/text()').extract()[0]
num2 = sel.xpath('td[2]/text()').extract()[0]
num3 = sel.xpath('td[3]/text()').extract()[0]
num4 = sel.xpath('td[4]/text()').extract()[0]
num5 = sel.xpath('td[5]/text()').extract()[0]
print num1,num2,num3,num4,num5
item['num1'] = num1
item['num2'] = num2
item['num3'] = num3
item['num4'] = num4
item['num5'] = num5
yield item
print '\n'
#循环换页爬取
i = 2
while i<=10:
next_url = "http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx?areaid=22572&page="+str(i)
i = i + 1
yield Request(next_url)
---------------------
作者:Eastmount
来源:CSDN
原文:https://blog.csdn.net/eastmount/article/details/79307675
方法三:获取下一页超链接请求爬取内容
如果存在“下一页”超链接,则进行跳转爬取,如果“下一页”超链接为空,则停止爬取。核心代码如下:
核心代码如下:
next_url = response.xpath('//a[@class="page-link next"]/@href').extract()
if next_url is not None:
next_url = 'http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx' + next_url[0]
yield Request(next_url, callback=self.parse)
设置爬取网页的数量
i = 0
next_url = response.xpath('//a[@class="page=link next"]/@href').extract()
if next_(url is not None) and i<20:
i = i + 1
next_url = 'http://www.gznw.gov.cn/priceInfo/getPriceInfoByAreaId.jx' + next_url[0]
yield Request(next_url, callback=self.parse)
网友评论