提醒:
当目标站点有反扒措施或者需要爬取的数据量非常的大的时候,在爬虫正式启动之前一定要考虑三个问题:
链接去重,数据去重,链接失败重试;
在这三个问题上思考所花的时间是值得的,否则:
1.你将花费更多的时间;2.做大量重复工作;3.影响心情,扰乱思路;
数据方面
目前数据存在空值(包括单个字段空值和所有字段为空的情况),对于此种情况需要提前设计好解决方案;
化整为零,拆解复杂问题,这是第一步,否则问题就会变得更加的复杂;
总结通用的一句话:在项目开始之前要进行风险分析(列出所有风险),制定风险应对策略;
目标
上海市咖啡店(已经有点评id)
url:http://www.dianping.com/shop/114840556/review_all(替换id即可)
全部评论需要登录才能爬:
多搞几个账号,拿到每个账号登陆后的cookies,我搞了四个;
请求间隔10s
部分代码源码参考:希望各位能够积极留言,一块交流:
# -*- coding: utf-8 -*-
import scrapy
import csv
from UA_Pool import uas
import random
import copy
import redis
import re
redis_cli=redis.Redis('127.0.0.1',6379)
ids=[]
with open('/Users/admin/Documents/scrapy_xiaolong/dianping/dianping/spiders/result.csv')as f:
readers=csv.reader(f)
for line in readers:
if line:
ids.append(line)
all={}
for i in ids:
all[i[0]]=i
ids=[]
for i in all:
ids.append(all[i])
cookieshi="cy=1; cityid=1; cye=shanghai; _lxsdk_cuid=167deeb62fbc8-002f1109f2b52a-19306952-1fa400-167deeb62fbc8; _lxsdk=167deeb62fbc8-002f1109f2b52a-19306952-1fa400-167deeb62fbc8; _hc.v=7c771fdd-ecac-f6ff-8191-6e0dbed4a9f3.1545633228; dper=377b8f10bf00f4a331feb1750b4e1f14ae436d9c82866693265469f757246474343819f46d9f7dc34e03849b4db6e4a36624116da5eac50e21010496bd24a6eaa8f40dd286f37cbec4505d1c70d4a8f1444193fa3ef8fe7157b66ed70cc66170; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_3971866024; ctu=3fe0ea23f9b2673c148e477b16eef3eeabdb67c299a392c9a164a846b5e769fd; uamo=17621989923; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; s_ViewType=10; cy=1; cye=shanghai; _lxsdk_s=%7C%7C7"
cookiesshi={}
for c in cookieshi.split('; '):
cookiesshi[c.split('=')[0]]=c.split('=')[1]
cookiejin="_lxsdk_cuid=165fa0056a9c8-06e424b7714a34-661f1574-100200-165fa0056a9c8; _hc.v=a7e3cbb4-5247-0589-16f8-8a4b9637da38.1537497651; s_ViewType=10; aburl=1; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1537866327; _adwp=169583271.7700698289.1537866327.1537866327.1537866327.1; _adwr=169583271%230; Hm_lvt_4c4fc10949f0d691f3a2cc4ca5065397=1537866780; _tr.u=Gj26laslJ2hdZRBC; cy=1; cye=shanghai; switchcityflashtoast=1; ua=dpuser_8026154301; ctu=9571195f40a000c3a654b3fa96b938b9f1ff818af7b7e3633f5cf0e16363d335; _lxsdk=F2653370CB9E11E89892833220E2DE53106CA6D44F234F3E97AB79B3601C087D; cityid=1; _dp.ac.v=3fcce72d-d3d6-4b49-860b-cf5165c2aa5c; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; lgtoken=0f129f20e-74a7-4e86-995b-16e1f7377dd9; dper=2ca0df92e039695466884d9f6d60c7679b26f909372ff25179eab86dc8ecd7929e5d9525b646c74e88918deefa8dee085d0cb5a1cd613d3329183bd47ab735b88d3656d48413002c49440e31fe71dde8f7409b454fc449cac25ae4f83dd93812; ll=7fd06e815b796be3df069dec7836c3df; uamo=18851302258; _lxsdk_s=167cf876fbe-5b9-cb8-3a3%7C%7C23"
cookiesjin={}
for c in cookiejin.split('; '):
cookiesjin[c.split('=')[0]]=c.split('=')[1]
cookieshun="_hc.v=47a32e3d-101c-71b0-70ea-791074fa1b8e.1531191007; _lxsdk_cuid=1648218986f54-0615d1cfb87604-5b193413-100200-16482189870c8; _lxsdk=1648218986f54-0615d1cfb87604-5b193413-100200-16482189870c8; s_ViewType=10; ua=%E5%8D%81%E4%B9%9D%E5%85%AB%E4%B8%83_6164; ctu=7d85f600da7b2dc921db2b4ef3eddfeebbf8b3790b6cffc3c1d319e4a0f581dd; _tr.u=VO3eVeVK1EppB3yF; switchcityflashtoast=1; aburl=1; __mta=142401037.1531729308753.1531729476027.1532049564994.3; _adwp=169583271.7265246201.1537866115.1537866115.1537866115.1; citypinyin=shanghai; cityname=5LiK5rW3; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1537931947,1537947118,1539235531,1539748669; __utma=1.336198293.1540894350.1540894350.1540894350.1; __utmz=1.1540894350.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); cityid=16; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=1; cye=shanghai; default_ab=citylist%3AA%3A1%7Cshop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1%7Cshopreviewlist%3AA%3A1%7Cmyinfo%3AA%3A1%7Cmap%3AA%3A1%7Csinglereview%3AA%3A1; lgtoken=082d86530-e1e9-45c7-a552-6d5e1aca0ef8; dper=bf91d57c8ecda03d7f489ab57d37c600983b547b5915461e53c34246aef07fd30a3b9a08e25997b5ad2fd4366f64dcc6395aa1cb7931fad2a19fada4987b0182aaaebb595b4afe416940419a2e6d20b8ecdb39992f41e7d57927d651ec6750f1; ll=7fd06e815b796be3df069dec7836c3df; _lxsdk_s=167cf8a3b17-610-58b-878%7C%7C25"
cookiesshun={}
for c in cookieshun.split('; '):
cookiesshun[c.split('=')[0]]=c.split('=')[1]
cookie2219="cy=1; cye=shanghai; _lxsdk_cuid=167cf8da760c8-0d1b77c77dd3d7-19306952-1fa400-167cf8da760c8; _lxsdk=167cf8da760c8-0d1b77c77dd3d7-19306952-1fa400-167cf8da760c8; _hc.v=d8cf6ed6-8891-479a-ec9f-6952774246af.1545375427; dper=1868d50956bbbc095ad7e44198e78d9e5984b949b5a8dcd0053feb012fcb5fa165b603184ec85c4d2aba9d252dc7a3ccbf583a50cf281bc463239bef567e8674803b48f2231253991bddb366ec453508b7333bbbb2365d1f27a4d26ff4af9629; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_0416641045; ctu=7006a409aad353585deee07304addc66784ad9f3af959b2695d8a3e8fd6de845; uamo=13174792219; _lxsdk_s=167cf8da762-b59-df3-8ad%7C%7C588"
cookies2219={}
for c in cookie2219.split('; '):
cookies2219[c.split('=')[0]]=c.split('=')[1]
# allcookies=[cookiesshi,cookiesshun,cookiesjin,cookies2219]
allcookies=[cookiesshi]
oriurl='http://www.dianping.com/shop/{}/review_all'
"""异常处理:评论页为空,评论的star为空值,302跳转 10s延时较为稳定"""
all = []
with open('/Users/admin/Documents/scrapy_xiaolong/dianping/dianping/fridayresult.csv')as f:
readers=csv.reader(f)
for one in readers:
all.append(one)
urls={'2849392': ['http://www.dianping.com/shop/2849392/review_all/p2', 'http://www.dianping.com/shop/2849392/review_all/p3'],'98321252': ['http://www.dianping.com/shop/98321252/review_all/p2', 'http://www.dianping.com/shop/98321252/review_all/p3', 'http://www.dianping.com/shop/98321252/review_all/p4', 'http://www.dianping.com/shop/98321252/review_all/p5', 'http://www.dianping.com/shop/98321252/review_all/p6', 'http://www.dianping.com/shop/98321252/review_all/p7', 'http://www.dianping.com/shop/98321252/review_all/p8', 'http://www.dianping.com/shop/98321252/review_all/p9', 'http://www.dianping.com/shop/98321252/review_all/p10', 'http://www.dianping.com/shop/98321252/review_all/p11', 'http://www.dianping.com/shop/98321252/review_all/p12', 'http://www.dianping.com/shop/98321252/review_all/p13']}
class V1Spider(scrapy.Spider):
name = 'v2'
num=0
def start_requests(self):
for id in ids:
i=id[0]
if i in urls:
value=urls[i]
for url in value:
pages=re.findall('/review_all/p(\d+)',url)[0]
pages=int(pages)
if pages == 2:
referer=url.replace('/p2','')
else:
referer=url.replace('/p{}'.format(pages),'/p{}'.format(pages-1))
headers = {'User-Agent': random.choice(uas), 'Referer': referer}
cookies=random.choice(allcookies)
if not redis_cli.sismember('success_urls',url):
meta = {'data': id, 'url': url, 'Referer': referer,'dont_redirect': True, 'handle_httpstatus_list': [302]}
yield scrapy.Request(url,headers=headers,cookies=cookies,dont_filter=True,callback=self.nextpage,meta=meta)
def nextpage(self,response):
data=response.meta['data']
try:
patterns=response.xpath('//div[@class="main-review"]')
if not patterns:
1/0
for pattern in patterns:
name=pattern.xpath('./div[@class="dper-info"]/a/text()').extract()[0].strip()
try:
star=pattern.xpath('./div[@class="review-rank"]/span[1]/@class').extract()[0]
star=re.findall("(\d+)",star)[0]
except:
star=0
thetime=pattern.xpath('.//span[@class="time"]/text()').extract()[0].strip()
newdata=copy.deepcopy(data)
newdata=newdata+[name,star,thetime]
if newdata not in all:
with open('fridayresult.csv', 'a', encoding='utf-8', newline='')as f:
writer = csv.writer(f)
writer.writerow(newdata)
print('write successfully {}'.format(newdata))
redis_cli.sadd('success_urls', response.url)
self.num+=1
print('第{}条数据采集成功总量1100条'.format(self.num))
except Exception as e:
print('请求失败{}'.format(data))
print(e)
referer=response.meta['Referer']
with open('xx.html','w')as f:
f.write(response.text)
cookies=random.choice(allcookies)
headers = {'User-Agent': random.choice(uas),'Referer': referer}
meta = {'data': data, 'url': response.meta['url'], 'Referer': headers['Referer'], 'dont_redirect': True,'handle_httpstatus_list': [302]}
yield scrapy.Request(response.meta['url'], headers=headers, cookies=cookies, callback=self.nextpage,meta=meta,dont_filter=True)
反爬上:我测试了3s,10s,60s,观察发现,都难以要比输入验证码的烦恼。说明账号在线只要达到一定的时长就会需要输入,另外账号和ip的访问频率受限制是已知的;目前测试3秒以上是没有问题的;
网友评论