美文网首页
python爬大众点评评论(爬虫),scrapy爬虫

python爬大众点评评论(爬虫),scrapy爬虫

作者: sexy_cyber | 来源:发表于2018-12-24 20:37 被阅读213次

    提醒:

    当目标站点有反扒措施或者需要爬取的数据量非常的大的时候,在爬虫正式启动之前一定要考虑三个问题:

    链接去重,数据去重,链接失败重试;

    在这三个问题上思考所花的时间是值得的,否则:

    1.你将花费更多的时间;2.做大量重复工作;3.影响心情,扰乱思路;

    数据方面

    目前数据存在空值(包括单个字段空值和所有字段为空的情况),对于此种情况需要提前设计好解决方案;

    化整为零,拆解复杂问题,这是第一步,否则问题就会变得更加的复杂;

    总结通用的一句话:在项目开始之前要进行风险分析(列出所有风险),制定风险应对策略;

    目标

    上海市咖啡店(已经有点评id)
    url:http://www.dianping.com/shop/114840556/review_all(替换id即可)
    全部评论需要登录才能爬:

    多搞几个账号,拿到每个账号登陆后的cookies,我搞了四个;

    请求间隔10s

    部分代码源码参考:希望各位能够积极留言,一块交流:

    # -*- coding: utf-8 -*-
    import scrapy
    import csv
    from UA_Pool import uas
    import random
    import copy
    import redis
    import re
    
    
    redis_cli=redis.Redis('127.0.0.1',6379)
    ids=[]
    with open('/Users/admin/Documents/scrapy_xiaolong/dianping/dianping/spiders/result.csv')as f:
        readers=csv.reader(f)
        for line in readers:
            if line:
                ids.append(line)
    all={}
    for i in ids:
        all[i[0]]=i
    ids=[]
    for i in all:
        ids.append(all[i])
    
    
    
    cookieshi="cy=1; cityid=1; cye=shanghai; _lxsdk_cuid=167deeb62fbc8-002f1109f2b52a-19306952-1fa400-167deeb62fbc8; _lxsdk=167deeb62fbc8-002f1109f2b52a-19306952-1fa400-167deeb62fbc8; _hc.v=7c771fdd-ecac-f6ff-8191-6e0dbed4a9f3.1545633228; dper=377b8f10bf00f4a331feb1750b4e1f14ae436d9c82866693265469f757246474343819f46d9f7dc34e03849b4db6e4a36624116da5eac50e21010496bd24a6eaa8f40dd286f37cbec4505d1c70d4a8f1444193fa3ef8fe7157b66ed70cc66170; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_3971866024; ctu=3fe0ea23f9b2673c148e477b16eef3eeabdb67c299a392c9a164a846b5e769fd; uamo=17621989923; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; s_ViewType=10; cy=1; cye=shanghai; _lxsdk_s=%7C%7C7"
    
    cookiesshi={}
    for c in cookieshi.split('; '):
        cookiesshi[c.split('=')[0]]=c.split('=')[1]
    
    cookiejin="_lxsdk_cuid=165fa0056a9c8-06e424b7714a34-661f1574-100200-165fa0056a9c8; _hc.v=a7e3cbb4-5247-0589-16f8-8a4b9637da38.1537497651; s_ViewType=10; aburl=1; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1537866327; _adwp=169583271.7700698289.1537866327.1537866327.1537866327.1; _adwr=169583271%230; Hm_lvt_4c4fc10949f0d691f3a2cc4ca5065397=1537866780; _tr.u=Gj26laslJ2hdZRBC; cy=1; cye=shanghai; switchcityflashtoast=1; ua=dpuser_8026154301; ctu=9571195f40a000c3a654b3fa96b938b9f1ff818af7b7e3633f5cf0e16363d335; _lxsdk=F2653370CB9E11E89892833220E2DE53106CA6D44F234F3E97AB79B3601C087D; cityid=1; _dp.ac.v=3fcce72d-d3d6-4b49-860b-cf5165c2aa5c; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; lgtoken=0f129f20e-74a7-4e86-995b-16e1f7377dd9; dper=2ca0df92e039695466884d9f6d60c7679b26f909372ff25179eab86dc8ecd7929e5d9525b646c74e88918deefa8dee085d0cb5a1cd613d3329183bd47ab735b88d3656d48413002c49440e31fe71dde8f7409b454fc449cac25ae4f83dd93812; ll=7fd06e815b796be3df069dec7836c3df; uamo=18851302258; _lxsdk_s=167cf876fbe-5b9-cb8-3a3%7C%7C23"
    cookiesjin={}
    for c in cookiejin.split('; '):
        cookiesjin[c.split('=')[0]]=c.split('=')[1]
    
    
    cookieshun="_hc.v=47a32e3d-101c-71b0-70ea-791074fa1b8e.1531191007; _lxsdk_cuid=1648218986f54-0615d1cfb87604-5b193413-100200-16482189870c8; _lxsdk=1648218986f54-0615d1cfb87604-5b193413-100200-16482189870c8; s_ViewType=10; ua=%E5%8D%81%E4%B9%9D%E5%85%AB%E4%B8%83_6164; ctu=7d85f600da7b2dc921db2b4ef3eddfeebbf8b3790b6cffc3c1d319e4a0f581dd; _tr.u=VO3eVeVK1EppB3yF; switchcityflashtoast=1; aburl=1; __mta=142401037.1531729308753.1531729476027.1532049564994.3; _adwp=169583271.7265246201.1537866115.1537866115.1537866115.1; citypinyin=shanghai; cityname=5LiK5rW3; Hm_lvt_dbeeb675516927da776beeb1d9802bd4=1537931947,1537947118,1539235531,1539748669; __utma=1.336198293.1540894350.1540894350.1540894350.1; __utmz=1.1540894350.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); cityid=16; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=1; cye=shanghai; default_ab=citylist%3AA%3A1%7Cshop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1%7Cshopreviewlist%3AA%3A1%7Cmyinfo%3AA%3A1%7Cmap%3AA%3A1%7Csinglereview%3AA%3A1; lgtoken=082d86530-e1e9-45c7-a552-6d5e1aca0ef8; dper=bf91d57c8ecda03d7f489ab57d37c600983b547b5915461e53c34246aef07fd30a3b9a08e25997b5ad2fd4366f64dcc6395aa1cb7931fad2a19fada4987b0182aaaebb595b4afe416940419a2e6d20b8ecdb39992f41e7d57927d651ec6750f1; ll=7fd06e815b796be3df069dec7836c3df; _lxsdk_s=167cf8a3b17-610-58b-878%7C%7C25"
    cookiesshun={}
    for c in cookieshun.split('; '):
        cookiesshun[c.split('=')[0]]=c.split('=')[1]
    
    cookie2219="cy=1; cye=shanghai; _lxsdk_cuid=167cf8da760c8-0d1b77c77dd3d7-19306952-1fa400-167cf8da760c8; _lxsdk=167cf8da760c8-0d1b77c77dd3d7-19306952-1fa400-167cf8da760c8; _hc.v=d8cf6ed6-8891-479a-ec9f-6952774246af.1545375427; dper=1868d50956bbbc095ad7e44198e78d9e5984b949b5a8dcd0053feb012fcb5fa165b603184ec85c4d2aba9d252dc7a3ccbf583a50cf281bc463239bef567e8674803b48f2231253991bddb366ec453508b7333bbbb2365d1f27a4d26ff4af9629; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_0416641045; ctu=7006a409aad353585deee07304addc66784ad9f3af959b2695d8a3e8fd6de845; uamo=13174792219; _lxsdk_s=167cf8da762-b59-df3-8ad%7C%7C588"
    
    
    cookies2219={}
    for c in cookie2219.split('; '):
        cookies2219[c.split('=')[0]]=c.split('=')[1]
    
    # allcookies=[cookiesshi,cookiesshun,cookiesjin,cookies2219]
    allcookies=[cookiesshi]
    
    oriurl='http://www.dianping.com/shop/{}/review_all'
    
    
    """异常处理:评论页为空,评论的star为空值,302跳转  10s延时较为稳定"""
    
    all = []
    with open('/Users/admin/Documents/scrapy_xiaolong/dianping/dianping/fridayresult.csv')as f:
        readers=csv.reader(f)
        for one in readers:
            all.append(one)
    
    urls={'2849392': ['http://www.dianping.com/shop/2849392/review_all/p2', 'http://www.dianping.com/shop/2849392/review_all/p3'],'98321252': ['http://www.dianping.com/shop/98321252/review_all/p2', 'http://www.dianping.com/shop/98321252/review_all/p3', 'http://www.dianping.com/shop/98321252/review_all/p4', 'http://www.dianping.com/shop/98321252/review_all/p5', 'http://www.dianping.com/shop/98321252/review_all/p6', 'http://www.dianping.com/shop/98321252/review_all/p7', 'http://www.dianping.com/shop/98321252/review_all/p8', 'http://www.dianping.com/shop/98321252/review_all/p9', 'http://www.dianping.com/shop/98321252/review_all/p10', 'http://www.dianping.com/shop/98321252/review_all/p11', 'http://www.dianping.com/shop/98321252/review_all/p12', 'http://www.dianping.com/shop/98321252/review_all/p13']}
    
    class V1Spider(scrapy.Spider):
        name = 'v2'
        num=0
        def start_requests(self):
            for id in ids:
                i=id[0]
                if i in urls:
                    value=urls[i]
                    for url in value:
                        pages=re.findall('/review_all/p(\d+)',url)[0]
                        pages=int(pages)
                        if pages == 2:
                            referer=url.replace('/p2','')
                        else:
                            referer=url.replace('/p{}'.format(pages),'/p{}'.format(pages-1))
    
                        headers = {'User-Agent': random.choice(uas), 'Referer': referer}
                        cookies=random.choice(allcookies)
                        if not redis_cli.sismember('success_urls',url):
                            meta = {'data': id, 'url': url, 'Referer': referer,'dont_redirect': True, 'handle_httpstatus_list': [302]}
                            yield scrapy.Request(url,headers=headers,cookies=cookies,dont_filter=True,callback=self.nextpage,meta=meta)
    
    
        def nextpage(self,response):
            data=response.meta['data']
            try:
                patterns=response.xpath('//div[@class="main-review"]')
                if not patterns:
                    1/0
                for pattern in patterns:
                    name=pattern.xpath('./div[@class="dper-info"]/a/text()').extract()[0].strip()
                    try:
                        star=pattern.xpath('./div[@class="review-rank"]/span[1]/@class').extract()[0]
                        star=re.findall("(\d+)",star)[0]
                    except:
                        star=0
                    thetime=pattern.xpath('.//span[@class="time"]/text()').extract()[0].strip()
                    newdata=copy.deepcopy(data)
                    newdata=newdata+[name,star,thetime]
    
                    if newdata not in all:
                        with open('fridayresult.csv', 'a', encoding='utf-8', newline='')as f:
                            writer = csv.writer(f)
                            writer.writerow(newdata)
                            print('write successfully {}'.format(newdata))
                redis_cli.sadd('success_urls', response.url)
                self.num+=1
                print('第{}条数据采集成功总量1100条'.format(self.num))
    
            except Exception as e:
                print('请求失败{}'.format(data))
                print(e)
                referer=response.meta['Referer']
                with open('xx.html','w')as f:
                    f.write(response.text)
                cookies=random.choice(allcookies)
                headers = {'User-Agent': random.choice(uas),'Referer': referer}
                meta = {'data': data, 'url': response.meta['url'], 'Referer': headers['Referer'], 'dont_redirect': True,'handle_httpstatus_list': [302]}
                yield scrapy.Request(response.meta['url'], headers=headers, cookies=cookies, callback=self.nextpage,meta=meta,dont_filter=True)
    
    
    
    
    

    反爬上:我测试了3s,10s,60s,观察发现,都难以要比输入验证码的烦恼。说明账号在线只要达到一定的时长就会需要输入,另外账号和ip的访问频率受限制是已知的;目前测试3秒以上是没有问题的;

    相关文章

      网友评论

          本文标题:python爬大众点评评论(爬虫),scrapy爬虫

          本文链接:https://www.haomeiwen.com/subject/cklvkqtx.html