美文网首页
攻克大众点评商铺详情页反爬

攻克大众点评商铺详情页反爬

作者: sexy_cyber | 来源:发表于2018-10-29 09:53 被阅读436次

    说明:

    代理已经被修改,勿直接复制粘贴使用;
    代码使用pyspider框架写的,不懂的留言哦;
    反爬相对于上一个帖子的变动是,多了cookie验证;如果cookie失效,那么再去请求Map接口,保存cookie,用此cookie去请求详情页;
    欢迎留言交流,反反爬路上我们携手共进;

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-10-23 17:47:08
    # Project: dianping_shop
    
    
    from pyspider.libs.base_handler import *
    import re
    import json
    from redis import Redis
    from lxml import etree
    from fake_useragent import UserAgent
    import requests
    redis_cli=Redis(host='172.22.50.176',port=6379)
    or_url='https://m.dianping.com/appshare/shop/{}'
    map = 'https://m.dianping.com/shop/{}/map'
    pattern = re.compile(r'"pageInitData":({.+?})')
    ua = UserAgent()
    
    
    
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
        def __init__(self):
            self.cookies ={'m_flash2': '1', 'pvhistory': '6L+U5ZuePjo8L2Vycm9yL2Vycm9yX3BhZ2U+OjwxNTM5ODUyMTE1OTk1XV9b'}
    
    
        # 通过商铺id爬商铺详情
        @every(minutes=100)
        def on_start(self):
            for i in range(30000):
                num=0
                id = redis_cli.spop('dianping_shop:shopid_set')
                id = id.strip()
                url = or_url.format(id)
                headers=self.url_start()
                self.crawl(url,headers=headers,validate_cert=False,proxy='http-xxxx80',cookies=self.cookies,save={'id':id,'url':url,'num':num},callback=self.index_page)
    
    
        # 取详情页价格,标签,等信息
        @config(age=10 * 24 * 60 * 60)
        @catch_status_code_error
        def index_page(self, response):
            id = response.save['id']
            num = response.save['num']
            num+=1
            # 切换ip换cookies,如果被ban
            if response.status_code != 200 or not response.text:
                if num > 1:
                    if response.text:
                        print(str(response.status_code)+'*'*9)
                    else:
                        print(str(response.status_code)+'响应体为空'+'*'*9)
                    return
                print(response.text)
                proxy = 'http-proxy-sg1.dobel.cn:9180'
                proxies = {
                    'http': 'http://' + proxy,
                }
                header = {
                    "Proxy-Authorization": "Basic U0hNQ0hUVxxxx4NFlHTTJv",
                    'User-Agent':ua.random
                }
                url = response.save['url']
                newurl = url+'?id={}'.format(num)
                response = requests.get('http://ip.dobel.cn/switch-ip', proxies=proxies, verify=False, headers=header)
                if response.status_code == 200:
                    print('Successfully')
                #获取新的cookies
                # cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id),proxies=proxies, headers=header).cookies.get_dict()
                cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id), headers=header).cookies.get_dict()
                self.cookies=cookies
                self.crawl(newurl,headers=header,validate_cert=False,cookies=cookies,proxy='http-proxy-sxxxxxn:9180',save={'id':id,'url':url,'num':num},callback=self.index_page)
    
            else:
                # 数据结构有变动,捕获异常,多种解析方式
                try:
                    tree = etree.HTML(response.text)
                except:
                    tree = response.etree
                print('xx')
                # 评论
                try:
                    comments=tree.xpath('//span[@class="itemNum-val"]/text()')[0]
                    comments = comments+'条'
                except:
                    try:
                        comments=tree.xpath('//div[@class="overview"]/span[2]/text()')[0]
                        print(comments)
                    except Exception as e:
                        print(e)
                        comments = 0
                # 价格
                try:
                    price=tree.xpath('//span[@class="itemNum"]/following::span[1]/text()')[0]
                    try:
                        int(price)
                        price = '¥'+price+'/人'
                    except:
                        price=''
                except:
                    try:
                        price=tree.xpath('//div[@class="overview"]/span[3]/text()')[0]
                    except:
                        price=''
    
                # 类型
                try:
                    try:
                        type=tree.xpath('//span[@class="subType"]/text()')[0]
                    except:
                        try:
                            type = tree.xpath('//div[@class="tags"][2]/text()')[0]
                        except:
                            try:
                                type = tree.xpath('//div[@class="tags"][1]/text()')[0].strip()
                                if not type:
                                    1/0
                            except:
                                type = tree.xpath('//span[@class="tag"][2]/text()')[0]
                except:
                    type=''
                    print('娶不到类型数据')
                url = map.format(id)
    
                headers = self.url_start()
                cookie = 's_ViewType=10; _hc.v=f096a7a9-db05-418a-3964-c3e4956f8b98.1528948190; _lxsdk=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8; _lxsdk_cuid=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8'
                headers['Cookie']=cookie
    
                data={'type':type,'comments':comments,'price':price}
                self.crawl(url, headers=headers,validate_cert=False,proxy='http-xxx.cn:9180', save=data,callback=self.detail_page)
    
        # 取经纬度,地址map接口
        @config(priority=2)
        @catch_status_code_error
        def detail_page(self, response):
            da=response.save
            content = response.text
            try:
                r = re.findall(pattern, content)[0]
                r = json.loads(r)
                shopLat=r['shopLat']
                shopLng=r['shopLng']
                shopId=r['shopId']
                shopName=r['shopName']
                address=r['address']
                da['shopLat']=shopLat
                da['shopLng']=shopLng
                da['shopId']=shopId
                da['shopName']=shopName
                da['address']=address
                self.send_message(self.project_name, da, url=shopId)
            except Exception as e:
                if response.text:
                    print(str(response.status_code) +'map异常'+'*'*9)
                else:
                    print(str(response.status_code) + 'map异常,响应体为空' + '*'*9)
                print(e)
                print(content)
    
        def on_message(self, project_name, msg):
            return msg
    
        def url_start(self):
            useragent=ua.random
            header = {
                "Proxy-Authorization": "Basic UxxxxlHTTJv",
                'User-Agent':useragent
            }
            return header
    

    相关文章

      网友评论

          本文标题:攻克大众点评商铺详情页反爬

          本文链接:https://www.haomeiwen.com/subject/llgvtqtx.html