说明：

代理已经被修改，勿直接复制粘贴使用；
代码使用pyspider框架写的，不懂的留言哦；
反爬相对于上一个帖子的变动是，多了cookie验证；如果cookie失效，那么再去请求Map接口，保存cookie，用此cookie去请求详情页；
欢迎留言交流，反反爬路上我们携手共进；

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-23 17:47:08
# Project: dianping_shop


from pyspider.libs.base_handler import *
import re
import json
from redis import Redis
from lxml import etree
from fake_useragent import UserAgent
import requests
redis_cli=Redis(host='172.22.50.176',port=6379)
or_url='https://m.dianping.com/appshare/shop/{}'
map = 'https://m.dianping.com/shop/{}/map'
pattern = re.compile(r'"pageInitData":({.+?})')
ua = UserAgent()





class Handler(BaseHandler):
    crawl_config = {
    }
    def __init__(self):
        self.cookies ={'m_flash2': '1', 'pvhistory': '6L+U5ZuePjo8L2Vycm9yL2Vycm9yX3BhZ2U+OjwxNTM5ODUyMTE1OTk1XV9b'}


    # 通过商铺id爬商铺详情
    @every(minutes=100)
    def on_start(self):
        for i in range(30000):
            num=0
            id = redis_cli.spop('dianping_shop:shopid_set')
            id = id.strip()
            url = or_url.format(id)
            headers=self.url_start()
            self.crawl(url,headers=headers,validate_cert=False,proxy='http-xxxx80',cookies=self.cookies,save={'id':id,'url':url,'num':num},callback=self.index_page)


    # 取详情页价格，标签，等信息
    @config(age=10 * 24 * 60 * 60)
    @catch_status_code_error
    def index_page(self, response):
        id = response.save['id']
        num = response.save['num']
        num+=1
        # 切换ip换cookies，如果被ban
        if response.status_code != 200 or not response.text:
            if num > 1:
                if response.text:
                    print(str(response.status_code)+'*'*9)
                else:
                    print(str(response.status_code)+'响应体为空'+'*'*9)
                return
            print(response.text)
            proxy = 'http-proxy-sg1.dobel.cn:9180'
            proxies = {
                'http': 'http://' + proxy,
            }
            header = {
                "Proxy-Authorization": "Basic U0hNQ0hUVxxxx4NFlHTTJv",
                'User-Agent':ua.random
            }
            url = response.save['url']
            newurl = url+'?id={}'.format(num)
            response = requests.get('http://ip.dobel.cn/switch-ip', proxies=proxies, verify=False, headers=header)
            if response.status_code == 200:
                print('Successfully')
            #获取新的cookies
            # cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id),proxies=proxies, headers=header).cookies.get_dict()
            cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id), headers=header).cookies.get_dict()
            self.cookies=cookies
            self.crawl(newurl,headers=header,validate_cert=False,cookies=cookies,proxy='http-proxy-sxxxxxn:9180',save={'id':id,'url':url,'num':num},callback=self.index_page)

        else:
            # 数据结构有变动，捕获异常，多种解析方式
            try:
                tree = etree.HTML(response.text)
            except:
                tree = response.etree
            print('xx')
            # 评论
            try:
                comments=tree.xpath('//span[@class="itemNum-val"]/text()')[0]
                comments = comments+'条'
            except:
                try:
                    comments=tree.xpath('//div[@class="overview"]/span[2]/text()')[0]
                    print(comments)
                except Exception as e:
                    print(e)
                    comments = 0
            # 价格
            try:
                price=tree.xpath('//span[@class="itemNum"]/following::span[1]/text()')[0]
                try:
                    int(price)
                    price = '¥'+price+'/人'
                except:
                    price=''
            except:
                try:
                    price=tree.xpath('//div[@class="overview"]/span[3]/text()')[0]
                except:
                    price=''

            # 类型
            try:
                try:
                    type=tree.xpath('//span[@class="subType"]/text()')[0]
                except:
                    try:
                        type = tree.xpath('//div[@class="tags"][2]/text()')[0]
                    except:
                        try:
                            type = tree.xpath('//div[@class="tags"][1]/text()')[0].strip()
                            if not type:
                                1/0
                        except:
                            type = tree.xpath('//span[@class="tag"][2]/text()')[0]
            except:
                type=''
                print('娶不到类型数据')
            url = map.format(id)

            headers = self.url_start()
            cookie = 's_ViewType=10; _hc.v=f096a7a9-db05-418a-3964-c3e4956f8b98.1528948190; _lxsdk=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8; _lxsdk_cuid=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8'
            headers['Cookie']=cookie

            data={'type':type,'comments':comments,'price':price}
            self.crawl(url, headers=headers,validate_cert=False,proxy='http-xxx.cn:9180', save=data,callback=self.detail_page)

    # 取经纬度，地址map接口
    @config(priority=2)
    @catch_status_code_error
    def detail_page(self, response):
        da=response.save
        content = response.text
        try:
            r = re.findall(pattern, content)[0]
            r = json.loads(r)
            shopLat=r['shopLat']
            shopLng=r['shopLng']
            shopId=r['shopId']
            shopName=r['shopName']
            address=r['address']
            da['shopLat']=shopLat
            da['shopLng']=shopLng
            da['shopId']=shopId
            da['shopName']=shopName
            da['address']=address
            self.send_message(self.project_name, da, url=shopId)
        except Exception as e:
            if response.text:
                print(str(response.status_code) +'map异常'+'*'*9)
            else:
                print(str(response.status_code) + 'map异常，响应体为空' + '*'*9)
            print(e)
            print(content)

    def on_message(self, project_name, msg):
        return msg

    def url_start(self):
        useragent=ua.random
        header = {
            "Proxy-Authorization": "Basic UxxxxlHTTJv",
            'User-Agent':useragent
        }
        return header