说明:
代理已经被修改,勿直接复制粘贴使用;
代码使用pyspider框架写的,不懂的留言哦;
反爬相对于上一个帖子的变动是,多了cookie验证;如果cookie失效,那么再去请求Map接口,保存cookie,用此cookie去请求详情页;
欢迎留言交流,反反爬路上我们携手共进;
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-10-23 17:47:08
# Project: dianping_shop
from pyspider.libs.base_handler import *
import re
import json
from redis import Redis
from lxml import etree
from fake_useragent import UserAgent
import requests
redis_cli=Redis(host='172.22.50.176',port=6379)
or_url='https://m.dianping.com/appshare/shop/{}'
map = 'https://m.dianping.com/shop/{}/map'
pattern = re.compile(r'"pageInitData":({.+?})')
ua = UserAgent()
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.cookies ={'m_flash2': '1', 'pvhistory': '6L+U5ZuePjo8L2Vycm9yL2Vycm9yX3BhZ2U+OjwxNTM5ODUyMTE1OTk1XV9b'}
# 通过商铺id爬商铺详情
@every(minutes=100)
def on_start(self):
for i in range(30000):
num=0
id = redis_cli.spop('dianping_shop:shopid_set')
id = id.strip()
url = or_url.format(id)
headers=self.url_start()
self.crawl(url,headers=headers,validate_cert=False,proxy='http-xxxx80',cookies=self.cookies,save={'id':id,'url':url,'num':num},callback=self.index_page)
# 取详情页价格,标签,等信息
@config(age=10 * 24 * 60 * 60)
@catch_status_code_error
def index_page(self, response):
id = response.save['id']
num = response.save['num']
num+=1
# 切换ip换cookies,如果被ban
if response.status_code != 200 or not response.text:
if num > 1:
if response.text:
print(str(response.status_code)+'*'*9)
else:
print(str(response.status_code)+'响应体为空'+'*'*9)
return
print(response.text)
proxy = 'http-proxy-sg1.dobel.cn:9180'
proxies = {
'http': 'http://' + proxy,
}
header = {
"Proxy-Authorization": "Basic U0hNQ0hUVxxxx4NFlHTTJv",
'User-Agent':ua.random
}
url = response.save['url']
newurl = url+'?id={}'.format(num)
response = requests.get('http://ip.dobel.cn/switch-ip', proxies=proxies, verify=False, headers=header)
if response.status_code == 200:
print('Successfully')
#获取新的cookies
# cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id),proxies=proxies, headers=header).cookies.get_dict()
cookies = requests.get('http://m.dianping.com/shop/{}/map'.format(id), headers=header).cookies.get_dict()
self.cookies=cookies
self.crawl(newurl,headers=header,validate_cert=False,cookies=cookies,proxy='http-proxy-sxxxxxn:9180',save={'id':id,'url':url,'num':num},callback=self.index_page)
else:
# 数据结构有变动,捕获异常,多种解析方式
try:
tree = etree.HTML(response.text)
except:
tree = response.etree
print('xx')
# 评论
try:
comments=tree.xpath('//span[@class="itemNum-val"]/text()')[0]
comments = comments+'条'
except:
try:
comments=tree.xpath('//div[@class="overview"]/span[2]/text()')[0]
print(comments)
except Exception as e:
print(e)
comments = 0
# 价格
try:
price=tree.xpath('//span[@class="itemNum"]/following::span[1]/text()')[0]
try:
int(price)
price = '¥'+price+'/人'
except:
price=''
except:
try:
price=tree.xpath('//div[@class="overview"]/span[3]/text()')[0]
except:
price=''
# 类型
try:
try:
type=tree.xpath('//span[@class="subType"]/text()')[0]
except:
try:
type = tree.xpath('//div[@class="tags"][2]/text()')[0]
except:
try:
type = tree.xpath('//div[@class="tags"][1]/text()')[0].strip()
if not type:
1/0
except:
type = tree.xpath('//span[@class="tag"][2]/text()')[0]
except:
type=''
print('娶不到类型数据')
url = map.format(id)
headers = self.url_start()
cookie = 's_ViewType=10; _hc.v=f096a7a9-db05-418a-3964-c3e4956f8b98.1528948190; _lxsdk=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8; _lxsdk_cuid=163baac8398c8-067e24fb434afb-3f616c4d-13c680-163baac8399c8'
headers['Cookie']=cookie
data={'type':type,'comments':comments,'price':price}
self.crawl(url, headers=headers,validate_cert=False,proxy='http-xxx.cn:9180', save=data,callback=self.detail_page)
# 取经纬度,地址map接口
@config(priority=2)
@catch_status_code_error
def detail_page(self, response):
da=response.save
content = response.text
try:
r = re.findall(pattern, content)[0]
r = json.loads(r)
shopLat=r['shopLat']
shopLng=r['shopLng']
shopId=r['shopId']
shopName=r['shopName']
address=r['address']
da['shopLat']=shopLat
da['shopLng']=shopLng
da['shopId']=shopId
da['shopName']=shopName
da['address']=address
self.send_message(self.project_name, da, url=shopId)
except Exception as e:
if response.text:
print(str(response.status_code) +'map异常'+'*'*9)
else:
print(str(response.status_code) + 'map异常,响应体为空' + '*'*9)
print(e)
print(content)
def on_message(self, project_name, msg):
return msg
def url_start(self):
useragent=ua.random
header = {
"Proxy-Authorization": "Basic UxxxxlHTTJv",
'User-Agent':useragent
}
return header
网友评论