# -*- coding: utf-8 -*-
import scrapy
from UA_Pool import uas
import random
import json
import time
import copy
from ..items import IamapItem7
from ..items import IamapItem30
import hashlib
import redis
import logging
logger = logging.getLogger(__name__)
# 该接口反爬措施为Referer,ip封锁加验证码,高频率用一个cookies请求会封账号
# 账号封锁:二级分类需要验证,cookies不会过期,但是需要极验验证;
#账号封锁:在极验验证成功的基础上,会无限跳验证码,死循环;
# 速度每日2.8万条3s delay
# 2019 1 15日高德位智可能存在的反爬,单日请求总次数限制,请求频率限制,目前观察3s限制仍然反复的跳验证码;
citys={'110000': '北京', '310000': '上海', '440100': '广州', '440300': '深圳', '330100': '杭州', '420100': '武汉', '510100': '成都'}
redis_cli = redis.Redis('127.0.0.1',6379)
def get_cookies():
cookies_li = []
for i in range(1,5):
if i == 1:
cookie = redis_cli.get('cookie')
else:
cookie = redis_cli.get('cookie{}'.format(i))
if type(cookie) is bytes:
cookie = cookie.decode('utf-8')
cookies = {}
for c in cookie.split('; '):
cookies[c.split('=')[0]] = c.split('=')[1]
cookies_li.append(cookies)
return cookies_li
cookies_li = get_cookies()
categroys={'景区': {'游乐场': '080501', '风景名胜': '3310', '公园广场': '3320', '博物馆': '3420', '科技馆': '3430'}, '活动场所': {'综合体育场': '080101', '音乐厅': '080602', '剧场': '080603', '美术馆': '140400', '会展中心': '3450'}, '医院': {}, '交通枢纽': {'机场': '3810', '港口码头': '3850'}, '购物场所': {'商场': '060100', '购物中心': '060101', '普通商场': '060102', '商业街': '3910'}}
class V1Spider(scrapy.Spider):
name = 'v1'
def start_requests(self):
# url="https://i.amap.com/service/aoi-ranking?category=060101&type=hot&datetype=week&dateno=20181216&top=100&city=440300"
# headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/top?category=060101&adcode=440300'}
for city in citys:
cityname=citys[city]
for onename in categroys:
for twoname in categroys[onename]:
caid=categroys[onename][twoname]
# week=date.today().weekday()+1
now=time.strftime("%Y%m%d",time.localtime(time.time()-7*24*60*60))
url="https://i.amap.com/service/aoi-ranking?category={}&type=hot&datetype=week&dateno={}&top=20&city={}".format(caid,now,city)
headers={"User-Agent":random.choice(uas)} #不可以加reffer,否则就是空值,也不用考虑特定的UA
info={'city':cityname,'citycode':city,'firstca':onename,'secondca':twoname}
yield scrapy.Request(url=url,headers=headers,cookies=random.choice(cookies_li),callback=self.indexparse,meta={'data':info,'caid':caid},dont_filter=True)
def indexparse(self,response):
olddata=response.meta['data']
citycode=olddata['citycode']
# url='https://i.amap.com/detail/B0FFF5DC6V?adcode=440300'
try:
data=json.loads(response.text)
if data['data']:
for i in data['data']:
info=copy.deepcopy(olddata)
aoiID=i['aoiId']
name=i['name']
info['areaname']=name
info['aoiID']=aoiID
url = 'https://i.amap.com/detail/{}?adcode={}'.format(aoiID,citycode)
headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/detail/{}?adcode={}'.format(aoiID,citycode)}
data30=copy.deepcopy(info)
yield scrapy.Request(url=url,headers=headers,cookies=random.choice(cookies_li),meta={'data':data30},callback=self.secondparse30,dont_filter=True)
now = time.strftime("%Y%m%d", time.localtime(time.time() - 1 * 24 * 60 * 60))
url='https://i.amap.com/service/aoi-index?aoiids={}&end={}&offset=7&byhour=1&refresh=0'.format(aoiID,now)
data7=copy.deepcopy(info)
yield scrapy.Request(url,headers=headers, cookies=random.choice(cookies_li),meta={'data': data7},callback=self.detail7,dont_filter=True)
else:
print('json解析出来的data为空{}'.format(response.text))
logger.info('json无法解析响应体{}'.format(response.text))
caid=response.meta['caid']
headers = {"User-Agent": random.choice(uas)}
yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), callback=self.indexparse,
meta={'data': olddata, 'caid': caid}, dont_filter=True)
except:
print('json无法解析响应体{}'.format(response.text))
caid = response.meta['caid']
city = olddata['citycode']
headers = {"User-Agent": random.choice(uas),
'Referer': 'https://i.amap.com/top?category={}&adcode={}'.format(caid, city)}
yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), callback=self.indexparse,
meta={'data': olddata,'caid':caid}, dont_filter=True)
def secondparse30(self,response):
data = response.meta['data']
try:
mannum=response.xpath('//span[contains(./text(),"男")]/preceding::span[1]/text()').extract()[0]
womannum=response.xpath('//span[contains(./text(),"女")]/preceding::span[1]/text()').extract()[0]
data['man']=mannum
data['woman']=womannum
patterns=response.xpath('//div[@class="age-view"]/div')
for pattern in patterns:
age=pattern.xpath('./span/text()').extract()[0]
num=pattern.xpath('./div/span/text()').extract()[0]
data[age]=num
thred=response.xpath('//span[contains(./text(),"近3天区域指数")]/following::span[1]/text()').extract()[0]
sevend=response.xpath('//span[contains(./text(),"近7天区域指数")]/following::span[1]/text()').extract()[0]
data['threday']=thred
data['sevenday']=sevend
aoiids = data['aoiID']
now = time.strftime("%Y%m%d", time.localtime(time.time() - 1 * 24 * 60 * 60))
url = 'https://i.amap.com/service/aoi-index?aoiids={}&end={}&offset=30&byhour=0&refresh=0'.format(aoiids,
now)
headers = {'User-Agent': random.choice(uas),
'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(aoiids, data['citycode'])}
yield scrapy.Request(url=url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data}, callback=self.detail30,dont_filter=True)
except:
headers = {"User-Agent": random.choice(uas),
'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(data['aoiID'], data['citycode'])}
yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data},callback=self.secondparse30,dont_filter=True)
def detail30(self,response):
data=response.meta['data']
aoiid=data['aoiID']
info=json.loads(response.text)
try:
info=info['data'][aoiid]
for i in info:
newdata = copy.deepcopy(data)
newdata.pop('citycode')
newdata.pop('aoiID')
thedate = i[0]
num = i[-1]
newdata['date'] = thedate[:4]+'-'+thedate[4:6]+'-'+thedate[6:8]
newdata['num'] = num
newdata['zero']=newdata.pop('0-19')
newdata['twenty']=newdata.pop('20-29')
newdata['thirty']=newdata.pop('30-39')
newdata['fourty']=newdata.pop('40-49')
newdata['fifty']=newdata.pop('50-59')
newdata['sixty']=newdata.pop('>60')
item=IamapItem30()
for da in newdata:
item[da]=newdata[da]
id = self.getmd5(item)
item['id'] = id
yield item
print('该数据30日每日数据保存成功{}'.format(data))
except:
headers = {'User-Agent': random.choice(uas),
'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(aoiid, data['citycode'])}
yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data}, callback=self.detail30,dont_filter=True)
print(response.text)
def detail7(self,response):
data=response.meta['data']
aoiid=data['aoiID']
info=json.loads(response.text)
try:
info=info['data'][aoiid]
for i in info:
newdata=copy.deepcopy(data)
newdata.pop('citycode')
newdata.pop('aoiID')
thedate=i[0]
num=i[-1]
newdata['date']=thedate[:4]+'-'+thedate[4:6]+'-'+thedate[6:8]+' '+thedate[8:]+':00'
newdata['num']=num
item=IamapItem7()
for da in newdata:
item[da]=newdata[da]
id=self.getmd5(item)
item['id']=id
yield item
print('该数据7日每小时数据数据保存成功{}'.format(data))
except Exception as e:
headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/detail/{}?adcode={}'.format(aoiid,data['citycode'])}
yield scrapy.Request(url=response.url,headers=headers, cookies=random.choice(cookies_li),meta={'data': data},callback=self.detail7,dont_filter=True)
print(response.text,e)
def getmd5(self, item):
md = hashlib.md5()
# py3 py2 通用
md.update(str(item).encode('utf-8'))
result = md.hexdigest()
return result
网友评论