美文网首页
高德位智数据采集python爬虫

高德位智数据采集python爬虫

作者: sexy_cyber | 来源:发表于2019-01-15 16:19 被阅读40次
    # -*- coding: utf-8 -*-
    import scrapy
    from UA_Pool import uas
    import random
    import json
    import time
    import copy
    from ..items import IamapItem7
    from ..items import IamapItem30
    import hashlib
    import redis
    import logging
    logger = logging.getLogger(__name__)
    
    # 该接口反爬措施为Referer,ip封锁加验证码,高频率用一个cookies请求会封账号
    # 账号封锁:二级分类需要验证,cookies不会过期,但是需要极验验证;
    #账号封锁:在极验验证成功的基础上,会无限跳验证码,死循环;
    # 速度每日2.8万条3s delay
    
    # 2019 1 15日高德位智可能存在的反爬,单日请求总次数限制,请求频率限制,目前观察3s限制仍然反复的跳验证码;
    
    
    
    citys={'110000': '北京', '310000': '上海', '440100': '广州', '440300': '深圳', '330100': '杭州', '420100': '武汉', '510100': '成都'}
    redis_cli = redis.Redis('127.0.0.1',6379)
    
    def get_cookies():
        cookies_li = []
        for i in range(1,5):
            if i == 1:
                cookie = redis_cli.get('cookie')
            else:
                cookie = redis_cli.get('cookie{}'.format(i))
            if type(cookie) is bytes:
                cookie = cookie.decode('utf-8')
            cookies = {}
            for c in cookie.split('; '):
                cookies[c.split('=')[0]] = c.split('=')[1]
            cookies_li.append(cookies)
        return cookies_li
    
    cookies_li = get_cookies()
    
    
    
    categroys={'景区': {'游乐场': '080501', '风景名胜': '3310', '公园广场': '3320', '博物馆': '3420', '科技馆': '3430'}, '活动场所': {'综合体育场': '080101', '音乐厅': '080602', '剧场': '080603', '美术馆': '140400', '会展中心': '3450'}, '医院': {}, '交通枢纽': {'机场': '3810', '港口码头': '3850'}, '购物场所': {'商场': '060100', '购物中心': '060101', '普通商场': '060102', '商业街': '3910'}}
    
    
    
    class V1Spider(scrapy.Spider):
        name = 'v1'
    
        def start_requests(self):
            # url="https://i.amap.com/service/aoi-ranking?category=060101&type=hot&datetype=week&dateno=20181216&top=100&city=440300"
            # headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/top?category=060101&adcode=440300'}
            for city in citys:
                cityname=citys[city]
                for onename in categroys:
                    for twoname in categroys[onename]:
                        caid=categroys[onename][twoname]
                        # week=date.today().weekday()+1
                        now=time.strftime("%Y%m%d",time.localtime(time.time()-7*24*60*60))
                        url="https://i.amap.com/service/aoi-ranking?category={}&type=hot&datetype=week&dateno={}&top=20&city={}".format(caid,now,city)
                        headers={"User-Agent":random.choice(uas)}       #不可以加reffer,否则就是空值,也不用考虑特定的UA
                        info={'city':cityname,'citycode':city,'firstca':onename,'secondca':twoname}
                        yield scrapy.Request(url=url,headers=headers,cookies=random.choice(cookies_li),callback=self.indexparse,meta={'data':info,'caid':caid},dont_filter=True)
    
        def indexparse(self,response):
            olddata=response.meta['data']
            citycode=olddata['citycode']
            # url='https://i.amap.com/detail/B0FFF5DC6V?adcode=440300'
            try:
                data=json.loads(response.text)
                if data['data']:
                    for i in data['data']:
                        info=copy.deepcopy(olddata)
                        aoiID=i['aoiId']
                        name=i['name']
                        info['areaname']=name
                        info['aoiID']=aoiID
                        url = 'https://i.amap.com/detail/{}?adcode={}'.format(aoiID,citycode)
                        headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/detail/{}?adcode={}'.format(aoiID,citycode)}
                        data30=copy.deepcopy(info)
                        yield scrapy.Request(url=url,headers=headers,cookies=random.choice(cookies_li),meta={'data':data30},callback=self.secondparse30,dont_filter=True)
                        now = time.strftime("%Y%m%d", time.localtime(time.time() - 1 * 24 * 60 * 60))
                        url='https://i.amap.com/service/aoi-index?aoiids={}&end={}&offset=7&byhour=1&refresh=0'.format(aoiID,now)
                        data7=copy.deepcopy(info)
                        yield scrapy.Request(url,headers=headers, cookies=random.choice(cookies_li),meta={'data': data7},callback=self.detail7,dont_filter=True)
                else:
                    print('json解析出来的data为空{}'.format(response.text))
                    logger.info('json无法解析响应体{}'.format(response.text))
                    caid=response.meta['caid']
                    headers = {"User-Agent": random.choice(uas)}
                    yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), callback=self.indexparse,
                                         meta={'data': olddata, 'caid': caid}, dont_filter=True)
            except:
                print('json无法解析响应体{}'.format(response.text))
                caid = response.meta['caid']
                city = olddata['citycode']
                headers = {"User-Agent": random.choice(uas),
                           'Referer': 'https://i.amap.com/top?category={}&adcode={}'.format(caid, city)}
                yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), callback=self.indexparse,
                                     meta={'data': olddata,'caid':caid}, dont_filter=True)
    
    
        def secondparse30(self,response):
            data = response.meta['data']
            try:
                mannum=response.xpath('//span[contains(./text(),"男")]/preceding::span[1]/text()').extract()[0]
                womannum=response.xpath('//span[contains(./text(),"女")]/preceding::span[1]/text()').extract()[0]
                data['man']=mannum
                data['woman']=womannum
                patterns=response.xpath('//div[@class="age-view"]/div')
                for pattern in patterns:
                    age=pattern.xpath('./span/text()').extract()[0]
                    num=pattern.xpath('./div/span/text()').extract()[0]
                    data[age]=num
                thred=response.xpath('//span[contains(./text(),"近3天区域指数")]/following::span[1]/text()').extract()[0]
                sevend=response.xpath('//span[contains(./text(),"近7天区域指数")]/following::span[1]/text()').extract()[0]
                data['threday']=thred
                data['sevenday']=sevend
                aoiids = data['aoiID']
                now = time.strftime("%Y%m%d", time.localtime(time.time() - 1 * 24 * 60 * 60))
                url = 'https://i.amap.com/service/aoi-index?aoiids={}&end={}&offset=30&byhour=0&refresh=0'.format(aoiids,
                                                                                                                  now)
                headers = {'User-Agent': random.choice(uas),
                           'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(aoiids, data['citycode'])}
                yield scrapy.Request(url=url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data}, callback=self.detail30,dont_filter=True)
            except:
                headers = {"User-Agent": random.choice(uas),
                           'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(data['aoiID'], data['citycode'])}
                yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data},callback=self.secondparse30,dont_filter=True)
    
    
        def detail30(self,response):
            data=response.meta['data']
            aoiid=data['aoiID']
            info=json.loads(response.text)
            try:
                info=info['data'][aoiid]
                for i in info:
                    newdata = copy.deepcopy(data)
                    newdata.pop('citycode')
                    newdata.pop('aoiID')
                    thedate = i[0]
                    num = i[-1]
                    newdata['date'] = thedate[:4]+'-'+thedate[4:6]+'-'+thedate[6:8]
                    newdata['num'] = num
                    newdata['zero']=newdata.pop('0-19')
                    newdata['twenty']=newdata.pop('20-29')
                    newdata['thirty']=newdata.pop('30-39')
                    newdata['fourty']=newdata.pop('40-49')
                    newdata['fifty']=newdata.pop('50-59')
                    newdata['sixty']=newdata.pop('>60')
                    item=IamapItem30()
                    for da in newdata:
                        item[da]=newdata[da]
                    id = self.getmd5(item)
                    item['id'] = id
                    yield item
                print('该数据30日每日数据保存成功{}'.format(data))
            except:
                headers = {'User-Agent': random.choice(uas),
                           'Referer': 'https://i.amap.com/detail/{}?adcode={}'.format(aoiid, data['citycode'])}
                yield scrapy.Request(url=response.url, headers=headers, cookies=random.choice(cookies_li), meta={'data': data}, callback=self.detail30,dont_filter=True)
                print(response.text)
    
        def detail7(self,response):
            data=response.meta['data']
            aoiid=data['aoiID']
            info=json.loads(response.text)
            try:
                info=info['data'][aoiid]
                for i in info:
                    newdata=copy.deepcopy(data)
                    newdata.pop('citycode')
                    newdata.pop('aoiID')
                    thedate=i[0]
                    num=i[-1]
                    newdata['date']=thedate[:4]+'-'+thedate[4:6]+'-'+thedate[6:8]+' '+thedate[8:]+':00'
                    newdata['num']=num
                    item=IamapItem7()
                    for da in newdata:
                        item[da]=newdata[da]
                    id=self.getmd5(item)
                    item['id']=id
                    yield item
                print('该数据7日每小时数据数据保存成功{}'.format(data))
            except Exception as e:
                headers={"User-Agent":random.choice(uas),'Referer':'https://i.amap.com/detail/{}?adcode={}'.format(aoiid,data['citycode'])}
                yield scrapy.Request(url=response.url,headers=headers, cookies=random.choice(cookies_li),meta={'data': data},callback=self.detail7,dont_filter=True)
                print(response.text,e)
    
    
        def getmd5(self, item):
            md = hashlib.md5()
            # py3 py2 通用
            md.update(str(item).encode('utf-8'))
            result = md.hexdigest()
            return result
    
    
    
    
    
    
    
    
    
    

    相关文章

      网友评论

          本文标题:高德位智数据采集python爬虫

          本文链接:https://www.haomeiwen.com/subject/pypydqtx.html