美文网首页
破解携程评论接口callback,eleven加密,获取全部评论

破解携程评论接口callback,eleven加密,获取全部评论

作者: sexy_cyber | 来源:发表于2018-10-29 09:59 被阅读151次

说明:

此代码是scrapy框架写的,是代码的主体部分;
callback解密代码(js代码)https://www.jianshu.com/p/c98bdcdb790f
起初并未加代理,当爬取量(去重后)达到34万的时候,ip被封,这时候换ip,继续爬,但是回不到从前,一个ip爬不了多少就会被封锁;被对方发现了
当爬取达到500个以上酒店的时候就会报错,报错内容是PyV8内存溢出,目前没有有效的解决办法,请高手们指教;

# -*- coding: utf-8 -*-
import scrapy
import PyV8
import csv
import re
import math
import time
import redis
redis_cli = redis.Redis(host='127.0.0.1',port=6379)
# total=0

JS_PATH = '/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/callback.js'
api="http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx"
ids_path='/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/newids.txt'
class Version1Spider(scrapy.Spider):
    name = 'version1'
    total=0
    # 拿到callback并发起请求获取eleven加密的js文件
    def start_requests(self):
        params = {"MasterHotelID": "",
                  "hotel": "",
                  "currentPage": "1",
                  "viewVersion": "c",
                  "eleven": "",
                  }
        with open(ids_path)as f:
            content = f.read().strip().split('\n')
        num=0
        for id in content:
            # id='12495955'
            num+=1
            print('**************************************第{}个id{}开始爬取评论**********************************'.format(num,id))
            id = id.strip()
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
                'Referer': 'http://hotels.ctrip.com/hotel/{}.html'.format(id)
            }
            while True:
                try:
                    oceanball, cas = self.get_oceanball()
                    break
                except:
                    print('等待')

                    time.sleep(2)

            yield scrapy.Request(url=oceanball,headers=headers,meta={'cas':cas,'headers':headers,'id':id,'params':params},callback=self.ocean_parse,dont_filter=True)

    # 生成callback
    def get_oceanball(self):
        oceanball = 'http://hotels.ctrip.com/domestic/cas/oceanball?callback=%s&_=%s'
        f = open(JS_PATH)
        callback_js = f.read()
        with PyV8.JSContext() as ctxt:
            ctxt.eval('var callback = %s' % callback_js)
            ctxt.eval('cas = callback(15)')
            ctxt.eval('var current_time = (new Date).getTime()')
            vars = ctxt.locals
            cas = vars.cas
            current_time = vars.current_time
        oceanball = oceanball % (cas, int(current_time))
        return (oceanball, cas)

    # 拿到js文件并且解密
    def ocean_parse(self,response):
        params=response.meta['params']
        print('*'*8,params['currentPage'])

        ocean=response.body.decode('utf-8')
        cas=response.meta['cas']
        hotel_id=response.meta['id']
        headers=response.meta['headers']
        ocean = ocean.replace('eval', 'JSON.stringify')
        while True:
            try:

                ctxt = PyV8.JSContext()
                ctxt.__enter__()
                ocean = ctxt.eval(ocean)
                ocean = eval(ocean)
                ocean = ocean.replace(cas, 'eleven=' + cas)
                with PyV8.JSContext() as ctxt:
                    ctxt.eval(
                        'var hotel_id = "%s"; var site = {}; site.getUserAgent = function(){}; var Image = function(){}; var window = {}; window.document = {body:{innerHTML:"1"}, documentElement:{attributes:{webdriver:"1"}}, createElement:function(x){return {innerHTML:"1"}}}; var document = window.document;window.navigator = {"appCodeName":"Mozilla", "appName":"Netscape", "language":"zh-CN", "platform":"Win"}; window.navigator.userAgent = site.getUserAgent(); var navigator = window.navigator; window.location = {}; window.location.href = "http://hotels.ctrip.com/hotel/"+hotel_id+".html"; var location = window.location;' % hotel_id)
                    ctxt.eval('var navigator = {userAgent:{indexOf: function(x){return "1"}}, geolocation:"1"}')
                    ctxt.eval('var %s = function(x){return x()}' % cas)
                    ctxt.eval(ocean)
                    vars = ctxt.locals
                    eleven = vars.eleven
                    break
            except:
                time.sleep(2)
                print('等待')
        # 已经删除多余参数

        params['MasterHotelID'] = hotel_id
        params['hotel'] = hotel_id
        params['eleven'] = eleven
        yield scrapy.FormRequest(url=api,method='GET',meta={'params':params,'hotel_id':hotel_id,'headers':headers},formdata=params,headers=headers,dont_filter=True,callback=self.comments_parse)

    # 获取第一页评论
    def comments_parse(self, response):
        params = response.meta['params']
        print('*'*18,params['currentPage'])

        hotel_id = response.meta['hotel_id']
        headers = response.meta['headers']
        patterns = response.xpath('//div[@class="comment_block J_asyncCmt"]')
        with open('result3.csv','a',encoding='utf-8',newline='')as f:
            writer = csv.writer(f)
            for pattern in patterns:
                # 评论页往后存在,字段缺失,房型和旅客类型缺失;
                try:
                    name=pattern.xpath('.//p[@class="name"]/span/text()').extract()[0]
                except:
                    name=''
                try:
                    room_type=pattern.xpath('//a[@class="room J_baseroom_link"]/text()').extract()[0]
                except:
                    room_type=''
                try:
                    date=pattern.xpath('//span[@class="date"]/text()').extract()[0]
                except:
                    date=''
                try:
                    guest_type=pattern.xpath('//span[@class="type"]/text()').extract()[0]
                except:
                    guest_type=''
                info = [hotel_id,name,room_type,date,guest_type]
                writer.writerow(info)
                redis_cli.sadd('ctrip_comments',info)
                print(info)
                redis_cli.spop('baseids')
                self.total+=1
                print(self.total)
        try:
            # 评论量
            comments = response.xpath('//span[@id="All_Comment"]/text()').extract()[0]
            comments=re.findall(r'全部\((\d+)\)',comments)[0]
            comments = int(comments)
            print('{}评论总量{}'.format(hotel_id,comments))
        except:
            # 反爬处理,,,,,,如果响应体为空,那么背反爬,需要重试
            if not response.body.decode('utf-8'):
                oceanball, cas = self.get_oceanball()
                yield scrapy.Request(url=oceanball, headers=headers,
                                     meta={'cas': cas, 'headers': headers, 'id': hotel_id, 'params': params},
                                     callback=self.ocean_parse, dont_filter=True)
                print('{}第{}页重试'.format(hotel_id, params['currentPage']))

        else:
            currentPage = params['currentPage']
            if int(currentPage) == 1:
                # 翻页取评论数据
                if comments>15:
                    pages = math.ceil(comments/15)
                    for page in range(2,pages+1):
                        a = {}
                        b = dict(a,**params)
                        print('{}第{}页开始爬'.format(hotel_id,page))
                        oceanball, cas = self.get_oceanball()
                        b['currentPage']=str(page)
                        print(b)
                        yield scrapy.Request(url=oceanball, headers=headers,
                                             meta={'cas': cas, 'headers': headers, 'id': hotel_id,'params':b}, callback=self.ocean_parse,dont_filter=True)
                    print('{}第{}页爬完'.format(hotel_id,pages))
            else:
                print('{}正在循环翻页'.format(hotel_id))

相关文章

网友评论

      本文标题:破解携程评论接口callback,eleven加密,获取全部评论

      本文链接:https://www.haomeiwen.com/subject/udnvtqtx.html