说明:
此代码是scrapy框架写的,是代码的主体部分;
callback解密代码(js代码)https://www.jianshu.com/p/c98bdcdb790f
起初并未加代理,当爬取量(去重后)达到34万的时候,ip被封,这时候换ip,继续爬,但是回不到从前,一个ip爬不了多少就会被封锁;被对方发现了
当爬取达到500个以上酒店的时候就会报错,报错内容是PyV8内存溢出,目前没有有效的解决办法,请高手们指教;
# -*- coding: utf-8 -*-
import scrapy
import PyV8
import csv
import re
import math
import time
import redis
redis_cli = redis.Redis(host='127.0.0.1',port=6379)
# total=0
JS_PATH = '/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/callback.js'
api="http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx"
ids_path='/Users/admin/Documents/scrapyceshi/ctripcomments/ctripcomments/spiders/newids.txt'
class Version1Spider(scrapy.Spider):
name = 'version1'
total=0
# 拿到callback并发起请求获取eleven加密的js文件
def start_requests(self):
params = {"MasterHotelID": "",
"hotel": "",
"currentPage": "1",
"viewVersion": "c",
"eleven": "",
}
with open(ids_path)as f:
content = f.read().strip().split('\n')
num=0
for id in content:
# id='12495955'
num+=1
print('**************************************第{}个id{}开始爬取评论**********************************'.format(num,id))
id = id.strip()
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36',
'Referer': 'http://hotels.ctrip.com/hotel/{}.html'.format(id)
}
while True:
try:
oceanball, cas = self.get_oceanball()
break
except:
print('等待')
time.sleep(2)
yield scrapy.Request(url=oceanball,headers=headers,meta={'cas':cas,'headers':headers,'id':id,'params':params},callback=self.ocean_parse,dont_filter=True)
# 生成callback
def get_oceanball(self):
oceanball = 'http://hotels.ctrip.com/domestic/cas/oceanball?callback=%s&_=%s'
f = open(JS_PATH)
callback_js = f.read()
with PyV8.JSContext() as ctxt:
ctxt.eval('var callback = %s' % callback_js)
ctxt.eval('cas = callback(15)')
ctxt.eval('var current_time = (new Date).getTime()')
vars = ctxt.locals
cas = vars.cas
current_time = vars.current_time
oceanball = oceanball % (cas, int(current_time))
return (oceanball, cas)
# 拿到js文件并且解密
def ocean_parse(self,response):
params=response.meta['params']
print('*'*8,params['currentPage'])
ocean=response.body.decode('utf-8')
cas=response.meta['cas']
hotel_id=response.meta['id']
headers=response.meta['headers']
ocean = ocean.replace('eval', 'JSON.stringify')
while True:
try:
ctxt = PyV8.JSContext()
ctxt.__enter__()
ocean = ctxt.eval(ocean)
ocean = eval(ocean)
ocean = ocean.replace(cas, 'eleven=' + cas)
with PyV8.JSContext() as ctxt:
ctxt.eval(
'var hotel_id = "%s"; var site = {}; site.getUserAgent = function(){}; var Image = function(){}; var window = {}; window.document = {body:{innerHTML:"1"}, documentElement:{attributes:{webdriver:"1"}}, createElement:function(x){return {innerHTML:"1"}}}; var document = window.document;window.navigator = {"appCodeName":"Mozilla", "appName":"Netscape", "language":"zh-CN", "platform":"Win"}; window.navigator.userAgent = site.getUserAgent(); var navigator = window.navigator; window.location = {}; window.location.href = "http://hotels.ctrip.com/hotel/"+hotel_id+".html"; var location = window.location;' % hotel_id)
ctxt.eval('var navigator = {userAgent:{indexOf: function(x){return "1"}}, geolocation:"1"}')
ctxt.eval('var %s = function(x){return x()}' % cas)
ctxt.eval(ocean)
vars = ctxt.locals
eleven = vars.eleven
break
except:
time.sleep(2)
print('等待')
# 已经删除多余参数
params['MasterHotelID'] = hotel_id
params['hotel'] = hotel_id
params['eleven'] = eleven
yield scrapy.FormRequest(url=api,method='GET',meta={'params':params,'hotel_id':hotel_id,'headers':headers},formdata=params,headers=headers,dont_filter=True,callback=self.comments_parse)
# 获取第一页评论
def comments_parse(self, response):
params = response.meta['params']
print('*'*18,params['currentPage'])
hotel_id = response.meta['hotel_id']
headers = response.meta['headers']
patterns = response.xpath('//div[@class="comment_block J_asyncCmt"]')
with open('result3.csv','a',encoding='utf-8',newline='')as f:
writer = csv.writer(f)
for pattern in patterns:
# 评论页往后存在,字段缺失,房型和旅客类型缺失;
try:
name=pattern.xpath('.//p[@class="name"]/span/text()').extract()[0]
except:
name=''
try:
room_type=pattern.xpath('//a[@class="room J_baseroom_link"]/text()').extract()[0]
except:
room_type=''
try:
date=pattern.xpath('//span[@class="date"]/text()').extract()[0]
except:
date=''
try:
guest_type=pattern.xpath('//span[@class="type"]/text()').extract()[0]
except:
guest_type=''
info = [hotel_id,name,room_type,date,guest_type]
writer.writerow(info)
redis_cli.sadd('ctrip_comments',info)
print(info)
redis_cli.spop('baseids')
self.total+=1
print(self.total)
try:
# 评论量
comments = response.xpath('//span[@id="All_Comment"]/text()').extract()[0]
comments=re.findall(r'全部\((\d+)\)',comments)[0]
comments = int(comments)
print('{}评论总量{}'.format(hotel_id,comments))
except:
# 反爬处理,,,,,,如果响应体为空,那么背反爬,需要重试
if not response.body.decode('utf-8'):
oceanball, cas = self.get_oceanball()
yield scrapy.Request(url=oceanball, headers=headers,
meta={'cas': cas, 'headers': headers, 'id': hotel_id, 'params': params},
callback=self.ocean_parse, dont_filter=True)
print('{}第{}页重试'.format(hotel_id, params['currentPage']))
else:
currentPage = params['currentPage']
if int(currentPage) == 1:
# 翻页取评论数据
if comments>15:
pages = math.ceil(comments/15)
for page in range(2,pages+1):
a = {}
b = dict(a,**params)
print('{}第{}页开始爬'.format(hotel_id,page))
oceanball, cas = self.get_oceanball()
b['currentPage']=str(page)
print(b)
yield scrapy.Request(url=oceanball, headers=headers,
meta={'cas': cas, 'headers': headers, 'id': hotel_id,'params':b}, callback=self.ocean_parse,dont_filter=True)
print('{}第{}页爬完'.format(hotel_id,pages))
else:
print('{}正在循环翻页'.format(hotel_id))
网友评论