-- coding: utf-8 --
from bs4 import BeautifulSoup
import requests
import re
import time
import json
import random
url = 'https://www.iqiyi.com/v_1z5beeqw150.html'
url = 'https://www.iqiyi.com/v_jht5qv2y7k.html'
url = 'https://www.iqiyi.com/v_tbj74ule08.html'
url = 'https://www.iqiyi.com/v_19rw306zp8.html'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/94.0.4606.81 Safari/537.36'
cookie = 'sn-assist={"show":false,"audio":false,"speed":"middle","zomm":1,"cursor":false,"pointer":false,"bigtext":false,"overead":false,"bgcolor":false}; isScale=false; _snvd=1634609426115AdhC1LXO4xC; tradeMA=137; cityId=9051; districtId=10346; hm_guid=fc453f86-178f-4108-963d-c6de8332e851; totalProdQty=0; SN_CITY=190_755_1000051_9051_01_10346_1_1_99_7550199; _df_ud=462966b2-f8b5-4d70-8607-852014a9a6e3; _snmc=1; _snsr=baidu|brand||title|suning:brand; authId=sin5aINSpE0u6Rl0Pc9aBk7S5MrM8WCoPa; secureToken=56DA93357439DB6E8EDFB00E413167DB; ssotbrd=TGTVyLKJDzszqNUH6W1V0CpV0pEm0IAfcVz3FSoW1eW; streetCode=7550199; _snzwt=THFA3x17c9b4393975Lmu34b3; _snms=16346924830053819; route=866d5a3e6e0a7894cd5f6be4e51b1c51; _snadtp=1; _snadid=HT_40097935_100000006_12203835633_20211020; smhst=12203835633|0071201482a12290322739|0071201482a12122946310|0000000000a12289097817|0071201482; _snma=1|163460942575786614|1634609425757|1634692548010|1634692575751|10|3; _snmp=163469257503650408; _snmb=163469246909449769|1634692575778|1634692575757|8; token=64466f3f-496d-4aad-ada1-dd1bcbc95d80'
headers = {'User-Agent': user_agent, 'Cookie': cookie}
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')
print(soup.text)
tvid_raw = re.findall('param['tvid']\s=\s"(\d+)"', soup.text)
if tvid_raw:
tvid = tvid_raw[0]
print(tvid)
else:
print('视频唯一ID未获取!')
exit(1)
taskName = '' # 任务名称
platform_name = '爱奇艺' # 渠道名称
item_id = url.rsplit('/', 1)[-1].split('.')[0] # 页面ID
print('item_id', item_id)
title = soup.select('.title-txt')[0].get_text() # 内容标题
print('title', title)
article_url = url # 内容链接
content = soup.find('span', class_="content-paragraph").get_text().strip().replace('\n', '').replace('\r', '') # 正文
print('content', content)
media_name_raw = re.findall(r'\d+","name":"(\w+)","id"', soup.text)
if len(media_name_raw) == 0:
media_name = '创作者账户名获取失败!!!'
else:
media_name = media_name_raw[0] # 创作者账户名
print('media_name', media_name)
read_count = '' # 内容浏览量(播放量) 无
comments_count = soup.select('.func-name')[0].get_text() # 总评论数
print('comments_count', comments_count)
datetime ='' # 内容上传时间 暂无
def variable_time_stamp():
"""
返回url所用的时间戳
:return:
"""
time_stamp_raw = str(time.time()).split('.')
return time_stamp_raw[0] + time_stamp_raw[1][0:3]
like_count_url_base = 'https://mpaas.iqiyi.com/common/sns_like/?agentType=1012&businessType=14&entityId={tvid}&f_userId=0&m_device_id=7262d3198b5f48d8fe6d5567c0e70f8b&proxyUri=count&sign=9d4b6ed1b07ce24c5c524773ec55c442×tamp={time_stamp}&userId=0&callback=jsonp_{time_stamp2}_5532'
time_stamp = variable_time_stamp()
like_count_url = like_count_url_base.format(tvid=tvid, time_stamp=time_stamp, time_stamp2=str(int(time_stamp) + 2))
print(like_count_url)
like_count_url = 'https://mpaas.iqiyi.com/common/sns_like/?agentType=1012&businessType=14&entityId=7281760754194600&f_userId=0&m_device_id=7262d3198b5f48d8fe6d5567c0e70f8b&proxyUri=count&sign=5159292ed435a80a6cc299f59348a044×tamp=1635492834176&userId=0&callback=jsonp_1635492834177_75020'
like_count_url = 'https://mpaas.iqiyi.com/common/sns_like/?agentType=1012&businessType=14&entityId=7281760754194600&f_userId=0&m_device_id=7262d3198b5f48d8fe6d5567c0e70f8b&proxyUri=count&sign=9d4b6ed1b07ce24c5c524773ec55c442×tamp=1635749920673&userId=0&callback=jsonp_1635749920675_5532'
like_count_data = requests.get(like_count_url, headers=headers)
like_count_soup = BeautifulSoup(like_count_data.text, 'html.parser')
print(like_count_soup) # TODO 点赞数链接分析
like_count_raw = re.findall('"code":\S+,"content":"(\d*)"', like_count_soup.text)
if like_count_raw:
like_count = like_count_raw[0] # 点赞数 异步
else:
like_count = '点赞数未获取!'
print(like_count)
def comment_time_handle(para_time):
"""
解析转换网页中的数字时间
:param para_time:
:return: %Y-%m-%d %H:%M:%S格式日期时间
"""
tupTime = time.localtime(int(para_time)) # 秒时间戳
return time.strftime("%Y-%m-%d %H:%M:%S", tupTime) # 评论时间
def special_treatment(comments_str):
"""
处理评论数据里的特殊字符
:param comments_str:
:return:
"""
pattern = r'""[\s\S]*荷 o O"'
comments_str_treat = re.sub(pattern, '"荷 o O"', comments_str)
return comments_str_treat
def get_comments_json_data(comments_url):
"""
把异步加载的数据转换成json格式
:param url:
:return:
"""
try:
time.sleep(random.random() + 1)
comments_data = requests.get(comments_url, headers=headers)
comments_soup = BeautifulSoup(comments_data.text, 'html.parser')
prefix = re.findall(r'^try{\s+jsonp_\d+_\d+(', str(comments_soup.text))
start_index = len(prefix[0])
end_index = len(str(comments_soup.text)) - len(') }catch(e){};')
comments_json_data_raw = str(comments_soup.text)[start_index:end_index]
# print(comments_json_data_raw)
json_data = json.loads(comments_json_data_raw)
except:
print('出现异常数据!特殊处理')
comments_json_data_raw = special_treatment(comments_json_data_raw)
json_data = json.loads(comments_json_data_raw)
return json_data
first_comments_url_base = 'https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&channel_id=10&content_id={tvid}&hot_size=10&last_id=&page=&page_size=10&types=hot,time&callback=jsonp_{time_stamp}_56720'
first_comments_url = first_comments_url_base.format(tvid=tvid, time_stamp=variable_time_stamp())
comments_json_data = get_comments_json_data(first_comments_url)
comments_list = comments_json_data.get('data').get('comments')
print(comments_list)
comments_num = 0 # 记录已爬取的评论数量
while 1:
if comments_list and comments_num < int(comments_count):
for item in comments_list:
item_id = item.get('id') # 最后一个评论人的ID,用于翻页
user_id = item.get('userInfo').get('uid') # 一级评论人ID 无
user_name = item.get('userInfo').get('uname') # 一级评论人账户名 异步
comment_like_count = item.get('likes') # 一级评论点赞数
text = item.get('content') # 一级评论内容 异步
comment_time = comment_time_handle(item.get('addTime')) # 一级评论时间 异步
comments_num += 1
print('comments_num:', comments_num)
print(item_id, user_id, user_name, comment_like_count, comment_time, text)
fold_comments_list = item.get('replies') # 二级评论(评论的回复)
if fold_comments_list:
for sub_item in fold_comments_list:
sub_user_name = sub_item.get('userInfo').get('uname') # 二级评论人账户名 异步
sub_comment_like_count = sub_item.get('likes') # 二级评论点赞数
sub_text = sub_item.get('content') # 二级评论内容 异步
sub_comment_time = comment_time_handle(sub_item.get('addTime')) # 二级评论时间 异步
comments_num += 1
print('comments_num:', comments_num)
print(sub_user_name, sub_comment_like_count, sub_comment_time, sub_text)
last_id = item_id
elif comments_list is None and comments_num < int(comments_count):
print('comments_list', comments_list)
print('出现异常数据!')
else:
print('comments_list', comments_list)
print('已无评论数据,退出循环!')
break
comments_url_base = 'https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&authcookie=null&business_type=17&channel_id=10&content_id={tvid}&hot_size=0&last_id={last_id}&page=&page_size=20&types=time&callback=jsonp_{time_stamp}_99118'
comments_url = comments_url_base.format(tvid=tvid, last_id=last_id, time_stamp=variable_time_stamp())
# print(comments_url)
comments_json_data = get_comments_json_data(comments_url)
comments_list = comments_json_data.get('data').get('comments')
# print(comments_list)
create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 采集创建时间
网友评论