-- coding: utf-8 --
from bs4 import BeautifulSoup
import requests
from urllib.parse import quote, unquote
import re
import time
url = 'https://v.qq.com/x/cover/mzc00200pfr3hmt.html'
url='https://v.qq.com/x/cover/mzc00200zsrmfna/c3217q6uk3c.html'
url = 'https://v.qq.com/x/cover/mzc00200pfr3hmt.html'
url = 'https://v.qq.com/x/cover/mzc00200qhoftwk.html'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/94.0.4606.81 Safari/537.36'
headers = {'User-Agent': user_agent}
wb_data = requests.get(url, headers=headers)
soup = BeautifulSoup(wb_data.text, 'html.parser')
print(soup.select('div[class="search-video"]'))
print(soup)
taskName = '' # 任务名称
platform_name = '腾讯视频' # 渠道名称
item_id = url.rsplit('/', 1)[-1].split('.')[0] # 页面ID
title = soup.select('h1[class="video_title _video_title"]')[0].get_text().strip() # 内容标题
article_url = url # 内容链接
str2=',"costar":null,"desc":"大雄开箱!180元的二手手机,能吃鸡能王者能刷小姐姐视频,这太超值了!","danmu":1,'
content_raw = re.findall(r'"costar":\w+,"desc":"(\S*)","danmu":1', soup.text) # 正文
content_raw = soup.select('p[class="summary _video_summary"]') # 正文 有可能为空
if len(content_raw) == 0:
content = '无' # 正文
else:
content = content_raw[0].get_text() # 正文
media_name = soup.select('span[class="user_name"]')[0].get_text() # 创作者账户名
keyword = '' # 搜索关键词
read_count = soup.select('em[class="num"]')[0].get_text() # 内容浏览量(播放量)
comments_count_raw = soup.select('div[class="txp_btn_text"]')
if comments_count_raw:
comments_count = re.findall(r'(\d*)\S+', comments_count_raw[0].attrs.get('title'))[0] # 总评论数
else:
comments_count = '总评论数为空'
# share_count = '' # 转发量 无
like_count_raw = '' # 点赞数 无
datetime_raw = soup.select('span[class="date _date"]') # 内容上传时间
if datetime_raw:
datetime = str(datetime_raw[0].get_text())[0:11]
else:
datetime = '内容上传时间获取异常!!!'
id_str = ''
user_id ='' # 评论人ID
user_name = soup.select('span[class="user_name"]')[0].get_text() # 评论人账户名
text = '' # 评论内容 异步
comment_like_count = '' # 评论点赞数 异步
comment_time = '' # 评论时间 异步
create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 采集创建时间
print(item_id, title, content, media_name, read_count, comments_count, datetime, create_time, sep='\n')
网友评论