爬取开课吧直播视频
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@文件 :kai_class.py
@说明 :
@时间 :2022/01/12 15:43:59
@作者 :wbb
@版本 :1.0
'''
from base64 import encode
from cProfile import run
from urllib import response
import requests
import asyncio
import aiohttp
import aiofiles
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad
# aes_key_url = ""
def main():
url = "https://weblearn.kaikeba.com/get/bsy_video/access_token"
response = requestUrl(url)
jsonDic = response.json()
access_token = jsonDic["data"]["access_token"]
url = "https://api-vod.baoshiyun.com/vod/v1/platform/media/detail?mediaId=media-860639249465344&accessToken=" + access_token
response = requestUrl(url)
content = response.json()
# print(content)
title = content["data"]["title"]
type = input("请输入你要下载类型: \n 0: 超清 \n 1: 高清 \n 2: 标清 \n")
videoGroupDic = content["data"]["mediaMetaInfo"]["videoGroup"][int(type)]
playUrl = videoGroupDic["playURL"]
print(playUrl)
ts_up_str = playUrl.split("lud")[0]
# 下载m3u8文件,并下载ts文件
download_m3u8_file(playUrl, title)
print("下载m3u8文件完成。。。")
title = title + ".text"
# 读取m3u8文件内容
asyncio.run(read_m3u8_file(title, ts_up_str))
aes_key = get_key(title)
aes_key = aes_key.encode('utf-8')
print(aes_key)
asyncio.run(aio_dec(title, aes_key))
def download_m3u8_file(url, name):
m3u8Content = requests.get(url).content
with open(name +".text",mode="wb") as f:
f.write(m3u8Content)
async def read_m3u8_file(title, up_url):
tasks = []
async with aiohttp.ClientSession() as session:
async with aiofiles.open(title, mode="rb") as f:
async for line in f:
line = line.decode().strip()
if line.startswith("#"):
continue
ts_url = up_url +"lud/"+ line
# print(ts_url)
task = asyncio.create_task(download_ts(ts_url, line, session))
tasks.append(task)
await asyncio.wait(tasks)
async def download_ts(url, path, session):
try:
async with session.get(url, timeout=20) as resp:
async with aiofiles.open(f"video/{path}", mode="wb") as f:
ts_content = resp.content.read()
await f.write(await ts_content)
print(f"{path}下载完毕。。。")
except Exception as e:
print(e)
print(f"{path}=====下载失败。。。")
# 重新下载
# async with download_ts(url, path, session) as r:
# return
# 获取密钥的key
def get_key(title):
aes_key_url = ""
with open(title, mode="rb") as f:
for line in f:
line = line.decode().strip()
if(line.startswith("#EXT-X-KEY")):
aes_key_url = line.split("URI=")[-1].strip().replace('"','')
break
response = requestUrl(aes_key_url)
return response.text
async def aio_dec(title, key):
print("解密")
# 解密
tasks = []
async with aiofiles.open(title, mode="rb") as f:
async for line in f:
line = line.decode().strip()
if line.startswith("#"):
continue
# 开始异步解密
task = asyncio.create_task(dec_ts(line, key))
tasks.append(task)
await asyncio.wait(tasks)
async def dec_ts(line, key):
# iv_str = "0" * 16
# iv_str = bytes(iv_str, encoding = "utf8")
aes = AES.new(key=key, IV=b'0000000000000000', mode=AES.MODE_CBC)
try:
async with aiofiles.open(f"video/{line}", mode="rb") as f1,\
aiofiles.open(f"video/temp_{line}", mode="wb") as f2:
bs = await f1.read()
bs = pad(bs, 16)
await f2.write(aes.decrypt(bs))
# print(f"{line}解密完毕!")
except Exception as e:
# print(e)
# print(f"{line}====解密失败!")
pass
def getVideoList():
url = "https://weblearn.kaikeba.com/student/courseinfo?course_id=239671&__timestamp=1642152764689"
# cookie = r'kd_user_id=bad4423b-d2bb-4837-bff1-cfd4841e6055; gr_user_id=cf5aaeaa-89f7-4c45-aa39-4057c8526e05; tblBackUrl=; ssoToken=425f02820a2f641e90af6866315ce504; figui=5Ls2YsxtMb5121A9; sensorsdata2015jssdkcross={"distinct_id":"90121525","first_id":"17e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24","props":{"$latest_traffic_source_type":"自然搜索流量","$latest_search_keyword":"开课吧","$latest_referrer":"https://www.baidu.com/other.php"},"$device_id":"17e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24","identities":"eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTdlNGNlZWI3ZDA2ZDMtMGRlMTUzNWRlNmRiNDQ4LTFkMzI2MjUzLTEwMjQwMDAtMTdlNGNlZWI3ZDE4NDkiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDEyMTUyNSJ9","history_login_id":{"name":"$identity_login_id","value":"90121525"}}; Hm_lvt_156e88c022bf41570bf96e74d090ced7=1641968082,1642149010; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_log_id=GaabmfGpxDyoQQ3kjNX:c8574e13-d7ac-4f8b-b9df-0c15cda32e27:69b176e7-49fc-45df-93f7-202dc43759ad; 99f53b614ce96c83_gr_session_id=d9b36c59-b4dd-4872-9512-6c307253ad12; 99f53b614ce96c83_gr_session_id_d9b36c59-b4dd-4872-9512-6c307253ad12=true; access-edu_online=2fdd49b38d3ae068de03a9525683f1fd; Hm_lpvt_156e88c022bf41570bf96e74d090ced7=1642149070; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_view_log_id=oVC4izdogVQkbUCaW1R; kkb_edu_session=eyJpdiI6IlpIeDlWQTRKS00rMTFoUTkxSUl4U1E9PSIsInZhbHVlIjoiOWNsNVlQVjZaaWp4WlwvaHl3ZlQrMWdFcCtNQnNRS1ZCb0duczArN1cwYWQ0aEZUd1BFVWpERUNXVWgzQVJWeUwiLCJtYWMiOiIxZWI5Y2Y1MDNhM2NiNWYxNWEwMzc2M2I1ZDJiMjc1NjUyNGFlMjRhYzE0YzEzNzRhOGQzZTNhZmVmYTg2Y2M2In0=; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_pageIndex=1; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_leaveTime=1642149992536'
cookieStr = 'kd_user_id=bad4423b-d2bb-4837-bff1-cfd4841e6055; gr_user_id=cf5aaeaa-89f7-4c45-aa39-4057c8526e05; tblBackUrl=; ssoToken=425f02820a2f641e90af6866315ce504; figui=5Ls2YsxtMb5121A9; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2290121525%22%2C%22first_id%22%3A%2217e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E5%BC%80%E8%AF%BE%E5%90%A7%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%7D%2C%22%24device_id%22%3A%2217e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24%22%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTdlNGNlZWI3ZDA2ZDMtMGRlMTUzNWRlNmRiNDQ4LTFkMzI2MjUzLTEwMjQwMDAtMTdlNGNlZWI3ZDE4NDkiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDEyMTUyNSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2290121525%22%7D%7D; Hm_lvt_156e88c022bf41570bf96e74d090ced7=1641968082,1642149010; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_log_id=GaabmfGpxDyoQQ3kjNX%3Ac8574e13-d7ac-4f8b-b9df-0c15cda32e27%3A69b176e7-49fc-45df-93f7-202dc43759ad; access-edu_online=2fdd49b38d3ae068de03a9525683f1fd; Hm_lpvt_156e88c022bf41570bf96e74d090ced7=1642149070; 99f53b614ce96c83_gr_session_id=15a60ffd-83f2-4cdc-9b50-3fd1d1dff030; 99f53b614ce96c83_gr_session_id_15a60ffd-83f2-4cdc-9b50-3fd1d1dff030=true; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_view_log_id=TxunV83iEVfk5pULc6F; kkb_edu_session=eyJpdiI6ImZrSGVvejRaZ2dIcjNLWldqRUVWNHc9PSIsInZhbHVlIjoidWxhQVNxQWZuR0JSN2l1WllVa001dDBSWGxjYTRYbnRpZXZ0ejBTRlBEcndhV2Q4bGpqOHp5KzdhS2xJSVJwRCIsIm1hYyI6IjE0N2Y4YmUxMWY2OTI4YzVjYWYyMTAxODdhZjVlMGFlYjZlYmE5ZDc1OTkyYWU4MzA3MjI2YzQ1M2Y2ZjNjMDQifQ%3D%3D; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_pageIndex=1; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_leaveTime=1642152755343'
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"cookie":cookieStr,
"referer": 'https://learn.kaikeba.com/'
}
response = requests.get(url, headers=header, timeout=20)
response.encoding='utf-8'
content = response.json()
print(content)
def requestUrl(url):
cookieStr = 'kd_user_id=bad4423b-d2bb-4837-bff1-cfd4841e6055; gr_user_id=cf5aaeaa-89f7-4c45-aa39-4057c8526e05; tblBackUrl=; ssoToken=425f02820a2f641e90af6866315ce504; figui=5Ls2YsxtMb5121A9; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2290121525%22%2C%22first_id%22%3A%2217e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E5%BC%80%E8%AF%BE%E5%90%A7%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Fother.php%22%7D%2C%22%24device_id%22%3A%2217e4cec70a9bc3-03b144c21fe2c8a-1d326253-1024000-17e4cec70aab24%22%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTdlNGNlZWI3ZDA2ZDMtMGRlMTUzNWRlNmRiNDQ4LTFkMzI2MjUzLTEwMjQwMDAtMTdlNGNlZWI3ZDE4NDkiLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDEyMTUyNSJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%2290121525%22%7D%7D; Hm_lvt_156e88c022bf41570bf96e74d090ced7=1641968082,1642149010; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_log_id=GaabmfGpxDyoQQ3kjNX%3Ac8574e13-d7ac-4f8b-b9df-0c15cda32e27%3A69b176e7-49fc-45df-93f7-202dc43759ad; access-edu_online=2fdd49b38d3ae068de03a9525683f1fd; Hm_lpvt_156e88c022bf41570bf96e74d090ced7=1642149070; 99f53b614ce96c83_gr_session_id=15a60ffd-83f2-4cdc-9b50-3fd1d1dff030; 99f53b614ce96c83_gr_session_id_15a60ffd-83f2-4cdc-9b50-3fd1d1dff030=true; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_view_log_id=TxunV83iEVfk5pULc6F; kkb_edu_session=eyJpdiI6ImZrSGVvejRaZ2dIcjNLWldqRUVWNHc9PSIsInZhbHVlIjoidWxhQVNxQWZuR0JSN2l1WllVa001dDBSWGxjYTRYbnRpZXZ0ejBTRlBEcndhV2Q4bGpqOHp5KzdhS2xJSVJwRCIsIm1hYyI6IjE0N2Y4YmUxMWY2OTI4YzVjYWYyMTAxODdhZjVlMGFlYjZlYmE5ZDc1OTkyYWU4MzA3MjI2YzQ1M2Y2ZjNjMDQifQ%3D%3D; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_pageIndex=1; kd_5d6526d7-3c9f-460b-b6cf-ba75397ce1ac_kuickDeal_leaveTime=1642152755343'
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36",
"cookie":cookieStr,
"referer": 'https://learn.kaikeba.com/'
}
response = requests.get(url, headers=header, timeout=20)
return response
if __name__ == "__main__":
main()
爬取百度vip文档
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@文件 :bd_document1.py
@说明 :
@时间 :2022/01/11 15:30:00
@作者 :wbb
@版本 :1.0
'''
# from asyncio import tasks
# from selenium import webdriver
# from bs4 import BeautifulSoup
import requests
import img2pdf
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import aiohttp
import asyncio
imgList = []
def main():
getData()
def getData():
# driver = webdriver.Chrome()
# driver.get("https://www.baidu.com")
# driver.maximize_window()
# #点击百度登录按钮
# driver.find_element_by_xpath('//*[@id="u1"]//a[@name="tj_login"]').click()
# #等待百度登录弹出框中 要出现的元素可见
# ele_id = "TANGRAM__PSP_11__userName"
# param = (By.ID,ele_id)
# #元素可见时,再进行后续操作
# WebDriverWait(driver,10).until(EC.visibility_of_element_located(param))
# driver.find_element(By.XPATH, '//*[@id="TANGRAM__PSP_11__userName"]').send_keys('温柔vs先生')
# # //*[@id="TANGRAM__PSP_11__password"]
# driver.find_element(By.XPATH, '//*[@id="TANGRAM__PSP_11__password"]').send_keys('wty963.')
# driver.find_element_by_id("TANGRAM__PSP_11__submit").click()
# time.sleep(5)
# 不显示浏览器
opt = Options()
# opt.add_argument("--headless")
# opt.add_argument("--disable-gpu")
opt.add_argument('--disable—blink—features=AutomationControlled')
url = "https://wenku.baidu.com/view/616109e6edf9aef8941ea76e58fafab068dc4467?fr=shopSearch-pc"
driver = Chrome(options=opt)
driver.get(url)
time.sleep(3)
driver.find_element(By.XPATH, '//*[@id="reader-thumb"]/div[4]').click()
time.sleep(2)
print("点击成功!")
#等待百度登录弹出框中 要出现的元素可见
ele_id = "TANGRAM__PSP_11__footerULoginBtn"
param = (By.ID,ele_id)
#元素可见时,再进行后续操作
WebDriverWait(driver,10).until(EC.visibility_of_element_located(param))
driver.find_element_by_id(ele_id).click()
time.sleep(2)
# WebDriverWait(driver,10).until(EC.visibility_of_element_located(By.ID, "TANGRAM__PSP_11__userName"))
driver.find_element(By.XPATH, '//*[@id="TANGRAM__PSP_11__userName"]').send_keys('温柔vs先生')
# //*[@id="TANGRAM__PSP_11__password"]
driver.find_element(By.XPATH, '//*[@id="TANGRAM__PSP_11__password"]').send_keys('wty963.')
driver.find_element_by_id("TANGRAM__PSP_11__submit").click()
print("进行登录")
# WebDriverWait(driver,10).until(EC.visibility_of_element_located(By.ID, "TANGRAM__43__select_show_arrow"))
time.sleep(5)
isHasArrow = is_element_exist(driver, 'TANGRAM__43__select_show_arrow')
if isHasArrow:
driver.find_element(By.ID, 'TANGRAM__43__select_show_arrow').click()
time.sleep(2)
driver.find_element(By.XPATH, '//*[@id="TANGRAM__43__select_email"]').click()
time.sleep(2)
driver.find_element(By.XPATH, '//*[@id="TANGRAM__43__button_send_email"]').click()
time.sleep(2)
code = input("请输入验证码:")
print(code)
driver.find_element(By.XPATH, '//*[@id="TANGRAM__43__input_vcode"]').send_keys(code)
driver.find_element(By.XPATH, '//*[@id="TANGRAM__43__button_submit"]').click()
else:
time.sleep(3)
img_list = driver.find_elements_by_xpath('//*[@id="reader-thumb"]/div/img')
savePDF(img_list)
print('下载完成')
def savePDF(img_list):
# 同步下载图片
# with open("gaigai" + ".pdf", "wb") as f:
# imgList = []
# for i in range(len(img_list)):
# item = img_list[i]
# img_url = item.get_attribute("src")
# print(img_url)
# img = saveImage(img_url, 'pageNo-' + str(i+1))
# imgList.append(img)
# f.write(img2pdf.convert(imgList))
# 异步下载图片
with open("gaigai" + ".pdf", "wb") as f:
asyncio.run(saveImageWithAsync(img_list))
f.write(img2pdf.convert(imgList))
# 异步保存图片
async def saveImageWithAsync(img_list):
tasks =[]
async with aiohttp.ClientSession() as session:
for i in range(len(img_list)):
item = img_list[i]
img_url = item.get_attribute("src")
task = asyncio.create_task(download_pic(img_url, 'pageNo-' + str(i+1), session))
tasks.append(task)
await asyncio.wait(tasks)
async def download_pic(imgUrl, path, session):
print('开始下载图片%s' + path)
async with session.get(imgUrl) as res:
content = await res.content.read()
print(f'图片{path}下载完成')
global imgList
imgList.append(content)
def saveImage(imgUrl, path):
with open(path + '.png', "wb") as f:
response = requestUrl(imgUrl)
img = response.content
# f.write(img)
return img
def requestUrl(url):
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36"}
response = requests.get(url, headers=header)
response.encoding = 'utf-8'
return response
def is_element_exist(driver, idStr):
# s = driver.find_elements_by_css_selector(css_selector=css)
s = driver.find_elements_by_id(idStr)
if len(s) == 0:
print("元素未找到:%s"%idStr)
return False
elif len(s) == 1:
print("找到元素:%s"%idStr)
return True
else:
print("找到%s个元素:%s"%(len(s),idStr))
return False
if __name__ == "__main__":
main()
Python 百度智能云文字识别 实现手写文字识别
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@文件 :img2Text.py
@说明 :
@时间 :2022/02/11 11:13:59
@作者 :wbb
@版本 :1.0
'''
from aip import AipOcr
# import requests
# import base64
""" 你的 APPID AK SK """
APP_ID = '25427015'
API_KEY = 'pQCtdYOhsZeC1bM6LdPaRU7e'
SECRET_KEY = 'K7Os2HS4b5tsQdWCtjSlBLGS7hxE58oy'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def image2Text(file):
image = get_file_content(file)
result = client.handwriting(image)
# print(result)
if 'words_result' in result:
return ''.join([w['words'] for w in result['words_result']])
# def getToken():
# # client_id 为官网获取的AK, client_secret 为官网获取的SK
# host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=【官网获取的AK】&client_secret=【官网获取的SK】'
# response = requests.get(host)
# if response:
# print(response.json())
# def requestForText(file):
# '''
# 手写文字识别
# '''
# request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/handwriting"
# # 二进制方式打开图片文件
# f = open(file, 'rb')
# img = base64.b64encode(f.read())
# params = {"image":img}
# access_token = '[调用鉴权接口获取的token]'
# request_url = request_url + "?access_token=" + access_token
# headers = {'content-type': 'application/x-www-form-urlencoded'}
# response = requests.post(request_url, data=params, headers=headers)
# if response:
# print (response.json())
if __name__ == "__main__":
text = image2Text("/Users/wbb/Desktop/WechatIMG19812.jpeg")
print(text)
网友评论