import requests
import re
from json import loads
import os
from tqdm import tqdm
class Baidu(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36'
}
self.rtcs_flag = '3'
self.rtcs_ver = '3'
self.base_url = 'http://wkrtcs.bdimg.com/rtcs/webapp'
self.base_img = 'https://wkrtcs.bdimg.com/rtcs/image'
self.flag = False #是否下载图片
self.cout = 1
def get_info(self, url):
try:
r = requests.get(url, headers=self.headers).content.decode()
except Exception:
print('编码错误,切换编码!')
r = requests.get(url, headers=self.headers).content.decode('gbk')
self.bucketNum = re.findall('"bucketNum":(\d+),', r)[0]
self.sign = re.findall('&sign=(.*?)&', r)[0]
self.rsign = re.findall('"rsign":"(.*?)",', r)[0]
self.md5sum = re.findall('&md5sum=(.*?)&', r)[0]
self.page_list = re.findall('"rtcs_range_info":(.*),"rtcs_flow"', r)[0]
self.page_count = re.findall('"rtcs_page_count":(.*?),', r)[0]
self.firstpageurl = re.findall('data-firstpageurl="(.*?)"', r)[0].replace('amp;', '')
print(self.firstpageurl)
try:
self.name = re.findall('<title>(.*?)</title>', r)[0].strip()
except Exception:
self.name = '百度文库'
if not os.path.exists(self.name):
os.mkdir(self.name)
self.path = self.name + '/'
# 解析翻页参数
def parse(self):
print('页数:', self.page_count)
page_dics = loads(self.page_list)
print('page_dicts:',page_dics)
if int(self.page_count) >= 2:
#self.get_first()
pn = 1
for r in page_dics: # 进度条
a = r.get('range')
pn = r.get('page')
try:
self.get_pages(pn, a)
except Exception:
print('解析错误')
if pn > int(self.page_count):
break
else:
self.get_first()
# 翻页写入文本
def get_pages(self, pn, ranges):
dic = {
'bucketNum': self.bucketNum,
'pn': pn,
'rn': 1,
'md5sum': self.md5sum,
'sign': self.sign,
'rtcs_flag': self.rtcs_flag,
'rtcs_ver': self.rtcs_ver,
'range': ranges,
'rsign': self.rsign
}
print('pn:', pn, 'a:', ranges)
strhtml = requests.get(self.base_url, params=dic, headers=self.headers)
# print(strhtml.apparent_encoding)
# print(strhtml.encoding)
page = strhtml.text[5:-1]
# print('page:', page)
b = loads(page)
# b = page
# print('b:', b)
a = ''
for i in b['document.xml']:
print(i)
for m in i['c']:
a += '\n'
for n in m['c']:
# print(n)
try:
if isinstance(n['c'], str):
a += n['c']
# print(a)
except Exception:
pass
if pn == 1:
with open(self.path + self.name + '.txt', 'w', encoding='utf-8') as f:
# print('a:',a)
f.write(a)
else:
with open(self.path + self.name + '.txt', 'a', encoding='utf-8') as f:
# print('a:',a)
f.write(a)
# 解析第一页
def get_first(self):
print(self.firstpageurl)
first_page = requests.get(url=self.firstpageurl, headers=self.headers).text[32:-1]
b = loads(first_page)
# print(b)
a = ''
for i in tqdm(b['document.xml']):
for m in i['c']:
a += '\n'
for n in m['c']:
try:
if isinstance(n['c'], str):
a += n['c']
except Exception:
pass
with open(self.path + self.name + '.txt', 'w', encoding='utf-8') as f:
f.write(a)
print('第一页解析完成!!!')
# 下载图片
def down_img(self, cout, num):
data = {
'md5sum': self.md5sum,
'sign': self.sign,
'rtcs_ver': '3',
'bucketNum': self.bucketNum,
'ipr': '{"c":"word/media/image%s.png"}' % cout
}
data = requests.get(url=self.base_img, params=data)
if data.status_code == 200:
with open(self.path + str(num) + '.jpg', 'wb+') as f:
f.write(data.content)
print(self.name + '下载完成!')
else:
couts = str(cout) + '_1'
print(couts)
data = {
'md5sum': self.md5sum,
'sign': self.sign,
'rtcs_ver': '3',
'bucketNum': self.bucketNum,
'ipr': '{"c":"word/media/image%s.png"}' % couts
}
data = requests.get(url=self.base_img, params=data)
if data.status_code == 200:
with open(self.path + str(num) + '.jpg', 'wb+') as f:
f.write(data.content)
print(self.name + '下载完成!')
else:
self.flag = False
def run(self, url):
num = 0
self.get_info(url)
self.parse()
# print('页面写入完成!!!' + '-' * 20 + '下载图片>>>>>>')
# while self.flag:
# num += 1
# self.down_img(self.cout, num)
# self.cout += 1
if __name__ == '__main__':
#url = input('请输入网址:')
url='https://wk.baidu.com/view/c5596afeccbff121dc3683df?pcf=2&pcf=2'
b = Baidu()
b.run(url)
网友评论