美文网首页
爬取某度文本

爬取某度文本

作者: 李静数据分析 | 来源:发表于2019-11-04 11:41 被阅读0次
    import requests
    import re
    from json import loads
    import os
    from tqdm import tqdm
    
    
    class Baidu(object):
        def __init__(self):
    
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Mobile Safari/537.36'
            }
            self.rtcs_flag = '3'
            self.rtcs_ver = '3'
            self.base_url = 'http://wkrtcs.bdimg.com/rtcs/webapp'
            self.base_img = 'https://wkrtcs.bdimg.com/rtcs/image'
            self.flag = False  #是否下载图片
            self.cout = 1
    
        def get_info(self, url):
            try:
                r = requests.get(url, headers=self.headers).content.decode()
            except Exception:
                print('编码错误,切换编码!')
                r = requests.get(url, headers=self.headers).content.decode('gbk')
            self.bucketNum = re.findall('"bucketNum":(\d+),', r)[0]
            self.sign = re.findall('&sign=(.*?)&', r)[0]
            self.rsign = re.findall('"rsign":"(.*?)",', r)[0]
            self.md5sum = re.findall('&md5sum=(.*?)&', r)[0]
            self.page_list = re.findall('"rtcs_range_info":(.*),"rtcs_flow"', r)[0]
            self.page_count = re.findall('"rtcs_page_count":(.*?),', r)[0]
            self.firstpageurl = re.findall('data-firstpageurl="(.*?)"', r)[0].replace('amp;', '')
            print(self.firstpageurl)
            try:
                self.name = re.findall('<title>(.*?)</title>', r)[0].strip()
            except Exception:
                self.name = '百度文库'
            if not os.path.exists(self.name):
                os.mkdir(self.name)
            self.path = self.name + '/'
    
        # 解析翻页参数
        def parse(self):
            print('页数:', self.page_count)
            page_dics = loads(self.page_list)
            print('page_dicts:',page_dics)
            if int(self.page_count) >= 2:
                #self.get_first()
                pn = 1
                for r in page_dics:  # 进度条
                    a = r.get('range')
                    pn = r.get('page')
                    try:
                        self.get_pages(pn, a)
                    except Exception:
                        print('解析错误')
                    if pn > int(self.page_count):
                        break
            else:
                self.get_first()
    
    
        # 翻页写入文本
        def get_pages(self, pn, ranges):
            dic = {
                'bucketNum': self.bucketNum,
                'pn': pn,
                'rn': 1,
                'md5sum': self.md5sum,
                'sign': self.sign,
                'rtcs_flag': self.rtcs_flag,
                'rtcs_ver': self.rtcs_ver,
                'range': ranges,
                'rsign': self.rsign
            }
            print('pn:', pn, 'a:', ranges)
            strhtml = requests.get(self.base_url, params=dic, headers=self.headers)
            # print(strhtml.apparent_encoding)
            # print(strhtml.encoding)
            page = strhtml.text[5:-1]
            # print('page:', page)
            b = loads(page)
            # b = page
            # print('b:', b)
            a = ''
            for i in b['document.xml']:
                print(i)
                for m in i['c']:
                    a += '\n'
                    for n in m['c']:
                        # print(n)
                        try:
                            if isinstance(n['c'], str):
                                a += n['c']
                                # print(a)
                        except Exception:
                            pass
            if pn == 1:
                with open(self.path + self.name + '.txt', 'w', encoding='utf-8') as f:
                    # print('a:',a)
                    f.write(a)
            else:
                with open(self.path + self.name + '.txt', 'a', encoding='utf-8') as f:
                    # print('a:',a)
                    f.write(a)
    
        # 解析第一页
        def get_first(self):
            print(self.firstpageurl)
            first_page = requests.get(url=self.firstpageurl, headers=self.headers).text[32:-1]
            b = loads(first_page)
            # print(b)
            a = ''
            for i in tqdm(b['document.xml']):
                for m in i['c']:
                    a += '\n'
                    for n in m['c']:
                        try:
                            if isinstance(n['c'], str):
                                a += n['c']
                        except Exception:
                            pass
            with open(self.path + self.name + '.txt', 'w', encoding='utf-8') as f:
                f.write(a)
            print('第一页解析完成!!!')
    
        # 下载图片
        def down_img(self, cout, num):
            data = {
                'md5sum': self.md5sum,
                'sign': self.sign,
                'rtcs_ver': '3',
                'bucketNum': self.bucketNum,
                'ipr': '{"c":"word/media/image%s.png"}' % cout
            }
    
            data = requests.get(url=self.base_img, params=data)
            if data.status_code == 200:
                with open(self.path + str(num) + '.jpg', 'wb+') as f:
                    f.write(data.content)
                print(self.name + '下载完成!')
            else:
                couts = str(cout) + '_1'
                print(couts)
                data = {
                    'md5sum': self.md5sum,
                    'sign': self.sign,
                    'rtcs_ver': '3',
                    'bucketNum': self.bucketNum,
                    'ipr': '{"c":"word/media/image%s.png"}' % couts
                }
                data = requests.get(url=self.base_img, params=data)
                if data.status_code == 200:
                    with open(self.path + str(num) + '.jpg', 'wb+') as f:
                        f.write(data.content)
                    print(self.name + '下载完成!')
                else:
    
                    self.flag = False
    
        def run(self, url):
            num = 0
            self.get_info(url)
            self.parse()
            # print('页面写入完成!!!' + '-' * 20 + '下载图片>>>>>>')
            # while self.flag:
            #     num += 1
            #     self.down_img(self.cout, num)
            #     self.cout += 1
    
    
    if __name__ == '__main__':
        #url = input('请输入网址:')
        url='https://wk.baidu.com/view/c5596afeccbff121dc3683df?pcf=2&pcf=2'
        b = Baidu()
        b.run(url)
    

    相关文章

      网友评论

          本文标题:爬取某度文本

          本文链接:https://www.haomeiwen.com/subject/svnrbctx.html