美文网首页默认收藏夹
基于python3的百度网盘爬虫

基于python3的百度网盘爬虫

作者: 今夕何夕_walker | 来源:发表于2017-02-15 22:11 被阅读69次

    为了更熟练掌握面向对象,用面向对象编程写了百度网盘的爬虫,爬虫部分都已完工,使用python3。自己在本机跑了快一个小时,目测效率还行。如果要部署使用,需要自己补充入库以及uk去重,因为老的网盘爬虫(使用代理采集)还能工作,就懒得去写入库和去重以及资源过滤部分了,等哪天老的爬虫不能用了,再来把这个补充完整。
    更新:补充流程图

    Paste_Image.png
    改进
    • uk去重简单实现:uk从队列之中取出后,直接存入数据库,已存在就跳过采集。
    • 资源入库:把打印方法替换成数据库插入就好。
    • 单机绑定不同本地ip多开或者绑定不同代理ip多开,虽然之前使用过socket bind本地ip多开其它爬虫,但是感觉还是使用代理ip更好,本地多ip也可以使用nginx做代理服务器,这样更通用一点。
    • 当前从某个用户获取fans_list之后,程序会自动采集,加入数据库之后,可以考虑将多余的uk存入数据库(uk爬取要比资源爬取快),之后直接从数据库取未采集的uk列表初始化采集

    当前采集到的数据直接输出到了控制台

    Paste_Image.png
    # -*- coding: utf-8 -*-
    # Copyright (c) 2017 - walker <cail1844@gmail.com>
    
    #  定义了用户和资源类型
    #  定义了FollowList和FansList类
    # 分离了download url的过程,带数据验证
    # 分离了主线程,实现资源调度
    # loggging记录日志,存放当前目录下
    # 最简数据,仅保留构成wap分享链接的uk,shareid以及文件名
    import requests
    from queue import Queue
    import time
    import json
    import re
    import random
    #import pymysql
    import logging
    from threading import Thread
    
    logging.basicConfig(handlers=[logging.FileHandler('baiduyunwap.log','w','utf-8')],level=logging.WARNING)
    
    class UKItem:
        'define UK class'
        def __init__(self,uk=None,fans=None,follow=None,pubshare=None):
            self.uk = uk
            self.fans = fans
            self.follow = follow
            self.pubshare = pubshare
    
        def existed(self):
            'check in mysql'
            self.history = False
    
        def put_uk(self,item,uk_queue):
            if 'fans_uk' in item.keys():
                self.uk = item['fans_uk']
            else:
                self.uk = item['follow_uk']
            self.fans = item['fans_count']
            self.follow = item['follow_count']
            self.pubshare = item['pubshare_count']
            uk_queue.put(self)
    
    class BDRecord:
        'define Baidu Yun Record'
        def __init__(self,title=None,uk=None,shareid=None):
            self.title = title
            self.uk = uk
            self.shareid = shareid
    
        def put_file(self,item,file_queue):
            self.title = item['title']
            self.uk = item['uk']
            if item['feed_type'] == 'share':
                self.shareid = item['shareid']
                file_queue.put(self)
    
    class FollowList:
        'get uk list from user follow list'
        pass
        def __init__(self, uk, uk_queue):
            self.uk = uk
            self.baseurl = "https://pan.baidu.com/pcloud/friend/getfollowlist?query_uk=%s&limit=24&start=%s&bdstoken=&channel=chunlei&clienttype=0&web=1"
            self.uk_queue = uk_queue
    
        def get_uk(self,start):
            url = self.baseurl % (self.uk,start)
            dic = Request(url).content_from_url()
            if dic and 'follow_list' in dic.keys():
                for item in dic['follow_list']:
                    u = UKItem()
                    u.put_uk(item,self.uk_queue)
            if dic and start == 0:
                return dic['total_count']
            else:
                return 0
    
        def page_uk(self,start=0):
            count = self.get_uk(start)
            while count > start + 24:
                time.sleep(2.1)
                start += 24
                self.get_uk(start)
    
    class FansList():
        'get uk from user fanslist'
        def __init__(self, uk, uk_queue):
            self.uk = uk
            self.baseurl = "https://pan.baidu.com/pcloud/friend/getfanslist?query_uk=%s&limit=24&start=%s&channel=chunlei&clienttype=0&web=1"
            self.uk_queue = uk_queue
    
        def get_uk(self,start):
            url = self.baseurl % (self.uk,start)
            dic = Request(url).content_from_url()
            if dic and 'fans_list' in dic.keys():
                # print(dic['fans_list'])
                for item in dic['fans_list']:
                    u = UKItem()
                    u.put_uk(item,self.uk_queue)
            if dic and start == 0:
                return dic['total_count']
            else:
                return 0
    
    
        def page_uk(self,start=0):
            count = self.get_uk(start)
            while count > start + 24:
                time.sleep(2.1)
                start += 24
                self.get_uk(start)
    
    class FileList:
        'get file from user pubshare list, from wap page'
        def __init__(self, uk, file_queue):
            self.uk = uk
            self.baseurl = "http://pan.baidu.com/wap/share/home?uk=%s&start=%s&adapt=pc&fr=ftw"
            self.file_queue = file_queue
    
        def get_file(self,start):
            url = self.baseurl % (self.uk, start)
            dic = Request(url).content_from_url()
            if dic and 'feedata' in dic.keys():
                for item in dic['feedata']['records']:
                    f = BDRecord()
                    f.put_file(item,self.file_queue)
            if dic and start == 0:
                # try:
                return dic['feedata']['total_count']
                # except:
                #     print('file dic',dic)
            else:
                # print('file dic',dic)
                return 0
    
        def page_file(self,start=0):
            count = self.get_file(start)
            while count > start + 20:
                start += 20
                time.sleep(2.1)
                self.get_file(start)
    
    class Request:
        'download the html and json'
        def __init__(self,url):
            self.url = url
        uk_list_reg = re.compile(r'friend')
        file_reg = re.compile(r'wap')
        file_content_reg = re.compile(r'window\.yunData = (.*?)\;')
        file_fast_reg = re.compile(r'<title>页面不存在</title>')
        uaList = [
        'Mozilla/5.0 (Linux; Android 4.0.3; M031 Build/IML74K) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19',
        'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Mozilla/5.0 (Linux; U; Android 4.1.1; zh-CN; M040 Build/JRO03H) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 UCBrowser/9.4.1.362 U3/0.8.0 Mobile Safari/533.1'
    ]
        ua = random.choice(uaList)
        ref = 'https://yun.baidu.com/share/home?uk=4726647#category/type=0'
        headers = {'User-Agent': ua,'Referer':ref}
    
        def content_from_url(self):
            try:
                r = requests.get(self.url,headers=self.headers,timeout=15)
                r.encoding = 'utf8'
            except Exception as e:
                logging.warning('[Request.content_from_url1] requests error {%s}' % e)
                return False
            try:
                if Request.uk_list_reg.search(self.url):
                    data =  r.json()
                    data = self.verify(data)
                elif Request.file_reg.search(self.url):
                    html =  r.text
                    html = self.verify(html)
                    j = Request.file_content_reg.search(html).group(1)
                    data = json.loads(j)
                else:
                    data = r.text
            except Exception as e:
                logging.warning('[Request.content_from_url2] data error {%s} [%s] \n %s' %(e,self.url,r.text))
                data = False
            return data
    
        def verify(self,data):
            if Request.uk_list_reg.search(self.url) and ('error_msg' in data.keys() or data['errno'] != 0):
                print('sleep 36',self.url,time.localtime())
                logging.warning('[Request.verify] verify file data,get bad data,sleep 120s')
                logging.warning(self.url)
                logging.warning(data)
                time.sleep(120)
                data = False
            elif Request.file_reg.search(self.url) and Request.file_fast_reg.search(data):
                print('sleep 36',self.url,time.localtime())
                logging.warning('[Request.verify] verify file data,get bad data,sleep 36s')
                logging.warning(self.url)
                logging.warning(data)
                time.sleep(36)
                data = False
            else:
                data = data
            return data
    
    class Main(Thread):
        'multiple thread main func'
        uk_queue = Queue()
        fans_follow = Queue()
        follow_queue = Queue()
        pubshare_queue = Queue()
        file_queue =Queue()
    
        def __init__(self):
            Thread.__init__(self)
            print("start Main Thread")
    
        def run(self):
            print("start run func")
            Thread(target=self.parse_uk).start()
            Thread(target=self.parse_fans_follow).start()
            Thread(target=self.parse_file).start()
            Thread(target=self.show_info).start()
    
        def parse_uk(self):
            while True:
                uk = Main.uk_queue.get()
                if bool(uk.fans) and uk.fans >= 25:
                    Main.fans_follow.put({'uk':uk.uk,'type':'fans'})
                if bool(uk.follow) and uk.follow >= 10:
                    Main.fans_follow.put({'uk':uk.uk,'type':'follow'})
                if bool(uk.pubshare) and uk.pubshare >=10 :
                    Main.pubshare_queue.put(uk.uk)
    
        def parse_fans_follow(self):
            while True:
                user = Main.fans_follow.get()
                if user['type'] == 'fans':
                    FansList(user['uk'],Main.uk_queue).page_uk()
                else:
                    FollowList(user['uk'],Main.uk_queue).page_uk()
    
        def parse_file(self):
            while True:
                uk = Main.pubshare_queue.get()
                pubshare = Main.pubshare_queue.get()
                FileList(uk,Main.file_queue).page_file()
    
        def show_info(self):
            while True:
                print('show info:', 'uk:%s  fans and follow:%s  pubshare:%s  file:%s' %(Main.uk_queue.qsize(),Main.fans_follow.qsize(),Main.pubshare_queue.qsize(),Main.file_queue.qsize()))
                logging.warning('uk:%s  fans and follow:%s  pubshare:%s  file:%s' %(Main.uk_queue.qsize(),Main.fans_follow.qsize(),Main.pubshare_queue.qsize(),Main.file_queue.qsize()))
                time.sleep(30)
    
    if __name__ == '__main__':
        t = Main()
        FollowList('675131233',t.uk_queue).page_uk()
        t.start()
        t.join()
        while True:
            f = t.file_queue.get()
            print(f.title,f.uk,f.shareid)
        print('end')
    

    相关文章

      网友评论

        本文标题:基于python3的百度网盘爬虫

        本文链接:https://www.haomeiwen.com/subject/vzzqwttx.html