美文网首页
Python批量下载新浪微博相册

Python批量下载新浪微博相册

作者: 平仄_pingze | 来源:发表于2016-07-16 18:10 被阅读881次

    最近用Python写了一个新浪微博相册的批量下载程序,选好用户后运行就可以把他相册的全部相片下载到本地。记录了下载历史,可以中途停止和续传。

    新浪微博浏览相册需要登录,在尝试了模拟登录之后,在登录机制和验证码上纠结了很久,最后我选择了直接用cookie登录,这样其实更省事。

    Python2.7,应该就用了requests这个第三方库,API用起来更舒服。

    pip install requests
    

    使用的时候:
    1.先打开该用户的微博页面,F12或者查看源代码,找到他的page_id,填到程序的uid处。
    2.用F12或者其他监听软件找到cookies,填入程序cookies处。
    3.把希望保存的本地目录路径填入程序中dirpath处。

    下面是完整代码

    # coding=u8
    #作者:平仄_pingze (简书)
    
    "功能"
    '''
    获取新浪微博用户相册照片到本地
    '''
    
    "使用方法"
    '''
    1.填写储存目录
    2.指定微博用户id
    3.填写cookie
    4.运行
    '''
    
    # ---|| 初始参数,需要预先填写 ||---
    dirpath = ''  #储存目录
    uid =   #用户page_id
    cookies = ''  #cookies
    
    
    import os
    import requests
    import urllib
    import re
    from StringIO import StringIO
    import pickle
    import traceback
    import time
    
    
    def list_find(alist,ele):
        '不报错的list.index()'
        try:
            return alist.index(ele)
        except:
            return -1
    
    def get_response(url,headers='',params=''):
        '稳定高效的获取响应方法'
        max_try_times = 20 # 最大尝试次数
        wait_time = 0.75 # 最大单次尝试时间
        sleep_time = 0.25 # 尝试失败延时
        #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
        for times in range(1,max_try_times+1):
            # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
            try:
                response = requests.get(url, timeout = wait_time, headers=headers, params=params)
                # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
                break
            except:
                if times < max_try_times:
                    # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
                    time.sleep(sleep_time)
                    continue
                else:
                    print('[%s][ERROR] The last try failed at last , exit pro ...' % time.asctime()[11:19])
                    traceback.print_exc()
                    exit()
        # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
        return response
    
    def retrieve(imgurl,imgpath):
        '稳定高效的下载图片方法(多次尝试失败后跳过)'
        max_try_times = 5 # 最大尝试次数
        wait_time = 15 # 最大单次尝试时间
        sleep_time = 3 # 尝试失败延时
        import socket
        socket.setdefaulttimeout(wait_time)
        #print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
        for times in range(1,max_try_times+1):
            # print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
            try:
                urllib.urlretrieve(imgurl,imgpath)
                # print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
                break
            except:
                if times < max_try_times:
                    # print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
                    time.sleep(sleep_time)
                    continue
                else:
                    print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19])
                    break
        # print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
    
    def secp(string,pattern1,pattern2=''):
        '替换字符串中所有指定字符串为新字符串(效率低)'
        while True:
            index = string.find(pattern1)
            if index > -1:
                string = string[:index]+pattern2+string[index+len(pattern1):]
            else:
                break
        return string
    
    def url_deal(url):
        'URL处理'
        urld = secp(url,'\\')
        urld = secp(urld,'thumb300','large')
        return urld
    
    def get_imgurl(html):
        '解析html,获取图像url列表'
        imgurl_list = []
        extlist = ['jpg','gif','png']
        for ext in extlist:
            pattern = r'class=\\\"photo_pict\\\" src=\\\"(http:\S+thumb300\S+.'+ext+')'
            result = re.findall(pattern,html,re.S)
            if len(result) > 0:
                for url in result:
                    imgurl_list.append(url_deal(url))    
        return imgurl_list  
    
    def save_img(imgurl,savepath,imgname):
        '向本地目录储存图像'
        imgext = imgurl[-4:]
        imgname = imgname + imgext 
        retrieve(imgurl,savepath+os.sep+imgname)
    
    def save_log(dic, path):
        '以pickle文件格式储存到目标路径'
        try:
            out_file = open(path, 'wb')
            pickle.dump(dic,out_file)
            return path
        except:
            traceback.print_exc()
            return None
        finally:
            out_file.close()     
    
    def load_log(path):
        '从指定文件读取pickle文件转成字典'
        try:
            in_file = open(path, 'rb')
            dic = pickle.load(in_file)
            return dic
        except:
            traceback.print_exc()
            return None
    
    def main():
        url = 'http://www.weibo.com/p/'+str(uid)+'/photos'
        headers = {
            'Cookie': cookies
        }
        #访问网址,获取html文档
        response = get_response(url, headers=headers)
        print('[%s][INFO] Pro starting at %s ...' % (time.asctime()[11:19], response.url))
        html = response.text
        #检查html是否有效;若无效,报错并中止
        if len(re.findall('thumb300',html,re.S)) < 1 and len(re.findall('oid',html,re.S)) < 1 and len(re.findall('的微薄',re.S)) < 1:
            print('[%s][ERROR] Invalid cookies or page_id, please check !' % (time.asctime()[11:19]))
            exit()
        #解析文档,获取用户信息和图片路径
        uname = re.findall(u'content="(.+?),',html,re.S)[0]
        imgurl_list = get_imgurl(html)
        #动态获取循环
        while True:
            #获取since_id,进一步获取动态加载的页面
            result = re.findall('since_id=(\S+)">',html,re.S)
            if len(result)>0:
                since_id = result[0][:-1]
            else:
                break
            #print(since_id)
            payload={
                'since_id': since_id,
                'page_id': uid,
                'ajax_call': 1
            }
            url = 'http://weibo.com/p/aj/album/loading'
            response = get_response(url,params=payload,headers=headers)
            html = response.text
            print('[%s][INFO] Got new page of %s !' % (time.asctime()[11:19], response.url))
            #解析文档,获取html路径
            imgurl_list = imgurl_list + get_imgurl(html)
        savepath = dirpath + os.sep + uname
        if(os.path.exists(savepath)==False or os.path.isdir(savepath)==False):
            os.mkdir(savepath)
        imgurl_list.reverse()
        global total_num
        total_num = len(imgurl_list)
        #log文件存在性检查
        logpath = savepath + os.sep + 'log.pkl'
        if os.path.exists(logpath) and os.path.isfile(logpath):
            print('[%s][INFO] Found log.pkl, loading...' % (time.asctime()[11:19]))
            logdic = load_log(logpath)
            log_last_num = logdic.get('last_num')
            log_imgurl_list = logdic.get('imgurl_list')
            index = log_last_num + 1
        else:
            print('[%s][INFO] Not found log.pkl, creating a new one ...' % (time.asctime()[11:19]))
            log_imgurl_list = []
            index = 1
        #开始下载图片
        num = 1
        for imgurl in imgurl_list:
            if list_find(log_imgurl_list, imgurl) < 0: 
                imgname = '{:0>5}'.format(index)
                save_img(imgurl, savepath, imgname)
                index = index + 1
                last_num = index - 1
                log_imgurl_list.append(imgurl)
                logdic = {
                    'last_num': last_num,
                    'imgurl_list': log_imgurl_list
                }
                print('[%s][INFO] Writing log ... (%d/%d) !' % (time.asctime()[11:19], num, total_num))
                save_log(logdic, logpath)
                print('[%s][INFO] Successfully saved image as %s (%d/%d) !' % (time.asctime()[11:19], imgname, num, total_num))
            else:
                print('[%s][INFO] Jump this image (%d/%d) !' % (time.asctime()[11:19], num, total_num))
            num = num + 1
    
    if __name__ == '__main__':
        main()
    
    

    比如我的初始参数是:

    dirpath = 'images' #与脚本同目录的images文件夹
    uid = 1035051191258123 # 韩寒
    cookies = 'SINAGLOBAL=221.237.83.131_146556……' #很长,不给你看
    

    套路是这么个套路,大家有什么想法可以提一提嘛……

    相关文章

      网友评论

          本文标题:Python批量下载新浪微博相册

          本文链接:https://www.haomeiwen.com/subject/tusujttx.html