美文网首页Python
Python 学习记录3

Python 学习记录3

作者: 夏秋之萌 | 来源:发表于2018-01-03 15:00 被阅读0次

    获取微博关注者的信息并抓取微博里的发布的照片原图

    import re,os,urllib,time,requests
    from bs4 import BeautifulSoup
    from urllib import request
    
    url = "https://weibo.cn/xxxxxxxxx/follow?page="
    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
    cookies = {"cookie": "Your cookie"}
    dir = "C:\\Users\\xxxxx\\Desktop\\Python\\Weibo"
    

    微博里发布的是单张图片

    def single(html,imgdir):
        reg_ori = r'src="(.+?)wap180/.+?\.jpg"' #获取图片url的开头
        reg_end = r'src=".+?/wap180/(.+?\.jpg)"' #获取图片url的结尾
        reg_orire = re.compile(reg_ori)
        reg_endre = re.compile(reg_end)
        url_ori = reg_orire.findall(html)[0]
        url_end = reg_endre.findall(html)[0]
        url = url_ori + "large" + '/' + url_end #生成新的url,把wap180换成large就是照片原图的url
        print(url)
        x = url_end.split('.')[0]
        curdir = imgdir + '\\'
        urllib.request.urlretrieve(url, '{}{}.jpg'.format(curdir, x))  
    

    微博里发布的是多张图片

    def group(html,imgdir):
        reg = r'<(a href=".+?">.+?)</a>'
        regre = re.compile(reg)
        lists = regre.findall(html) #找到所有的url
        for i in lists:
            if u'组图' in i: #判断是不是包含照片url
                ureg = r'a href="(https.+?)">'
                uregre = re.compile(ureg)
                gro_url = uregre.findall(i)[0]
                html = requests.get(gro_url,cookies=cookies,headers=headers).text
                img = r'img src="(http.+?\.jpg)"' #抓取照片的url
                imgre = re.compile(img)
                imgurl = imgre.findall(html)
                for u in imgurl:
                    s = r'(.+?)thumb180/.+?\.jpg'
                    e = r'.+?/thumb180/(.+?\.jpg)'
                    ss = re.compile(s).findall(u)[0]
                    ee = re.compile(e).findall(u)[0]
                    uu = ss + "large" + '/' + ee #将thumb180换成large就是照片原图url
                    print(uu)
                    curdir = imgdir + '\\'
                    x = ee.split('.')[0] #以照片url结尾的字符作为照片的名字保存
                    urllib.request.urlretrieve(uu, '{}{}.jpg'.format(curdir, x))
                    time.sleep(2)
            time.sleep(2)  
    

    获取所有微博的总页数

    def Page(url):
        response = requests.get(url, cookies=cookies, headers=headers)
        reg = r'value="(\d+)"'
        page = int(re.compile(reg).findall(response.text)[0])
        return page  
    

    遍历所有页数的微博

    def GetFollow(url,dir):
        ori_url = url + str(1)
        num = Page(ori_url)
        for i in range(1,num+1):
            print("第" + str(i) + "页")
            curr_url = url + str(i)
            try:
                response = requests.get(curr_url, cookies=cookies, headers=headers)
                while response.status_code != 200:
                    response = requests.get(curr_url, cookies=cookies, headers=headers)
                soup = BeautifulSoup(response.text,'lxml')
                weibo = soup.find_all('div',class_ = "c")
                for w in weibo:
                    i = str(w)
                    if u'原图' in i:
                        if u'组图' in i:
                            print("多图")
                            group(i,dir)
                            time.sleep(2)
                        else:
                            print("单图")
                            single(i,dir)
                            time.sleep(2)
            except:
                time.sleep(2)
                continue
            time.sleep(2)  
    

    获取所有关注的人,并抓取粉丝信息,主页网址信息,创建以微博ID为名字的目录,输出基本信息到txt文件中

    def GetAll(url):
        for p in range(1,56):
            curr_url = url + str(p)
            html = requests.get(curr_url,cookies=cookies,headers=headers).text
            #print(html)
            soup = BeautifulSoup(html,'lxml')
            follow = soup.find_all("td")
            list = []
            for i in range(1,20,2):
                list.append(follow[i])
            for i in list:
                con = str(i)
                #print(con)
                reg_url = r'td valign="top"><a href="(https://weibo.cn.+?)">.+?</a>' #匹配出微博主页url
                reg_name = r'td valign="top"><a href="https://weibo.cn.+?">(.+?)</a>' #匹配出微博ID
                reg_fans = r'<br/>(.+?)<br/>' #匹配出微博粉丝
                urlre = re.compile(reg_url)
                namere = re.compile(reg_name)
                fansre = re.compile(reg_fans)
                fourl = urlre.findall(con)[0]
                foname = namere.findall(con)[0]
                fofans = fansre.findall(con)[0]
                print(fourl,foname,fofans)
                people_dir = dir + '\\' + foname
                if not os.path.isdir(people_dir):
                    os.mkdir(people_dir)
                os.chdir(people_dir)
                file = people_dir + '\\' + foname + ".txt"
                ff = open(file,'at',encoding=('utf-8'))
                out = foname + ' ' + fofans + ' ' + fourl + '\n'
                ff.write(out)
                ff.close()
                full_url = fourl + "?page="
                GetFollow(full_url, people_dir)
                os.chdir(dir)
                time.sleep(5)
    
    GetAll(url)

    相关文章

      网友评论

        本文标题:Python 学习记录3

        本文链接:https://www.haomeiwen.com/subject/jomhnxtx.html