美文网首页
Python 学习记录4

Python 学习记录4

作者: 夏秋之萌 | 来源:发表于2018-01-05 16:22 被阅读0次
    import requests,re,time,urllib,os,random
    from urllib import request
    from bs4 import BeautifulSoup
    
    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
    cookies = {"cookie": "_T_WM"}
    class Weibo(object):
        def __init__(self,url):
            self.url = url
            self.dir = "C:\\Users\\Desktop\\Python\\Weibo"
        #input any url
        def getType(self):
            url_split = self.url.split('/')
            user = url_split[3]
            if user == str():
                flag = 1
            else:
                flag = 0
            return flag
        # input the first page url
        def getPage(self):
            Html = requests.get(self.url,cookies=cookies,headers=headers).text
            pagereg = r'value="(\d+)"'
            pages = re.compile(pagereg).findall(Html)
            if len(pages) == 0:
                page = 1
            else:
                page = pages[-1]
            return page
        # input any url
        def getUrl(self):
            Html = requests.get(self.url,cookies=cookies,headers=headers).text
            Soup = BeautifulSoup(Html,'lxml')
            return Html,Soup
        # Input the first page url
        def getBasicInfo(self):
            OneHtml = requests.get(self.url,cookies=cookies,headers=headers).text
            OneSoup = BeautifulSoup(OneHtml,'lxml')
            ID_reg = r'<a href="/(\d+)/info"'
            fans_reg = r'<a href=".+?/fans">粉丝\[(\d+)\]</a>'
            name_reg = r'<title>(.+?)的微博</title>'
            ID = re.compile(ID_reg).findall(OneHtml)[0]
            fans = re.compile(fans_reg).findall(OneHtml)[0]
            name = re.compile(name_reg).findall(OneHtml)[0]
            people_dir = self.dir + '\\' + str(name)
            if not os.path.isdir(people_dir):
                os.mkdir(people_dir)
            info_url = "https://weibo.cn/" + str(ID) + "/" + "info"
            return name,fans,info_url,people_dir
        # Input the info page url
        def getDetailInfo(self):
            InfoHtml = requests.get(self.url,cookies=cookies,headers=headers).text
            # 学校
            xx = r'<div class="tip">学习经历</div><div class="c">(.+?)<br/>'
            xuexiao = re.compile(xx).findall(InfoHtml)
            if len(xuexiao) != 0:
                info_school = "学校" + ':' + xuexiao[0] + '\n'
            else:
                info_school = "学校:Missing" + '\n'
            # 性别
            xb = r'<br/>性别:(.+?)<br/>'
            xingbie = re.compile(xb).findall(InfoHtml)
            if len(xingbie) != 0:
                info_xb = "性别" + ':' + str(xingbie[0]) + '\n'
            else:
                info_xb = "性别:Missing"
            # 地区
            dq = r'<br/>地区:(.+?)<br/>'
            diqu = re.compile(dq).findall(InfoHtml)
            if len(diqu) != 0:
                info_dq = "地区" + ':' + str(diqu[0]) + '\n'
            else:
                info_dq = "地区:Missing"
            # 生日
            sr = r'<br/>生日:(.+?)<br/>'
            shengri = re.compile(sr).findall(InfoHtml)
            if len(shengri) != 0:
                info_sr = "生日" + ':' + str(shengri[0]) + '\n'
            else:
                info_sr = "生日:Missing" + '\n'
            # 简介
            jjie = r'<br/>简介:(.+?)<br/>'
            jianjie = re.compile(jjie).findall(InfoHtml)
            if len(jianjie) != 0:
                info_jjie = "简介" + ':' + str(jianjie[0]) + '\n'
            else:
                info_jjie = "简介:Missing" + '\n'
    
            return info_school,info_xb,info_dq,info_sr,info_jjie
    
    def one(html,dir):
        s = r'src="(.+?)wap180/.+?"/></a>'
        e = r'src=".+?/wap180/(.+?)"/></a>'
        ss = re.compile(s).findall(html)[0]
        ee = re.compile(e).findall(html)[0]
        url = ss + "large/" + ee
        print(url)
        curdir = dir + '\\'
        urllib.request.urlretrieve(url, '{}{}.jpg'.format(curdir, ee))
    
    def group(html,dir):
        reg = r'<(a href=".+?">.+?)</a>'
        regre = re.compile(reg)
        lists = regre.findall(html)
        for i in lists:
            if u'组图' in i:
                ureg = r'a href="(https.+?)">'
                uregre = re.compile(ureg)
                gro_url = uregre.findall(i)[0]
                print(gro_url)
                Group = Weibo(gro_url)
                html,soup = Group.getUrl()
                img = r'img src="(http.+?)".+?原图'
                imgre = re.compile(img)
                imgurl = imgre.findall(html)
                #print("imgurl",imgurl)
                for u in imgurl:
                    u = str(u)
                    s = r'^(.+?)thumb180/.+?'
                    e = r'.+?/thumb180/(.+?)$'
                    ss = re.compile(s).findall(u)[0]
                    ee = re.compile(e).findall(u)[0]
                    uu = ss + "large" + '/' + ee
                    print(uu)
                    curdir = dir + '\\'
                    urllib.request.urlretrieve(uu, '{}{}'.format(curdir, ee))
                    time.sleep(1)
            time.sleep(1)
    
    def getInfo(url):
        basic = Weibo(url)
        page = basic.getPage()
        name,fans,info_url,people_dir = basic.getBasicInfo()
        detail = Weibo(info_url)
        xx,xb,dq,sr,jjie = detail.getDetailInfo()
        file = people_dir + '\\' + name + ".txt"
        fo = open(file,'w',encoding=('utf-8'))
        fo.write("昵称:" + name + '\n');fo.write(xb);fo.write(sr)
        fo.write("粉丝:" + fans + '\n');fo.write(xx);fo.write(dq)
        fo.write(jjie);fo.write("目录:" + people_dir + '\n')
        print(name + ":Info write done!")
        return page,people_dir
    
    def getLastWeiboTime(url):
        time_html,time_soup = Weibo(url).getUrl()
        wb_list = time_soup.find_all('div',class_="c")
        leng = len(wb_list)
        i = 0
        time_list = []
        for i in range(leng):
            weibo = str(wb_list[i])
            #print(wb_list[i])
            if u'置顶' not in weibo and u'赞' in weibo :
                reg = r'<span class="ct">(.+?)<'
                real_time = re.compile(reg).findall(weibo)[0]
                time_list.append(real_time)
        print(time_list[0])
    
    def getWeibo(ori):
        url = ori + "?page="
        pages,dir = getInfo(url)
        for p in range(1,int(pages) + 1):
            cur_url = url + str(p)
            print("第" + str(p) + "页")
            try:
                Page = Weibo(cur_url)
                page_html,page_soup = Page.getUrl()
                wbs = page_soup.find_all('div',class_="c")
                for w in wbs:
                    con = str(w)
                    #print(con)
                    if u'原图' in con and u'转发了' not in con and u'转发理由' not in con:
                        #print(con)
                        if u'组图' in con:
                            #print(con)
                            print("组图")
                            group(con, dir)
                            time.sleep(1)
                        else:
                            #print(con)
                            print("单图")
                            one(con, dir)
                            time.sleep(1)
            except:
                time.sleep(1)
                continue
        print("Img downloads Done!")
    
    oris = [""]
    for ori in oris:
        getWeibo(ori)
        getLastWeiboTime(ori)

    相关文章

      网友评论

          本文标题:Python 学习记录4

          本文链接:https://www.haomeiwen.com/subject/uejsnxtx.html