美文网首页
校园新浪微博话题检测爬虫工具

校园新浪微博话题检测爬虫工具

作者: 这样你就找不到我了 | 来源:发表于2020-02-19 21:37 被阅读0次
    from selenium import webdriver
    import xlrd
    import xlwt
    from xlutils.copy import copy
    import requests
    from lxml import html
    import time
    import re
    etree = html.etree
    driver = webdriver.Chrome()
    driver_uid = webdriver.Chrome()
    
    #  模拟登录
    print(u'登陆新浪微博手机端...')
    #  打开Firefox浏览器
    #  给定登陆的网址
    login_url = 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=https%3A%2F%2Fm.weibo.cn%2F'
    driver.get(login_url)
    time.sleep(2)
    #  找到输入用户名的地方,并将用户名里面的内容清空,然后送入你的账号
    username = driver.find_element_by_id("loginName")
    time.sleep(2)
    username.clear()
    username.send_keys('135******49')
    #  找到输入密码的地方,然后送入你的密码
    password = driver.find_element_by_id('loginPassword')
    time.sleep(1)
    password.send_keys('1980897959')
    #  点击登录
    driver.find_element_by_id("loginAction").click()
    #  这里给个15秒非常重要,因为在点击登录之后,新浪微博会有个九宫格验证码,下图有,通过程序 执行的话会有点麻烦(可以参考崔庆才的Python书里面有解决方法),这里就手动
    time.sleep(15)
    print("登陆成功!")
    
    
    def get_response(url, xpath_way):
        time.sleep(3)
        driver.get(url)
        texts = driver.page_source
        #  建树
        texts = etree.HTML(texts)
        #  找树
        text = texts.xpath(xpath_way)
        return text
    
    #  1,打开表格获取昵称
    
    
    def read_excel(file_way):
        workbook = xlrd.open_workbook(file_way)
        #  选择操作的sheet,根据索引,或者名称
        worksheet1 = workbook.sheet_by_index(0)
        #  获取整行和整列的值(数组)
        cols = worksheet1.col_values(3)
        return cols
    
    
    def write_excel_xls_append(file, value):
        index = len(value)  # 获取需要写入数据的行数
        workbook = xlrd.open_workbook(file)  # 打开工作簿
        sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
        worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
        rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
        new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
        new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
        for j in range(1, len(value)):
            new_worksheet.write(j, 7, value[j])  # 追加写入数据,注意是从i+rows_old行开始写入
        new_workbook.save(new_file)  # 保存工作簿
    
    
    #  2,根据昵称获取uid
    
    
    def get_uid(nickname):
        nickname = nickname.strip()
        time.sleep(2)
        id_count = 0
        name_xapth = "//div[@class='info']//a[@class = 'name']"
        user_url = "https://s.weibo.com/user?q=" + str(nickname) + "&Refer=weibo_user"
        user_xapth = "//div[@class='info']//a[@href='javascript:void(0);']/@uid"
        driver_uid.get(user_url)
        texts = driver_uid.page_source
        texts = etree.HTML(texts)
        names_ = texts.xpath(name_xapth)
        for id_count in range(len(names_)):
            name_ = names_[id_count].xpath('string(.)').strip()
            # 字符相差为一都算相等
            if abs(len(name_) - len(nickname)) < 2:
                uid = texts.xpath(user_xapth)
                if len(uid):
                    return uid[id_count]
                else:
                    return 0
        return 0
    
    #  4,检查是否包含关键字[]
    
    
    def if_keywords(text):
        keywords = ['一周一乐[超话]','#一周之始,始于周一#','#一周之始,始于周一#', '#一周一乐#', '#一师有你#', '#随手拍一师#', '#一师印象#', '#一师青年#', '#气节一师#','#晚安,一师#', '#晚安,一师#', '#今日话题#', "#一师诗词苑#", "#早安,一师#", "#早安,一师#"]
        for keyword in keywords:
            if keyword in text:
                if "湖南第一师范学院团委" in text:
                    return 1;
                else:
                    return 0;
        return 0
    
    #  5,处理包含“全文”
    
    
    def get_quanwen(now_url, j):
        time.sleep(4)
        qw_url_xpath = "//body//div[@class='c'][" + str(j) + "]//div[1]//a[contains(text(),'全文')]/@href"
        qw_url = get_response(now_url, qw_url_xpath)[0]
        qw_url = "https://weibo.cn" + str(qw_url)
        # print(qw_url)
        qw_url_xpath = "//body/div[@class='c']//div[1]"
        quanwen_span = get_response(qw_url, qw_url_xpath)
        quanwen = quanwen_span[0].xpath('string(.)').strip()
        return quanwen
    
    
    #  6,判断uid是否发送过微博
    
    
    def if_fabu(uid):
        URL = "https://weibo.cn/u/" + str(uid) + "?page=1"
        respond = get_response(URL,"//div//span[@class='ct']/text()")
        if respond:
            return 1;
        else:
            return 0;
    
    
    #   7,将count写入excel
    def down_pint(counts, line):
        workbook = xlrd.open_workbook(r'XX月XX学院微博话题互动加分表(1).xls')
        # 复制一份book
        workbooknew = copy(workbook)
        worksheet = workbooknew.get_sheet(0)
        for count in counts:
            worksheet.write(line, 5, count)
            line = line+1
        workbooknew.save('XX月XX学院微博话题互动加分表(1).xls')
    
    
    line = 0 # 写入表格的列数
    flag = 1
    counts = []
    if __name__ == '__main__':
        nicknames = read_excel("体育学院12月微博话题互动加分表.xls")
        l = 1
        for nickname in nicknames[2:]:
            print("="*100)
            print(l)
            l = l+1
            count = 0
            uid = get_uid(nickname)
            print(nickname)
            print(uid)
            if uid == 0:
                print("用户不存在")
                counts.append(0)
                continue
            if if_fabu(uid) == 0:
                print("用户未发送过微博")
                counts.append(0)
                continue
            #  获取i页用户微博及其发布时间
            i = 1
            time_end = 2
            while i:
                qw_count = 0
                url_page = "https://weibo.cn/u/" + str(uid) + "?page=" + str(i)
                times_fabu_xpath = "//body/div[@class ='c']/div//span[@class = 'ct']/text()"
                # texts_weibo_xpath = "//body/div[@class='c']//div[1]"
                texts_weibo_xpath = "//body/div[@class='c']"
    
                time.sleep(2)
                driver.get(url_page)
                texts_source = driver.page_source
                texts_source = etree.HTML(texts_source)
                times_fabu = texts_source.xpath(times_fabu_xpath)
                texts_weibo = texts_source.xpath(texts_weibo_xpath)
    
                # print("第"+str(i)+"页")
                i = i + 1
                overtime = 0
                #  处理当前页的微博
                if len(times_fabu) == 0:
                    i = 0;
                    break;
                for j in range(len(times_fabu)):
                    time_fabu = times_fabu[j].split("来自")[0]
                    #print(time_end,time_fabu)
    
                    if time_end == times_fabu[len(times_fabu)-1]:
                        # print("页面结尾!!")
                        i = 0
                        break
                    #  设置二次超过时间才退出(防止由置顶导致的时间判断错误)
                    if time_fabu < "11月15日":
                        # print("太久没有发微博了!")
                        overtime = overtime+1
                        if overtime >= 2:
                            flag = 0
                            i = 0;
                            break;
    
                    if i > 25:
                        flag = 0
                        i = 0;
                        break;
    
                    if time_fabu > "11月15日" and time_fabu < "12月16日":
                        texts_weibo[j] = texts_weibo[j].xpath('string(.)').strip()
                        # print(texts_weibo[j])
                    #  bug提示:如果一页微博中有个“全文",该方法只会处理该页面第一个
                        if "全文" in texts_weibo[j]:
                            # qw_url_xpath = "//body//div[@class='c'][" + str(qw_count) + "]//div[1]//a[contains(text(),'全文')]//@href"
                            qw_url_xpath = "//body//div[@class='c']//div[1]//a[contains(text(),'全文')]"
                            qw_url = texts_source.xpath(qw_url_xpath)
                            qw_url = qw_url[qw_count].attrib
                            qw_url = qw_url['href']
                            qw_count = qw_count + 1
                            qw_url = "https://weibo.cn" + str(qw_url)
                            qw_url_xpath = "//body/div[@class='c']//div[1]"
                            quanwen_span = get_response(qw_url, qw_url_xpath)
                            quanwen = quanwen_span[0].xpath('string(.)').strip()
                            texts_weibo[j] = quanwen
                            # texts_weibo[j] = get_quanwen("https://weibo.cn/u/" + str(uid) + "?page=" + str(i-1), j+1)
                        # print(texts_weibo[j])
                        if if_keywords(texts_weibo[j]):
                            count = count + 0.5
                            if count >= 2.5:
                                break
                if flag == 0:
                    flag = 1
                    i = 0
                    break
                    time_end = time_fabu
            if count > 2.5:
                count = 2.5
            print("得分:" + str(count))
            counts.append(str(count))
    
    print(counts)
    file = "12月体育学院微博话题互动加分表.xlsx"
    new_file = "12月体育学院微博话题互动加分表.xls"
    write_excel_xls_append(file, counts)
    
    

    相关文章

      网友评论

          本文标题:校园新浪微博话题检测爬虫工具

          本文链接:https://www.haomeiwen.com/subject/knpzfhtx.html