校园新浪微博话题检测爬虫工具

作者: 这样你就找不到我了 | 来源:发表于2020-02-19 21:37 被阅读0次

校园新浪微博话题检测爬虫工具
新浪微博爬虫
如何采集新浪微博数据？
仿新浪微博Android客户端
新浪微博爬虫代码
python大牛一个小时爬去新浪微博千万数据是这样做到的
猴子都能学会的20行代码登录微博
python-新浪爬虫之模拟登录
FMDB 离线缓存
微博爬虫开源项目汇总大全（长期更新、欢迎补充）

from selenium import webdriver
import xlrd
import xlwt
from xlutils.copy import copy
import requests
from lxml import html
import time
import re
etree = html.etree
driver = webdriver.Chrome()
driver_uid = webdriver.Chrome()

#  模拟登录
print(u'登陆新浪微博手机端...')
#  打开Firefox浏览器
#  给定登陆的网址
login_url = 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=https%3A%2F%2Fm.weibo.cn%2F'
driver.get(login_url)
time.sleep(2)
#  找到输入用户名的地方，并将用户名里面的内容清空，然后送入你的账号
username = driver.find_element_by_id("loginName")
time.sleep(2)
username.clear()
username.send_keys('135******49')
#  找到输入密码的地方，然后送入你的密码
password = driver.find_element_by_id('loginPassword')
time.sleep(1)
password.send_keys('1980897959')
#  点击登录
driver.find_element_by_id("loginAction").click()
#  这里给个15秒非常重要，因为在点击登录之后，新浪微博会有个九宫格验证码，下图有，通过程序 执行的话会有点麻烦（可以参考崔庆才的Python书里面有解决方法），这里就手动
time.sleep(15)
print("登陆成功！")


def get_response(url, xpath_way):
    time.sleep(3)
    driver.get(url)
    texts = driver.page_source
    #  建树
    texts = etree.HTML(texts)
    #  找树
    text = texts.xpath(xpath_way)
    return text

#  1，打开表格获取昵称


def read_excel(file_way):
    workbook = xlrd.open_workbook(file_way)
    #  选择操作的sheet,根据索引，或者名称
    worksheet1 = workbook.sheet_by_index(0)
    #  获取整行和整列的值（数组）
    cols = worksheet1.col_values(3)
    return cols


def write_excel_xls_append(file, value):
    index = len(value)  # 获取需要写入数据的行数
    workbook = xlrd.open_workbook(file)  # 打开工作簿
    sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
    new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
    new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
    for j in range(1, len(value)):
        new_worksheet.write(j, 7, value[j])  # 追加写入数据，注意是从i+rows_old行开始写入
    new_workbook.save(new_file)  # 保存工作簿


#  2，根据昵称获取uid


def get_uid(nickname):
    nickname = nickname.strip()
    time.sleep(2)
    id_count = 0
    name_xapth = "//div[@class='info']//a[@class = 'name']"
    user_url = "https://s.weibo.com/user?q=" + str(nickname) + "&Refer=weibo_user"
    user_xapth = "//div[@class='info']//a[@href='javascript:void(0);']/@uid"
    driver_uid.get(user_url)
    texts = driver_uid.page_source
    texts = etree.HTML(texts)
    names_ = texts.xpath(name_xapth)
    for id_count in range(len(names_)):
        name_ = names_[id_count].xpath('string(.)').strip()
        # 字符相差为一都算相等
        if abs(len(name_) - len(nickname)) < 2:
            uid = texts.xpath(user_xapth)
            if len(uid):
                return uid[id_count]
            else:
                return 0
    return 0

#  4，检查是否包含关键字[]


def if_keywords(text):
    keywords = ['一周一乐[超话]','#一周之始，始于周一#','#一周之始,始于周一#', '#一周一乐#', '#一师有你#', '#随手拍一师#', '#一师印象#', '#一师青年#', '#气节一师#','#晚安,一师#', '#晚安，一师#', '#今日话题#', "#一师诗词苑#", "#早安，一师#", "#早安,一师#"]
    for keyword in keywords:
        if keyword in text:
            if "湖南第一师范学院团委" in text:
                return 1;
            else:
                return 0;
    return 0

#  5，处理包含“全文”


def get_quanwen(now_url, j):
    time.sleep(4)
    qw_url_xpath = "//body//div[@class='c'][" + str(j) + "]//div[1]//a[contains(text(),'全文')]/@href"
    qw_url = get_response(now_url, qw_url_xpath)[0]
    qw_url = "https://weibo.cn" + str(qw_url)
    # print(qw_url)
    qw_url_xpath = "//body/div[@class='c']//div[1]"
    quanwen_span = get_response(qw_url, qw_url_xpath)
    quanwen = quanwen_span[0].xpath('string(.)').strip()
    return quanwen


#  6，判断uid是否发送过微博


def if_fabu(uid):
    URL = "https://weibo.cn/u/" + str(uid) + "?page=1"
    respond = get_response(URL,"//div//span[@class='ct']/text()")
    if respond:
        return 1;
    else:
        return 0;


#   7，将count写入excel
def down_pint(counts, line):
    workbook = xlrd.open_workbook(r'XX月XX学院微博话题互动加分表(1).xls')
    # 复制一份book
    workbooknew = copy(workbook)
    worksheet = workbooknew.get_sheet(0)
    for count in counts:
        worksheet.write(line, 5, count)
        line = line+1
    workbooknew.save('XX月XX学院微博话题互动加分表(1).xls')


line = 0 # 写入表格的列数
flag = 1
counts = []
if __name__ == '__main__':
    nicknames = read_excel("体育学院12月微博话题互动加分表.xls")
    l = 1
    for nickname in nicknames[2:]:
        print("="*100)
        print(l)
        l = l+1
        count = 0
        uid = get_uid(nickname)
        print(nickname)
        print(uid)
        if uid == 0:
            print("用户不存在")
            counts.append(0)
            continue
        if if_fabu(uid) == 0:
            print("用户未发送过微博")
            counts.append(0)
            continue
        #  获取i页用户微博及其发布时间
        i = 1
        time_end = 2
        while i:
            qw_count = 0
            url_page = "https://weibo.cn/u/" + str(uid) + "?page=" + str(i)
            times_fabu_xpath = "//body/div[@class ='c']/div//span[@class = 'ct']/text()"
            # texts_weibo_xpath = "//body/div[@class='c']//div[1]"
            texts_weibo_xpath = "//body/div[@class='c']"

            time.sleep(2)
            driver.get(url_page)
            texts_source = driver.page_source
            texts_source = etree.HTML(texts_source)
            times_fabu = texts_source.xpath(times_fabu_xpath)
            texts_weibo = texts_source.xpath(texts_weibo_xpath)

            # print("第"+str(i)+"页")
            i = i + 1
            overtime = 0
            #  处理当前页的微博
            if len(times_fabu) == 0:
                i = 0;
                break;
            for j in range(len(times_fabu)):
                time_fabu = times_fabu[j].split("来自")[0]
                #print(time_end,time_fabu)

                if time_end == times_fabu[len(times_fabu)-1]:
                    # print("页面结尾！！")
                    i = 0
                    break
                #  设置二次超过时间才退出（防止由置顶导致的时间判断错误）
                if time_fabu < "11月15日":
                    # print("太久没有发微博了！")
                    overtime = overtime+1
                    if overtime >= 2:
                        flag = 0
                        i = 0;
                        break;

                if i > 25:
                    flag = 0
                    i = 0;
                    break;

                if time_fabu > "11月15日" and time_fabu < "12月16日":
                    texts_weibo[j] = texts_weibo[j].xpath('string(.)').strip()
                    # print(texts_weibo[j])
                #  bug提示：如果一页微博中有个“全文",该方法只会处理该页面第一个
                    if "全文" in texts_weibo[j]:
                        # qw_url_xpath = "//body//div[@class='c'][" + str(qw_count) + "]//div[1]//a[contains(text(),'全文')]//@href"
                        qw_url_xpath = "//body//div[@class='c']//div[1]//a[contains(text(),'全文')]"
                        qw_url = texts_source.xpath(qw_url_xpath)
                        qw_url = qw_url[qw_count].attrib
                        qw_url = qw_url['href']
                        qw_count = qw_count + 1
                        qw_url = "https://weibo.cn" + str(qw_url)
                        qw_url_xpath = "//body/div[@class='c']//div[1]"
                        quanwen_span = get_response(qw_url, qw_url_xpath)
                        quanwen = quanwen_span[0].xpath('string(.)').strip()
                        texts_weibo[j] = quanwen
                        # texts_weibo[j] = get_quanwen("https://weibo.cn/u/" + str(uid) + "?page=" + str(i-1), j+1)
                    # print(texts_weibo[j])
                    if if_keywords(texts_weibo[j]):
                        count = count + 0.5
                        if count >= 2.5:
                            break
            if flag == 0:
                flag = 1
                i = 0
                break
                time_end = time_fabu
        if count > 2.5:
            count = 2.5
        print("得分:" + str(count))
        counts.append(str(count))

print(counts)
file = "12月体育学院微博话题互动加分表.xlsx"
new_file = "12月体育学院微博话题互动加分表.xls"
write_excel_xls_append(file, counts)

校园新浪微博话题检测爬虫工具
新浪微博爬虫
layout: posttitle: 新浪微博爬虫categories: Spiderdescription: 微...
如何采集新浪微博数据？
本文主要介绍神箭手“新浪微博采集爬虫”（以下简称“微博爬虫”）的使用教程以及注意事项。新浪微博中有大量高价值的软...
仿新浪微博Android客户端
仿新浪微博客户端 github地址新浪微博开放平台API微博SDK 微博正则处理(@、表情、话题##、url正则处...
新浪微博爬虫代码
整理一下文章《[数据分析] 简书在微博上的分享情况》的代码。微博爬虫的难点：登录问题之前我用了两种方式：1...
python大牛一个小时爬去新浪微博千万数据是这样做到的
爬虫功能：此项目和QQ空间爬虫类似，主要爬取新浪微博用户的个人信息、微博信息、粉丝和关注（详细见此）。代码获取...
猴子都能学会的20行代码登录微博
猴子都能学会的20行代码登录微博如何登录新浪微博是令许多数据新手（包括我）头疼的大问题。由于新浪的反爬虫策略，网...
python-新浪爬虫之模拟登录
好了，现在讲述针对Ajax异步请求内容的爬虫实例，以新浪微博为例。首先，新浪微博与前面讲述的两个网站不同的是，需登...
FMDB 离线缓存
以新浪微博加载微博的工具类为例： @implementation HWStatusTool static FMDa...
微博爬虫开源项目汇总大全（长期更新、欢迎补充）
SinaSpider- 基于scrapy和redis的分布式微博爬虫。SinaSpider主要爬取新浪微博的个人信...