from selenium import webdriver
import xlrd
import xlwt
from xlutils.copy import copy
import requests
from lxml import html
import time
import re
etree = html.etree
driver = webdriver.Chrome()
driver_uid = webdriver.Chrome()
# 模拟登录
print(u'登陆新浪微博手机端...')
# 打开Firefox浏览器
# 给定登陆的网址
login_url = 'https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=https%3A%2F%2Fm.weibo.cn%2F'
driver.get(login_url)
time.sleep(2)
# 找到输入用户名的地方,并将用户名里面的内容清空,然后送入你的账号
username = driver.find_element_by_id("loginName")
time.sleep(2)
username.clear()
username.send_keys('135******49')
# 找到输入密码的地方,然后送入你的密码
password = driver.find_element_by_id('loginPassword')
time.sleep(1)
password.send_keys('1980897959')
# 点击登录
driver.find_element_by_id("loginAction").click()
# 这里给个15秒非常重要,因为在点击登录之后,新浪微博会有个九宫格验证码,下图有,通过程序 执行的话会有点麻烦(可以参考崔庆才的Python书里面有解决方法),这里就手动
time.sleep(15)
print("登陆成功!")
def get_response(url, xpath_way):
time.sleep(3)
driver.get(url)
texts = driver.page_source
# 建树
texts = etree.HTML(texts)
# 找树
text = texts.xpath(xpath_way)
return text
# 1,打开表格获取昵称
def read_excel(file_way):
workbook = xlrd.open_workbook(file_way)
# 选择操作的sheet,根据索引,或者名称
worksheet1 = workbook.sheet_by_index(0)
# 获取整行和整列的值(数组)
cols = worksheet1.col_values(3)
return cols
def write_excel_xls_append(file, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(file) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for j in range(1, len(value)):
new_worksheet.write(j, 7, value[j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(new_file) # 保存工作簿
# 2,根据昵称获取uid
def get_uid(nickname):
nickname = nickname.strip()
time.sleep(2)
id_count = 0
name_xapth = "//div[@class='info']//a[@class = 'name']"
user_url = "https://s.weibo.com/user?q=" + str(nickname) + "&Refer=weibo_user"
user_xapth = "//div[@class='info']//a[@href='javascript:void(0);']/@uid"
driver_uid.get(user_url)
texts = driver_uid.page_source
texts = etree.HTML(texts)
names_ = texts.xpath(name_xapth)
for id_count in range(len(names_)):
name_ = names_[id_count].xpath('string(.)').strip()
# 字符相差为一都算相等
if abs(len(name_) - len(nickname)) < 2:
uid = texts.xpath(user_xapth)
if len(uid):
return uid[id_count]
else:
return 0
return 0
# 4,检查是否包含关键字[]
def if_keywords(text):
keywords = ['一周一乐[超话]','#一周之始,始于周一#','#一周之始,始于周一#', '#一周一乐#', '#一师有你#', '#随手拍一师#', '#一师印象#', '#一师青年#', '#气节一师#','#晚安,一师#', '#晚安,一师#', '#今日话题#', "#一师诗词苑#", "#早安,一师#", "#早安,一师#"]
for keyword in keywords:
if keyword in text:
if "湖南第一师范学院团委" in text:
return 1;
else:
return 0;
return 0
# 5,处理包含“全文”
def get_quanwen(now_url, j):
time.sleep(4)
qw_url_xpath = "//body//div[@class='c'][" + str(j) + "]//div[1]//a[contains(text(),'全文')]/@href"
qw_url = get_response(now_url, qw_url_xpath)[0]
qw_url = "https://weibo.cn" + str(qw_url)
# print(qw_url)
qw_url_xpath = "//body/div[@class='c']//div[1]"
quanwen_span = get_response(qw_url, qw_url_xpath)
quanwen = quanwen_span[0].xpath('string(.)').strip()
return quanwen
# 6,判断uid是否发送过微博
def if_fabu(uid):
URL = "https://weibo.cn/u/" + str(uid) + "?page=1"
respond = get_response(URL,"//div//span[@class='ct']/text()")
if respond:
return 1;
else:
return 0;
# 7,将count写入excel
def down_pint(counts, line):
workbook = xlrd.open_workbook(r'XX月XX学院微博话题互动加分表(1).xls')
# 复制一份book
workbooknew = copy(workbook)
worksheet = workbooknew.get_sheet(0)
for count in counts:
worksheet.write(line, 5, count)
line = line+1
workbooknew.save('XX月XX学院微博话题互动加分表(1).xls')
line = 0 # 写入表格的列数
flag = 1
counts = []
if __name__ == '__main__':
nicknames = read_excel("体育学院12月微博话题互动加分表.xls")
l = 1
for nickname in nicknames[2:]:
print("="*100)
print(l)
l = l+1
count = 0
uid = get_uid(nickname)
print(nickname)
print(uid)
if uid == 0:
print("用户不存在")
counts.append(0)
continue
if if_fabu(uid) == 0:
print("用户未发送过微博")
counts.append(0)
continue
# 获取i页用户微博及其发布时间
i = 1
time_end = 2
while i:
qw_count = 0
url_page = "https://weibo.cn/u/" + str(uid) + "?page=" + str(i)
times_fabu_xpath = "//body/div[@class ='c']/div//span[@class = 'ct']/text()"
# texts_weibo_xpath = "//body/div[@class='c']//div[1]"
texts_weibo_xpath = "//body/div[@class='c']"
time.sleep(2)
driver.get(url_page)
texts_source = driver.page_source
texts_source = etree.HTML(texts_source)
times_fabu = texts_source.xpath(times_fabu_xpath)
texts_weibo = texts_source.xpath(texts_weibo_xpath)
# print("第"+str(i)+"页")
i = i + 1
overtime = 0
# 处理当前页的微博
if len(times_fabu) == 0:
i = 0;
break;
for j in range(len(times_fabu)):
time_fabu = times_fabu[j].split("来自")[0]
#print(time_end,time_fabu)
if time_end == times_fabu[len(times_fabu)-1]:
# print("页面结尾!!")
i = 0
break
# 设置二次超过时间才退出(防止由置顶导致的时间判断错误)
if time_fabu < "11月15日":
# print("太久没有发微博了!")
overtime = overtime+1
if overtime >= 2:
flag = 0
i = 0;
break;
if i > 25:
flag = 0
i = 0;
break;
if time_fabu > "11月15日" and time_fabu < "12月16日":
texts_weibo[j] = texts_weibo[j].xpath('string(.)').strip()
# print(texts_weibo[j])
# bug提示:如果一页微博中有个“全文",该方法只会处理该页面第一个
if "全文" in texts_weibo[j]:
# qw_url_xpath = "//body//div[@class='c'][" + str(qw_count) + "]//div[1]//a[contains(text(),'全文')]//@href"
qw_url_xpath = "//body//div[@class='c']//div[1]//a[contains(text(),'全文')]"
qw_url = texts_source.xpath(qw_url_xpath)
qw_url = qw_url[qw_count].attrib
qw_url = qw_url['href']
qw_count = qw_count + 1
qw_url = "https://weibo.cn" + str(qw_url)
qw_url_xpath = "//body/div[@class='c']//div[1]"
quanwen_span = get_response(qw_url, qw_url_xpath)
quanwen = quanwen_span[0].xpath('string(.)').strip()
texts_weibo[j] = quanwen
# texts_weibo[j] = get_quanwen("https://weibo.cn/u/" + str(uid) + "?page=" + str(i-1), j+1)
# print(texts_weibo[j])
if if_keywords(texts_weibo[j]):
count = count + 0.5
if count >= 2.5:
break
if flag == 0:
flag = 1
i = 0
break
time_end = time_fabu
if count > 2.5:
count = 2.5
print("得分:" + str(count))
counts.append(str(count))
print(counts)
file = "12月体育学院微博话题互动加分表.xlsx"
new_file = "12月体育学院微博话题互动加分表.xls"
write_excel_xls_append(file, counts)
网友评论