import requests,time
from lxml import etree
from pymongo import MongoClient
def requestGet(url):
r = requests.get(url, headers=headers)
html = r.text
select = etree.HTML(html)
return select
def get_fens_info(url,fens_count):
try:
# select = requestGet(url)
fens_page = int(fens_count)//9+1 if int(fens_count)%9 == 0 else int(fens_count)//9+2
print(fens_page)
#粉丝数只有前100页有数据
for page in range(1,100):
print(page)
select = requestGet(url+'?page={}'.format(page))
infos = select.xpath('//div[@id="list-container"]//div[@class="info"]')
for info in infos:
name = info.xpath('a/text()')[0]
text = info.xpath('div/text()')
print(name,text)
post_data = {
'name' : name,
'text' : text
}
#存放粉丝信息到表里
result = posts.insert_one(post_data)
except Exception as e:
print('get_fens_info函数解析错误 错误为:',e)
#获取简书推荐作者的名字和url地址
def get_recommend_author_name():
try:
page_index = 1
while True:
select = requestGet(base_url + str(page_index))
infos = select.xpath('//div[@class="wrap"][position()>0]')
# print(len(infos))
print(page_index)
if len(infos) != 0:
page_index += 1
for info in infos:
name = info.xpath('a/h4/text()')[0]
print(name)
url = jianshu + info.xpath('a/@href')[0]
get_recommend_author_info(name, url)
else:
break
except Exception as e:
print("get_recommend_author_name函数解析错误 错误为 ", e)
#获取作者的粉丝数和关注数等信息
def get_recommend_author_info(name,url):
try:
select = requestGet(url)
infos = select.xpath('//div[@class="meta-block"][position()>0]')
# print(len(infos))
guanzhu_count = infos[0].xpath('a/p/text()')[0]
# guanzhu_url = jianshu + infos[0].xpath('a/@href')[0]
fensi_count = infos[1].xpath('a/p/text()')[0]
fensi_url = jianshu + infos[1].xpath('a/@href')[0]
wenzhang_count = infos[2].xpath('a/p/text()')[0]
zishu_count = infos[3].xpath('p/text()')[0]
xihuan_count = infos[4].xpath('p/text()')[0]
# print(guanzhu_url, fensi_url)
print(guanzhu_count, fensi_count, wenzhang_count, zishu_count, xihuan_count)
get_fens_info(fensi_url,fensi_count)
return True
except Exception as e:
print('get_recommend_author_info函数解析错误 错误为:',e)
if __name__ == "__main__":
jianshu = 'http://www.jianshu.com'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
headers = {
'User-Agent': user_agent
}
base_url = 'http://www.jianshu.com/recommendations/users?page='
start = time.time()
client = MongoClient()
# 使用上面的代码片段,将建立连接到默认主机(localhost)和端口(27017)。您还可以指定主机和 / 或使用端口:
client = MongoClient('localhost', 27017)
# 或者使用MongoURl格式:
# client = MongoClient('mongodb://localhost:27017')
#连接fens_db这个数据库
db = client.fens_db
#posts表名
posts = db.posts
get_recommend_author_name()
end = time.time()
# print(UserName)
print("总耗时 %0.3f" % (end - start))
屏幕快照 2017-08-15 上午9.00.12.png
网友评论