https://github.com/dingtom/python/blob/master/%E7%88%AC%E5%8F%96%E7%9F%A5%E4%B9%8E%E9%97%AE%E9%A2%98.ipynb
# 知乎 API v4 整理 https://www.jianshu.com/p/86139ab70b86
# limit:数量,最大 20
# offset:起始位置,从零开始
# sort_by:{default, created},表示默认排序或者时间排序
# include:额外信息,包括is_normal,is_sticky,collapsed_by,suggest_edit,comment_count,
# collapsed_counts,reviewing_comments_count,can_comment,content,editable_content,voteup_count,
# reshipment_settings,comment_permission,mark_infos,created_time,updated_time,relationship.is_author,
# voting,is_thanked,is_nothelp,upvoted_followees;author.is_blocking,is_blocked,is_followed,
# voteup_count,message_thread_token,badge[?(type=best_answerer)].topics。
url = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset={}&limit=20&sort_by=default&platform=desktop'
# https://www.zhihu.com/api/v4/questions/275359100/answers?include
# data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics'+
# &offset={}&limit={}&sort_by=default&platform=desktop
ua = UserAgent(verify_ssl=False)
headers = {'User-Agent': ua.random
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
def parse_page(url, headers):#, proxy):
html = requests.get(url, headers=headers)#, proxies=proxy)
bs = json.loads(html.text)
# print(bs.keys()) # ['ad_info', 'data', 'paging']
# print(bs['data'][0]
# ['id', 'type', 'answer_type', 'question', 'author', 'url', 'is_collapsed',
# 'created_time', 'updated_time', 'extras', 'is_copyable', 'is_normal', 'voteup_count',
# 'comment_count','is_sticky', 'admin_closed_comment', 'comment_permission', 'can_comment',
# 'reshipment_settings','content', 'editable_content', 'excerpt', 'collapsed_by', 'collapse_reason',
# 'annotation_action', 'mark_infos', 'relevant_info', 'suggest_edit', 'is_labeled', 'reward_info',
# 'relationship', 'ad_answer']
result = pd.DataFrame()
for i in bs['data']:
headline = i['author']['headline'] #签名
gender = i['author']['gender'] #性别
user_type = i['author']['user_type'] # 用户类型
user_id = i['author']['id']
user_token = i['author']['url_token']
follwer_count = i['author']['follower_count'] #关注人数
name = i['author']['name'] #用户昵称
vote_up = i['voteup_count'] #点赞数
updated_time = i['updated_time'] #更新时间
title = i['question']['title'] #问题
created_time = i['created_time'] #创建时间
comment_count = i['comment_count'] #评论数
can_comment = i['can_comment']['status'] #是否可以评论
content = i['content'] #内容,还需要再清洗
cache = pd.DataFrame({# '用户ID':[user_id],
'用户名':[name],
'性别':[gender],
# 'token':[user_token],
#'用户类型':[user_type],
'签名':[headline],
'被关注人数':[follwer_count],
'创建时间':[created_time],
'更新时间':[updated_time],
'评论数':[comment_count],
'点赞数':[vote_up],
# '是否可以评论':[can_comment],
'内容':[content],
# '问题':[title]
})
result = pd.concat([result, cache])
return result, bs['paging']['is_end']
# # 不同的代理IP,代理ip的类型必须和请求url的协议头保持一致
# proxy_list = [
# {"https": '112.115.57.20:3128'},
# {'https': '121.41.171.223:3128'},
# {"https": '223.242.224.226:9999'},
# # {"https": '183.166.70.169:9999'},
# {"https": '115.221.242.206:9999'},
# #{"https": '163.204.245.77:9999'},
# {"https": '113.195.18.226:9999'},
# {"https": '183.166.97.125:9999'},
# {"https": '182.34.32.91:9999'},
# {"https": '182.34.33.167:9999'},
# {"https": '182.34.34.116:9999'},
# {"https": '113.195.19.47:9999'},
# {"https": '110.243.24.126:9999'},
# {"https": '58.220.95.40:10174'},
# {"https": '113.124.94.252:9999'},
# {"https": '182.34.37.228:9999'},
# {"https": '163.204.240.55:9999'},
# {"https": '183.166.111.38:9999'},
# {"https": '183.166.133.225:9999'},
# {"https": '117.69.151.7:9999'},
# {"https": '182.149.82.27:9999'},
# {"https": '183.166.139.207:9999'},
# ]
# # 随机获取代理IP
# proxy = random.choice(proxy_list)
# print(proxy)
question_id = 275359100
final_result = spider(question_id, headers)#, proxy)
final_result.to_csv("data.csv")
#pdata.to_excel('out.xlsx',sheet_name="sheetname",index=False)
def spider(q_id, headers):#, proxy):
results = pd.DataFrame()
offset = 0
while True:
try:
time.sleep(1)
result, flag = parse_page(url.format(q_id, offset, 20), headers)#, proxy)
if flag:
break
else:
results = pd.concat([results, result])
time.sleep(random.random())
print('i had parsed:', offset)
offset += 20
except Exception as e:
print('Error', e)
return results
网友评论