爬取知乎问题

作者: dingtom | 来源:发表于2020-07-08 18:40 被阅读0次

爬取知乎问题
爬取知乎奇葩问题
Python爬虫新手教程：知乎文章图片爬取器!
Python爬虫新手教程：知乎文章图片爬取器
Python爬虫新手教程：知乎文章图片爬取器
Python爬虫入门教程第二十二讲：知乎文章图片爬取器之二
Python爬虫入门教程：知乎文章图片爬取器
爬取知乎专栏
Python 实战项目
2017.07.20

https://github.com/dingtom/python/blob/master/%E7%88%AC%E5%8F%96%E7%9F%A5%E4%B9%8E%E9%97%AE%E9%A2%98.ipynb

# 知乎 API v4 整理 https://www.jianshu.com/p/86139ab70b86

# limit：数量，最大 20
# offset：起始位置，从零开始
# sort_by：{default, created}，表示默认排序或者时间排序
# include：额外信息，包括is_normal,is_sticky,collapsed_by,suggest_edit,comment_count,
# collapsed_counts,reviewing_comments_count,can_comment,content,editable_content,voteup_count,
# reshipment_settings,comment_permission,mark_infos,created_time,updated_time,relationship.is_author,
# voting,is_thanked,is_nothelp,upvoted_followees;author.is_blocking,is_blocked,is_followed,
# voteup_count,message_thread_token,badge[?(type=best_answerer)].topics。

url = 'https://www.zhihu.com/api/v4/questions/{}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset={}&limit=20&sort_by=default&platform=desktop'
# https://www.zhihu.com/api/v4/questions/275359100/answers?include
# data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics'+
# &offset={}&limit={}&sort_by=default&platform=desktop
ua = UserAgent(verify_ssl=False)



headers = {'User-Agent': ua.random
           # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
          }
def parse_page(url, headers):#, proxy):
    html  = requests.get(url, headers=headers)#, proxies=proxy)
    bs = json.loads(html.text)  
    # print(bs.keys())  # ['ad_info', 'data', 'paging']
    # print(bs['data'][0]
    # ['id', 'type', 'answer_type', 'question', 'author', 'url', 'is_collapsed', 
    # 'created_time', 'updated_time', 'extras', 'is_copyable', 'is_normal', 'voteup_count',
    # 'comment_count','is_sticky', 'admin_closed_comment', 'comment_permission', 'can_comment', 
    # 'reshipment_settings','content', 'editable_content', 'excerpt', 'collapsed_by', 'collapse_reason', 
    # 'annotation_action', 'mark_infos', 'relevant_info', 'suggest_edit', 'is_labeled', 'reward_info',
    # 'relationship', 'ad_answer']
    result = pd.DataFrame()
    for i in bs['data']:
        headline = i['author']['headline'] #签名
        gender = i['author']['gender']  #性别
        user_type =  i['author']['user_type']  # 用户类型
        user_id =  i['author']['id']
        user_token = i['author']['url_token']
        follwer_count = i['author']['follower_count'] #关注人数
        name = i['author']['name']   #用户昵称
        vote_up = i['voteup_count']  #点赞数
        updated_time = i['updated_time']    #更新时间
        title = i['question']['title']   #问题
        created_time = i['created_time'] #创建时间
        comment_count = i['comment_count'] #评论数
        can_comment = i['can_comment']['status']   #是否可以评论
        content = i['content']  #内容，还需要再清洗
        cache = pd.DataFrame({# '用户ID':[user_id],
                              '用户名':[name],
                              '性别':[gender],
                             # 'token':[user_token],
                              #'用户类型':[user_type],
                              '签名':[headline],
                              '被关注人数':[follwer_count],
                              '创建时间':[created_time],
                              '更新时间':[updated_time],
                              '评论数':[comment_count],
                              '点赞数':[vote_up],
                             # '是否可以评论':[can_comment],
                              '内容':[content],
                             # '问题':[title]
        })
        result = pd.concat([result, cache])
    return result, bs['paging']['is_end']
# # 不同的代理IP,代理ip的类型必须和请求url的协议头保持一致
# proxy_list = [
#     {"https": '112.115.57.20:3128'},        
#     {'https': '121.41.171.223:3128'},
#     {"https": '223.242.224.226:9999'},
#    # {"https": '183.166.70.169:9999'},
#     {"https": '115.221.242.206:9999'},  
#     #{"https": '163.204.245.77:9999'},
#     {"https": '113.195.18.226:9999'},
#     {"https": '183.166.97.125:9999'},
#     {"https": '182.34.32.91:9999'},
#     {"https": '182.34.33.167:9999'},
#     {"https": '182.34.34.116:9999'},
#     {"https": '113.195.19.47:9999'},
#     {"https": '110.243.24.126:9999'},
#     {"https": '58.220.95.40:10174'},
#     {"https": '113.124.94.252:9999'},
#     {"https": '182.34.37.228:9999'},
#     {"https": '163.204.240.55:9999'},
#     {"https": '183.166.111.38:9999'},
#     {"https": '183.166.133.225:9999'},
#     {"https": '117.69.151.7:9999'},
#     {"https": '182.149.82.27:9999'},
#     {"https": '183.166.139.207:9999'},  
# ]

# # 随机获取代理IP
# proxy = random.choice(proxy_list)
# print(proxy)
question_id = 275359100
final_result = spider(question_id, headers)#, proxy)

final_result.to_csv("data.csv")
#pdata.to_excel('out.xlsx',sheet_name="sheetname",index=False)
def spider(q_id, headers):#, proxy):
    results = pd.DataFrame()
    offset = 0
    while True:
        try:
            time.sleep(1)
            result, flag = parse_page(url.format(q_id, offset, 20), headers)#, proxy)
            if flag:
                break
            else:
                results = pd.concat([results, result])
                time.sleep(random.random())
                print('i had parsed:', offset)
                offset += 20
        except Exception as e:
             print('Error', e)
    return results