1.拿到第一个用户名、链接以及include参数(user_url)
start_user = 'gui-mu-zhi'
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
2.拿到粉丝链接以及include参数
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}t&limit={limit}'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
3.修改初始请求,返回用户和粉丝的请求,通过format方法将参数传递进去,分别回调解析用户函数和解析粉丝函数
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, offset=0, limit=20), self.parse_follow)
4.解析用户,获取内容
def parse_user(self, response):
# 将拿到的数据转成json格式
result = json.loads(response.text)
# 将Item实例化对象
item = ZhihuItem()
# 遍历item里面的fields
for filed in item.fields:
# 判断filed是否在返回结果的keys中
if filed in result.keys():
# 将返回结果传入item当中
item[filed] = result.get(filed)
yield item
# 返回请求粉丝链接将刚拿到的粉丝用户里面的url_token还有include传进去 回调到解析粉丝函数当中
yield Request(self.follows_url.format(user=result.get('url_token'), include=self.follows_query, offset=0, limit=20), self.parse_follow)
5.# 解析粉丝列表
def parse_follow(self, response):
# 将返回结果转成json格式
results = json.loads(response.text)
# 判断'data'是否在在返回结果的keys里面
if 'data' in results.keys():
# 如果在 遍历data数据
for result in results.get('data'):
# 返回请求用户函数当中 再解析存储数据
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query), callback=self.parse_user)
# 判断paging是否在keys中并且当中的is_end参数值是否为False(判断是否是最后一页)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
# 拿到下一页链接
next_page = results.get('paging').get('next')
# 返回请求下一页 回调继续解析粉丝
yield Request(
next_page,
self.parse_follow,
)
注:使用的是scrapy框架
网友评论