# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request, FormRequest
from scrapy.selector import Selector
import json
import os
from tutorial.items import ZhihuItem
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["zhihu.com"]
pwd = os.path.split(os.path.realpath(__file__))[0]
cookiesfilename = pwd + os.path.sep + "cookies.txt"
start_urls = [
"https://www.zhihu.com"
]
def start_requests(self):
cookies = self.load_cookies_from_mozilla(self.cookiesfilename)
self.logger.info(cookies)
for url in self.start_urls:
yield Request(
url,
cookies=cookies,
callback=self.after_login)
def after_login(self, response):
# 有/people/*链接说明登录成功
if not response.xpath('//a[re:test(@href, "/people/")]'):
self.logger.info("登录失败")
scrapy.shell.inspect_response(response, self)
return None
self.logger.info("登录成功")
# scrapy.shell.inspect_response(response, self)
_xsrf = response.xpath('///*[@name="_xsrf"]/@value').extract_first()
headers = response.request.headers
headers["X-Xsrftoken"] = _xsrf
return FormRequest(
url="https://www.zhihu.com/topic/19552832/followers",
headers=headers,
formdata={"offset": "0"},
meta={"offset": 0, "headers": headers},
callback=self.get_followers)
def get_followers(self, response):
msg = json.loads(response.body_as_unicode())['msg']
offset = response.meta["offset"] + 20
if not msg[0] == 0:
sel = Selector(text=msg[-1])
else:
return None
for i in sel.xpath('//a[@class="zg-link author-link"]'):
name = i.xpath('text()').extract_first()
href = i.xpath("@href").extract_first()
yield Request(
url="https://www.zhihu.com" + href,
meta={"name": name, "href": href},
callback=self.get_about
)
mi_ids = sel.xpath('///*[@class="zm-person-item"]/@id').extract()
yield FormRequest(
url="https://www.zhihu.com/topic/19552832/followers",
headers=response.meta["headers"],
formdata={"offset": str(offset), "start": mi_ids[-1].split('-')[-1]},
meta={"offset": offset, "headers": response.meta["headers"]},
callback=self.get_followers)
def get_about(self, response):
sel = Selector(text=response.body_as_unicode())
item = ZhihuItem()
item["name"] = response.meta["name"]
item["href"] = response.meta["href"]
item["location"] = sel.xpath('///*[@class="location item"]/@title').extract_first()
item["business"] = sel.xpath('///*[@class="business item"]/@title').extract_first()
if sel.xpath('///*[@class="icon icon-profile-male"]'):
item["gender"] = "male"
elif sel.xpath('///*[@class="icon icon-profile-female"]'):
item["gender"] = "female"
else:
item["gender"] = "unknown"
item["employment"] = sel.xpath('///*[@class="employment item"]/@title').extract_first()
item["position"] = sel.xpath('///*[@class="position item"]/@title').extract_first()
item["education"] = sel.xpath('///*[@class="education item"]/@title').extract_first()
item["major"] = sel.xpath('///*[@class="education-extra item"]/@title').extract_first()
yield item
def load_cookies_from_mozilla(self, filename):
cookies = []
with open(filename, "r", encoding="utf-8") as f:
for line in f:
cookies_list = line.split()
cookies.append(dict(
name=cookies_list[-2],
value=cookies_list[-1],
))
return cookies
网友评论