美文网首页
Scrapy to zhihu

Scrapy to zhihu

作者: 李二狗2000 | 来源:发表于2016-09-17 16:14 被阅读0次
    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy import Request, FormRequest
    from scrapy.selector import Selector
    import json
    import os
    from tutorial.items import ZhihuItem
    
    
    class ZhihuSpider(scrapy.Spider):
        name = "zhihu"
        allowed_domains = ["zhihu.com"]
        pwd = os.path.split(os.path.realpath(__file__))[0]
        cookiesfilename = pwd + os.path.sep + "cookies.txt"
        start_urls = [
            "https://www.zhihu.com"
        ]
    
        def start_requests(self):
            cookies = self.load_cookies_from_mozilla(self.cookiesfilename)
            self.logger.info(cookies)
            for url in self.start_urls:
                yield Request(
                    url,
                    cookies=cookies,
                    callback=self.after_login)
    
        def after_login(self, response):
            # 有/people/*链接说明登录成功
            if not response.xpath('//a[re:test(@href, "/people/")]'):
                self.logger.info("登录失败")
                scrapy.shell.inspect_response(response, self)
                return None
            self.logger.info("登录成功")
            # scrapy.shell.inspect_response(response, self)
            _xsrf = response.xpath('///*[@name="_xsrf"]/@value').extract_first()
            headers = response.request.headers
            headers["X-Xsrftoken"] = _xsrf
            return FormRequest(
                url="https://www.zhihu.com/topic/19552832/followers",
                headers=headers,
                formdata={"offset": "0"},
                meta={"offset": 0, "headers": headers},
                callback=self.get_followers)
    
        def get_followers(self, response):
    
            msg = json.loads(response.body_as_unicode())['msg']
            offset = response.meta["offset"] + 20
            if not msg[0] == 0:
                sel = Selector(text=msg[-1])
            else:
                return None
            for i in sel.xpath('//a[@class="zg-link author-link"]'):
                name = i.xpath('text()').extract_first()
                href = i.xpath("@href").extract_first()
                yield Request(
                    url="https://www.zhihu.com" + href,
                    meta={"name": name, "href": href},
                    callback=self.get_about
                )
            mi_ids = sel.xpath('///*[@class="zm-person-item"]/@id').extract()
            yield FormRequest(
                url="https://www.zhihu.com/topic/19552832/followers",
                headers=response.meta["headers"],
                formdata={"offset": str(offset), "start": mi_ids[-1].split('-')[-1]},
                meta={"offset": offset, "headers": response.meta["headers"]},
                callback=self.get_followers)
    
        def get_about(self, response):
            sel = Selector(text=response.body_as_unicode())
            item = ZhihuItem()
            item["name"] = response.meta["name"]
            item["href"] = response.meta["href"]
            item["location"] = sel.xpath('///*[@class="location item"]/@title').extract_first()
            item["business"] = sel.xpath('///*[@class="business item"]/@title').extract_first()
            if sel.xpath('///*[@class="icon icon-profile-male"]'):
                item["gender"] = "male"
            elif sel.xpath('///*[@class="icon icon-profile-female"]'):
                item["gender"] = "female"
            else:
                item["gender"] = "unknown"
            item["employment"] = sel.xpath('///*[@class="employment item"]/@title').extract_first()
            item["position"] = sel.xpath('///*[@class="position item"]/@title').extract_first()
            item["education"] = sel.xpath('///*[@class="education item"]/@title').extract_first()
            item["major"] = sel.xpath('///*[@class="education-extra item"]/@title').extract_first()
    
            yield item
    
        def load_cookies_from_mozilla(self, filename):
            cookies = []
            with open(filename, "r", encoding="utf-8") as f:
                for line in f:
                    cookies_list = line.split()
                    cookies.append(dict(
                        name=cookies_list[-2],
                        value=cookies_list[-1],
                    ))
            return cookies
    

    相关文章

      网友评论

          本文标题:Scrapy to zhihu

          本文链接:https://www.haomeiwen.com/subject/qzqwettx.html