美文网首页Python爬虫作业
Python爬虫作业 | 爬取简书用户思路分析及代码

Python爬虫作业 | 爬取简书用户思路分析及代码

作者: JaeGwen | 来源:发表于2017-05-11 23:56 被阅读902次
    简书大学堂
    最近几个月除了工作需要打开的网页之外点开最多的就是简书

    再加上最近对爬虫的学习也一直在进行中,于是乎就再爬完拉勾职位信息之后,便将目光锁定在jianshu上了,当然这也是一项实战作业的一部分

    初步的思路

    今天在用Scrapy写代码的时候,对网页的结构也有了大致的分析,再加上之前罗罗攀的思路,初步我是通过专题入口

    • 热门专题
    image.png image.png
    • 专题管理员 (一般粉丝、文章、字数、收获喜欢、这几项数据都非常漂亮)
    image.png image.png

    以上红框里的数据项就是我需要爬取的字段

    但是以上的思路存在一点的问题:

    存在一些简书用户并不是一些热门专题的管理员,但是其人气粉丝量也很高,这个思路可能无法将这些用户爬取下来

    进阶的思路

    • 热门专题
    • 专题关注的人
    • 专题关注的人的动态
    • ** 推荐作者 粉丝信息**
    image.png image.png image.png

    优点:

    数据大而全,基本包含了99%的用户(个人猜测,不严谨)

    缺点:

    因为许多用户不止关注一个专题,而且其中包含了大量的新注册用户(数据很多为空),并且也有大量重复数据需要去重

    代码部分:

    jianshu.py 还在调试阶段,待更新...

    # -*- coding: utf-8 -*-
    
    import sys
    import json
    import requests
    import scrapy
    import re
    from lxml import etree
    from scrapy.http import Request
    
    reload(sys)
    sys.path.append('..')
    sys.setdefaultencoding('utf-8')
    
    class jianshu(scrapy.Spider):
        name = 'jianshu'
    #    topic_category = ['city']
        topic_category = ['recommend', 'hot', 'city']
        base_url = 'http://www.jianshu.com/recommendations/collections?page=%s&order_by=%s'
        cookies={
                'UM_distinctid': '15b89d53a930-02ab95f11ccae2-51462d15-1aeaa0-15b89d53a9489b',
                'CNZZDATA1258679142': '1544557204-1492664886-%7C1493280769',
                '_session_id': 'Q3RteU9BeTA3UVh1bHp1d24ydmZJaGdkRDZJblE3SWg3dTlNR2J1WmJ5NS9HNlpOZVg4ZUk0TnNObE5wYXc3SjhYcU5WR0NKZ3RhcE9veFVDU2RNWkpqNE44MWxuVmtoR1ZDVXBFQ29Kc1kzZmd4SVNZakJyWVN4c1RFQXZNTFhmUUtxemVDVWlVU1l3VW92NFpTeEE2Q0ppUVN0QVFEMUpLZjFHdHViR21zZko2b1lFTW9DR08yNDh5Z0pvd0VJRzc4aFBqRnZYbGt6QXlmSzMxdU1QTVFwUVcxdUViaElqZzh2Y1RwcENtSWxWbW5PMUVGZ2UrZ2xVcm1NTlpMK2x2UTdOWlZjUVNPK1dCTERpMnd6U3ZxbXlROENML2VseTRHUTBqbFE1ZUlqN1FqazJJK0tsV1htdEt1bnl5MkhCbHNJTmh1ejFLTW9pYVcrVmx0bit1blNXV1VCQ3JNbHAvK1Z5T1ZvUk5IMVMzR1dUNHBlWFZBamcwYjQxSzBjZVRvMGRZSDRmV0xtTGZHekF1M3V6dGcwMHhpQ24zdmVKelV5eDRFSWZ4QT0tLW1uSXNLakp6SW54SUo0QU16a2dFSkE9PQ%3D%3D--0849c37208f8c573960d857029c7d6a15145c419',
                'remember_user_token':'W1szNDgxMjU3XSwiJDJhJDEwJDlSS3VLcFFWMlZzNFJuOFFNS1JQR3UiLCIxNDk0MjEzNDQ3LjYwODEwNzgiXQ%3D%3D--9241542a4e44d55acaf8736a1d57dd0e96ad4e7a',
                '_ga': 'GA1.2.2016948485.1492666105',
                '_gid': 'GA1.2.382495.1494550475',
                'Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068': '1494550475',
                'Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068': '1494213432,1494213612,1494321303,1494387194'
        }
        headers = {
                'Accept-Encoding': 'gzip, deflate, sdch',
                'Accept - Language': 'zh - CN, zh;q = 0.8',
                'Connection': 'close',
                'Cookie': 'UM_distinctid=15b89d53a930-02ab95f11ccae2-51462d15-1aeaa0-15b89d53a9489b; CNZZDATA1258679142=1544557204-1492664886-%7C1493280769; remember_user_token=W1szNDgxMjU3XSwiJDJhJDEwJDlSS3VLcFFWMlZzNFJuOFFNS1JQR3UiLCIxNDk0MjEzNDQ3LjYwODEwNzgiXQ%3D%3D--9241542a4e44d55acaf8736a1d57dd0e96ad4e7a; _ga=GA1.2.2016948485.1492666105; _gid=GA1.2.824702661.1494486429; _gat=1; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1494213432,1494213612,1494321303,1494387194; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1494486429; _session_id=czl6dzVOeXdYaEplRVdndGxWWHQzdVBGTll6TVg5ZXFDTTI5cmN2RUsvS2Y2d3l6YlkrazZkZWdVcmZDSjFuM2tpMHpFVHRTcnRUVnAyeXhRSnU5UEdhaGMrNGgyMTRkeEJYOE9ydmZ4N1prN1NyekFibkQ5K0VrT3paUWE1bnlOdzJrRHRrM0Z2N3d3d3hCcFRhTWdWU0lLVGpWWjNRdjArZkx1V2J0bGJHRjZ1RVBvV25TYnBQZmhiYzNzOXE3VWNBc25YSS93WUdsTEJFSHVIck4wbVI5aWJrUXFaMkJYdW41WktJUDl6OVNqZ2k0NWpGL2dhSWx0S2FpNzhHcFZvNGdQY012QlducWgxNVhoUEN0dUpCeUI4bEd3OXhiMEE2WEplRmtaYlR6VTdlZXFsaFFZMU56M2xXcWwwbmlZeWhVb0dXKzhxdEtJaFZKaUxoZVpUZEZPSnBGWmF3anFJaFZpTU9Icm4wcllqUFhWSzFpYWF4bTZmSEZ1QXdwRWs3SHNEYmNZelA4VG5zK0wvR0MwZDdodlhZakZ6OWRVbUFmaE5JMTIwOD0tLXVyVEVSeVdOLy9Cak9nVG0zV0hueVE9PQ%3D%3D--ea401e8c501e7b749d593e1627dbaa88ab4befc2',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
                'Host':'www.jianshu.com',
                "X-Requested-With": 'XMLHttpRequest'
        }
    
        def get_total_page(self):
            #获取专题总页数 包含3个字典的列表 [{"hot": xx}, {"recommend":xx}, {"city": xx}]
            total_page_list = []
            for order in self.topic_category:
                order = order.decode('utf-8')
                total_page = 100
                dict = {}
                for page in range(1, total_page):
                    url = self.base_url % (page, order)
                    html = requests.get(url, headers=self.headers).content
                    selector = etree.HTML(html)
                    #print html
                    try:
                        elements = selector.xpath('//*[@id="list-container"]/div[1]/div/h4/a/text()')[0]
                        if elements is not Exception:
                            continue
                    except Exception :
                        dict['total_page'] = page - 1
                        dict['category'] = order
                        break
            total_page_list.append(dict)
            return total_page_list
    
        def get_topic_info(self):
            #获取专题信息
            topic_info_list = []
            total_page_list = self.get_total_page()
            base_url = self.base_url
            for dict in total_page_list:
                category = dict['category']
                total_page = int(dict['total_page'])
                for page in range(1, total_page + 1):
                    url = base_url % (page, category)
                    html = requests.get(url, headers=self.headers,cookies=self.cookies).content
                    selector = etree.HTML(html)
                    topic_href = selector.xpath('//*[@id="list-container"]')[0]
                    for href in topic_href:
                        dict = {}
                        topic_name = href.xpath('./div/h4/a/text()')[0]
                        topic_url = "www.jianshu.com" + href.xpath('./div/h4/a/@href')[0]
                        topic_img_url = href.xpath('./div/a/img/@src')[0]
                        img_num = topic_img_url.split("/")[5]
                        dict['topic_name'] = topic_name
                        dict['topic_url'] = topic_url
                        #
                        dict['img_num'] = img_num
                        topic_info_list.append(dict)
            return topic_info_list
    
        def get_topic_admin_info(self):
            #获取管理员信息
            topic_admin_info_list = []
            topic_info_list = self.get_topic_info()
            for d in topic_info_list:
                img_num = str(d['img_num'])
                base_url = "http://www.jianshu.com/collections/%s/editors_and_subscribers" % img_num
                base_url_response = requests.get(base_url, headers=self.headers, cookies=self.cookies)
                json_data_base = json.loads(base_url_response.text.decode('utf-8'))
                editors_total_pages = json_data_base['editors_total_pages']
                for page in range(1, int(editors_total_pages) + 1):
                    if page == 1:
                        editors = json_data_base['editors']
                        for editor in editors:
                            dict = {}
                            dict['nickname'] = editor['nickname']
                            dict['slug'] = editor['slug']
                            topic_admin_info_list.append(dict)
                    else:
                            try:
                                url = "http://www.jianshu.com/collections/{}/editors?page={}".format(img_num, page)
                                response = requests.get(url,headers=self.headers,cookies=self.cookies)
                                json_data = json.loads(response.text.decode('utf-8'))
                                editors = json_data['editors']
                                for editor in editors:
                                    dict = {}
                                    dict['nickname'] = editor['nickname']
                                    dict['slug'] = editor['slug']
                                    topic_admin_info_list.append(dict)
                            except Exception:
                                pass
            return topic_admin_info_list
    
        def get_followers_following_list(self):
            # 获取管理员粉丝列表
            followers_list = []
            topic_admin_list = self.get_topic_admin_info()
            followers_base_url = "http://www.jianshu.com/users/%s/followers"
            for dict in topic_admin_list:
                url = followers_base_url % dict['slug']
                headers = self.headers
                headers['Referer'] = url
                headers['DNT'] = '1'
                response = requests.get(url, headers=headers, cookies=self.cookies).content
                total_followers = re.findall(r'<p>(\d+)</p>', response)[1]
                total_page = int(total_followers) / 9 + 1
                for page in range(1, int(total_page) + 1):
                    followers_url = url + "?page="+ str(page)
                    html = requests.get(followers_url, headers=headers, cookies=self.cookies).content
                    list = re.findall(r'class="name" href="(.+?)">.*</a>', html)
                    followers_list.extend(list)
            return followers_list
    
        def get_recommand_editor(self):
            # 获取推荐作者列表
            recommand_editors_list = []
            for page in range(1, 10):
                url = "http://www.jianshu.com/recommendations/users?page={}".format(str(page))
                headers = self.headers
                headers["Referer"] = "http://www.jianshu.com/recommendations/users?utm_source=desktop&utm_medium=index-users"
                headers["Accept"] = "text/html, */*; q=0.01"
                headers["Connection"] = "keep-alive"
                html = requests.get(url, headers=headers, cookies=self.cookies).content
                lists = re.findall(r'class="_blank" herf="/users/(.+?)">.*</a>', html)
                recommand_editors_list.extend(lists)
            return recommand_editors_list
    
        def get_recommand_editor_followers(self):
            #获取推荐作者的粉丝
            get_recommand_editor_followers_list = []
            recommand_editor_list = self.get_recommand_editor()
            for editor in recommand_editor_list:
                url = "http://www.jianshu.com/u/" + str(editor)
                followers_base_url = "http://www.jianshu.com/users/{}/followers".format(str(editor))
                headers = self.headers
                headers['Referer'] = url
                headers['DNT'] = '1'
                response = requests.get(url, headers=headers, cookies=self.cookies).content
                total_followers = re.findall(r'<p>(\d+)</p>', response)[1]
                total_page = int(total_followers) / 9 + 1
                for page in range(1, int(total_page) + 1):
                    followers_url = followers_base_url + "?page=" + str(page)
                    html = requests.get(followers_url, headers=headers, cookies=self.cookies).content
                    list = re.findall(r'class="name" href="(.+?)">.*</a>', html)
                    get_recommand_editor_followers_list.extend(list)
            return get_recommand_editor_followers_list
    
        def is_redirect(self, url):
            #判断是否是重定向302
            state_code = requests.get(url, allow_redirects = False).status_code
            if state_code == "200":
                return True
            else:
                return False
    
        def start_requests(self):
            followers_lists = self.get_followers_following_list()
            editor_followers_list = self.get_recommand_editor_followers()
            # 管理员粉丝列表 + 推荐作者粉丝列表
            tota_list = followers_lists + editor_followers_list
            tota_list = list(set(tota_list))
            base_url = "http://www.jianshu.com/%s"
            for d in tota_list:
                if d is None:
                    continue
                else:
                    url = base_url % d
                    timeline_url = "http://www.jianshu.com/users/%s/timeline" % d.split("/")[2]
                    headers = self.headers
                    headers['Referer'] = url
                    headers['Upgrade-Insecure-Requests'] = 1
                    headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
                    yield Request(timeline_url, headers=headers, callback=self.parse)
    
    
        def parse(self, response):
            data = response.body
            item = {}
            topic_admin_name = re.findall(r'class="name" href=".*">(.+?)</a>', data)[0]
            item['topic_admin_name'] = str(topic_admin_name)
            one = re.findall(r'<p>(\d+)</p>', data)
            item['topic_admin_gz'] = str(one[0])
            item['topic_admin_fans'] = str(one[1])
            item['topic_admin_essay_num'] =str(one[2])
            item['topic_admin_word_num'] = str(one[3])
            item['topic_admin_like'] = str(one[4])
            yield item
    

    相关文章

      网友评论

        本文标题:Python爬虫作业 | 爬取简书用户思路分析及代码

        本文链接:https://www.haomeiwen.com/subject/yfvatxtx.html