美文网首页
python crawler 知乎用户数据

python crawler 知乎用户数据

作者: Tim_Chen | 来源:发表于2016-09-27 12:17 被阅读0次

    先写一点准备的函数,文件名为getZhihuInfo.py

    import requests
    from bs4 import BeautifulSoup
    import json
    
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Connection': 'keep-alive',
        'Cache-Control':'max-age=0',
        'Cookie':'#你自己的',
        'Host': 'www.zhihu.com',
        'Referer': 'https://www.zhihu.com/people',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
    }
    headers_post = {
        'Accept':'*/*',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'Connection':'keep-alive',
        'Content-Length':'16',
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Referer':'https://www.zhihu.com/people',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
        'Cookie':'#你自己的',
        'Host':'www.zhihu.com',
        'Origin':'https://www.zhihu.com',
        'X-Requested-With':'XMLHttpRequest',
        'X-Xsrftoken':'82f9b2f5e3166156c04eeb491ac6f21e'
    }
    
    #判断是否为空
    def setValue(soupS):
        if soupS:
            return soupS[0].get_text()
        else:
            return 'Unknown'
    
    #获得每个用户Url的基础信息,返回一个list
    def getBasicInfo(peopleUrl):
        wb_data = requests.get(peopleUrl,headers = headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
    
        name = soup.select('div.title-section > span')[0].get_text()
    
        alocation = soup.select('span.location.item')
        abusiness = soup.select('span.business.item')
        agender = soup.select('span.item.gender > i')
        aemployment = soup.select('span.employment.item')
        aposition = soup.select('span.position.item')
        aeducation = soup.select('span.education.item')
        aeducation_extra = soup.select('span.education-extra.item')
    
        location = setValue(alocation)
        business = setValue(abusiness)
        if agender:
            gender = agender[0].get('class')[1][13:]
        else:
            gender = 'Unknown'
        employment = setValue(aemployment)
        position = setValue(aposition)
        education = setValue(aeducation)
        education_extra = setValue(aeducation_extra)
    
        agree = soup.select('span.zm-profile-header-user-agree > strong')[0].get_text()
        thanks = soup.select('span.zm-profile-header-user-thanks > strong')[0].get_text()
        action5 = soup.select('span.num')
        asks = action5[0].get_text()
        answers = action5[1].get_text()
        posts = action5[2].get_text()
        if len(action5) > 3:
            collections = action5[3].get_text()
            logs = action5[4].get_text()
        else:
            collections = 'Null'
            logs = 'Null'
    
        followees = soup.select('a.item > strong')[0].get_text()
        followers = soup.select('a.item > strong')[1].get_text()
    
        focus2 = soup.select('div.zm-profile-side-section-title > a > strong')
        if len(focus2) == 2:
            zl = focus2[0].get_text()[:-3]
            ht = focus2[1].get_text()[:-3]
        else:
            ht = focus2[0].get_text()[:-3]
            zl = '0'
        basicInfoSet = [name,location,business,gender,employment,position,education,education_extra,agree,thanks,asks,answers,posts,collections,logs,followees,followers,zl,ht]
        return basicInfoSet
    
    #获得每个用户关注的用户的URL,返回list
    def getFolloweesUrl(OneUrl):
        url = OneUrl + '/followees'
        wb_data = requests.get(url,headers = headers)
        soup = BeautifulSoup(wb_data.text,'lxml')
        alist = soup.select('a.zg-link.author-link')
        followeeUrlSet = []
        if alist:
            for i in alist:
                followeeUrlSet.append(i.get('href'))
            #print(len(followeeSet))
            return followeeUrlSet
    
    #获得每个用户某日期最近20条动态,返回一个字典
    def postActivitiesByDate(Purl,byDate):
        url = Purl + '/activities'
        data = {
            'start': byDate
        }
        wb_data = requests.post(url,headers = headers_post,data = data)
        #print(wb_data)
    
        soup = BeautifulSoup(wb_data.json()['msg'][1], 'lxml')
        activities = soup.select('div.zm-profile-section-item.zm-item.clearfix')
        actdata = {}
        for i in activities:
            actdata[i.get('data-time')] = i.get('data-type-detail')
        return actdata
    

    再获得一堆用户的知乎个人主页的网址,存储在MongoDB中:

    from getZhihuInfo import getFolloweesUrl
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    zhiHu = client['zhiHu']
    zhiHuId = zhiHu['zhiHuId']
    
    #初始Url,要先找一个个人主页的网址
    urlSet = ['']
    
    #初始设置
    # zhiHuId.remove()
    # fd1 = {
    #     'id':0,
    #     'followees':urlSet
    # }
    # zhiHuId.insert_one(fd1)
    
    begin = 0   #初始为0
    end = 1000
    dbId = 0   #最大的id值
    for k in range(begin,end):
        for i in zhiHuId.find_one({'id': k})['followees']:
            followees = getFolloweesUrl(i)
            dbId +=1
            fd = {
                'id':dbId,
                'followees':followees
            }
            zhiHuId.insert_one(fd)
            print(dbId)
    

    然后,把每个Url从数据库中提取出来,然后获得各项数据,存储在数据库中

    from getZhihuInfo import setValue,getBasicInfo,postActivitiesByDate
    
    import pymongo
    import time
    
    
    client = pymongo.MongoClient('localhost',27017)
    zhiHu = client['zhiHu']
    zhiHuId = zhiHu['zhiHuId']
    zhiHuDetail = zhiHu['zhiHuDetail']
    
    #初始Url
    OneUrl = ''
    #需要获取的信息的by日期
    byDate160909 = 1473379200
    
    begin = 0   #初始为0
    end = 1000
    count = 0
    for k in range(begin,end):
        x = zhiHuId.find_one({'id': k})['followees']
        if x:
            for i in x:
                y = getBasicInfo(i)
                z = postActivitiesByDate(i,byDate160909)
    
                oneData = {
                    'name':y[0],'location':y[1],'business':y[2],
                    'gender':y[3], 'employment':y[4], 'position':y[5], 'education':y[6],
                    'education_extra':y[7], 'agree':y[8],'thanks':y[9], 'asks':y[10],
                    'answers':y[11], 'posts':y[12], 'collections':y[13], 'logs':y[14], 'followees':y[15],
                    'followers':y[16], 'zl':y[17], 'ht':y[18],
                    'activities':z
                }
                zhiHuDetail.insert_one(oneData)
                count += 1
                print(k,'----',count)
    

    相关文章

      网友评论

          本文标题:python crawler 知乎用户数据

          本文链接:https://www.haomeiwen.com/subject/mjmcyttx.html