美文网首页
python爬取微博内容-简版

python爬取微博内容-简版

作者: 阪本先生_ | 来源:发表于2020-04-10 10:25 被阅读0次

    首先地址是m.weibo.cn,不是网页版的,这是手机版的


    1.jpg 2.jpg 3.jpg 4.jpg

    '''

    import requests
    from bs4 import BeautifulSoup
    import json
    from pyquery import PyQuery as pq
    from pymongo import MongoClient
    
    headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/5088862652',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400',
    'X-Requested-With': 'XMLHttpRequest'
        }
    # params = {
    #         'uid': '5088862652',
    #         'luicode': '10000011',
    #         'lfid': '231093_-_selffollowed',
    #         'type': 'uid',
    #         'value': '5088862652',
    #         'containerid': '1076035088862652',
    #         'since_id': '4485032922577236',}
    # base_url = 'https://m.weibo.cn/api/container/getIndex?'
    # response = requests.get(url=base_url,headers=headers,params=params).json()
    # since_id = response['data']['cardlistInfo']['since_id']
    
    client = MongoClient()
    db = client['weibo']
    collection = db['weibo']
    
    
    
    def get_page(page):
        params = {
            'uid': '5088862652',
            'luicode': '10000011',
            'lfid': '231093_-_selffollowed',
            'type': 'uid',
            'value': '5088862652',
            'containerid': '1076035088862652',
    
        }
        if since_id != 0 :
            params['since_id'] = since_id#添加到parmers中
        base_url = 'https://m.weibo.cn/api/container/getIndex?'
        response = requests.get(url=base_url,headers=headers,params=params).json()
        # print(response)
        return response
    
    def parse_page(json):
        content = json['data']['cards']#定位到需要的内容的节点处
        for i in content:#遍历出每个的主要内容
            weibo = {}
            weibo['text'] = pq(i['mblog']['text']).text()#去掉文中 <br/ 多余的HTML信息
            weibo['id'] = i['mblog']['id']
            weibo['attitudes'] = i['mblog']['attitudes_count']#点赞数
            weibo['comments'] = i['mblog']['comments_count']#评论数
            yield weibo #传递到weibo{}
    
    def save_to_mongo(result):
        if collection.insert(result):
            print("Mongo写入")
    
    if __name__=='__main__':
        since_id = 0
        for page in range(1,5):
            json = get_page(since_id)
            since_id = json['data']['cardlistInfo']['since_id']#这里是下一页的since_id
            print("======")
    
            results = parse_page(json)
            for result in results:
                try:
                    print(result)
                    save_to_mongo(result)
                except:
                    print('=' * 10 + "此内容无法显示" + "=" * 10)
    
    
    
    

    相关文章

      网友评论

          本文标题:python爬取微博内容-简版

          本文链接:https://www.haomeiwen.com/subject/xnplmhtx.html