微信公众号爬虫

作者: 繁著 | 来源:发表于2017-10-19 18:26 被阅读186次

    微信团队于2017-06-06发布更新:

    “ 对所有公众号开放,在图文消息正文中插入自己帐号和其他公众号已群发文章链接的能力。”

    那么,利用这个接口,我们就可以爬取指定公众号的文章链接了

    文章参考:静觅

    准备工具:一个订阅号,安装selenium

    爬取步骤:




    1. get_cookie.py用selenium登陆,获取cookie,其中你需要勾选“记住”选项,还需要微信扫描二维码,确定顺利登陆
    # -*- coding:utf-8 -*-
    from selenium import webdriver
    import io
    import time
    import json
    from pprint import pprint
    
    post = {}
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
    driver = webdriver.Chrome('/Users/weiweiwang/code/financialNews/chromedriver',chrome_options=chrome_options)
    driver.get('https://mp.weixin.qq.com/')
    time.sleep(2)
    driver.find_element_by_xpath("./*//input[@name='account']").clear()
    driver.find_element_by_xpath("./*//input[@name='account']").send_keys('你的账号')
    driver.find_element_by_xpath("./*//input[@name='password']").clear()
    driver.find_element_by_xpath("./*//input[@name='password']").send_keys('你的密码')
    # 在自动输完密码之后记得点一下记住我
    time.sleep(5)
    driver.find_element_by_xpath("./*//a[@class='btn_login']").click()
    # 拿手机扫二维码!
    time.sleep(15)
    driver.get('https://mp.weixin.qq.com/')
    cookie_items = driver.get_cookies()
    for cookie_item in cookie_items:
        post[cookie_item['name']] = cookie_item['value']
    cookie_str = json.dumps(post)
    with io.open('cookie.txt', 'w+') as f:
        f.write(cookie_str.decode('utf-8'))
    
    1. get_url.py获取你需要的公众号的历史文章链接
    # -*- coding:utf-8 -*-
    import requests
    import io
    import redis
    import json
    import re
    import random
    import time
    
    gzlist = ['yq_Butler']
    
    
    url = 'https://mp.weixin.qq.com'
    header = {
        "HOST": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
        }
    
    with io.open('cookie.txt', 'r') as f:
        cookie = f.read()
    cookies = json.loads(cookie)
    response = requests.get(url=url, cookies=cookies)
    token = re.findall(r'token=(\d+)', str(response.url))[0]
    for query in gzlist:
        query_id = {
            'action': 'search_biz',
            'token' : token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'query': query,
            'begin': '0',
            'count': '5',
        }
        search_url = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?'
        search_response = requests.get(search_url, cookies=cookies, headers=header, params=query_id)
        lists = search_response.json().get('list')[0]
        fakeid = lists.get('fakeid')
        query_id_data = {
            'token': token,
            'lang': 'zh_CN',
            'f': 'json',
            'ajax': '1',
            'random': random.random(),
            'action': 'list_ex',
            'begin': '0',
            'count': '5',
            'query': '',
            'fakeid': fakeid,
            'type': '9'
        }
        appmsg_url = 'https://mp.weixin.qq.com/cgi-bin/appmsg?'
        appmsg_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
        max_num = appmsg_response.json().get('app_msg_cnt')
        num = int(int(max_num) / 5)
        begin = 0
        while num + 1 > 0 :
            query_id_data = {
                'token': token,
                'lang': 'zh_CN',
                'f': 'json',
                'ajax': '1',
                'random': random.random(),
                'action': 'list_ex',
                'begin': '{}'.format(str(begin)),
                'count': '5',
                'query': '',
                'fakeid': fakeid,
                'type': '9'
            }
            print('翻页###################',begin)
            query_fakeid_response = requests.get(appmsg_url, cookies=cookies, headers=header, params=query_id_data)
            fakeid_list = query_fakeid_response.json().get('app_msg_list')
            for item in fakeid_list:
                print(item.get('link'))
            num -= 1
            begin = int(begin)
            begin+=5
            time.sleep(2)
    

    所有代码见github

    相关文章

      网友评论

        本文标题:微信公众号爬虫

        本文链接:https://www.haomeiwen.com/subject/hjrpuxtx.html