公众号
- 灏泽异谈
文章列表链接
使用charles分析公众号请求
-
如图
image.png
找有用信息
- 完整URL请求地址
- 完整的请求头(headers)信息,Headers里面包括了cookie、User-agent、Host 等信息。
- 因为 requests.get 方法里面的 headers 参数必须是字典对象,所以,先要写个函数把刚刚拷贝的字符串转换成字典对象。
def headers_to_dict(headers):
"""
将字符串
'''
Host: mp.weixin.qq.com
Connection: keep-alive
Cache-Control: max-age=
'''
转换成字典对象
{
"Host": "mp.weixin.qq.com",
"Connection": "keep-alive",
"Cache-Control":"max-age="
}
:param headers: str
:return: dict
"""
headers = headers.split("\n")
d_headers = dict()
for h in headers:
if h:
k, v = h.split(":", 1)
d_headers[k] = v.strip()
return d_headers
完整源代码
import requests
# -*- coding: utf-8 -*-
__author__ = "zoranlee"
def headers_to_dict(headers):
"""
将字符串
'''
Host: mp.weixin.qq.com
Connection: keep-alive
Cache-Control: max-age=
'''
转换成字典对象
{
"Host": "mp.weixin.qq.com",
"Connection": "keep-alive",
"Cache-Control":"max-age="
}
:param headers: str
:return: dict
"""
headers = headers.split("\n")
d_headers = dict()
for h in headers:
if h:
k, v = h.split(":", 1)
d_headers[k] = v.strip()
return d_headers
#提取数据内容
def extract_data(html_content):
"""
从html页面中提取历史文章数据
:param html_content 页面源代码
:return: 历史文章列表
"""
import re
import html
import json
# rex = "data = '({.*?})'"
rex = "data=({.*?\n)"
pattern = re.compile(pattern=rex, flags=re.S)
match = pattern.search(html_content)
if match:
data = match.group(1)
data = html.unescape(data)
# 按换行符
data = data[:-2]
data = json.loads(data)
articles = data.get("appmsg_list")
for item in articles:
print(item)
return articles
def crawl():
url = "https://mp.weixin.qq.com/mp/homepage?__biz=MzI4OTUyODgwMQ==&hid=1&sn=2324eec706f1b6ceb8f8b2a1e35671ee&scene=18&devicetype=iOS13.6.1&version=17001127&lang=zh_CN&nettype=WIFI&ascene=7&session_us=gh_e340a4c9f6df&fontScale=100&pass_ticket=Gd7oyTKM6dlbkNgUH3qtICelGKGOz2qQ8S56kql%2FhvSnK5zySYhIP2UsniJPiTox&wx_header=1"
headers="""
accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
x-wechat-uin:MTI0MzQ2NQ%3D%3D
user-agent:Mozilla/5.0 (iPhone; CPU iPhone OS 13_6_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.17(0x17001127) NetType/WIFI Language/zh_CN
accept-language:zh-cn
accept-encoding:gzip, deflate, br
"""
headers = headers_to_dict(headers)
response = requests.get(url, headers=headers, verify=False)
# print(response.text)
# 将抓取到的数据写成网页
with open("weixin_history.html", "w", encoding="utf-8") as f:
f.write(response.text)
#提取数据内容
articles = extract_data(response.text)
print(articles)
if __name__ == '__main__':
crawl()
网友评论