get 请求##########
http://top.hengyan.com/dianji/default.aspx?p=1
http://top.hengyan.com/dianji/default.aspx?p=2
from urllib import request
url = 'http://top.hengyan.com/dianji/default.aspx?p=1'
构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
url, \目标url
data=None, \默认为None表示是get请求,如果不为None说明是get请求
timeout 设置请求的超时时间
cafile=None, capath=None, cadefault=False,:证书相关参数
context=None :忽略证书认证
urlopen不能添加请求头
response = request.urlopen(url=url,timeout=10)
添加请求头
req = request.Request(url=url, headers=headers)
response = request.urlopen(req, timeout=10)
响应状态码
code = response.status
当前请求的url地址
url = response.url
print(code, url)
b_content = response.read()
bytes -> str: decode
str -> bytes: encode
print(b_content)
html = b_content.decode('utf-8')
print(html)
文件操作
"""
w: w+: wb: wb+ a: a+: ab: ab+: r: rb:
"""
with open('hengyan.html', 'w') as file:
file.write(html)
###############post请求###########
from urllib import parse
import json
def get_ssjy_data(page=1):
# 世纪佳缘网
url = 'http://search.jiayuan.com/v2/search_v2.php'
# 请求参数
# """
# sex: f
# key:
# stc: 1:11,2:20.28,23:1
# sn: default
# sv: 1
# p: 1 (页码)
# f: search
# listStyle: bigPhoto
# pri_uid: 0
# jsversion: v5
#
# """
# 构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
# post请求参数
form_data = {
'sex': 'f', 'key': '', 'stc': '1:11,2:20.28,23:1',
'sn': 'default', 'sv': '1', 'p': str(page),
'f': 'search', 'listStyle': 'bigPhoto',
'pri_uid': '0', 'jsversion': 'v5',
}
form_data = parse.urlencode(form_data).encode('utf-8')
print(form_data)
# b'sex=f&key=&stc=1%3A11%2C2%3A20.28%2C23%3A1&sn=default&sv=1&p=1&f=search&listStyle=bigPhoto&pri_uid=0&jsversion=v5'
# 构建请求对象
req = request.Request(url=url, data=form_data, headers=headers)
response = request.urlopen(req, timeout=10)
if response.status == 200:
content = response.read().decode('utf-8').replace('##jiayser##//', '').replace('##jiayser##', '')
# print(content)
# json.load():将本地文件中json字符串,转换成python数据类型(dict)
# json.loads():将json字符串,转换成python数据类型(dict)
# json.dump():将python数据类型转为json字符串,并且保存至本地文件
# json.dumps():将python数据类型转为json字符串
data = json.loads(content)
print(type(data))
# print(data)
userinfos = data['userInfo']
for user in userinfos:
age = user['age']
name = user['nickname']
gender = user['sex']
print(age, name, gender)
# 获取下一页
total_page = int(data['pageTotal'])
print(str(page) + '页数据提取完毕')
if page < total_page:
# 需要继续提取下一页
next_page = page + 1
# 递归的方式,继续提取下一页数据
get_ssjy_data(page=next_page)
else:
# 数据提取完毕
print('数据提取完毕')
网友评论