post大致逻辑
- 构建表单(-->伪装请求头)
- 传递表单 req
- 获得结果 response
from urllib import request
from urllib.parse import urlencode
from fake_useragent import UserAgent
#使用url发起一个POST请求
web = UserAgent()
webr = web.random
#目标URL(xx)是一个测试接口
req_url = 'https://httpbin.org/post'
formdata = {
'name':'崔',
'age':12,
'gender':'男',
'class':'5081',
}
#数据要经过两部处理
#1.需要将表烦数据转换为url编码格式(urlencode) 2.转换后的字符串转换为2进制数据(encode)
data_tranfrom = urlencode(formdata).encode()
print(data_tranfrom)
req = request.Request(url=req_url,headers=webr)
response = request.urlopen(req,data=data_tranfrom,timeout=1)
#状态码例如:200 404 ,500
print(response.status)
p = response.read().decode()
print(p)
-
作业
- 有道翻译取数据
from urllib import request
from urllib.parse import urlencode
from fake_useragent import UserAgent
import json
#分析网页
#http://fanyi.youdao.com/
def youdao():
browser = UserAgent()
#提取url
# POST请求的目标URL(这个代码是之前的链接,方便我们使用,不用传递sign参数,新版中该参数是加密的)
req_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
inf = input('请输入信息')
#分析表单数据
form_data = {
# 'i': '老鼠爱大米',
'i':inf,
'from':'AUTO',
'to':'AUTO',
'smartresult':'dict',
'client':'fanyideskweb',
'doctype':'json',
'version':'2.1',
'keyfrom':'fanyi.web',
'action':'FY_BY_CLICKBUTTION',
'typeResult':'false',
}
#先转换16位编码,再转换二进制数据流
req_data = urlencode(form_data).encode('utf-8')
#伪装浏览器
req_header = {
'User_Agent':browser.random,
}
#构建request对象
req = request.Request(url=req_url,data=req_data,headers=req_header)
# 获取响应
response = request.urlopen(req)
# 响应码
rsp_num = response.status
print(rsp_num)
# 结果
rsp_inf = response.read().decode('utf-8').strip()
print(rsp_inf)
print(type(rsp_inf))
# json转义
result_j = json.loads(rsp_inf)
print(type(result_j))
print('翻译结果结果是')
print(result_j['translateResult'][0][0]['tgt'])
if __name__=='__main__':
youdao()
作业
- 豆瓣登录
from urllib.parse import urlencode
from urllib import request
import json
def douban():
req_url = 'https://accounts.douban.com/login'
form_data = {
'source':'movie',
'redir':'https://movie.douban.com/',
'form_email':'2332256766@qq.com',
'form_password':'密码',
'login':'登录',
}
req_header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36',
}
req_data = urlencode(form_data).encode('utf-8')
req = request.Request(url=req_url,data=req_data,headers=req_header)
response = request.urlopen(req)
print(response.status)
#读响应
html_content = response.read().decode('utf-8')
with open('page.html','w') as file:
file.write(html_content)
if __name__ =='__main__':
douban()
作业
- 拉手网取数据
from urllib import request
from urllib.parse import urlencode
from fake_useragent import UserAgent
import json
#https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
#通过url确定为post请求
# first:false
# pn:2
# kd:Java
# first:false
# pn:3
# kd:Java
# first:false
# pn:4
# kd:Java
def lagoSpider():
browers = UserAgent()
keyword = input('请输入查询内容')
#构建url
req_url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
#构建post请求的表单数据
for pg in range(1,31):
form_data = {
'first':'false',
'pn': pg,
'kd':keyword,
}
#转化为二进制数据
req_data = urlencode(form_data).encode('utf-8')
#构建header
req_header = {
'User-Agent':browers.random,
'Referer':'https://www.lagou.com/jobs/list_Java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
}
req = request.Request(url=req_url,data=req_data,headers=req_header)
response = request.urlopen(req)
print(response.status)
if response.status == 200:
print('请求成功')
js_file = response.read().decode('utf-8')
print(js_file)
parse_data(js_file)
def parse_data(text):
#分析发现返回的结果为json字符串
#1.需要将json字符串,转换为python数据类型
result = json.loads(text)
print(type(result))
positionResult = result['content']['positionResult']['result']
print(len(positionResult))
for i in positionResult:
d = {}
d['postionName'] = i['positionName']
d['publishName'] = i['firstType']
d['companyName'] = i['companyFullName']
postion_dict = d
json_srt = json.dumps(postion_dict,ensure_ascii=False)
#文件的读写模式
#w:没有则写,有则覆盖
#w+:读写模式
#wb:写入二进制
#wb+:读写模式
#a:追加,从末尾追加
#a+ 追加,具有读写
#ab:二进制追加
#......
with open('jsontext.json','a') as file:
file.write(json_srt+'n')
# print(postionName+'/n')
# print(publishName)
# print(companyName+'/n')
if __name__ == '__main__':
lagoSpider()
-
注意
反爬
'Referer':'https://www.lagou.com/jobs/list_Java?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput='
cookie
网友评论