parese的简单用法
- urlencode()函数:帮我们将key:value这样的键值对转换**"key=value"这样的字符串(将字典键值对按URL编码转换,从而能被web服务器接受)
- parse的unquote():把 URL编码字符串,转换回原先字符串
- parse.quote():把字符串(不是字典)转乘url编码
from urllib import request,parse
wd = {
"kw":"你好"
}
aa = "你好"
a = parse.quote(aa)
b = parse.unquote(a)
c = parse.urlencode(wd)
d = parse.unquote(c)
print(a)
print(b)
print(c)
print(type(d))
print(d)
"""
%E4%BD%A0%E5%A5%BD
你好
kw=%E4%BD%A0%E5%A5%BD
<class 'str'>
"""
一般HTTP请求提交数据,需要编码成 URL编码格式,然后做为url的一部分,或者作为参数传到Request对象中。
Get方式
GET请求一般用于我们向服务器获取数据,比如说,我们用百度搜索美女:https://www.baidu.com/s?wd=美女
最终中文被转换为十六进制的数据
https://www.baidu.com/s?wd = %E7%BE%8E%E5%A5%B3
在其中我们可以看到在请求部分里,http://www.baidu.com/s? 之后出现一个长长的字符串,其实就是我们要查询的关键词美女,于是我们可以尝试用默认的Get方式来发送请求。
小例子
百度搜索
"""
https://www.baidu.com/s?wd=%E9%A9%AC%E4%BA%91&pn=10
"""
from urllib import parse,request
from fake_useragent import UserAgent
def searchSpider(kw,start_page,end_page):
for page in range(start_page,end_page+1):
# 将字典类型的参数转url编码
zidian = {
"wd": kw,
"pn": (page-1)*10
}
resule = parse.urlencode(zidian)
print(resule)
full = "https://www.baidu.com/s?"+resule
print(full)
html = load_page(full)
filename = "第"+str(page)+"页"+kw+".html"
save_page_html(html,filename)
def load_page(url):
req_heard = {
"User-Agent":ua.random
}
print(req_heard)
req = request.Request(url,headers=req_heard)
response = request.urlopen(req)
if response.status == 200:
print("请求成功")
return response.read().decode()
def save_page_html(html,filename):
with open("test/"+filename,"w",encoding="utf-8") as file:
file.write(html)
if __name__ == "__main__":
ua = UserAgent()
kw = input("请输入关键字")
#起始页和截至页
start_page = int(input("输入起始页"))
end_page = int(input("输入截止页"))
searchSpider(kw, start_page, end_page)
百度贴吧
#百度贴吧帖子图片,下载到本地
#分析帖子贴吧中分页的url地址规律
'''
https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BE%8E%E5%A5%B3&fr=search
https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50
https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=100
'''
import re
from fake_useragent import UserAgent
from urllib import parse,request
def tiebaSpider(name,start_page,end_page):
for page in range(start_page,end_page+1):
parmars = {
"kw":name,
"ie":"utf-8",
"pn":(page-1)*50
}
a = parse.urlencode(parmars)
full_url = "https://tieba.baidu.com/f?"+a
html = load_data(full_url)
#从页面源码中匹配出详情地址
tiezi_urlinfo = parse_page_detail_url(html)
for note in tiezi_urlinfo:
detail_url = "https://tieba.baidu.com"+note[0]
title = note[1]
# print("正在获取"+title+"的帖子详情")
html = load_data(detail_url)
images = parse_detail_imageurl(html)
download_image(images)
def load_data(url):
req_header = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
req = request.Request(url,headers=req_header)
response = request.urlopen(req)
if response.status == 200:
return response.read().decode()
#使用正则从每个分页的html页面源码中,提取帖子的详情url地址
def parse_page_detail_url(html):
pattern = re.compile('<div.*?class="threadlist_title pull_left j_th_tit ">' +
'.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</div>', re.S)
result = re.findall(pattern,html)
return result
def parse_detail_imageurl(html):
pattern = re.compile('<img.*?class="BDE_Image".*?src="(.*?)".*?>', re.S)
result = re.findall(pattern, html)
print("图片链接",result)
return result
def download_image(image):
for i in image:
req_header = {
"User-Agent": ua.random
}
req = request.Request(i, headers=req_header)
response = request.urlopen(req)
if response.status == 200:
flilename = response.url[-20:]
with open("img/"+flilename,"wb") as filr:
filr.write(response.read())
print(flilename,"下载完成")
if __name__== "__main__":
ua = UserAgent()
name = input("输入关键字")
start_page = int(input("输入起始页"))
end_page = int(input("输入结束页"))
tiebaSpider(name, start_page, end_page)
Post方式
上面我们说了Request请求对象的里有data参数,它就是用在POST里的,我们要传送的数据就是这个参数data,data是一个字典,里面要匹配键值对
发送post请求
#测试接口:https://httpbin.org/post
from urllib import parse,request
url = 'https://httpbin.org/post'
#表单数据
fordata = {
'name':'红红火火',
'age':18,
'gender':'男'
}
#先使用urlencode将参数转为url编码格式的字符串,
# 然后在使用encode()方法将字符串转为bytes类型的参数
formdata = parse.urlencode(fordata).encode('utf-8')
#不需要添加请求头
response = request.urlopen(url,data=formdata)
print(response.status)
print(response.read().decode('utf-8'))
#需要设置请求头
red_header = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
req = request.Request(url,headers=red_header,data=formdata)
response = request.urlopen(req)
print(response.status)
print(response.read().decode('utf-8'))
有道
from urllib import parse,request
import json
# POST请求的目标URL(这个代码是之前的链接,方便我们使用,不用传递sign参数,新版中该参数是加密的)
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"
formdata = {
'i': '我的祖国',
'from': 'AUTO',
'to': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'doctype': 'json',
'version': '2.1',
'keyfrom': 'fanyi.web',
'action': 'FY_BY_CLICKBUTTION',
'typoResult': 'false',
}
formdata = parse.urlencode(formdata).encode('utf-8')
req_header = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
req = request.Request(url,headers=req_header,data=formdata)
response = request.urlopen(req)
print(response.status)
# print(response.read().decode('utf-8'))
json_str = response.read().decode('utf-8')
print(json_str)
# json.loads():将json字符串,转换为python的数据类型;
# json类型的数据(对象和数组)对象->dict 数组->list
# json.load()
# json.dumps()
# json.dump()
data = json.loads(json_str)
print(type(data))
result = data['translateResult'][0][0]['tgt']
print(result)
拉钩
#
#目标url:
# https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false
# post请求要提交的表单数据
# first: true
# pn: 1 (页码)
# kd: c++ (关键字)
from urllib import request,parse
import json,pymysql,time
#pip3 install pymysql
def lagouspider(url,formdata):
#发起请求返回响应结果
response_data = load_page_data(url,formdata)
#得到的数据是一个json字符串,需要转为python类型的数据
data = json.loads(response_data)
print(data)
if data['success']:
print('请求成功')
#拿到职位信息
postionJobs = data['content']['positionResult']['result']
for jobinfo in postionJobs:
jobdata = {}
#职位标题
jobdata["companyId"] = jobinfo["companyId"]
jobdata['positionName'] = jobinfo['positionName']
#职位的发布时间
jobdata['publishTime'] = jobinfo['formatCreateTime']
#公司名称
jobdata['companyName'] = jobinfo['companyShortName']
#薪资
# jobdata['salary'] = jobinfo['salary']
# #工作经验
# jobdata['workYear'] = jobinfo['workYear']
# #学历
# jobdata['education'] = jobinfo['education']
# #公司类型
# jobdata['industry'] = jobinfo['industryField']
# #融资
# jobdata['stage'] = jobinfo['financeStage']
# #人数
# jobdata['companySize'] = jobinfo['companySize']
# #福利
# jobdata['fuli'] = ','.join(jobinfo['companyLabelList'])
# #招聘诱惑
# jobdata['positionAdvantage'] = jobinfo['positionAdvantage']
save_data_to_db(jobdata)
#print(jobdata)
#判断是否需要发起下一次请求
#取出当前页码
cur_page = int(data['content']['pageNo'])
#每页返回多少条数据
page_size = int(data['content']['pageSize'])
#职位总数
totalcount = int(data['content']['positionResult']['totalCount'])
if cur_page*page_size < totalcount:
#下一页的页码
next_page = cur_page+1
print('继续发起请求第'+str(next_page)+'页')
formdata['pn'] = next_page
time.sleep(1)
lagouspider(url,formdata)
else:
print('请求不成功,休息一会继续发起请求')
time.sleep(10)
print('重新发起第'+str(formdata['pn'])+'页请求')
lagouspider(url,formdata)
def load_page_data(url,formdata):
"""
发起请求(下载器)
:param url:
:param formdata:
:return:
"""
#将表单数据转为web服务器能识别的url编码格式的bytes类型的数据
form_data = parse.urlencode(formdata).encode('utf-8')
#设置请求头
req_headers = {
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Referer':'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
}
#构建一个request对象
req = request.Request(url,headers=req_headers,data=form_data)
#发起请求
response = request.urlopen(req)
if response.status == 200:
return response.read().decode('utf-8')
def save_data_to_db(jobdata):
"""
存储数据
:param jobdata: 对应的是一个字典(存放的是职位信息)
:return:
"""
sql = """
INSERT INTO list_lagou(%s)
VALUES (%s)
""" % (
','.join(jobdata.keys()),
','.join(["%s"]*len(jobdata))
)
try:
cursor.execute(sql,list(jobdata.values()))
mysql_client.commit()
except Exception as err:
print(err)
mysql_client.rollback()
if __name__ == '__main__':
#数据库连接
"""
host=None, user=None, password="",
database=None, port=0,
charset=''
"""
mysql_client = pymysql.Connect(
'127.0.0.1','root','123456',
'test01',3306,charset='utf8',
)
#创建游标(执行sql语句)
cursor = mysql_client.cursor()
#目标url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
#要提交的参数
formdata = {
'first': 'true',
'pn': 1,
'kd': 'c++',
}
lagouspider(url,formdata)
-
GET方式是直接以链接形式访问,链接中包含了所有的参数,服务器端用Request.QueryString获取变量的值。如果包含了密码的话是一种不安全的选择,不过你可以直观地看到自己提交了什么内容。
-
POST则不会在网址上显示所有的参数,服务器端用Request.Form获取提交的数据,在Form提交的时候。但是HTML代码里如果不指定 method 属性,则默认为GET请求,Form中提交的数据将会附加在url之后,以?分开与url分开。
网友评论