美文网首页
Python_爬虫_GET和POST方法

Python_爬虫_GET和POST方法

作者: 暴走的金坤酸奶味 | 来源:发表于2018-12-23 18:08 被阅读0次

parese的简单用法

  • urlencode()函数:帮我们将key:value这样的键值对转换**"key=value"这样的字符串(将字典键值对按URL编码转换,从而能被web服务器接受)
  • parse的unquote():把 URL编码字符串,转换回原先字符串
  • parse.quote():把字符串(不是字典)转乘url编码
from urllib import request,parse
wd = {
    "kw":"你好"
}
aa = "你好"
a = parse.quote(aa)
b = parse.unquote(a)
c = parse.urlencode(wd)
d = parse.unquote(c)
print(a)
print(b)
print(c)
print(type(d))
print(d)

"""
%E4%BD%A0%E5%A5%BD
你好
kw=%E4%BD%A0%E5%A5%BD
<class 'str'>
"""

一般HTTP请求提交数据,需要编码成 URL编码格式,然后做为url的一部分,或者作为参数传到Request对象中。

Get方式

GET请求一般用于我们向服务器获取数据,比如说,我们用百度搜索美女:https://www.baidu.com/s?wd=美女
最终中文被转换为十六进制的数据
https://www.baidu.com/s?wd = %E7%BE%8E%E5%A5%B3
在其中我们可以看到在请求部分里,http://www.baidu.com/s? 之后出现一个长长的字符串,其实就是我们要查询的关键词美女,于是我们可以尝试用默认的Get方式来发送请求。

小例子
百度搜索


"""
https://www.baidu.com/s?wd=%E9%A9%AC%E4%BA%91&pn=10
"""
from urllib import parse,request
from fake_useragent import UserAgent
def searchSpider(kw,start_page,end_page):
    for page in range(start_page,end_page+1):
        # 将字典类型的参数转url编码
        zidian = {
            "wd": kw,
            "pn": (page-1)*10
        }
        resule = parse.urlencode(zidian)
        print(resule)
        full = "https://www.baidu.com/s?"+resule
        print(full)
        html = load_page(full)
        filename = "第"+str(page)+"页"+kw+".html"
        save_page_html(html,filename)
def load_page(url):
    req_heard = {
        "User-Agent":ua.random
    }
    print(req_heard)
    req = request.Request(url,headers=req_heard)
    response = request.urlopen(req)
    if response.status == 200:
        print("请求成功")
        return response.read().decode()
def save_page_html(html,filename):
    with open("test/"+filename,"w",encoding="utf-8") as file:
        file.write(html)
if __name__ == "__main__":
    ua = UserAgent()
    kw = input("请输入关键字")
    #起始页和截至页
    start_page = int(input("输入起始页"))
    end_page = int(input("输入截止页"))
    searchSpider(kw, start_page, end_page)

百度贴吧

#百度贴吧帖子图片,下载到本地
#分析帖子贴吧中分页的url地址规律
'''
https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BE%8E%E5%A5%B3&fr=search
https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=50
https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=100

'''
import re
from fake_useragent import UserAgent
from urllib import parse,request
def tiebaSpider(name,start_page,end_page):
    for page in range(start_page,end_page+1):
        parmars = {
            "kw":name,
            "ie":"utf-8",
            "pn":(page-1)*50
        }
        a = parse.urlencode(parmars)
        full_url = "https://tieba.baidu.com/f?"+a
        html = load_data(full_url)
        #从页面源码中匹配出详情地址
        tiezi_urlinfo = parse_page_detail_url(html)
        for note in tiezi_urlinfo:
            detail_url = "https://tieba.baidu.com"+note[0]
            title = note[1]
            # print("正在获取"+title+"的帖子详情")
            html = load_data(detail_url)
            images = parse_detail_imageurl(html)
            download_image(images)
def load_data(url):
    req_header = {
        "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
    }
    req = request.Request(url,headers=req_header)
    response = request.urlopen(req)
    if response.status == 200:
        return response.read().decode()
#使用正则从每个分页的html页面源码中,提取帖子的详情url地址
def parse_page_detail_url(html):
    pattern = re.compile('<div.*?class="threadlist_title pull_left j_th_tit ">' +
                         '.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</div>', re.S)
    result = re.findall(pattern,html)
    return result
def parse_detail_imageurl(html):
    pattern = re.compile('<img.*?class="BDE_Image".*?src="(.*?)".*?>', re.S)
    result = re.findall(pattern, html)
    print("图片链接",result)
    return result
def download_image(image):
    for i in image:
        req_header = {
            "User-Agent": ua.random
        }
        req = request.Request(i, headers=req_header)
        response = request.urlopen(req)
        if response.status == 200:
            flilename = response.url[-20:]
            with open("img/"+flilename,"wb") as filr:
                filr.write(response.read())
                print(flilename,"下载完成")
if __name__== "__main__":
    ua = UserAgent()
    name = input("输入关键字")
    start_page = int(input("输入起始页"))
    end_page = int(input("输入结束页"))
    tiebaSpider(name, start_page, end_page)

Post方式

上面我们说了Request请求对象的里有data参数,它就是用在POST里的,我们要传送的数据就是这个参数data,data是一个字典,里面要匹配键值对
发送post请求

#测试接口:https://httpbin.org/post
from urllib import parse,request

url = 'https://httpbin.org/post'

#表单数据
fordata = {
    'name':'红红火火',
    'age':18,
    'gender':'男'
}

#先使用urlencode将参数转为url编码格式的字符串,
# 然后在使用encode()方法将字符串转为bytes类型的参数
formdata = parse.urlencode(fordata).encode('utf-8')

#不需要添加请求头
response = request.urlopen(url,data=formdata)
print(response.status)
print(response.read().decode('utf-8'))

#需要设置请求头
red_header = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
req = request.Request(url,headers=red_header,data=formdata)
response = request.urlopen(req)
print(response.status)
print(response.read().decode('utf-8'))

有道

from urllib import parse,request
import json
# POST请求的目标URL(这个代码是之前的链接,方便我们使用,不用传递sign参数,新版中该参数是加密的)
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"

formdata = {
    'i': '我的祖国',
    'from': 'AUTO',
    'to': 'AUTO',
    'smartresult': 'dict',
    'client': 'fanyideskweb',
    'doctype': 'json',
    'version': '2.1',
    'keyfrom': 'fanyi.web',
    'action': 'FY_BY_CLICKBUTTION',
    'typoResult': 'false',
}

formdata = parse.urlencode(formdata).encode('utf-8')

req_header = {
    'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}

req = request.Request(url,headers=req_header,data=formdata)

response = request.urlopen(req)

print(response.status)
# print(response.read().decode('utf-8'))
json_str = response.read().decode('utf-8')
print(json_str)
# json.loads():将json字符串,转换为python的数据类型;
# json类型的数据(对象和数组)对象->dict 数组->list
# json.load()
# json.dumps()
# json.dump()
data = json.loads(json_str)
print(type(data))

result = data['translateResult'][0][0]['tgt']
print(result)

拉钩

#

#目标url:
# https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false

# post请求要提交的表单数据
# first: true
# pn: 1 (页码)
# kd: c++ (关键字)
from urllib import request,parse
import json,pymysql,time
#pip3 install pymysql


def lagouspider(url,formdata):
    #发起请求返回响应结果
    response_data = load_page_data(url,formdata)
    #得到的数据是一个json字符串,需要转为python类型的数据
    data = json.loads(response_data)
    print(data)

    if data['success']:
        print('请求成功')
        #拿到职位信息
        postionJobs = data['content']['positionResult']['result']
        for jobinfo in postionJobs:
            jobdata = {}
            #职位标题
            jobdata["companyId"] = jobinfo["companyId"]
            jobdata['positionName'] = jobinfo['positionName']
            #职位的发布时间
            jobdata['publishTime'] = jobinfo['formatCreateTime']
            #公司名称
            jobdata['companyName'] = jobinfo['companyShortName']
            #薪资
            # jobdata['salary'] = jobinfo['salary']
            # #工作经验
            # jobdata['workYear'] = jobinfo['workYear']
            # #学历
            # jobdata['education'] = jobinfo['education']
            # #公司类型
            # jobdata['industry'] = jobinfo['industryField']
            # #融资
            # jobdata['stage'] = jobinfo['financeStage']
            # #人数
            # jobdata['companySize'] = jobinfo['companySize']
            # #福利
            # jobdata['fuli'] = ','.join(jobinfo['companyLabelList'])
            # #招聘诱惑
            # jobdata['positionAdvantage'] = jobinfo['positionAdvantage']

            save_data_to_db(jobdata)
            #print(jobdata)

        #判断是否需要发起下一次请求
        #取出当前页码
        cur_page = int(data['content']['pageNo'])
        #每页返回多少条数据
        page_size = int(data['content']['pageSize'])
        #职位总数
        totalcount = int(data['content']['positionResult']['totalCount'])
        if cur_page*page_size < totalcount:
            #下一页的页码
            next_page = cur_page+1
            print('继续发起请求第'+str(next_page)+'页')
            formdata['pn'] = next_page
            time.sleep(1)
            lagouspider(url,formdata)
    else:
        print('请求不成功,休息一会继续发起请求')
        time.sleep(10)
        print('重新发起第'+str(formdata['pn'])+'页请求')
        lagouspider(url,formdata)

def load_page_data(url,formdata):
    """
    发起请求(下载器)
    :param url:
    :param formdata:
    :return:
    """
    #将表单数据转为web服务器能识别的url编码格式的bytes类型的数据
    form_data = parse.urlencode(formdata).encode('utf-8')
    #设置请求头
    req_headers = {
        'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Referer':'https://www.lagou.com/jobs/list_c%2B%2B?labelWords=&fromSearch=true&suginput=',
    }
    #构建一个request对象
    req = request.Request(url,headers=req_headers,data=form_data)
    #发起请求
    response = request.urlopen(req)
    if response.status == 200:
        return response.read().decode('utf-8')

def save_data_to_db(jobdata):
    """
    存储数据
    :param jobdata: 对应的是一个字典(存放的是职位信息)
    :return:
    """

    sql = """
    INSERT INTO list_lagou(%s)
    VALUES (%s)
    """ % (
        ','.join(jobdata.keys()),
        ','.join(["%s"]*len(jobdata))
    )
    try:
        cursor.execute(sql,list(jobdata.values()))
        mysql_client.commit()
    except Exception as err:
        print(err)
        mysql_client.rollback()

if __name__ == '__main__':

    #数据库连接
    """
    host=None, user=None, password="",
    database=None, port=0,
    charset=''
    """
    mysql_client = pymysql.Connect(
        '127.0.0.1','root','123456',
        'test01',3306,charset='utf8',
    )
    #创建游标(执行sql语句)
    cursor = mysql_client.cursor()

    #目标url
    url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

    #要提交的参数
    formdata = {
        'first': 'true',
        'pn': 1,
        'kd': 'c++',
    }
    lagouspider(url,formdata)
  • GET方式是直接以链接形式访问,链接中包含了所有的参数,服务器端用Request.QueryString获取变量的值。如果包含了密码的话是一种不安全的选择,不过你可以直观地看到自己提交了什么内容。

  • POST则不会在网址上显示所有的参数,服务器端用Request.Form获取提交的数据,在Form提交的时候。但是HTML代码里如果不指定 method 属性,则默认为GET请求,Form中提交的数据将会附加在url之后,以?分开与url分开。

相关文章

网友评论

      本文标题:Python_爬虫_GET和POST方法

      本文链接:https://www.haomeiwen.com/subject/noinkqtx.html