美文网首页
header构造

header构造

作者: 奕剑听雨 | 来源:发表于2018-08-13 16:10 被阅读90次

    在进行接口测试等工作中,很多页面访问需要权限,这些权限管理字段一般存储于header里面,已cookie、token等形式存在,因此经常需要在url1中发送请求获得这些许可证,在后面的接口测试中利用许可证构造header继续进行请求以下是一个举例,是构造header在爬虫中的应用。

    coding=utf-8

    http://699pic.com/download/getDownloadUrl----获取下载URL地址的接口接收pid:500472407

    返回包含图片链接URL=http://down.699pic.com/photo/50047/2407.jpg?_upt=63305cd11514965673&_upd=500472407.jpg

    upd分解成两块填入 _upt为实时生成,具有时间使用限制 #图片id

    如何获得_upd?

    http://699pic.com/sousuo-61847-0-1-0-0-0.html第一页#keyword 如何转化为61847---页面代码中可以找到

    http://699pic.com/sousuo-61847-0-2-0-0-0.html第二页

    http://699pic.com/sousuo-61847-0-3-0-0-0.html第三页

    多页面获取只要改变 这个数字,数字在html代码中可寻找到

    _upd在网页元素中html代码中可以找到拼接即可

    问题转化为keyword---五位数字的对应关系如何生成的

    import requests
    import time
    import multiprocessing#多进程
    from bs4 import BeautifulSoup#用于处理html文本,可以树状解析,方便查找和拆分
    import sys
    import io
    from urllib import request#用于模拟登陆请求,携带登陆cookie信息进行访问
    import json
    import os
    import random

    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')#改变标准输出的默认编码

    class SpiderForPicture(object):
    author = 'Blokks'

    def __init__(self, keyword):
        self.keyword = keyword
    
    def saving_folder_making(self):
        folder_path = 'F:\\test_auto\\spider\\pictures\\' + self.keyword
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
            print('创建名字为%s的目录存放图片' % self.keyword)
        return folder_path
    
    def get_page_count(self):
        try:
            keyword = self.keyword
            url = 'http://699pic.com/tupian/' + keyword + '.html'
            html = requests.get(url)
            content = (html.content).decode('utf-8')
            re_1 = BeautifulSoup(content, "lxml")
            re_2 = re_1.find_all(name='div', attrs={'class': 'pager-linkPage'})
            re_3 = re_2[0].find_all(name='a')
            list_ = []
            list_result = []
            result_dict = {}
            for item in re_3:
                ls = (item.get('href')).split('-')
                list_.append(ls)
                list_result.append(int(ls[3]))
            page_count = str(max(list_result))
            key_number = str(list_[0][1])
            result_dict[key_number] = page_count
            return result_dict#取得对应数字kw和页码数
        except:
            print('搜索关键字未找到图片...')
            exit(1)
    
    def get_pic_id(self):
        pic_id_list = []
        kw_dict = self.get_page_count()
        list_ = []
        for i in kw_dict:
            list_.append(i)
            list_.append(kw_dict[i])
        page_count = list_[1]
        print('根据关键字%s一共搜索到图片%s张' % (self.keyword, page_count))
        key_number = list_[0]
        for num in range(1, int(page_count)+1):
            url = 'http://699pic.com/sousuo-'+key_number+'-0-'+str(num)+'-0-0-0.html'
            html = requests.get(url)
            content = (html.content).decode('utf-8')
            re_1 = BeautifulSoup(content, "lxml")
            re_2 = re_1.find_all(name='div', attrs={'class': 'list'})
            for item in re_2:
                pic_id_list.append(item.get('data-id'))
        for i in pic_id_list:
            if len(str(i)) < 9:
                pic_id_list.remove(i)
        return pic_id_list
    
    def get_download_url(self):
        pic_id_list = self.get_pic_id()
        url_pool = []
        for pic_id in pic_id_list:
            url = 'http://699pic.com/download/getDownloadUrl?pid=' + pic_id
            cookie_str = r'2017endalert=1; uniqid=5a4c7bd11a363; bargain_popup=1; uv_cookie=c610bdc8d6965b2e7abec5d93' \
                         r'd07ad59; is_click_activity=1; from_data=YTo1OntzOjQ6Imhvc3QiO3M6MTA6IjY5OXBpYy5jb20iO3M6Mzoi' \
                         r'c2VtIjtiOjA7czoxMDoic291cmNlZnJvbSI7aTowO3M6NDoid29yZCI7TjtzOjM6ImtpZCI7aTowO30%3D; isVip=0; ' \
                         r'isPay=0; is_qy_vip=1; is_join_2017_end_18454014=0; isSearch=0; s_token=03e987b8c9b7912d89e77b' \
                         r'b7fd9b62e8; PHPSESSID=kt1v9k8sid51kg0ej6e127cvkvgmpc7q; Qs_lvt_135734=1513923395%2C1513923542' \
                         r'%2C1514961873%2C1515026629%2C1515031146; mediav=%7B%22eid%22%3A%22278616%22%2C%22ep%22%3A' \
                         r'%22%22%2C%22vid%22%3A%22%5EySs)9Ku%25D%3A*qX%24(Pe%3FD%22%2C%22ctn%22%3A%22%22%7D; ' \
                         r'Hm_lvt_1154154465e0978ab181e2fd9a9b9057=1515026630,1515026702,1515031028,1515031147; ' \
                         r'Hm_lvt_ddcd8445645e86f06e172516cac60b6a=1515026629,1515026702,1515031028,1515031147; ' \
                         r'recentlysearch=YTo0OntpOjA7YToyOntzOjI6Imt3IjtzOjc6ImRpYW5uYW8iO3M6NjoicGlueWluIjtzOjY6IjMx' \
                         r'MTExMCI7fWk6MTthOjI6e3M6Mjoia3ciO3M6Njoi55S16ISRIjtzOjY6InBpbnlpbiI7czo3OiJkaWFubmFvIjt9aTo' \
                         r'yO2E6Mjp7czoyOiJrdyI7czoxMjoi5pm66IO95a625bGFIjtzOjY6InBpbnlpbiI7czoxMjoiemhpbmVuZ2ppYWp1Ij' \
                         r't9aTozO2E6Mjp7czoyOiJrdyI7czo2OiLlpKfmtbciO3M6NjoicGlueWluIjtzOjU6ImRhaGFpIjt9fQ%3D%3D; ' \
                         r'search_Kw=%22diannao%22; is_join_2017_end_533435=0; Qs_pv_135734=144824772440290620%2C38906' \
                         r'64247893633500%2C3737559667568741000%2C2243149228815513300%2C1985644855545767200; ' \
                         r'Hm_lpvt_1154154465e0978ab181e2fd9a9b9057=1515034556; Hm_lpvt_ddcd8445645e86f06e172516cac60' \
                         r'b6a=1515034556; redirect=http%3A%2F%2F699pic.com%2Ftupian-500472175.html; session_data=YTo1' \
                         r'OntzOjM6InVpZCI7czo2OiI1MzM0MzUiO3M6NToidG9rZW4iO3M6MzI6ImZkZDIyZWY5NDJlMjY3NjViYTdhMGE2NmY' \
                         r'4NzVmMTE3IjtzOjM6InV1dCI7czozMjoiMWM0Y2E4ZDZmMDRhYTdhYmJiNTNkNTkwZmI4MGJiMWMiO3M6NDoiZGF0YS' \
                         r'I7YToxOntzOjg6InVzZXJuYW1lIjtzOjEyOiLku5nlpbPlprnlprkiO31zOjY6ImV4dGltZSI7aToxNTE1NjM5MzgzO' \
                         r'30%3D; uid=533435; username=%E4%BB%99%E5%A5%B3%E5%A6%B9%E5%A6%B9; head_pic=http%3A%2F%2' \
                         r'Fq.qlogo.cn%2Fqqapp%2F101268598%2FD2C2DF0668D1C9B957ADD345B9B7A420%2F40; login_user=1'
            req = request.Request(url)
            req.add_header('Cookie', cookie_str)
            req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36')
            resp = request.urlopen(req)
            result_ = resp.read().decode('utf-8')
            result_dict = json.loads(result_)
            if not 'url' in result_dict:
                print('cookies失败o(╥﹏╥)o')
                exit(1)
            download_url = result_dict['url']
            url_pool.append(download_url)
        return url_pool
    
    def download_picture(self, url):
        file_name = self.keyword + str(random.randint(100000, 999999)) + '.jpg'
        folder_path = self.saving_folder_making()
        file_path = folder_path + '\\' + file_name
        resp = requests.get(url)
        content = resp.content
        with open(file_path, 'wb') as f:
            f.write(content)
    

    def main():
    start_time = time.time()
    keyword = input('请输入需要搜索的关键字(拼音):')
    spider = SpiderForPicture(keyword)
    middle_time = time.time()
    time_cost = middle_time - start_time
    url_pool = spider.get_download_url()
    print('下载地址解析完毕---用时%s---现在开始下载....' % time_cost)
    p = multiprocessing.Pool(processes=4)
    p.map(spider.download_picture, url_pool)
    p.close()
    p.join()
    end_time = time.time()
    time_used = end_time - start_time
    print('全部下载完毕,用时%s' % time_used)

    if name == 'main':
    main()

    可以看到上面代码中cookie贼长,在本例子中使用add_header进行请求头构造完成后面的请求需要。

    相关文章

      网友评论

          本文标题:header构造

          本文链接:https://www.haomeiwen.com/subject/ideubftx.html