美文网首页
爬虫---requests

爬虫---requests

作者: riverstation | 来源:发表于2018-07-17 17:25 被阅读44次

    requests

    是什麽?

    • 是一个第三方库,这个库和urllib是一样的,就是模拟浏览器发送http请求的,requests是对urllib的一层封装,所以提供的接口更加的人性化

    详请地址

    http://docs.python-requests.org/zh_CN/latest/index.html

    安装方式

    pip install requests

    get\带参数的get

    get方法是params=data
    data是一个参数字典
    r = requests.get(url=url, params=data)
    响应对象r

    r.text 字符串格式内容
    r.content 字节格式内容
    r.headers 响应头部
    r.url 请求url
    r.status_code 状态码

    
    import requests
    
    '''
    url = 'http://www.baidu.com/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    r = requests.get(url=url, headers=headers)
    '''
    
    # r是响应对象
    # 网页的字符串内容
    # print(r.text)
    # 字节内容
    # print(r.content)
    # 获取网页的url
    # print(r.url)
    # 获取响应头部
    # print(r.headers)
    # 获取状态码
    # print(r.status_code)
    
    url = 'https://www.baidu.com/s?'
    data = {
        'ie': 'utf8',
        'wd': '周杰伦'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    r = requests.get(url=url, params=data, headers=headers)
    
    with open(r'tupian\zhou.html', 'wb') as fp:
        fp.write(r.content)
    

    post

    必应翻译

    data是参数字典
    r = requests.post(url=url, data=data)

    import requests
    
    post_url = 'https://cn.bing.com/ttranslationlookup?&IG=5C360E60322D4FA4865EEBCF710B93B6&IID=translator.5036.2'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    fromdata = {
        'from': 'zh-CHS',
        'to': 'en',
        'text': '皇上',
    }
    
    r = requests.post(url=post_url, data=fromdata, headers=headers)
    
    print(r.text)
    

    会话

    登录,人人网---获取保存cooike

    s = requests.Session()
    s.post()
    s.get()

    import requests
    
    # 使用会话技术,首先创建一个会话
    # 往下所有操作,使用s进行发送   s.post  s.get
    s = requests.Session()
    
    post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018621432232'
    data = {
        'email':'17701256561',
        'icode':'',
        'origURL':'http://www.renren.com/home',
        'domain':'renren.com',
        'key_id':'1',
        'captcha_type':'web_login',
        'password':'bd20fe8cf1541a10558676a6eeccb4a1a786cfc09823ddd69d5bbaafc7060292',
        'rkey':'227f4ceb2f44827f9de8296ca1ef1c3f',
        'f':'https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DaovDobnt13PO-vgvw1r-eSnSe_QNvNGtexiQFzyME-a%26wd%3D%26eqid%3Db5d58b1e000297f4000000025b4d88e3',
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    r = s.post(url=post_url, headers=headers, data=data)
    
    # print(r.text)
    
    url = 'http://www.renren.com/960481378/profile'
    
    r = s.get(url, headers=headers)
    
    with open('renren.html', 'wb') as fp:
        fp.write(r.content)
    

    公交爬取

    import requests
    from lxml import etree
    import re
    import json
    import time
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    def parse_first_page(url):
        r = requests.get(url=url, headers=headers)
        # 生成一个tree对象
        tree = etree.HTML(r.text)
        # 通过tree对象查找所有的数字、字母链接
        number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
        char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
        return number_href_list + char_href_list
    
    def parse_second_page(url, all_href, fp):
        # 为了拼接完整的url,先将右边的 / 干掉
        url = url.rstrip('/')
        for href in all_href:
            href = url + href
            r = requests.get(href, headers=headers)
            tree = etree.HTML(r.text)
            # 解析,获取所有的公交href信息
            bus_href_list = tree.xpath('//div[@id="con_site_1"]/a/@href')
            bus_name_list = tree.xpath('//div[@id="con_site_1"]/a/text()')
            # print(bus_href_list)
            # exit()
            # 向列表中的url依次发送请求,解析内容
            parse_third_page(url, bus_href_list, bus_name_list, fp)
    
    def parse_third_page(url, bus_href_list, bus_name_list, fp):
        for bus_href in bus_href_list:
            title = bus_name_list[bus_href_list.index(bus_href)]
            print('正在爬取%s......' % title)
            # 拼接完整的url
            bus_href = url + bus_href
            # 向每一路公交的详情页发送请求
            r = requests.get(url=bus_href, headers=headers)
            # 在下面的函数中,解析每一路公交的详细信息
            parse_content(r.text, fp)
            print('结束爬取%s' % title)
            time.sleep(1)
    
    def parse_content(content, fp):
        tree = etree.HTML(content)
        # 获取线路名称
        name = tree.xpath('//div[@class="bus_i_t1"]/h1/text()')[0]
        # 获取运行时间
        runtime = tree.xpath('//div[@class="bus_i_content"]/p[1]/text()')[0]
        # 获取票价信息
        price = tree.xpath('//div[@class="bus_i_content"]/p[2]/text()')[0]
        # 公交公司
        try:
            company = tree.xpath('//div[@class="bus_i_content"]/p[3]/a/text()')[0]
        except Exception as e:
            company = ''
        
        # 更新时间
        gxsj = tree.xpath('//div[@class="bus_i_content"]/p[last()]/text()')[0]
        # 获取公交路线长度
        try:
            length = tree.xpath('//div[@class="bus_label "]/p/text()')[0]
            pattern = re.compile(r'\d+\.\d+')
            ret = pattern.search(length)
            length = ret.group()
        except Exception as e:
            length = ''
        
    
        total_list = tree.xpath('//span[@class="bus_line_no"]/text()')
        # 获取上行总站数, 使用正则将总站数拿走
        pattern = re.compile(r'\d+')
        up_total = total_list[0]
        up_total = pattern.search(up_total).group()
        # 获取下行总站数
        try:
            down_total = total_list[1]
            down_total = pattern.search(down_total).group()
        except Exception as e:
            down_total = ''
        
        # 获取上行的公交站牌信息
        up_site_name = tree.xpath('//div[@class="bus_line_site "][1]//a/text()')
    
        # 获取下行的公交站牌信息
        try:
            down_site_name = tree.xpath('//div[@class="bus_line_site "][2]//a/text()')
        except Exception as e:
            down_site_name = []
        
        
    
        # 将公交的详细信息保存到字典中
        item = {
            '线路名称': name,
            '运行时间': runtime,
            '票价信息': price,
            '公交公司': company,
            '更新时间': gxsj,
            '线路长度': length,
            '上行站数': up_total,
            '上行站牌': up_site_name,
            '下行站数': down_total,
            '下行站牌': down_site_name,
        }
        string = json.dumps(item, ensure_ascii=False)
        fp.write(string + '\n')
    
    
    def main():
        # 打开文件
        fp = open('北京公交路线.txt', 'w', encoding='utf8')
        url = 'http://beijing.8684.cn/'
        # 获取所有的数字、字母链接
        all_href = parse_first_page(url)
        # 遍历列表,依次发送请求,解析二级页面
        parse_second_page(url, all_href, fp)
        fp.close()
    
    if __name__ == '__main__':
        main()
    
    

    登录---验证码

    验证码:

    (1)将验证码下载到本地,让用户手动输入
    (2)使用软件识别,效率不高
    (3)使用打码平台,识别率高
    
    import requests
    
    # 搞一个会话
    s = requests.Session()
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    }
    
    # 先将验证码下载到本地
    get_url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    r = s.get(get_url, headers=headers)
    
    # 需要向图片src发送请求,将验证码下载到本地
    image_src = 'https://so.gushiwen.org/RandCode.ashx'
    r = s.get(image_src, headers=headers)
    with open('code.png', 'wb') as fp:
        fp.write(r.content)
    
    
    code = input('请输入验证码:')
    
    post_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
    data = {
        '__VIEWSTATE': 'BvBAwAIKh29BShbC/yKMDsjiElxi+d4wdH3pR2dacgsifqK0rmUzL4Mc9YzHGDc6P6rqB4wMZ39uRj2MpaaSjQtarGnIo6qf1djLGa75XLo/S4b65Uhv2TETKt0=',
        '__VIEWSTATEGENERATOR':'C93BE1AE',
        'from': 'http://so.gushiwen.org/user/collect.aspx',
        'email': '1090509990@qq.com',
        'pwd': '123456',
        'code': code,
        'denglu': '登录',
    }
    
    r = s.post(post_url, headers=headers, data=data)
    
    print(r.text)
    
    

    相关文章

      网友评论

          本文标题:爬虫---requests

          本文链接:https://www.haomeiwen.com/subject/xxegpftx.html