美文网首页
4.request + xpath爬取网页数据

4.request + xpath爬取网页数据

作者: 思绪太重_飘不动 | 来源:发表于2019-07-06 16:37 被阅读0次

    1.requests是一个第三方模块,是urllib.requset的封装,功能比urllib.request强大,使用更方便.

    requsets的安装: pip install requests

    import requests
    from fake_useragent import UserAgent
    import random
        
    # 创建useragent对象
    us = UserAgent()
    headers = {
            "User-Agent": us.random 
        }
        
    # 创建代理池
    proxy_list = [
            {"代理协议": "代理的ip"},
            {'http': "42.59.86.21:1133"},
            {'https': "42.59.86.21:1133"},
            ......
        ]
        
    # 随机选择一个代理ip
    proxy = random.choice(proxy_list)
    url = "请求的网址"
        
    # cookies = {
            # 以获取的cokies
            "":"",
            "":"",
            ......
        }
     # requsets的GET请求
     # 可以直接通过 proxies参数设置代理
     # 也可以直接设置cookie
      response = requests.get(url, headers=headers, proxies=proxy, cookies=cookies)
      # 对象响应有如下操作
      print(response.__dict__) # 查看对象所有属性
      print(response.text)  # 返回字符串内容
      print(response.content) # 返回二进制内容
      print(response.content.decode()) # 返回字符串,默认编码utf-8
      print(response.json()) # requests自带json解析
      print(response.status_code) # 返回状态码
      print(response.cookies) # 获取cookies
      print(response.url) # 获取请求网址
      print(response.headers) # 获取请求头
    --------------------------------------------------------    
        # 也可以获取响应对象中的cookies
        # 获取cookie
        res_cookies = response.cookies
        # 将cookiejar转换成字典
        print(requests.utils.dict_from_cookiejar(res_cookies))
     ---------------------------------------------------------   
        # 也可以通过session来自动保存cookies
        import requests
    
        # 笔趣阁登录
        url = "https://www.biquge5200.cc/u/login.htm"
        data = {
            # 用户名: niejeff, 密码: 123456
            "name": "niejeff",
            "password": "E10ADC3949BA59ABBE56E057F20F883E",
            "autoLogin": "1",
            "autologin": "1"
        }
    
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
        }
    
        # 使用session: 自动保存和提交cookie
        session = requests.session()
    
        # 登录
        response = session.post(url, data=data, headers=headers)
        print(response.text)
        -------------------------------------------------------
        # requests 中的SSL认证
        import requests
    
        # 要想检查某个主机的SSL证书,你可以使用 verify 参数(也可以不写)
        response = requests.get("https://www.baidu.com/", verify=True)
    
        # 忽略验证, 可以省略不写或设置为verify=False
        response = requests.get("https://www.baidu.com/", verify=False)
    
        print(response.text)
    
        # 如果出现下面的错误,则忽略SSL证书
        # SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)
    
    
        # auth验证
        # 如果提示: Requires authentication, 添加auth参数
        auth=('test', '123456')
        response = requests.get('https://api.github.com/user', auth=auth)
        print(response.text)
    
    1. xpath用来解析,查找,过滤网页数据

    xpath的安装:pip install lxml
    xpath的常用语法

    / : 获取子节点,默认选取根节点。
    // : 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置。
    . : 选取当前节点。
    ..: 选取当前节点的父节点。
    @ : 选取属性。

    2.1.以获取京东数据为例

    import requests      # 导入requests
    from lxml import etree      # 导入xpath
    import time     # 导入时间模块 
    from spider_jd.proxys import proxys    # 自定义代理ip模块
    from spider_jd.user_agent import user   # 自定义用户代理 user_agent 模块
    
    
    # 获取静态页面部分数据
    def crawl_first(num):
        # 设置用户代理
        user_agent = user()
        headers = {
            'User-Agent': user_agent,
            'authority': 'search.jd.com',
            'method': 'GET',
            'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=4&s=74&scrolling=y&log_id=1562323559.30035&tpl=3_M&show_items=100005945610,5089273,5089235,100000822969,8790521,100003936976,47716923931,3133841,7652089,47847150993,100005819880,100000856861,46971161949,7437564,100003332220,8058010,100000972490,100003475378,100001247225,100000287133,100005150846,1861091,100003490442,7652091,100003336101,100002433196,100004544992,100003207541,100000766433,100005088618',
            'scheme': 'https',
            'cookie': 'shshshfpa=ea747a7e-66d3-d02e-43d0-ac87be9b0c90-1546772120; shshshfpb=smbRYXeZEqVbOIUQUwQguGQ%3D%3D; qrsc=3; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1562307212304; __jdu=1561809008634351058752; areaId=19; ipLoc-djd=19-1607-3155-0; PCSYCityID=1607; xtest=9587.cf6b6759; rkv=V0600; __jda=122270672.1561809008634351058752.1561809008.1562307212.1562314874.3; 3AB9D23F7A4B3C9B=3YDZC2ANDPEZWOXX5SJRD2YMFTT3VIKVQPGFXB5HZCBN6OJ7H4TPWJ643OIP5NDNSX6UJ5YUUNM52HATIF66ZSSFPI; shshshfp=ffea12491b36e16b6aa589b093d49865',
            'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=3&s=5&click=0',
            'x-requested-with': 'XMLHttpRequest'
        }
        # 设置代理ip
        proxies = proxys()
        # 待爬url
        url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&' \
              'enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={}'.format(num*2-1)
        # 发送请求,并返回响应
        response = requests.get(url=url, headers=headers, proxies=proxies)
        response.encoding = 'utf-8'
        html = response.text
        # 创建xpath对象
        tree = etree.HTML(html)
        # 获取全部手机的li
        phone_li = tree.xpath('//*[@id="J_goodsList"]/ul/li')
        with open('JD_Phone.csv', 'a', newline='', encoding='utf-8')as fp:
            for phone in phone_li:
                    try:
                        # 价格
                        p_price = phone.xpath('./div/div[3]/strong/i/text()')[0]
                        # 标题
                        p_title = phone.xpath('./div/div[4]/a/@title')[0]
                        # 直卖店
                        p_shop = phone.xpath('./div/div[7]/span/a/@title')
                    except Exception as e:
                        print(e)
                    try:
                        string = 'title:%s,price:%s,shop:%s' % (p_title, p_price, p_shop)
                        fp.write(string + '\n')
                        fp.flush()
                    except Exception as s:
                        print(s)
    
    
    # 获取动态加载部分数据
    def crawl_last(num):
        a = time.time()
        b = "%.5f" % a
        # 设置用户代理
        user_agent = user()
        headers = {
            'User-Agent': user_agent,
            'authority': 'search.jd.com',
            'method': 'GET',
            'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=4&s=74&scrolling=y&log_id=1562323559.30035&tpl=3_M&show_items=100005945610,5089273,5089235,100000822969,8790521,100003936976,47716923931,3133841,7652089,47847150993,100005819880,100000856861,46971161949,7437564,100003332220,8058010,100000972490,100003475378,100001247225,100000287133,100005150846,1861091,100003490442,7652091,100003336101,100002433196,100004544992,100003207541,100000766433,100005088618',
            'scheme': 'https',
            'cookie': 'shshshfpa=ea747a7e-66d3-d02e-43d0-ac87be9b0c90-1546772120; shshshfpb=smbRYXeZEqVbOIUQUwQguGQ%3D%3D; qrsc=3; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1562307212304; __jdu=1561809008634351058752; areaId=19; ipLoc-djd=19-1607-3155-0; PCSYCityID=1607; xtest=9587.cf6b6759; rkv=V0600; __jda=122270672.1561809008634351058752.1561809008.1562307212.1562314874.3; 3AB9D23F7A4B3C9B=3YDZC2ANDPEZWOXX5SJRD2YMFTT3VIKVQPGFXB5HZCBN6OJ7H4TPWJ643OIP5NDNSX6UJ5YUUNM52HATIF66ZSSFPI; shshshfp=ffea12491b36e16b6aa589b093d49865',
            'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=3&s=5&click=0',
            'x-requested-with': 'XMLHttpRequest'
        }
        # 设置代理ip
        proxies = proxys()
        # 待爬url
        url = 'https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={}&s={}&scrolling=y&log_id={}'.format(2*num, 48*num-20, b)
        # 发送请求,并返回响应
        response = requests.get(url=url, headers=headers, proxies=proxies)
        print(response.url)
        response.encoding = 'utf-8'
        html = response.text
    
        # 创建xpath对象
        tree = etree.HTML(html)
        # 获取全部手机的li
        phone_li = tree.xpath('//*[@id="J_goodsList"]/ul/li')
        print(phone_li)
        with open('JD_Phone.csv', 'a', encoding='utf-8')as fp:
            for phone in phone_li:
                    try:
                        # 价格
                        p_price = phone.xpath('./div/div[3]/strong/i/text()')[0]
                        # 标题
                        p_title = phone.xpath('./div/div[4]/a/@title')[0]
                        # 直卖店
                        p_shop = phone.xpath('./div/div[7]/span/a/@title')
                    except Exception as e:
                        print(e)
    
                    string = 'title:%s,price:%s,shop:%s' % (p_title, p_price, p_shop)
                    print(string)
                    fp.write(string + '\n')
                    fp.flush()
    
    
    if __name__ == '__main__':
        for num in range(1):
            print('开始爬取静态数据部分')
            crawl_first(num)
            print('crawl_first:%s结束' % num)
            print('*'*100)
            print('开始爬取动态加载数据部分')
            crawl_last(num)
            print('crawl_last:%s结束' % num)
    
    

    相关文章

      网友评论

          本文标题:4.request + xpath爬取网页数据

          本文链接:https://www.haomeiwen.com/subject/skaphctx.html