4.request + xpath爬取网页数据

作者: 思绪太重_飘不动 | 来源:发表于2019-07-06 16:37 被阅读0次

4.request + xpath爬取网页数据
python爬虫之单纯用find（）函数来爬取数据
第一个小爬虫代码
爬虫： example two -- 爬取小说
2019-06-11 公交查询
使用XPath爬取起点网
Python爬虫九：豌豆荚设计奖多进程，异步IO爬取速度对比
[小技巧]Chrome中拷贝XPath的方法
提取Scrapy 爬虫概念
Python爬虫实战-抓取Boss直聘招聘信息

1.requests是一个第三方模块,是urllib.requset的封装,功能比urllib.request强大,使用更方便.

requsets的安装: pip install requests

import requests
from fake_useragent import UserAgent
import random
    
# 创建useragent对象
us = UserAgent()
headers = {
        "User-Agent": us.random 
    }
    
# 创建代理池
proxy_list = [
        {"代理协议": "代理的ip"},
        {'http': "42.59.86.21:1133"},
        {'https': "42.59.86.21:1133"},
        ......
    ]
    
# 随机选择一个代理ip
proxy = random.choice(proxy_list)
url = "请求的网址"
    
# cookies = {
        # 以获取的cokies
        "":"",
        "":"",
        ......
    }
 # requsets的GET请求
 # 可以直接通过 proxies参数设置代理
 # 也可以直接设置cookie
  response = requests.get(url, headers=headers, proxies=proxy, cookies=cookies)
  # 对象响应有如下操作
  print(response.__dict__) # 查看对象所有属性
  print(response.text)  # 返回字符串内容
  print(response.content) # 返回二进制内容
  print(response.content.decode()) # 返回字符串,默认编码utf-8
  print(response.json()) # requests自带json解析
  print(response.status_code) # 返回状态码
  print(response.cookies) # 获取cookies
  print(response.url) # 获取请求网址
  print(response.headers) # 获取请求头
--------------------------------------------------------    
    # 也可以获取响应对象中的cookies
    # 获取cookie
    res_cookies = response.cookies
    # 将cookiejar转换成字典
    print(requests.utils.dict_from_cookiejar(res_cookies))
 ---------------------------------------------------------   
    # 也可以通过session来自动保存cookies
    import requests

    # 笔趣阁登录
    url = "https://www.biquge5200.cc/u/login.htm"
    data = {
        # 用户名： niejeff, 密码： 123456
        "name": "niejeff",
        "password": "E10ADC3949BA59ABBE56E057F20F883E",
        "autoLogin": "1",
        "autologin": "1"
    }

    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"
    }

    # 使用session: 自动保存和提交cookie
    session = requests.session()

    # 登录
    response = session.post(url, data=data, headers=headers)
    print(response.text)
    -------------------------------------------------------
    # requests 中的SSL认证
    import requests

    # 要想检查某个主机的SSL证书，你可以使用 verify 参数（也可以不写）
    response = requests.get("https://www.baidu.com/", verify=True)

    # 忽略验证， 可以省略不写或设置为verify=False
    response = requests.get("https://www.baidu.com/", verify=False)

    print(response.text)

    # 如果出现下面的错误，则忽略SSL证书
    # SSLError: ("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",)


    # auth验证
    # 如果提示： Requires authentication， 添加auth参数
    auth=('test', '123456')
    response = requests.get('https://api.github.com/user', auth=auth)
    print(response.text)

xpath用来解析,查找,过滤网页数据

xpath的安装:pip install lxml
xpath的常用语法

/ : 获取子节点，默认选取根节点。
// : 从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置。
. : 选取当前节点。
..: 选取当前节点的父节点。
@ : 选取属性。

2.1.以获取京东数据为例

import requests      # 导入requests
from lxml import etree      # 导入xpath
import time     # 导入时间模块 
from spider_jd.proxys import proxys    # 自定义代理ip模块
from spider_jd.user_agent import user   # 自定义用户代理 user_agent 模块


# 获取静态页面部分数据
def crawl_first(num):
    # 设置用户代理
    user_agent = user()
    headers = {
        'User-Agent': user_agent,
        'authority': 'search.jd.com',
        'method': 'GET',
        'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=4&s=74&scrolling=y&log_id=1562323559.30035&tpl=3_M&show_items=100005945610,5089273,5089235,100000822969,8790521,100003936976,47716923931,3133841,7652089,47847150993,100005819880,100000856861,46971161949,7437564,100003332220,8058010,100000972490,100003475378,100001247225,100000287133,100005150846,1861091,100003490442,7652091,100003336101,100002433196,100004544992,100003207541,100000766433,100005088618',
        'scheme': 'https',
        'cookie': 'shshshfpa=ea747a7e-66d3-d02e-43d0-ac87be9b0c90-1546772120; shshshfpb=smbRYXeZEqVbOIUQUwQguGQ%3D%3D; qrsc=3; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1562307212304; __jdu=1561809008634351058752; areaId=19; ipLoc-djd=19-1607-3155-0; PCSYCityID=1607; xtest=9587.cf6b6759; rkv=V0600; __jda=122270672.1561809008634351058752.1561809008.1562307212.1562314874.3; 3AB9D23F7A4B3C9B=3YDZC2ANDPEZWOXX5SJRD2YMFTT3VIKVQPGFXB5HZCBN6OJ7H4TPWJ643OIP5NDNSX6UJ5YUUNM52HATIF66ZSSFPI; shshshfp=ffea12491b36e16b6aa589b093d49865',
        'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=3&s=5&click=0',
        'x-requested-with': 'XMLHttpRequest'
    }
    # 设置代理ip
    proxies = proxys()
    # 待爬url
    url = 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&' \
          'enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={}'.format(num*2-1)
    # 发送请求,并返回响应
    response = requests.get(url=url, headers=headers, proxies=proxies)
    response.encoding = 'utf-8'
    html = response.text
    # 创建xpath对象
    tree = etree.HTML(html)
    # 获取全部手机的li
    phone_li = tree.xpath('//*[@id="J_goodsList"]/ul/li')
    with open('JD_Phone.csv', 'a', newline='', encoding='utf-8')as fp:
        for phone in phone_li:
                try:
                    # 价格
                    p_price = phone.xpath('./div/div[3]/strong/i/text()')[0]
                    # 标题
                    p_title = phone.xpath('./div/div[4]/a/@title')[0]
                    # 直卖店
                    p_shop = phone.xpath('./div/div[7]/span/a/@title')
                except Exception as e:
                    print(e)
                try:
                    string = 'title:%s,price:%s,shop:%s' % (p_title, p_price, p_shop)
                    fp.write(string + '\n')
                    fp.flush()
                except Exception as s:
                    print(s)


# 获取动态加载部分数据
def crawl_last(num):
    a = time.time()
    b = "%.5f" % a
    # 设置用户代理
    user_agent = user()
    headers = {
        'User-Agent': user_agent,
        'authority': 'search.jd.com',
        'method': 'GET',
        'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=4&s=74&scrolling=y&log_id=1562323559.30035&tpl=3_M&show_items=100005945610,5089273,5089235,100000822969,8790521,100003936976,47716923931,3133841,7652089,47847150993,100005819880,100000856861,46971161949,7437564,100003332220,8058010,100000972490,100003475378,100001247225,100000287133,100005150846,1861091,100003490442,7652091,100003336101,100002433196,100004544992,100003207541,100000766433,100005088618',
        'scheme': 'https',
        'cookie': 'shshshfpa=ea747a7e-66d3-d02e-43d0-ac87be9b0c90-1546772120; shshshfpb=smbRYXeZEqVbOIUQUwQguGQ%3D%3D; qrsc=3; __jdc=122270672; __jdv=122270672|baidu|-|organic|not set|1562307212304; __jdu=1561809008634351058752; areaId=19; ipLoc-djd=19-1607-3155-0; PCSYCityID=1607; xtest=9587.cf6b6759; rkv=V0600; __jda=122270672.1561809008634351058752.1561809008.1562307212.1562314874.3; 3AB9D23F7A4B3C9B=3YDZC2ANDPEZWOXX5SJRD2YMFTT3VIKVQPGFXB5HZCBN6OJ7H4TPWJ643OIP5NDNSX6UJ5YUUNM52HATIF66ZSSFPI; shshshfp=ffea12491b36e16b6aa589b093d49865',
        'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page=3&s=5&click=0',
        'x-requested-with': 'XMLHttpRequest'
    }
    # 设置代理ip
    proxies = proxys()
    # 待爬url
    url = 'https://search.jd.com/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&cid2=653&cid3=655&page={}&s={}&scrolling=y&log_id={}'.format(2*num, 48*num-20, b)
    # 发送请求,并返回响应
    response = requests.get(url=url, headers=headers, proxies=proxies)
    print(response.url)
    response.encoding = 'utf-8'
    html = response.text

    # 创建xpath对象
    tree = etree.HTML(html)
    # 获取全部手机的li
    phone_li = tree.xpath('//*[@id="J_goodsList"]/ul/li')
    print(phone_li)
    with open('JD_Phone.csv', 'a', encoding='utf-8')as fp:
        for phone in phone_li:
                try:
                    # 价格
                    p_price = phone.xpath('./div/div[3]/strong/i/text()')[0]
                    # 标题
                    p_title = phone.xpath('./div/div[4]/a/@title')[0]
                    # 直卖店
                    p_shop = phone.xpath('./div/div[7]/span/a/@title')
                except Exception as e:
                    print(e)

                string = 'title:%s,price:%s,shop:%s' % (p_title, p_price, p_shop)
                print(string)
                fp.write(string + '\n')
                fp.flush()


if __name__ == '__main__':
    for num in range(1):
        print('开始爬取静态数据部分')
        crawl_first(num)
        print('crawl_first:%s结束' % num)
        print('*'*100)
        print('开始爬取动态加载数据部分')
        crawl_last(num)
        print('crawl_last:%s结束' % num)

4.request + xpath爬取网页数据
1.requests是一个第三方模块,是urllib.requset的封装,功能比urllib.request强大...
python爬虫之单纯用find（）函数来爬取数据
对于python软件爬取网页数据，一般采用BeautifulSoup库或者Xpath技术来解析html，然后寻找爬...
第一个小爬虫代码
分别用urllib+beautifulsoup和request+xpath方法爬取和解析网页内容
爬虫： example two -- 爬取小说
摘要爬取的目标网页是笔趣阁的飞剑问道使用requests + xpath 的方法，xpath使用xml模块解析 ...
2019-06-11 公交查询
爬取网址：url='https://wuhan.8684.cn'层层xpath解析网页requests应用进入首...
使用XPath爬取起点网
使用XPath简单爬取起点网并将数据存入MySQL数据库
Python爬虫九：豌豆荚设计奖多进程，异步IO爬取速度对比
一：前言使用requests+BeautifulSoup或者xpath等网页解析工具就可以爬取大部分的网页，但...
[小技巧]Chrome中拷贝XPath的方法
在用Scrapy爬取数据时需要用XPath确定路径，对于网页结构不熟悉的童鞋，需要认真找标签之间的嵌套关系，来确定...
提取Scrapy 爬虫概念
（1）Scrapy 的基本架构图和原理（2）模拟登录（3） HTML和XPath （4）爬取动态网页（6）爬...
Python爬虫实战-抓取Boss直聘招聘信息
爬取过程： 1、获取数据：Requests 2、解析数据：xpath 3、保存数据：pandas 在boss中查询...