美文网首页
114.xpath的使用的案例及代码

114.xpath的使用的案例及代码

作者: 羽天驿 | 来源:发表于2020-02-12 20:47 被阅读0次

    一.xpath的使用:

    XML Path Language XML 路径语言
    安装lxml库 (持HTML和XML解析,持XPath解析式)
    pip install lxml

    二. beautifulsoup的使用

    pip3 install beautifulsoup4
    解析器
    Python 标准库 BeautifulSoup(html, “html.parser”) 速度般,容错能好
    lxml HTML解析器 BeautifulSoup(html, “lxml”) 速度快,容错好
    lxml xml解析器 BeautifulSoup(markup, “xml”) 速度快,唯持xml
    html5lib BeautifulSoup(markup, “html5lib”) 容错性,速度慢
    引⼊BeautifulSoup
    from bs4 import BeautifulSoup
    获取⽅法
    soup = BeautifulSoup(html, "lxml") # 试lxml解析器构造beautifulsoup
    print(soup.prettify()) # 取缩进格式化输出
    print(soup.title.string) # 取title内容
    print(soup.head)
    print(soup.p)

    获取节点的名字

    print(soup.title.name)

    获取节点属性

    soup.img.attrs["src"]
    print(soup.p.attrs)
    print(soup.p.attrs["name"])
    print(soup.p["class"])

    获取节点包含的内容

    print(soup.p.string)
    <p class="c1"><span>asdf<span>asdfasdfasdfasdfadsfad<p>
    嵌套选择
    <head>
    <title>this is title</title>
    </head>

    soup的节点都为 bs4.element.Tag类型,可以继续选择

    print(soup.head.title.string)


    关联选择
    有些元素没有特征定位,可以先选择有办法定位的,然后以这个节点为准选择它的⼦节点、⽗
    节点、兄弟节点等
    <p class="p1"></p>
    <p></p>
    <p></p>
    print(soup.p.contents) # 取p节点下所有节点列表
    print(soup.p.descendants) #取p节点所有孙节点
    print(soup.a.parent) # 取节点
    print(soup.a.parents) # 取所有祖先节点
    print(soup.a.next_sibling) # 同级下节点
    print(soup.a.previous_sibling) # 同级上节点
    print(soup.a.next_siblings) # 同级所有后节点
    print(soup.a.previous_siblings) # 同级所有前节点
    print(list(soup.a.parents)[0].attrs['class'])


    方法选择器根据属性和本进查找
    <ul><li><li><ul>
    <ul><li><li>jjj<li><li></ul>
    print(soup.find_all(name="ul"))
    for ul in soup.find_all(name="ul"):
    print(ul.find_all(name="li"))
    for li in ul.find_all(name="li"):
    print(li.string)
    soup.find_all(attrs={"id": "list-1"})


    css 选择器
    <p id="p1" class="panel"><p class=""><p><p>
    soup.select('.panel .panel_heading')
    soup.select('ul li')
    soup.select('#id1 .element')


    三.爬虫爬取的基本流程

    首先是:我们请求网页用的是requests库.

    def get_page(page):
        url = 'https://maoyan.com/board/4?offset=%s' % str(page * 10)
        headers =  {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # response.content返回的是字节流,decode后变成字符串
            return response.content.decode('utf-8')
        return None
    
    def main():
     get_page()
    
    if __name__ == '__main__':
        main()
    

    四.获取到网页我们需要进行解析:

    (一)xpath解析网页的方法:
    1.爬取豆瓣电影:

    import requests
    from lxml import etree
    
    # 获取网页
    def get_page():
        url = 'https://www.douban.com/group/explore'
        headers =  {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # response.content返回的是字节流,decode后变成字符串
            return response.content.decode('utf-8')
        return None
    
    # 使用xpath解析网页
    def parse_page(html):
        # 把html文本对象转换成etree的节点对象
        etree_html = etree.HTML(html)
        # print(etree_html)
        # print(type(etree_html))
    
        # // 表示匹配子子孙孙 * 表示任意类型节点 //* 子子孙孙任意类型节点
        # results = etree_html.xpath('//*')
        # print(results)
        # print(len(results))
        
        # 匹配所有的img标签
        # results = etree_html.xpath('//img')
        # print(results)
        # print(len(results))
    
        # 找出所有a标签里面的文字
        # results = etree_html.xpath('//a/text()')
        # print(results)
    
        # / 表示直接的儿子节点
        # results = etree_html.xpath('//div/h3/a/text()')
        # print(results)
    
        # 根据属性值来获取节点[@class="likes"]
        # results = etree_html.xpath('//div[@class="likes"]/text()')
        # print(results)
    
        # likes_list = []
        # for i in range(len(results)):
        #     if i % 2 == 0:
        #         likes_list.append(results[i])
        # print(likes_list)
    
        # 获取属性值用@attr
        # results = etree_html.xpath('//div[@class="pic"]/div[@class="pic-wrap"]/img/@src')
        # print(results)
    
        # 如果是一个属性里面包含多个值,但是你只知道一个值的时候 用contains
        # results = etree_html.xpath('//div[contains(@class, "grid-16-8")]//div[@class="likes"]/text()')
        # print(results)
    
        # results = etree_html.xpath('//div[@class="grid-16-8 clearfix"]//div[@class="likes"]/text()')
        # print(results)
    
        # contains这些方法 可以使用 and or这些条件
        # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天")]/text()') 
        # print(results)
    
        # ..表示父亲节点
        # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天")]/../../h3/a/text()') 
        # print(results)
    
        # 获取昨天16:00 到 18:00间的数据
        # results = etree_html.xpath('//span[@class="pubtime" and contains(text(), "昨天") and (starts-with(substring-after(text(),"昨天"), "16:") or starts-with(substring-after(text(),"昨天"), "17:"))]/text()') 
        # print(results)
    
        # 根据顺序号来取指定节点, 从1开始,不是从0开始
        # [1] [first()] [last()] [position() < 4]
        # 获取第2个标题
        # results = etree_html.xpath('//div[@class="channel-item"][2]/div/h3/a/text()')[0]
        # print(results)
    
        # 获取第3个到第5个标题
        # results = etree_html.xpath('//div[@class="channel-item"][position() >=3 and position() <= 5]/div/h3/a/text()')
        # print(results)
    
        # following::* 获取当前节点结束标签之后的所有节点
        # results = etree_html.xpath('//div[@class="channel-item"][2]/following::*')
        # print(results)
    
        # following-sibling::* 获取当前节点结束标签之后的所有同级节点
        results = etree_html.xpath('//div[@class="channel-item"][2]/following-sibling::*')
        print(results)
        print(len(results))
    
    
    
    def main():
        html = get_page()
        # print(html)
        parse_page(html)
    
    if __name__ == '__main__':
        main()
    

    (二.)BeautifulSoup(靓汤解析网页的方法)
    使用BeautifulSoup--爬取新浪体育的标题

    import requests
    from bs4 import BeautifulSoup
    
    # 获取网页
    def get_page():
        url = 'http://sports.sina.com.cn/nba/'
        headers =  {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # response.content返回的是字节流,decode后变成字符串
            return response.content.decode('utf-8')
        return None
    
    def parse_page(html):
        # html = '<div><span>坚持努力</span></div>'
        soup = BeautifulSoup(html, 'lxml')
        # 整齐的格式输出,会做一些补全
        # print(soup.prettify())
        # print(soup.title.string)
        # print(soup.head)
        # print(soup.p)
        # print(soup.p.name) # 标签的类型名字
        # print(soup.img.attrs) # attrs获取节点的所有属性
        # print(soup.img.attrs['src'])
        # print(soup.p.contents) # 返回第一个p标签里面的所有节点列表
        # print(list(soup.a.parents))
    
        # 用css定位的方式获取节点 空格可以表示儿子或孙子(只要在下面就行了)
        # a_list = soup.select('.news-list-b .list a')
        # for item in a_list:
        #     print(item.string)
    
        # 当class里面有多个样式名,需要把空格去掉
        a_list = soup.select('div.-live-layout-container.row-fuild .news-list-b .list a')
        for item in a_list:
            print(item.string)
    
    
    def main():
        html = get_page()
        # print(html)
        parse_page(html)
    
    if __name__ == '__main__':
        main()
    

    (三.)使用xpath和re爬取四川卫生网站的疫情。

    import requests
    import re
    from lxml import etree
    
    # 获取网页
    def get_page(url):
        headers =  {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36" 
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # response.content返回的是字节流,decode后变成字符串
            return response.content.decode('utf-8')
        return None
    
    # 解析详情页
    def parse_detail_page(html):
        etree_html = etree.HTML(html)
        result = etree_html.xpath('//div[@class="wy_contMain fontSt"]//span/text()')  
        # print(result)  
        # 把列表转换成一个大的字符串
        result_str = ''.join(result)
        
        # 获取数据时间
        titles = etree_html.xpath('//h1[@class="blue fontNb"]/text()')
        print(titles)
    
        # 获取每日新增病例数字
        pattern = re.compile('新增.*?确诊病例(\d+)例', re.S)
        xzs = re.findall(pattern, result_str)
        print(xzs)
    
    # 使用xpath解析网页
    def parse_page(html):
        # 把html文本对象转换成etree的节点对象
        etree_html = etree.HTML(html)
        items = etree_html.xpath('//div[@class="wy_contMain fontSt"]/ul/li/a[starts-with(text(), "截至")]')
        for item in items:
            link = item.xpath('./@href')[0]
            title = item.xpath('./text()')[0]
            print(link)
            print(title)
            full_link = 'http://wsjkw.sc.gov.cn' + link
            # 爬取详情页的信息
            detail_html = get_page(full_link)
            # 解析详情页信息
            parse_detail_page(detail_html) 
    
    def main():
        url = 'http://wsjkw.sc.gov.cn/scwsjkw/gggs/tygl.shtml'
        html = get_page(url)
        # print(html)
        parse_page(html)
    
    if __name__ == '__main__':
        main()
    

    下面是连接数据库的操作

    import pymysql
    
    # 获取数据库连接
    def get_connection():
        localhost = '127.0.0.1'
        port = 3306
        user = 'root'
        password = 'Vff12345678'
        database = 'maoyan'
        db = pymysql.connect(host, user, password, database, charset='utf8', port=port)
        return db
    
    # 获取游标
    def get_cursor(con):
        cursor = con.cursor()
        return cursor
    
    # 关闭数据库连接
    def close_connection(con):
        con.close()
    
    # 插入数据
    def save_db(con, cursor, data_dict):
        sql = 'insert into movie (title, releasetime, actor, ranks, score, cover) values ("%s", "%s", "%s", "%s", "%s", "%s")' % (data_dict['title'], data_dict['releasetime'], data_dict['actor'], data_dict['rank'], data_dict['score'], data_dict['cover'])
        print(sql)
        cursor.execute(sql)
        con.commit()
    

    (四.)data数据的爬取(在XHR中)
    爬取蘑菇街女装

    import requests
    import json
    
    from day3.sqlachimy_hepler import*
    
    # 获取网页
    def get_page(page, action):
        # url = 'https://list.mogu.com/search?callback=jQuery21107629394841283899_1581471928849&_version=8193&ratio=3%3A4&cKey=15&page=' + str(page) + '&sort=pop&ad=0&fcid=50240&action=' + action + '&acm=3.mce.1_10_1ko4s.132244.0.9qYcxrQfkVICJ.pos_1-m_482170-sd_119&ptp=31.v5mL0b._head.0.ZS3jNSPn&_=1581471928851'
        url='https://list.mogu.com/search?&cKey=15&page='+str(page)+'&action='+action
        headers =  {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # response.content返回的是字节流,decode后变成字符串
            return response.content.decode('utf-8')
        return None
    
    # 解析网页
    def parse_page(html, action):
        # start_index = html.index('(')
        # html = html[start_index + 1:-2]
        # print(html)
        json_data = json.loads(html)
        # 获取是否结束的标记
        is_end = json_data['result']['wall']['isEnd']
        results = json_data['result']['wall']['docs']
        for item in results:
            # 放类别进去
            item['category'] = action
            print(item['title'])
            # 用sqlalchemy方式保存到数据库中
            save_goods(item)
    
        return is_end
    
    # 获取所有的数据
    def get_all_pages():
        # actions = ['clothing', 'skirt', 'trousers', 'shoes', 'bags', 'boyfriend', 'neiyi', 'baby', 'home']
        actions = ['neiyi']
    
        for action in actions:
            page = 1
            print(action)
            print('*' * 20)
            while True:
                print(page)
                html = get_page(page, action)
                is_end = parse_page(html, action)
                if is_end:
                    break
                page += 1
    
    def main():
        get_all_pages()
    
    if __name__ == '__main__':
        main()
    #建立数据库建表
    create  database mogujie default character set=utf8;
    use mogujie;
    create table goods(
    id integer primary key auto_increment,
    title varchar(128),
    link varchar(1024),
    trade_item_id varchar(32),
    org_price varchar(32),
    price varchar(32),
    sale varchar(32),
    category varchar(128)
    );
    create index ix_goods _title on goods (title);
    #使用sqlalchemy存储数据
    from sqlalchemy.ext.declarative import declarative_base
    from sqlalchemy import Column, Integer, String, ForeignKey, UniqueConstraint, Index
    from sqlalchemy.orm import sessionmaker, relationship
    from sqlalchemy import create_engine
    
    engine = create_engine("mysql+pymysql://root:361394621@localhost/mogujie?charset=utf8", max_overflow=5,encoding='utf-8')
    
    Base = declarative_base()
    class Goods(Base):
        __tablename__ = 'goods'
        id = Column(Integer, primary_key=True, autoincrement=True)    #主键,自增
        trade_item_id = Column(String(32))
        title = Column(String(128))
        category = Column(String(128))
        link = Column(String(1024))
        org_price = Column(String(32))
        price = Column(String(32))
        sale = Column(String(32))
    

    相关文章

      网友评论

          本文标题:114.xpath的使用的案例及代码

          本文链接:https://www.haomeiwen.com/subject/vnnzxhtx.html