美文网首页
XPath选择器运用

XPath选择器运用

作者: 錦魚 | 来源:发表于2018-11-22 19:06 被阅读0次
    lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签。

    文件读取: 除了直接读取字符串,lxml还支持从文件里读取内容。我们新建一个hello.html文件

    • 利用 etree.parse() 方法来读取文件。

    初步使用 我们利用它来解析 HTML 代码,简单示例:
    # lxml_test.py
    
    # 使用 lxml 的 etree 库
    from lxml import etree
    
    text = '''
    <div>
    <ul>
    <li class="item-0"><a href="link1.html">first item</a></li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-inactive"><a href="link3.html">third item</a></li>
    <li class="item-1"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a> # 注意,此处缺少一个 </li> 闭合标签
    </ul>
    </div>
    '''
    
    #利用etree.HTML,将字符串转化为Element对象,
    Element对象具有xpath的方法
    html_element = etree.HTML(text)
    
    # 按字符串序列化HTML文档
    result = etree.tostring(html_element))
    
    print(result)
    输出结果:
    
    <html><body>
    <div>
    <ul>
    <li class="item-0"><a href="link1.html">first item</a></li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-inactive"><a href="link3.html">third item</a></li>
    <li class="item-1"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
    </div>
    </body></html>
    lxml 可以自动修正 html 代码,例子里不仅补全了 li 标签,还添加了 body,html 标签。
    文件读取: 除了直接读取字符串,lxml还支持从文件里读取内容。我们新建一个hello.html文件:
    
    <!-- hello.html -->
    
    <div>
    <ul>
    <li class="item-0"><a href="link1.html">first item</a></li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
    <li class="item-1"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
    </div>
    利用 etree.parse() 方法来读取文件。
    
    # lxml_parse.py
    
    from lxml import etree
    
    # 读取外部文件 hello.html
    html = etree.parse('./hello.html')
    result = etree.tostring(html, pretty_print=True)
    
    print(result)
    输出结果与之前相同:
    
    <html><body>
    <div>
    <ul>
    <li class="item-0"><a href="link1.html">first item</a></li>
    <li class="item-1"><a href="link2.html">second item</a></li>
    <li class="item-inactive"><a href="link3.html">third item</a></li>
    <li class="item-1"><a href="link4.html">fourth item</a></li>
    <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
    </div>
    </body></html>
    XPath实例测试
    
    1获取所有的 <li> 标签
    # xpath_li.py
    from lxml import etree
    
    html = etree.parse('hello.html')
    print type(html) # 显示etree.parse() 返回类型
    
    result = html.xpath('//li')
    
    print result # 打印<li>标签的元素集合
    print len(result)
    print type(result)
    print type(result[0])
    输出结果:
    
    <type 'lxml.etree._ElementTree'>
    [<Element li at 0x1014e0e18>, <Element li at 0x1014e0ef0>, <Element li at 0x1014e0f38>, <Element li at 0x1014e0f80>, <Element li at 0x1014e0fc8>]
    5
    <type 'list'>
    <type 'lxml.etree._Element'>
    
    2. 继续获取<li> 标签的所有 class属性
    # xpath_li.py
    
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//li/@class')
    
    print (result)
    运行结果
    ['item-0', 'item-1', 'item-inactive', 'item-1', 'item-0']
    
    3. 继续获取
    标签下hre 为 link1.html 的 标签
    
    # xpath_li.py
    
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//li/a[@href="link1.html"]')
    
    print(result)
    运行结果
    [<Element a at 0x10ffaae18>]
    
    4. 获取
    标签下的所有 标签
    
    # xpath_li.py
    
    from lxml import etree
    
    html = etree.parse('hello.html')
    #result = html.xpath('//li/span')
    #注意这么写是不对的:
    #因为 / 是用来获取子元素的,而 <span> 并不是 <li> 的子元素,所以,要用双斜杠
    
    result = html.xpath('//li//span')
    print(result)
    运行结果
    [<Element span at 0x10d698e18>]
    
    5. 获取
    标签下的标签里的所有 class
    
    # xpath_li.py
    
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//li/a//@class')
    
    print(result)
    运行结果
    ['blod']
    
    6. 获取最后一个
    的 的 href
    # xpath_li.py
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//li[last()]/a/@href')
    # 谓语 [last()] 可以找到最后一个元素
    
    print(result)
    运行结果
    ['link5.html']
    
    7. 获取倒数第二个元素的内容
    # xpath_li.py
    
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//li[last()-1]/a')
    
    # text 方法可以获取元素内容
    print(result[0].text)
    运行结果
    fourth item
    
    8. 获取 class 值为 bold 的标签名 # xpath_li.py
    from lxml import etree
    
    html = etree.parse('hello.html')
    result = html.xpath('//*[@class="bold"]')
    
    # tag方法可以获取标签名
    print(result[0].tag)
    运行结果
    span
    

    例子

    from lxml import etree
    from fake_useragent import UserAgent
    import requests
    
    
    def qidianSpider(start_url):
        get_novel_list_by_url(start_url)
    
    def get_novel_list_by_url(req_url):
        req_header = {'User-Agent':UserAgent().random}
        response = requests.get(url=req_url,headers=req_header)
    
        if response.status_code == 200:
            html_element = etree.HTML(response.text)
            noval_lis  = html_element.xpath('//ul[@class="all-img-list cf"]/li')
            for noval_li in noval_lis:
                coverImage = noval_li.xpath('./div[@class="book-img-box"]/a/img/@src')[0]
                title = noval_li.xpath('./div[@class="book-mid-info"]/h4/a/text()')[0]
                author = noval_li.xpath('.//a[@class="name"]/text()')[0]
                nub = noval_li.xpath('.//span/text()')[0]
                #< span class ="jIVjQgqX" > 𘟄𘟆𘟁𘟉𘟈𘟆 < / span >
                type = noval_li.xpath('.//p/a[2]/text()')[0]
                type_small = noval_li.xpath('.//p/a[3]/text()')[0]
                tit_url = noval_li.xpath('.//h4/a/@href')[0]
                print(coverImage,title,author,type,type_small,nub)
                novel_chapter(tit_url)
                break
    def novel_chapter(url):
        req_url = 'https:'+url+'#Catalog'
        print(req_url)
        html_chapter_element = etree.HTMl(requests.text)
        chapter_lis = html_chapter_element.xpath('//ul[@class="cf"]/li')[0]
        print(chapter_lis)
    
    if __name__=='__main__':
        url = 'https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=1'
        qidianSpider(url)
    

    相关文章

      网友评论

          本文标题:XPath选择器运用

          本文链接:https://www.haomeiwen.com/subject/qakrqqtx.html