XPath爬取百度贴吧链接里的图片

作者: 博行天下 | 来源:发表于2017-11-11 19:52 被阅读41次
XPath将 HTML文件 转换成 XML文档,然后用 XPath 查找 HTML 节点或元素,更多XPath用法请查看XPath学习文档
# -*- coding:utf-8 -*-

import urllib2
import lxml.etree

class GetImage():

    def __init__(self):
        self.tieba = "https://tieba.baidu.com"
        self.count = 50

    def get_html(self,url):
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        html = response.read()
        return html

    def get_xpath(self):
        # 起始页
        baginPage = int(raw_input("请输入起始页:"))
        # 结束页
        endPage = int(raw_input("请输入结束页:"))
        for pagecount in  range(baginPage,endPage + 1):
            pn = (pagecount - 1) * self.count
            urllink = self.tieba + "/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=" + str(pn)
            xmlcontent = lxml.etree.HTML(self.get_html(urllink))
            # content = xmlcontent.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
            # content = xmlcontent.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@class="j_th_tit "]/@href')
            content = xmlcontent.xpath('//a[@class="j_th_tit "]/@href')

            for item in content:
                itemcontent = lxml.etree.HTML(self.get_html(self.tieba + item))
                print self.tieba + item
                itemlist = itemcontent.xpath('//img[@class="BDE_Image"]//@src')
                for imageitem in itemlist:
                    get_image = self.get_html(imageitem)
                    with open("images/" + imageitem[-10:],'a') as file:
                        file.write(get_image)
                        file.close

if __name__ == "__main__":
    getImages = GetImage()
    getImages.get_xpath()

相关文章

网友评论

    本文标题:XPath爬取百度贴吧链接里的图片

    本文链接:https://www.haomeiwen.com/subject/iprgmxtx.html