XPath爬取百度贴吧链接里的图片

作者: 博行天下 | 来源:发表于2017-11-11 19:52 被阅读41次
    XPath将 HTML文件 转换成 XML文档,然后用 XPath 查找 HTML 节点或元素,更多XPath用法请查看XPath学习文档
    # -*- coding:utf-8 -*-
    
    import urllib2
    import lxml.etree
    
    class GetImage():
    
        def __init__(self):
            self.tieba = "https://tieba.baidu.com"
            self.count = 50
    
        def get_html(self,url):
            request = urllib2.Request(url)
            response = urllib2.urlopen(request)
            html = response.read()
            return html
    
        def get_xpath(self):
            # 起始页
            baginPage = int(raw_input("请输入起始页:"))
            # 结束页
            endPage = int(raw_input("请输入结束页:"))
            for pagecount in  range(baginPage,endPage + 1):
                pn = (pagecount - 1) * self.count
                urllink = self.tieba + "/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn=" + str(pn)
                xmlcontent = lxml.etree.HTML(self.get_html(urllink))
                # content = xmlcontent.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
                # content = xmlcontent.xpath('//div[@class="threadlist_title pull_left j_th_tit "]//a[@class="j_th_tit "]/@href')
                content = xmlcontent.xpath('//a[@class="j_th_tit "]/@href')
    
                for item in content:
                    itemcontent = lxml.etree.HTML(self.get_html(self.tieba + item))
                    print self.tieba + item
                    itemlist = itemcontent.xpath('//img[@class="BDE_Image"]//@src')
                    for imageitem in itemlist:
                        get_image = self.get_html(imageitem)
                        with open("images/" + imageitem[-10:],'a') as file:
                            file.write(get_image)
                            file.close
    
    if __name__ == "__main__":
        getImages = GetImage()
        getImages.get_xpath()
    

    相关文章

      网友评论

        本文标题:XPath爬取百度贴吧链接里的图片

        本文链接:https://www.haomeiwen.com/subject/iprgmxtx.html