美文网首页
python爬取百度贴吧的图片2

python爬取百度贴吧的图片2

作者: lunabird | 来源:发表于2015-11-25 20:57 被阅读239次

    今天看了一下beautifulsoup库的用法,把昨天的python爬取百度贴吧的图片1的代码更新成使用beautifulsoup库的函数来实现。用的还是不太熟练,但是感觉比正则表达式写起来容易了一些。

    # -*- coding: utf-8 -*-
    from bs4 import BeautifulSoup
    import urllib2
    import urllib
    import re
    
    class imgTest:
    
        def __init__(self, baseUrl, seeLZ):
            self.baseUrl = baseUrl
            self.seeLZ = '?see_lz='+str(seeLZ)
    
        #print to log.txt
        def printToLog(self,mystr):
            f = open('txt/log.txt', 'a')
            f.write(mystr+"\n")
            f.close()
        #get the html source code
        def getPage(self, pageNum):
            try:
                url = self.baseUrl+self.seeLZ +'&pn='+str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                content = response.read()
                return content
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print "failed to connect baidutieba.",e.reason
                    return None
    
        def getPageNum(self):
            page = self.getPage(1)
            soup = BeautifulSoup(page,'html.parser')
            pageNum = soup.find_all("span",class_='red')[1].string
            return pageNum
    
        def getTitle(self):
            page = self.getPage(1)
            soup = BeautifulSoup(page,'html.parser')
            return soup.h3.string
    
        def getAllImageURLs(self,pageNum):
            page = self.getPage(pageNum)    
            soup = BeautifulSoup(page,'html.parser')    
            imgTags = soup.find_all("img",class_="BDE_Image")
            imgURLs = []
            for item in imgTags:
                imgURLs.append(item.get('src'))
            print imgURLs
            return imgURLs
        #save a single img 
        def saveImg(self,imageURL,filename):
            u = urllib.urlopen(imageURL)
            data = u.read()
            f = open(filename,'wb')
            f.write(data)
            f.close()
        #download images
        def saveImgs(self, images, name, num):
            number = num
            for imageURL in images:
                splitPath = imageURL.split('.')
                fTail = splitPath.pop()
                if len(fTail)>3:
                    fTail = "jpg"
                fileName = name+"/"+str(number)+"."+fTail
                self.saveImg(imageURL,fileName)
                number += 1
    
    
    baseURL = 'http://tieba.baidu.com/p/3925387672'
    imgtest = imgTest(baseURL,1)
    totalnum = int(imgtest.getPageNum())
    
    imageCount = 0
    for i in range(1, totalnum+1):
        imageURLs = imgtest.getAllImageURLs(i)
        imgtest.saveImgs(imageURLs,"pic",imageCount)
        imageCount += len(imageURLs)
        print imageCount
    

    附上beautifulsoup的文档看看吧,就酱。

    相关文章

      网友评论

          本文标题:python爬取百度贴吧的图片2

          本文链接:https://www.haomeiwen.com/subject/fjfdhttx.html