美文网首页Python
爬取简书文章内图片

爬取简书文章内图片

作者: heheddff | 来源:发表于2018-11-29 19:23 被阅读0次

    第一次写爬虫,还是爬简书文章内的图片,小记一下,希望各位大神给一下意见,能更好的学习python,感谢~

    spider_image.py文件负责调度

    from html_parser import HtmlParser
    from download import Download
    from url_manager import UrlManger
    from save_results import SaveResult
    
    class SpiderImages():
        
        #init all instance
        def __init__(self):
            self.download = Download()
            self.htmlparser = HtmlParser()
            self.urlmanager = UrlManger()
            self.saveresult = SaveResult()
        
        def run(self,urls):
            i = 1   
            for url in urls:
                file_dir = url.split('/')[-1]
                self.urlmanager.add_new_url(url)
                while self.urlmanager.has_new_url():
                    new_url = self.urlmanager.get_new_url()
                    html_cont = self.download.download(new_url)
                    new_urls,name,html_cont,t = self.htmlparser.parser(html_cont)
                    self.urlmanager.add_new_urls(new_urls)
                    
                    self.saveresult.save(html_cont,file_dir,name,t)
                    print("{} {}".format(i,new_url))
                    i += 1
        
        def main(self,url):
            #self.craw(url)
            self.run(url)
            
    url = ["https://www.jianshu.com/p/cafdb41e186a","https://www.jianshu.com/p/d2a1490c785c","https://www.jianshu.com/p/cce86949fc9a"]
    spider = SpiderImages()
    spider.main(url)
    

    download.py文件负责网页内容下载

    import requests
    import os
    
    class Download():
        
        def download(self,url):
            try:
                imagename = url.split('/')[-1]
                headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
                res = requests.get(url,headers=headers)         
                res.raise_for_status()
                res.encoding = "utf-8"
                #print({'contents':res.content,'imagename':imagename})
                return {'contents':res.text,'imagename':imagename,'binary':res.content}
            except:
                print("download error") 
    

    html_parser.py文件负责解析网页内容

    from bs4 import BeautifulSoup
    class HtmlParser():
        def parser(self,html_content):
            try:
                soup = BeautifulSoup(html_content['contents'],"html.parser")
                name = soup.find("div",class_="author").find_all(class_='info')[0].find('a').text
                images = soup.find("div",class_="show-content-free").find_all('img')
                new_images = []
                for image in images:
                    new_images.append("http:"+image['data-original-src'])
                return new_images,name,html_content,1
            except:
                return '','',html_content,0
    

    url_manager.py负责url管理

    class UrlManger():
        
        def __init__(self):
            self.wait_urls = set()
            self.downloaded_urls = set()
            
        def add_new_url(self,url):      
            if url and self.checkaddwaiturl(url)  and self.checkaddurldownload(url):
                self.wait_urls.add(url)
            else:
                return
            
        def add_new_urls(self,urls):
            if urls:
                for url in urls:
                    self.add_new_url(url)
            
        def has_new_url(self):
            return len(self.wait_urls) != 0
            
        def get_new_url(self):
                download_url = self.wait_urls.pop()
                self.downloaded_urls.add(download_url)
                return download_url
            
        def checkaddwaiturl(self,url):
            if url not in self.wait_urls:
                return True
            else:
                return False
        
        def checkaddurldownload(self,url):
            if url not in self.downloaded_urls:
                return True
            else:
                return False
    

    save_results.py负责保存图片到本地

    import os
    
    class SaveResult():
        
        root = './download'
        def saveimage(self,res):
            imagename = '/'.join([self.__root.strip('/'),res['imagename']])
            imagename = self.__checkfiletype(imagename)
            
            if self.__checkfile(imagename):
                print('{} is exists'.format(imagename))
            else:
                try:
                    with open(imagename,"wb") as f:
                        f.write(res['binary'])
                        f.close()
                except:
                    print("save image fail")    
        
        def createDir(self,path):
            self.__root = '/'.join([self.root,path])
            print(self.__root)
            
            if os.path.exists(self.__root):
                return
            os.makedirs(self.__root)
            
        def __checkfile(self,filename):
            return os.path.exists(filename)
        
        def __checkfiletype(self,filename):
            return filename if filename.rfind('.') >0 else '.'.join([filename,'jpg'])
        
        def save(self,contents,file_dir,author,t):
            if t == 1:
                self.createDir('/'.join([author,file_dir]))
            else:
                self.saveimage(contents)
    
    

    相关文章

      网友评论

        本文标题:爬取简书文章内图片

        本文链接:https://www.haomeiwen.com/subject/bntqcqtx.html