简单的python爬取图片

作者: cpp加油站 | 来源:发表于2017-07-31 10:05 被阅读114次

    之前有一次网页上图片不能复制,就自己写了一个下载图片程序,只能针对例子中网页获取图片,若想下载其他网页中图片,需对程序进行改造。

    #coding=utf-8
    from bs4 import BeautifulSoup
    import aiohttp
    import asyncio
    from urllib import request
    
    #回调函数,打印进度
    def callbackFunc(blocknum, blocksize, totalsize):
        '''回调函数
        @blocknum: 已经下载的数据块
        @blocksize: 数据块的大小
        @totalsize: 远程文件的大小
        '''
        percent = 100.0*blocknum*blocksize/totalsize
        if percent > 100:
            percent = 100
        print("%.2f%%" % percent)
    
    #获取页面源代码
    async def getPage(url, res_list):
        headers = {'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers = headers) as resp:
                assert resp.status == 200
                res_list.append(await resp.text())
    
    #分析html源代码,找到所需要的标记
    class parsePage():
        def __init__(self, page_str):
            self.page_str = page_str
        #__enter__和__exit__配合,主要实现with...as...功能
        def __enter__(self):
            page_str = self.page_str
            #这里只是例子中获取图片地址的规则,具体html页面需要具体分析
            page = BeautifulSoup(page_str, 'lxml')
            articles = page.findAll('img')
            art_urls = []
            for a in articles:
                art_urls.append(a['src'])
            return art_urls
        def __exit__(self, exc_type, exc_val, exc_tb):
            pass
    
    class jianshuParsePage(parsePage):
        def __init__(self, page_str):
            super(jianshuParsePage, self).__init__(page_str)
        
    '''
    获取网页中所有的图片网址,并下载图片
    '''
    if __name__ == '__main__':
        page_url_base = 'http://www.baiyinghui.com/gallery/show/id/MDAwMDAwMDAwMLGIgtuFtKF2?spm=a220o.1000855.0.0.iuLTnS'
        page_urls = [page_url_base]
        loop = asyncio.get_event_loop()
        ret_list = []
        tasks = [getPage(host,ret_list) for host in page_urls]
        loop.run_until_complete(asyncio.wait(tasks))
    
        articles_url = []
        x = 0
        for ret in ret_list:
            with jianshuParsePage(ret) as tmp:
                for jpg in tmp:
                    #这里是过滤掉包含特殊字符和汉字的url,因为
                    if jpg.find('(') != -1 or jpg.find('副本') != -1 or jpg.find(')') != -1 or jpg.find(' ') != -1:
                        pass
                    elif jpg.find('http') != -1:
                        print(jpg)
                        #根据获取到的图片的url下载图片
                        request.urlretrieve(jpg, '/home/pycode/tupian/%s.jpg' % x, callbackFunc)
                        x += 1
                        #这里为了测试方便,就直接跳出循环了
                        break
        ret_list = []
    
        loop.close()
    

    相关文章

      网友评论

        本文标题:简单的python爬取图片

        本文链接:https://www.haomeiwen.com/subject/avpllxtx.html