美文网首页
爬取百度贴吧图片

爬取百度贴吧图片

作者: yousa_ | 来源:发表于2019-09-26 13:41 被阅读0次
    import re
    # 正则表达式模块,用来匹配图片地址
    import urllib.request
    # 用来获取HTML源码
    import sys
    import os
    import re
    
    def geturls(path):
        urls = []
        with open(path, 'r', encoding='utf-8') as f1:
            url_list = f1.readlines()
            for url in url_list:
                urls.append(url[3:])
        print('get urls list ready!')
        return urls
    
    def getHtml(urls):
        num = 0
        for url in urls:
            print(num)
            page = urllib.request.urlopen(url)
            html = page.read()
            num += 1
            yield html
        print('get htmls list ready!')
    
    def getpic(htmls, dir):
        '''
    
        :param htmls: iteration
        :param dir:
        :return:
        '''
        if not os.path.exists(dir):
            os.makedirs(dir)
    
        imgName = 0
        while (htmls.__next__()):
            html = htmls.__next__()
            reg = r'src="(https://imgsa.*?\.jpg)"'
            imgre = re.compile(reg)
            imList = re.findall(imgre, html.decode('utf-8'))
            print(imList)
    
            # 下载图片
            for imgPath in imList:
                # ------ 这里最好使用异常处理及多线程编程方式 ------
                try:
                    f = open(dir +'/' + str(imgName) + ".jpg", 'wb')
                    f.write((urllib.request.urlopen(imgPath)).read())
                    print(imgPath)
                    f.close()
                except Exception as e:
                    print(imgPath + " error")
                imgName += 1
    
    def main():
        pic_dir = 'pic'
        urls = geturls('_防诈骗.txt')
        htmls = getHtml(urls)
        getpic(htmls, pic_dir)
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:爬取百度贴吧图片

          本文链接:https://www.haomeiwen.com/subject/gqpyyctx.html