美文网首页
threading模块多线程下载斗图图片

threading模块多线程下载斗图图片

作者: Kwen_9527 | 来源:发表于2017-08-02 11:53 被阅读0次

    涉及
    lxml模块的css选择器
    requests库
    threading多线程
    测试环境为python 2.7

    #coding=utf8
    
    import re
    import os
    import lxml.html
    import requests
    import threading
    import time
    
    def get_url(url):
        html = requests.get(url).content
        soup = lxml.html.fromstring(html)
        src_list = soup.cssselect('a.col-xs-6.col-sm-3') #css选择器选择图片class
        url_list = []
        title_list = []
        for src in src_list:
            img_url = re.compile('data-original="(http:)?//(.*?)"').search(lxml.html.tostring(src)).group(2) #获取图片class中的图片url
            url_list.append(img_url)
            title = src.text_content().replace('\n','').replace(' ', '').strip() #获取图片标题
            title_list.append(title)
        start_save_img(url_list,title_list)
    
    def save_img(img_url,title):
        img_url = 'http://'+img_url
        img_format = re.compile('\.jpg|\.png|.gif').search(img_url).group()
        img_content = requests.get(img_url).content
        print '正在下载'+ img_url
        if not os.path.exists('./img'):
            os.mkdir('img')
        with open('img/{}{}'.format(title.encode('utf-8'), img_format), 'wb') as f:
            f.write(img_content)
    
    def start_save_img(url_list,title_list):
        for i in range(len(url_list)):
            th = threading.Thread(target=save_img,args=(url_list[i],title_list[i])) #多线程下载
            th.start() #开启多线程
    
    def start():
        for i in range(1,10):
            url = 'https://www.doutula.com/photo/list/?page={}'.format(i)
            get_url(url)
            time.sleep(3)
    
    if __name__ == '__main__':
        start()
    

    相关文章

      网友评论

          本文标题:threading模块多线程下载斗图图片

          本文链接:https://www.haomeiwen.com/subject/lqmelxtx.html