美文网首页Pythoner集中营
全自动获取图片,并留下福利图片

全自动获取图片,并留下福利图片

作者: 慕幕沐 | 来源:发表于2018-05-06 20:27 被阅读48次

    本人并不擅长造轮子- -借用这位网友的代码进行改进

    http://blog.csdn.net/xiligey1/article/details/73321152
    

    需要使用的库有request nude url lib threading等

    主要需要注意一下nude,我用pypy安装莫名失败,只能用python2运行这段代码- –
    首先是网友的源代码:

    # coding=utf-8
    """根据搜索词下载百度图片"""
    import re
    import sys
    import urllib
    
    import requests
    
    
    def get_onepage_urls(onepageurl):
        """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
        if not onepageurl:
            print('已到最后一页, 结束')
            return [], ''
        try:
            html = requests.get(onepageurl).text
        except Exception as e:
            print(e)
            pic_urls = []
            fanye_url = ''
            return pic_urls, fanye_url
        pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
        fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
        return pic_urls, fanye_url
    
    
    def down_pic(pic_urls):
        """给出图片链接列表, 下载所有图片"""
        for i, pic_url in enumerate(pic_urls):
            try:
                pic = requests.get(pic_url, timeout=15)
                string = str(i + 1) + '.jpg'
                with open(string, 'wb') as f:
                    f.write(pic.content)
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
            except Exception as e:
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
                print(e)
                continue
    
    
    if __name__ == '__main__':
        keyword = '苍老师'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
        url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
        url_init = url_init_first + urllib.quote(keyword, safe='/')
        all_pic_urls = []
        onepage_urls, fanye_url = get_onepage_urls(url_init)
        all_pic_urls.extend(onepage_urls)
    
        fanye_count = 0  # 累计翻页数
        while 1:
            onepage_urls, fanye_url = get_onepage_urls(fanye_url)
            fanye_count += 1
            print('第%s页' % fanye_count)
            if fanye_url == '' and onepage_urls == []:
                break
            all_pic_urls.extend(onepage_urls)
    
        down_pic(list(set(all_pic_urls)))
    

    但是这样的话速度太慢了,几千张图片一个个下载

    所以我用了多线程,又怕崩溃卡死,我用的四个线程

    你们也可以根据实际情况更改一波

    更改后:

    def get_onepage_urls(onepageurl):
        """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
        if not onepageurl:
            print('已到最后一页, 结束')
            return [], ''
        try:
            html = requests.get(onepageurl).text
        except Exception as e:
            print(e)
            pic_urls = []
            fanye_url = ''
            return pic_urls, fanye_url
        pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
        fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
        return pic_urls, fanye_url
    
    
    def down_pic(pic_urls):
        """给出图片链接列表, 下载所有图片"""
        bigest=0
        wrong=0
        for i, pic_url in pic_urls:
            try:
                pic = requests.get(pic_url, timeout=15)
                string = str(i + 1) + '.jpg'
                with open(string, 'wb') as f:
                    f.write(pic.content)
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
            except Exception as e:
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
                print(e)
                wrong=wrong+1
                continue
            if I>bigest:
                bigest=i
    
    if __name__ == '__main__':
        keyword = '美女'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
        url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
        url_init = url_init_first + urllib.quote(keyword, safe='/')
        all_pic_urls = []
        onepage_urls, fanye_url = get_onepage_urls(url_init)
        all_pic_urls.extend(onepage_urls)
    
        fanye_count = 0  # 累计翻页数
        while 1:
            onepage_urls, fanye_url = get_onepage_urls(fanye_url)
            fanye_count += 1
            print('第%s页' % fanye_count)
            if fanye_url == '' and onepage_urls == []:
                break
            all_pic_urls.extend(onepage_urls)
        The_list=list(set(all_pic_urls))
        x=1
        a=[]
        for i in The_list:
            word=[x,i]
            x=x+1
            a.append(word)
        The_list=a
        all_num=The_list[-1][0]
        num=int(The_list[-1][0])//4
        print "共"+str(all_num)+"张图片"
        list1=The_list[0:num]
        list2=The_list[num:2*num]
        list3=The_list[2*num:3*num]
        list4=The_list[3*num:-1]
        threads = []
        t1 = threading.Thread(target=down_pic, args=(list1,))
        threads.append(t1)
        t2 = threading.Thread(target=down_pic, args=(list2,))
        threads.append(t2)
        t3 = threading.Thread(target=down_pic, args=(list3,))
        threads.append(t3)
        t4 = threading.Thread(target=down_pic, args=(list4,))
        threads.append(t4)
        for t in threads:
            t.setDaemon(True)
            t.start()
        for t in threads:
            t.join()
        print "over"
        global wrong
        print "共"+str(wrong)+"张下载失败"
        #down_pic(list(set(all_pic_urls)))
    

    有的地方改的不好

    这样的话

    本地就有了1.jpg-2000.jpg(按2000张)

    接下来要筛选

    import nude,os,threading
    

    介绍一下nude(https://github.com/hhatto/nude.py)

    这是裸体检测的一个库,原理就是找人体肤色区域然后分析一下(日本小哥开发)

    然后嘞

    我们可以反过来用

    如果监测到是非法图片- – 就留下,否则用os.remove删除

    def nude_yesorno(num):
        try:
            res=nude.is_nude(str(num)+'.jpg')
            return res
        except:
            return False
    
    
    def panduan(first,bigest):
        for i in range(first,bigest):
            res=nude_yesorno(i)
            if res==False:
                try:
                    os.remove(str(i)+".jpg")
                except:
                    pass
            print i,res
    

    可是如果两千张,一张张也很慢

    继续引用多线程

    def thread_panduan(biggest):
        a_5=biggest//5
        threads = []
        t1 = threading.Thread(target=panduan, args=(0,a_5))
        threads.append(t1)
        t2 = threading.Thread(target=panduan, args=(a_5, 2*a_5))
        threads.append(t2)
        t3 = threading.Thread(target=panduan, args=(2*a_5, 3 * a_5))
        threads.append(t3)
        t4 = threading.Thread(target=panduan, args=(3*a_5, 4 * a_5))
        threads.append(t4)
        for t in threads:
            t.join()
        print "over"
    

    OK[图片上传失败...(image-6adb41-1525609580232)]

    为了稳定性,我把两个文件分开,如果同时下载并分析容错应该就比较差了吧- –

    这个应该也可以用来筛查视频,截取帧,判断,以后有时间再写

    GitHub: https://github.com/Muxxs/pic_nude
    blog: http://muxxs.com

    欢迎交流

    相关文章

      网友评论

        本文标题:全自动获取图片,并留下福利图片

        本文链接:https://www.haomeiwen.com/subject/sntprftx.html