美文网首页工作生活
processing threading downloading

processing threading downloading

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:56 被阅读0次

多进程多线程爬取图片

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import urllib
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")

images_all = set()
# browser = webdriver.Chrome(chrome_options = chrome_options)
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")

origial_urls = []
original_urls_temp = []

Threads_number = 40
Processes_number = 6
num_cpu=multiprocessing.cpu_count()
path = "huaban_big_images"
if not os.path.exists(path):
    os.makedirs(path)

# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)

with open("huaban_images_all.txt",'r',encoding="utf8") as read_file:
        lines = read_file.readlines()
        for index, line in enumerate(lines):
            url = line.strip()
            original_urls_temp.append(url)
            if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
                origial_urls.append(original_urls_temp)
                original_urls_temp = []
# origial_urls


# def start_thread():
#     print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
    try:
        print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, "start" + url))
        
        try:
            lock = Lock()
            lock.acquire()
            time.sleep(random.randint(1,5))
            filename = path +"/" + str("index%depoch%dbatch%dindex3%dindex2%d"%(index, epoch, batch, index3, index2)) + str(url.split("/")[-1]) + ".png"
            print("now is loading %s"%url)
            urllib.request.urlretrieve(url, filename = filename)
        except Exception as e:
            print("failt to fetch : %s"%url)
            print(e)
        finally:
            lock.release()
            
        print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, "end" + url))
        
    except Exception as e:
        pass
    finally:
        pass
        
def running_processing(urls, index, epoch, batch, index3):
    threads = []
    print("start process %d number %d"%(batch, index3))
    for index2, url in enumerate(urls) :
        t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
        threads.append(t)
    for index_i, thread in enumerate(threads):
#         thread.setDaemon(True)
        thread.start()
    for index_j, thread in enumerate(threads):
        thread.join()
    

    

if __name__ == '__main__': 
    epoch = 0
    batch = 0
    len_original_urls = len(origial_urls)
    temp_urls_set = []
    for index, urls in enumerate(origial_urls):
        temp_urls_set.append(urls)
        epoch += 1
        if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
            batch += 1
            multiThreads = []
            for index3, urls in enumerate(temp_urls_set):
                mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
                mt.start()
                multiThreads.append(mt)

            for index_j, mthread in enumerate(multiThreads):
                mthread.join()
            temp_urls_set = []

    print('program end:%s' %time.ctime())

相关文章

网友评论

    本文标题:processing threading downloading

    本文链接:https://www.haomeiwen.com/subject/tttdhctx.html