美文网首页工作生活
processing threading downloading

processing threading downloading

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次

添加设置 setDaemon和 join(timeout)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
# browser = webdriver.Chrome(chrome_options = chrome_options)
images_all = set()

# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")

origial_urls = []
original_urls_temp = []

Threads_number = 40
Processes_number = 6
num_cpu=multiprocessing.cpu_count()

# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)

with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
        lines = read_file.readlines()
        for index, line in enumerate(lines):
            url = "http://huaban.com" + line.strip()
            original_urls_temp.append(url)
            
            if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
                origial_urls.append(original_urls_temp)
                original_urls_temp = []
# origial_urls


# def start_thread():
#     print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
    browser = webdriver.PhantomJS()
    browser.set_page_load_timeout(10000)
    browser.set_script_timeout(10000)
    try:
#         chrome_options = Options()
#         chrome_options.add_argument("--disable-gpu")
#         chrome_options.add_argument("--headless")
#         browser = webdriver.Chrome(chrome_options = chrome_options)
#         browser = webdriver.PhantomJS()
#         browser.set_page_load_timeout(10000)
#         browser.set_script_timeout(10000)
        time.sleep(random.randint(1,5))
        browser.get(url)
        line = None
        try:
            img1 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/a/img')
            if img1 != None:
                images_all.add(img1.get_attribute('src'))
                line = img1.get_attribute('src')
        except Exception as e:
            pass


        try:
            img2 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/img')
            if img2 != None:
                images_all.add(img2.get_attribute('src'))
                line = img2.get_attribute('src')
        except Exception as e:
            pass
        
        try:
            lock = Lock()
            lock.acquire()
            with open("processing_threading_huaban_big_images_all_urls.txt",'a',encoding="utf8") as write_temp_file:
                write_temp_file.write(line + "\n")
        except Exception as e:
            print("failt to fetch : %s"%url)
            print(e)
        finally:
            lock.release()
            
        
        print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
    except Exception as e:
        pass
    finally:
        browser.close()

        
def running_processing(urls, index, epoch, batch, index3):
#     print("start")
    threads = []
    print("start process %d number %d"%(batch, index3))
    for index2, url in enumerate(urls) :
        t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
        threads.append(t)
    for index_i, thread in enumerate(threads):
        thread.setDaemon(True)
        thread.start()
    for index_j, thread in enumerate(threads):
        thread.join(5)
#     print("epoch %d finished in %s"%(epoch, time.ctime()))
    

    

if __name__ == '__main__': 
    epoch = 0
    batch = 0
    len_original_urls = len(origial_urls)
    temp_urls_set = []
    for index, urls in enumerate(origial_urls):
        temp_urls_set.append(urls)
        epoch += 1
        if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
            batch += 1
            multiThreads = []
            for index3, urls in enumerate(temp_urls_set):
#                 print(urls)
                mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
#                 mt = multiprocessing.Process(target=start_thread)
                mt.start()
                multiThreads.append(mt)
#             for index_i, mthread in enumerate(multiThreads):
#                 mthread.start()
            for index_j, mthread in enumerate(multiThreads):
                mthread.join()
            temp_urls_set = []
#             print("end of batch: ",batch)
        
#     with open("huaban_big_images_all_urls.txt",'w',encoding="utf8") as write_file:
#         for line in images_all:
#             write_file.write(str(line) + "\n")
#     print("images_all")
#     print(images_all)
    print('program end:%s' %time.ctime())

相关文章

网友评论

    本文标题:processing threading downloading

    本文链接:https://www.haomeiwen.com/subject/gntdhctx.html