添加设置 setDaemon和 join(timeout)
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
# browser = webdriver.Chrome(chrome_options = chrome_options)
images_all = set()
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")
origial_urls = []
original_urls_temp = []
Threads_number = 40
Processes_number = 6
num_cpu=multiprocessing.cpu_count()
# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)
with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
lines = read_file.readlines()
for index, line in enumerate(lines):
url = "http://huaban.com" + line.strip()
original_urls_temp.append(url)
if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
origial_urls.append(original_urls_temp)
original_urls_temp = []
# origial_urls
# def start_thread():
# print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
browser = webdriver.PhantomJS()
browser.set_page_load_timeout(10000)
browser.set_script_timeout(10000)
try:
# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
# browser = webdriver.Chrome(chrome_options = chrome_options)
# browser = webdriver.PhantomJS()
# browser.set_page_load_timeout(10000)
# browser.set_script_timeout(10000)
time.sleep(random.randint(1,5))
browser.get(url)
line = None
try:
img1 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/a/img')
if img1 != None:
images_all.add(img1.get_attribute('src'))
line = img1.get_attribute('src')
except Exception as e:
pass
try:
img2 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/img')
if img2 != None:
images_all.add(img2.get_attribute('src'))
line = img2.get_attribute('src')
except Exception as e:
pass
try:
lock = Lock()
lock.acquire()
with open("processing_threading_huaban_big_images_all_urls.txt",'a',encoding="utf8") as write_temp_file:
write_temp_file.write(line + "\n")
except Exception as e:
print("failt to fetch : %s"%url)
print(e)
finally:
lock.release()
print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
except Exception as e:
pass
finally:
browser.close()
def running_processing(urls, index, epoch, batch, index3):
# print("start")
threads = []
print("start process %d number %d"%(batch, index3))
for index2, url in enumerate(urls) :
t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
threads.append(t)
for index_i, thread in enumerate(threads):
thread.setDaemon(True)
thread.start()
for index_j, thread in enumerate(threads):
thread.join(5)
# print("epoch %d finished in %s"%(epoch, time.ctime()))
if __name__ == '__main__':
epoch = 0
batch = 0
len_original_urls = len(origial_urls)
temp_urls_set = []
for index, urls in enumerate(origial_urls):
temp_urls_set.append(urls)
epoch += 1
if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
batch += 1
multiThreads = []
for index3, urls in enumerate(temp_urls_set):
# print(urls)
mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
# mt = multiprocessing.Process(target=start_thread)
mt.start()
multiThreads.append(mt)
# for index_i, mthread in enumerate(multiThreads):
# mthread.start()
for index_j, mthread in enumerate(multiThreads):
mthread.join()
temp_urls_set = []
# print("end of batch: ",batch)
# with open("huaban_big_images_all_urls.txt",'w',encoding="utf8") as write_file:
# for line in images_all:
# write_file.write(str(line) + "\n")
# print("images_all")
# print(images_all)
print('program end:%s' %time.ctime())
网友评论