from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
images_all = set()
# browser = webdriver.Chrome(chrome_options = chrome_options)
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")
origial_urls = []
original_urls_temp = []
Threads_number = 20
Processes_number = 4
num_cpu=multiprocessing.cpu_count()
# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)
with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
lines = read_file.readlines()
for index, line in enumerate(lines):
url = "http://huaban.com" + line.strip()
original_urls_temp.append(url)
if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
origial_urls.append(original_urls_temp)
original_urls_temp = []
# origial_urls
# def start_thread():
# print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
try:
browser = webdriver.PhantomJS()
browser.set_page_load_timeout(10000)
browser.set_script_timeout(10000)
time.sleep(random.randint(1,4))
browser.get(url)
line = None
try:
img1 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/a/img')
if img1 != None:
images_all.add(img1.get_attribute('src'))
line = img1.get_attribute('src')
except Exception as e:
pass
try:
img2 = browser.find_element_by_xpath('//*[@id="baidu_image_holder"]/img')
if img2 != None:
images_all.add(img2.get_attribute('src'))
line = img2.get_attribute('src')
except Exception as e:
pass
try:
lock = Lock()
lock.acquire()
with open("huaban_big_images_all_urls_temp.txt",'a',encoding="utf8") as write_temp_file:
write_temp_file.write(line + "\n")
except Exception as e:
pass
finally:
lock.release()
print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
except Exception as e:
pass
finally:
browser.close()
def running_processing(urls, index, epoch, batch, index3):
# print("start")
threads = []
print("start process %d number %d"%(batch, index3))
for index2, url in enumerate(urls) :
t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
threads.append(t)
for index_i, thread in enumerate(threads):
thread.start()
for index_j, thread in enumerate(threads):
thread.join()
# print("epoch %d finished in %s"%(epoch, time.ctime()))
if __name__ == '__main__':
epoch = 0
batch = 0
len_original_urls = len(origial_urls)
temp_urls_set = []
for index, urls in enumerate(origial_urls):
temp_urls_set.append(urls)
epoch += 1
if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
batch += 1
multiThreads = []
for index3, urls in enumerate(temp_urls_set):
# print(urls)
mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
# mt = multiprocessing.Process(target=start_thread)
mt.start()
multiThreads.append(mt)
# for index_i, mthread in enumerate(multiThreads):
# mthread.start()
for index_j, mthread in enumerate(multiThreads):
mthread.join()
temp_urls_set = []
# print("end of batch: ",batch)
with open("huaban_big_images_all_urls.txt",'w',encoding="utf8") as write_file:
for line in images_all:
write_file.write(str(line) + "\n")
# print("images_all")
# print(images_all)
print('program end:%s' %time.ctime())
网友评论