切换为静态网页爬取urls, 用re正则search查找
import re
import requests
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
# browser = webdriver.Chrome(chrome_options = chrome_options)
images_all = set()
# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")
origial_urls = []
original_urls_temp = []
Threads_number = 40
Processes_number = 6
num_cpu=multiprocessing.cpu_count()
# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)
with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
lines = read_file.readlines()
for index, line in enumerate(lines):
url = "http://huaban.com" + line.strip()
original_urls_temp.append(url)
if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
origial_urls.append(original_urls_temp)
original_urls_temp = []
# origial_urls
# def start_thread():
# print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
try:
time.sleep(random.randint(1,5))
response = requests.get(url)
content = response.text
# print(content)
pattern = r'\"file\"\:\{.*?\}'
data = re.search(pattern=pattern, string=content)
# print(data[0])
# print(content)
# pattern = r'app\["page"\] .*? \"frames\":(1|\"1\")\}'
# data = re.search(pattern, content)
# # print(data.)
# # print(data)
pattern_two = r'\"key\":\"(.*?)\"\,'
data = re.findall(pattern_two, str(data[0]))
data = "http://img.hb.aicdn.com/" + str(data[0]) + "_fw658"
# print(data)
line = data
images_all.add(line)
try:
lock = Lock()
lock.acquire()
with open("processing_threading_huaban_big_images_all_urls_part3.txt",'a',encoding="utf8") as write_temp_file:
write_temp_file.write(line + "\n")
except Exception as e:
print("failt to fetch : %s"%url)
print(e,url)
finally:
lock.release()
print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
except Exception as e:
print(e, url)
finally:
pass
def running_processing(urls, index, epoch, batch, index3):
# print("start")
threads = []
print("start process %d number %d"%(batch, index3))
for index2, url in enumerate(urls) :
t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
threads.append(t)
for index_i, thread in enumerate(threads):
# thread.setDaemon(True)
thread.start()
for index_j, thread in enumerate(threads):
thread.join()
# print("epoch %d finished in %s"%(epoch, time.ctime()))
if __name__ == '__main__':
epoch = 0
batch = 0
len_original_urls = len(origial_urls)
temp_urls_set = []
for index, urls in enumerate(origial_urls):
temp_urls_set.append(urls)
epoch += 1
if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
batch += 1
multiThreads = []
for index3, urls in enumerate(temp_urls_set):
# print(urls)
mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
# mt = multiprocessing.Process(target=start_thread)
mt.start()
multiThreads.append(mt)
# for index_i, mthread in enumerate(multiThreads):
# mthread.start()
for index_j, mthread in enumerate(multiThreads):
mthread.join()
temp_urls_set = []
# print("end of batch: ",batch)
with open("huaban_big_images_all_urls_part3.txt",'w',encoding="utf8") as write_file:
for line in images_all:
write_file.write(str(line) + "\n")
print('program end:%s' %time.ctime())
网友评论