美文网首页工作生活
processing threading downloading

processing threading downloading

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次

切换为静态网页爬取urls, 用re正则search查找

import re
import requests
import time
import os
from bs4 import BeautifulSoup
import random
import threading
import multiprocessing
import warnings
from multiprocessing import Lock
warnings.filterwarnings("ignore")
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'

# chrome_options = Options()
# chrome_options.add_argument("--disable-gpu")
# chrome_options.add_argument("--headless")
# browser = webdriver.Chrome(chrome_options = chrome_options)
images_all = set()

# browser = webdriver.Chrome()
# browser = webdriver.PhantomJS(executable_path="phantomjs.exe")

origial_urls = []
original_urls_temp = []

Threads_number = 40
Processes_number = 6
num_cpu=multiprocessing.cpu_count()

# print("numbers of Threads: ",Threads_number)
# print("numbers of Processes: ",Processes_number)
# print("numbers of cpu: ",num_cpu)

with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
        lines = read_file.readlines()
        for index, line in enumerate(lines):
            url = "http://huaban.com" + line.strip()
            original_urls_temp.append(url)
            
            if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
                origial_urls.append(original_urls_temp)
                original_urls_temp = []
# origial_urls


# def start_thread():
#     print("fuck")
def get_image_url(index,index2, url, epoch, batch, index3):
    try:
        time.sleep(random.randint(1,5))
        response = requests.get(url)
        content = response.text
        # print(content)

        pattern = r'\"file\"\:\{.*?\}'
        data = re.search(pattern=pattern, string=content)
        # print(data[0])
        # print(content)
        # pattern = r'app\["page"\] .*? \"frames\":(1|\"1\")\}'
        # data = re.search(pattern, content)
        # # print(data.)
        # # print(data)
        pattern_two = r'\"key\":\"(.*?)\"\,'
        data = re.findall(pattern_two, str(data[0]))
        data = "http://img.hb.aicdn.com/" + str(data[0]) + "_fw658"
#         print(data)
        line = data
        images_all.add(line)
        try:
            lock = Lock()
            lock.acquire()
            with open("processing_threading_huaban_big_images_all_urls_part3.txt",'a',encoding="utf8") as write_temp_file:
                write_temp_file.write(line + "\n")
        except Exception as e:
            print("failt to fetch : %s"%url)
            print(e,url)
        finally:
            lock.release()
        print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
    except Exception as e:
        print(e, url)
    finally:
        pass

        
def running_processing(urls, index, epoch, batch, index3):
#     print("start")
    threads = []
    print("start process %d number %d"%(batch, index3))
    for index2, url in enumerate(urls) :
        t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
        threads.append(t)
    for index_i, thread in enumerate(threads):
#         thread.setDaemon(True)
        thread.start()
    for index_j, thread in enumerate(threads):
        thread.join()
#     print("epoch %d finished in %s"%(epoch, time.ctime()))
    

    

if __name__ == '__main__': 
    epoch = 0
    batch = 0
    len_original_urls = len(origial_urls)
    temp_urls_set = []
    for index, urls in enumerate(origial_urls):
        temp_urls_set.append(urls)
        epoch += 1
        if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
            batch += 1
            multiThreads = []
            for index3, urls in enumerate(temp_urls_set):
#                 print(urls)
                mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
#                 mt = multiprocessing.Process(target=start_thread)
                mt.start()
                multiThreads.append(mt)
#             for index_i, mthread in enumerate(multiThreads):
#                 mthread.start()
            for index_j, mthread in enumerate(multiThreads):
                mthread.join()
            temp_urls_set = []
#             print("end of batch: ",batch)
        
    with open("huaban_big_images_all_urls_part3.txt",'w',encoding="utf8") as write_file:
        for line in images_all:
            write_file.write(str(line) + "\n")
    print('program end:%s' %time.ctime())

相关文章

网友评论

    本文标题:processing threading downloading

    本文链接:https://www.haomeiwen.com/subject/eatdhctx.html