美文网首页工作生活
processing threading downloading

processing threading downloading

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次

    切换为静态网页爬取urls, 用re正则search查找

    import re
    import requests
    import time
    import os
    from bs4 import BeautifulSoup
    import random
    import threading
    import multiprocessing
    import warnings
    from multiprocessing import Lock
    warnings.filterwarnings("ignore")
    # os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
    
    # chrome_options = Options()
    # chrome_options.add_argument("--disable-gpu")
    # chrome_options.add_argument("--headless")
    # browser = webdriver.Chrome(chrome_options = chrome_options)
    images_all = set()
    
    # browser = webdriver.Chrome()
    # browser = webdriver.PhantomJS(executable_path="phantomjs.exe")
    
    origial_urls = []
    original_urls_temp = []
    
    Threads_number = 40
    Processes_number = 6
    num_cpu=multiprocessing.cpu_count()
    
    # print("numbers of Threads: ",Threads_number)
    # print("numbers of Processes: ",Processes_number)
    # print("numbers of cpu: ",num_cpu)
    
    with open("huaban_pin_asserts_all.txt",'r',encoding="utf8") as read_file:
            lines = read_file.readlines()
            for index, line in enumerate(lines):
                url = "http://huaban.com" + line.strip()
                original_urls_temp.append(url)
                
                if (index + 1) % Threads_number == 0 or (index + 1) == len(lines):
                    origial_urls.append(original_urls_temp)
                    original_urls_temp = []
    # origial_urls
    
    
    # def start_thread():
    #     print("fuck")
    def get_image_url(index,index2, url, epoch, batch, index3):
        try:
            time.sleep(random.randint(1,5))
            response = requests.get(url)
            content = response.text
            # print(content)
    
            pattern = r'\"file\"\:\{.*?\}'
            data = re.search(pattern=pattern, string=content)
            # print(data[0])
            # print(content)
            # pattern = r'app\["page"\] .*? \"frames\":(1|\"1\")\}'
            # data = re.search(pattern, content)
            # # print(data.)
            # # print(data)
            pattern_two = r'\"key\":\"(.*?)\"\,'
            data = re.findall(pattern_two, str(data[0]))
            data = "http://img.hb.aicdn.com/" + str(data[0]) + "_fw658"
    #         print(data)
            line = data
            images_all.add(line)
            try:
                lock = Lock()
                lock.acquire()
                with open("processing_threading_huaban_big_images_all_urls_part3.txt",'a',encoding="utf8") as write_temp_file:
                    write_temp_file.write(line + "\n")
            except Exception as e:
                print("failt to fetch : %s"%url)
                print(e,url)
            finally:
                lock.release()
            print("index: %d, epoch: %d, batch: %d, index3: %d, index2: %d, line: %s"%(index, epoch, batch, index3, index2, line))
        except Exception as e:
            print(e, url)
        finally:
            pass
    
            
    def running_processing(urls, index, epoch, batch, index3):
    #     print("start")
        threads = []
        print("start process %d number %d"%(batch, index3))
        for index2, url in enumerate(urls) :
            t = threading.Thread(target= get_image_url, args=(index, index2, url, epoch, batch, index3))
            threads.append(t)
        for index_i, thread in enumerate(threads):
    #         thread.setDaemon(True)
            thread.start()
        for index_j, thread in enumerate(threads):
            thread.join()
    #     print("epoch %d finished in %s"%(epoch, time.ctime()))
        
    
        
    
    if __name__ == '__main__': 
        epoch = 0
        batch = 0
        len_original_urls = len(origial_urls)
        temp_urls_set = []
        for index, urls in enumerate(origial_urls):
            temp_urls_set.append(urls)
            epoch += 1
            if (index + 1) % Processes_number == 0 or (index + 1) == len_original_urls:
                batch += 1
                multiThreads = []
                for index3, urls in enumerate(temp_urls_set):
    #                 print(urls)
                    mt = multiprocessing.Process(target=running_processing,args=(urls, index, epoch, batch, index3))
    #                 mt = multiprocessing.Process(target=start_thread)
                    mt.start()
                    multiThreads.append(mt)
    #             for index_i, mthread in enumerate(multiThreads):
    #                 mthread.start()
                for index_j, mthread in enumerate(multiThreads):
                    mthread.join()
                temp_urls_set = []
    #             print("end of batch: ",batch)
            
        with open("huaban_big_images_all_urls_part3.txt",'w',encoding="utf8") as write_file:
            for line in images_all:
                write_file.write(str(line) + "\n")
        print('program end:%s' %time.ctime())
    

    相关文章

      网友评论

        本文标题:processing threading downloading

        本文链接:https://www.haomeiwen.com/subject/eatdhctx.html