美文网首页工作生活
taking image urls from pinterest

taking image urls from pinterest

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次
    
    
    """
    really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement
    """
    import re
    from selenium import webdriver
    import time
    import os
    import sys
    import re
    from bs4 import BeautifulSoup
    import random
    from selenium.webdriver.chrome.options import Options
    
    temp_path ="temp_chinese_pinterest_img_asserts_all2.txt"
    path ="chinese_pinterest_img_asserts_all2.txt"
    
    wikiart_path = 'chinese-painting'  #"wikiart"
    original_url =  'https://www.pinterest.jp/jimmyyeji/%E4%B8%AD%E5%9B%BD%E4%B9%A6%E7%94%BB-chinese-painting/'  # 'https://www.wikiart.org/en/paintings-by-style/cubism?select=featured#!#filterName:featured,viewType:masonry'
    # os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
    if not os.path.exists(wikiart_path):
        os.makedirs(wikiart_path)
    # option = webdriver.ChromeOptions()
    # option.add_argument('--headless')
    # option.add_argument('--disable-gpu')
    # browser = webdriver.Chrome(chrome_options = option)
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    browser = webdriver.Firefox(firefox_options=fireFoxOptions)
    
    asserts_all=set()
    
    mark_time = 0
    last_value = 0
    
    # ------------------test start------------------------
    
    # browser.get(original_url)
    
    
    
    
    
    now_len = 0
    pre_len = 0
    count__all = 0
    
    try:
        browser.get(original_url)
    #  js="var q=document.documentElement.scrollTop=100000"
    #  browser.execute_script(js)
        while(True):
            time.sleep(random.randint(1,3))
            browser.execute_script("window.scrollBy(0,300)")
            pageSource = browser.page_source
            soup = BeautifulSoup(pageSource,'lxml')
            asserts = soup.find_all('img')
            for assert_value in asserts:
                if assert_value.get("src") != None and assert_value.get("src") != "" and assert_value.get("src").find("236x") != -1:
                    print(re.sub(r'236x',"originals",assert_value.get("src")))
                    with open(temp_path,'a',encoding="utf-8") as w_file:
                        w_file.write(str(re.sub(r'236x',"originals",assert_value.get("src"))) + "\n")
                    asserts_all.add(re.sub(r'236x',"originals",assert_value.get("src")))
            print(len(asserts_all))
            now_len = len(asserts_all)
            if now_len == pre_len:
                count_all += 1
            else:
                count_all = 0
            
            if count_all >=10:
                break
            pre_len = now_len
        with open(path,'w',encoding="utf8") as write_file:
            for line in asserts_all:
                write_file.write(str(line)+"\n")
    except Exception as e:
        print("global",e)
    finally:
        browser.close()
    
        
    
        
    
    

    相关文章

      网友评论

        本文标题:taking image urls from pinterest

        本文链接:https://www.haomeiwen.com/subject/ihxdhctx.html