美文网首页工作生活
downloading image urls from baid

downloading image urls from baid

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次
    
    
    """
    really used in fetching url from google images
    """
    import re
    from selenium import webdriver
    import time
    import os
    import sys
    import re
    from bs4 import BeautifulSoup
    import random
    from selenium.webdriver.chrome.options import Options
    
    down_loading_urls = {
        
    }
    
    
    baidu_path = 'Willow_baidu'  #"wikiart"
    original_url =  'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C1%F8%CA%F7&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111' 
    
    
    temp_path = baidu_path + "/" + "temp_baidu.txt"
    path = baidu_path + "/" + "baidu.txt"
    
    
    
    # os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
    if not os.path.exists(baidu_path):
        os.makedirs(baidu_path)
    # option = webdriver.ChromeOptions()
    # option.add_argument('--headless')
    # option.add_argument('--disable-gpu')
    # browser = webdriver.Chrome(chrome_options = option)
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.set_headless()
    browser = webdriver.Firefox(firefox_options=fireFoxOptions)
    
    asserts_all=set()
    
    mark_time = 0
    last_value = 0
    
    # ------------------test start------------------------
    
    # browser.get(original_url)
    
    
    
    
    
    now_len = 0
    pre_len = 0
    count_all = 0
    
    try:
        browser.get(original_url)
    #  js="var q=document.documentElement.scrollTop=100000"
    #  browser.execute_script(js)
        while(True):
            time.sleep(random.randint(1,3))
            browser.execute_script("window.scrollBy(0,1000)")
    #         print(browser.find_element_by_xpath('//*[@id="smb"]'))
            
            pageSource = browser.page_source
            soup = BeautifulSoup(pageSource,'lxml')
            asserts = soup.find_all('li', {"class":"imgitem"})
            for line in asserts:
    #             print(data.get("ou"))
                with open(temp_path,'a',encoding="utf-8") as w_file:
                    w_file.write(line.get("data-objurl") + "\n")
                asserts_all.add(line.get("data-objurl"))
            print(len(asserts_all))
            now_len = len(asserts_all)
            if now_len == pre_len:
                count_all += 1
            else:
                count_all = 0
            
            if count_all >=10:
                break
            pre_len = now_len
        
    except Exception as e:
        print("global",e)
    finally:
        with open(path,'w',encoding="utf8") as write_file:
            for line in asserts_all:
                write_file.write(str(line)+"\n")
    #     pass
        browser.close()
    
        
    
        
    
    

    相关文章

      网友评论

        本文标题:downloading image urls from baid

        本文链接:https://www.haomeiwen.com/subject/pkxdhctx.html