downloading image urls from baid

作者: 狼无雨雪 | 来源:发表于2019-07-05 12:57 被阅读0次



"""
really used in fetching url from google images
"""
import re
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options

down_loading_urls = {
    
}


baidu_path = 'Willow_baidu'  #"wikiart"
original_url =  'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C1%F8%CA%F7&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111' 


temp_path = baidu_path + "/" + "temp_baidu.txt"
path = baidu_path + "/" + "baidu.txt"



# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
if not os.path.exists(baidu_path):
    os.makedirs(baidu_path)
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options = option)
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)

asserts_all=set()

mark_time = 0
last_value = 0

# ------------------test start------------------------

# browser.get(original_url)





now_len = 0
pre_len = 0
count_all = 0

try:
    browser.get(original_url)
#  js="var q=document.documentElement.scrollTop=100000"
#  browser.execute_script(js)
    while(True):
        time.sleep(random.randint(1,3))
        browser.execute_script("window.scrollBy(0,1000)")
#         print(browser.find_element_by_xpath('//*[@id="smb"]'))
        
        pageSource = browser.page_source
        soup = BeautifulSoup(pageSource,'lxml')
        asserts = soup.find_all('li', {"class":"imgitem"})
        for line in asserts:
#             print(data.get("ou"))
            with open(temp_path,'a',encoding="utf-8") as w_file:
                w_file.write(line.get("data-objurl") + "\n")
            asserts_all.add(line.get("data-objurl"))
        print(len(asserts_all))
        now_len = len(asserts_all)
        if now_len == pre_len:
            count_all += 1
        else:
            count_all = 0
        
        if count_all >=10:
            break
        pre_len = now_len
    
except Exception as e:
    print("global",e)
finally:
    with open(path,'w',encoding="utf8") as write_file:
        for line in asserts_all:
            write_file.write(str(line)+"\n")
#     pass
    browser.close()

网友评论

工作生活

本文标题：downloading image urls from baid

本文链接：https://www.haomeiwen.com/subject/pkxdhctx.html

延伸阅读

深度阅读

您也可以注册成为美文阅读网的作者，发表您的原创作品、分享您的心情！

downloading image urls from baid

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读

工作生活