"""
really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement
"""
import re
from selenium import webdriver
import time
import os
import sys
import re
from bs4 import BeautifulSoup
import random
from selenium.webdriver.chrome.options import Options
temp_path ="temp_chinese_pinterest_img_asserts_all2.txt"
path ="chinese_pinterest_img_asserts_all2.txt"
wikiart_path = 'chinese-painting' #"wikiart"
original_url = 'https://www.pinterest.jp/jimmyyeji/%E4%B8%AD%E5%9B%BD%E4%B9%A6%E7%94%BB-chinese-painting/' # 'https://www.wikiart.org/en/paintings-by-style/cubism?select=featured#!#filterName:featured,viewType:masonry'
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
if not os.path.exists(wikiart_path):
os.makedirs(wikiart_path)
# option = webdriver.ChromeOptions()
# option.add_argument('--headless')
# option.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options = option)
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.set_headless()
browser = webdriver.Firefox(firefox_options=fireFoxOptions)
asserts_all=set()
mark_time = 0
last_value = 0
# ------------------test start------------------------
# browser.get(original_url)
now_len = 0
pre_len = 0
count__all = 0
try:
browser.get(original_url)
# js="var q=document.documentElement.scrollTop=100000"
# browser.execute_script(js)
while(True):
time.sleep(random.randint(1,3))
browser.execute_script("window.scrollBy(0,300)")
pageSource = browser.page_source
soup = BeautifulSoup(pageSource,'lxml')
asserts = soup.find_all('img')
for assert_value in asserts:
if assert_value.get("src") != None and assert_value.get("src") != "" and assert_value.get("src").find("236x") != -1:
print(re.sub(r'236x',"originals",assert_value.get("src")))
with open(temp_path,'a',encoding="utf-8") as w_file:
w_file.write(str(re.sub(r'236x',"originals",assert_value.get("src"))) + "\n")
asserts_all.add(re.sub(r'236x',"originals",assert_value.get("src")))
print(len(asserts_all))
now_len = len(asserts_all)
if now_len == pre_len:
count_all += 1
else:
count_all = 0
if count_all >=10:
break
pre_len = now_len
with open(path,'w',encoding="utf8") as write_file:
for line in asserts_all:
write_file.write(str(line)+"\n")
except Exception as e:
print("global",e)
finally:
browser.close()
网友评论