之前做box相关的learning项目,需要好多box的图片,就写了这个爬虫。
功能相当于在google image中搜索相关关键字,批量保存前几页图片。
from selenium import webdriver
import time
import urllib
from selenium.webdriver.common.keys import Keys
# 查询图片的关键字
keys = ['cupboard','glass box']
# chrome driver绝对路径
driver = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
for key in keys:
driver.get("https://www.google.co.jp/imghp?hl=zh-CN&tab=wi&ei=qJgtWINugqDSBJTzqqgI&ved=0EKouCBMoAQ")
elem = driver.find_element_by_name("q")
elem.send_keys(key)
elem.send_keys(Keys.RETURN)
#driver.fine_element_by_name("btnG").click()
xpath = '//div[@id="rg_s"]/div/a'
driver.maximize_window()
#driver.set_window_size(1000,30000)
#time.sleep(5)
img_url_dic = {}
pos = 0
m = 0
#urlfile = open(key+".txt",'w')
urlfile = open("url.txt",'a')
for i in range(5):
print i
pos += i*500
#js = "document.documentElement.scrollTop=%d" % pos
js = "window.scrollBy(0,%d)" % pos
driver.execute_script(js)
time.sleep(3)
for element in driver.find_elements_by_xpath(xpath):
img_url = element.get_attribute('href')
if img_url != None and not img_url_dic.has_key(img_url):
img_url_dic[img_url] = ''
img_url = img_url[img_url.index("=")+1:img_url.index("&")] #first image url - smallest
img_url = img_url.replace("%3A",":")
img_url = img_url.replace("%2F","/")
#####save picture
#data = urllib.urlopen(img_url).read()
#filename = img_url[img_url.rindex("/")+1:]
#f = open("pictures\\"+filename, 'wb')
#f.write(data)
#f.close()
#####save img url
#print img_url
urlfile.write(img_url)
urlfile.write('\n')
urlfile.close()
driver.close()
网友评论