实践告诉我爬取JD的图片必须先让界面滑到底部再向上慢慢滑动才能爬取所有图片!代码如下:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from lxml import etree
# 设置浏览器参数
chrome_options = webdriver.ChromeOptions()
# 设置成无头浏览器
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 5)
def get_page(page):
if page == 1:
url = 'https://www.jd.com/'
browser.get(url)
input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
input.clear()
input.send_keys('机器人')
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search button.button')))
button.click()
time.sleep(2)
# 执行往下滚动页面的javascript代码
# 先滚动到底部
str_js = 'var scrollHeight = document.body.scrollHeight; window.scrollTo(0, scrollHeight);'
browser.execute_script(str_js)
# 再向上滚动到头部
steps = 16
for i in range(steps, 0, -1):
str_js = 'var scrollHeight = document.body.scrollHeight / %d; window.scrollTo(0, scrollHeight * %d);' % (steps, i)
browser.execute_script(str_js)
time.sleep(1)
html = browser.page_source
# 找输入页码框
input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
loc = input_page.location
# 滚动到页码框位置
str_js = 'window.scrollTo(0, %d);' % loc['y']
browser.execute_script(str_js)
# 输入页码
input_page.clear()
input_page.send_keys(page + 1)
# 点击跳页按钮
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
button.click()
return html
# 解析页面
def parse_page(html):
etree_html = etree.HTML(html)
items = etree_html.xpath('//div[@id="J_goodsList"]/ul/li[@class="gl-item"]')
for item in items:
titles = item.xpath('.//div[@class="p-name p-name-type-2"]//em//text()')
# print(titles)
title = ''.join(titles)
print(title)
print('*' * 20)
images = item.xpath('.//div[@class="p-img"]/a/img/@src')
print(images)
def main():
for page in range(100):
print(page + 1)
html = get_page(page + 1)
# print(html)
parse_page(html)
if __name__ == '__main__':
main()
网友评论