美文网首页
day04 selenium自动搜索爬取jd的100页图片

day04 selenium自动搜索爬取jd的100页图片

作者: LittleBear_6c91 | 来源:发表于2019-04-12 20:38 被阅读0次

实践告诉我爬取JD的图片必须先让界面滑到底部再向上慢慢滑动才能爬取所有图片!代码如下:

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
from lxml import etree

# 设置浏览器参数
chrome_options = webdriver.ChromeOptions()
# 设置成无头浏览器
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)

wait = WebDriverWait(browser, 5)

def get_page(page):
    if page == 1:
        url = 'https://www.jd.com/'
        browser.get(url)
        input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
        input.clear()
        input.send_keys('机器人')
      
        button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search button.button')))
        button.click()
        time.sleep(2)


    # 执行往下滚动页面的javascript代码

    # 先滚动到底部
    str_js = 'var scrollHeight = document.body.scrollHeight;  window.scrollTo(0, scrollHeight);'
    browser.execute_script(str_js)
    
    # 再向上滚动到头部
    steps = 16
    for i in range(steps, 0, -1):
        str_js = 'var scrollHeight = document.body.scrollHeight / %d;   window.scrollTo(0, scrollHeight * %d);' % (steps, i)
        browser.execute_script(str_js)
        time.sleep(1)

    html = browser.page_source

    # 找输入页码框
    input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
    loc = input_page.location
    # 滚动到页码框位置
    str_js = 'window.scrollTo(0, %d);' % loc['y']
    browser.execute_script(str_js)

    # 输入页码
    input_page.clear()
    input_page.send_keys(page + 1)

    # 点击跳页按钮
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
    button.click()

    return html

# 解析页面
def parse_page(html):
    etree_html = etree.HTML(html)
    items = etree_html.xpath('//div[@id="J_goodsList"]/ul/li[@class="gl-item"]')
    for item in items:
        titles = item.xpath('.//div[@class="p-name p-name-type-2"]//em//text()')
        # print(titles)
        title = ''.join(titles)
        print(title)
        print('*' * 20)

        images = item.xpath('.//div[@class="p-img"]/a/img/@src')
        print(images)

def main():
    for page in range(100):
        print(page + 1)
        html = get_page(page + 1)
        # print(html)
        parse_page(html)

if __name__ == '__main__':
    main()

相关文章

网友评论

      本文标题:day04 selenium自动搜索爬取jd的100页图片

      本文链接:https://www.haomeiwen.com/subject/awvhwqtx.html