美文网首页
day04 selenium自动搜索爬取jd的100页图片

day04 selenium自动搜索爬取jd的100页图片

作者: LittleBear_6c91 | 来源:发表于2019-04-12 20:38 被阅读0次

    实践告诉我爬取JD的图片必须先让界面滑到底部再向上慢慢滑动才能爬取所有图片!代码如下:

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver import ActionChains
    import time
    from lxml import etree
    
    # 设置浏览器参数
    chrome_options = webdriver.ChromeOptions()
    # 设置成无头浏览器
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    wait = WebDriverWait(browser, 5)
    
    def get_page(page):
        if page == 1:
            url = 'https://www.jd.com/'
            browser.get(url)
            input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#key')))
            input.clear()
            input.send_keys('机器人')
          
            button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search button.button')))
            button.click()
            time.sleep(2)
    
    
        # 执行往下滚动页面的javascript代码
    
        # 先滚动到底部
        str_js = 'var scrollHeight = document.body.scrollHeight;  window.scrollTo(0, scrollHeight);'
        browser.execute_script(str_js)
        
        # 再向上滚动到头部
        steps = 16
        for i in range(steps, 0, -1):
            str_js = 'var scrollHeight = document.body.scrollHeight / %d;   window.scrollTo(0, scrollHeight * %d);' % (steps, i)
            browser.execute_script(str_js)
            time.sleep(1)
    
        html = browser.page_source
    
        # 找输入页码框
        input_page = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_bottomPage input.input-txt')))
        loc = input_page.location
        # 滚动到页码框位置
        str_js = 'window.scrollTo(0, %d);' % loc['y']
        browser.execute_script(str_js)
    
        # 输入页码
        input_page.clear()
        input_page.send_keys(page + 1)
    
        # 点击跳页按钮
        button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage .btn.btn-default')))
        button.click()
    
        return html
    
    # 解析页面
    def parse_page(html):
        etree_html = etree.HTML(html)
        items = etree_html.xpath('//div[@id="J_goodsList"]/ul/li[@class="gl-item"]')
        for item in items:
            titles = item.xpath('.//div[@class="p-name p-name-type-2"]//em//text()')
            # print(titles)
            title = ''.join(titles)
            print(title)
            print('*' * 20)
    
            images = item.xpath('.//div[@class="p-img"]/a/img/@src')
            print(images)
    
    def main():
        for page in range(100):
            print(page + 1)
            html = get_page(page + 1)
            # print(html)
            parse_page(html)
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:day04 selenium自动搜索爬取jd的100页图片

          本文链接:https://www.haomeiwen.com/subject/awvhwqtx.html