美文网首页Python基础
Python网络爬虫实战之九:Selenium进阶操作与爬取京东

Python网络爬虫实战之九:Selenium进阶操作与爬取京东

作者: 27efec53a72d | 来源:发表于2018-08-13 00:27 被阅读491次

    目录:Python网络爬虫实战系列

    正文:

    一、Selenium进阶操作

    1、回顾 Selenium 打开有界面的浏览器

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
        input = browser.find_element_by_id('kw')
        input.send_keys('Python')
        input.send_keys(Keys.ENTER)
        wait = WebDriverWait(browser, 10)
        wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
        print(browser.current_url)
        print(browser.get_cookies())
        print(browser.page_source)
    finally:
        browser.close()
    

    2、回顾 Selenium 打开无界面的浏览器

    from selenium import webdriver
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')  # 设置headless模型
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://www.baidu.com')
    print(driver.page_source)
    driver.close()
    

    3、页面交互:模拟人工在淘宝上搜索

    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get("http://www.taobao.com")
    input_str = browser.find_element_by_id('q')
    input_str.send_keys("ipad")
    time.sleep(1)
    input_str.clear()
    input_str.send_keys("macBook pro")
    button = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]')
    button.click()
    

    4、动作链: 模拟人工拖拽图像元素

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    
    browser = webdriver.Chrome()
    url = "http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable"
    browser.get(url)
    browser.switch_to.frame('iframeResult')
    source = browser.find_element_by_css_selector('#draggable')
    target = browser.find_element_by_css_selector('#droppable')
    actions = ActionChains(browser)
    actions.drag_and_drop(source, target)
    actions.perform()
    

    5、执行JS:模拟人工在知乎上下拉滚动条到页面底部

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get("http://www.zhihu.com/explore")
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    browser.execute_script('alert("To Bottom")')
    

    6、获取节点信息:知乎首页中“提问”按钮

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    url = 'https://www.zhihu.com/explore'
    browser.get(url)
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input.id)
    print(input.location)
    print(input.tag_name)
    print(input.size)
    

    7、隐式等待

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.implicitly_wait(10)
    browser.get('https://www.zhihu.com/explore')
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input)
    

    8、显式等待

    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com/')
    wait = WebDriverWait(browser, 10)
    input = wait.until(EC.presence_of_element_located((By.ID, 'q')))
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.btn-search')))
    print(input, button)
    

    9、切换Frame

    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    
    browser = webdriver.Chrome()
    url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
    browser.get(url)
    browser.switch_to.frame('iframeResult')
    try:
        logo = browser.find_element_by_class_name('logo')
    except NoSuchElementException:
        print('NO LOGO in iframeResult')
    browser.switch_to.parent_frame()
    try:
        logo = browser.find_element_by_class_name('logo')
        print(logo)
        print(logo.text)
    except NoSuchElementException:
        print('NO LOGO in parent_frame')
    

    10、前进与后退

    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com/')
    browser.get('https://www.taobao.com/')
    browser.get('https://www.zhihu.com/')
    browser.back()
    time.sleep(1)
    browser.forward()
    browser.close()
    

    11、浏览器的选项卡操作

    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()')
    print(browser.window_handles)
    browser.switch_to.window(browser.window_handles[1])
    browser.get('https://www.taobao.com')
    time.sleep(1)
    browser.switch_to.window(browser.window_handles[0])
    browser.get('https://www.zhihu.com/')
    

    二、爬取京东商品评论

    1、爬取京东的商品评论——通过打开界面浏览器

    from selenium import webdriver
    from urllib.parse import quote
    
    driver = webdriver.Chrome()  # 打开浏览器
    key = '红酒'  # 设置搜索商品关键词
    url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
    driver.get(url)  # 打开url
    driver.implicitly_wait(3)  # 等待
    links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a')  # 查找当前页面的商品链接
    urls = [l.get_attribute('href') for l in links]
    url = urls[1]  # 获取第一个商品链接
    driver.get(url)  # 打开页面
    driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品评论
    # 获取评论数据
    comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
    comment_text_list = [c.text for c in comment_list]
    driver.find_element_by_link_text('下一页').click()  # 点击下一页评论
    driver.close()
    

    2、爬取京东的商品评论——通过无界面浏览器

    from selenium import webdriver
    from urllib.parse import quote
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(chrome_options=chrome_options)
    key = '红酒'  # 设置搜索商品关键词
    url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
    driver.get(url)  # 打开url
    driver.implicitly_wait(3)  # 等待
    links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/a')  # 查找当前页面的商品链接
    urls = [l.get_attribute('href') for l in links]
    url = urls[1]  # 获取第一个商品链接
    driver.get(url)  # 打开页面
    driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品评论
    # 获取评论数据
    comment_list = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
    comment_text_list = [c.text for c in comment_list]
    # driver.find_element_by_link_text('下一页').click()  # TODO 报错:Message: no such element: Unable to locate element: {"method":"link text","selector":"下一页"}
    
    

    3、爬取京东的商品评论——通过封装函数的形式

    from selenium import webdriver
    from urllib.parse import quote
    import pandas as pd
    from selenium.common.exceptions import StaleElementReferenceException
    
    
    def get_page_comment(driver):
        try:
            content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
            content_list = [c.text for c in content]
        except StaleElementReferenceException as msg:
            print(u"get_page_comment异常%s" % msg)
            print(u"重新get_page_comment")
            content = driver.find_elements_by_xpath('//*[@id="comment-0"]//div/div[2]/p')
            content_list = [c.text for c in content]
        return content_list
    
    
    def get_page_all_comment(driver, i):
        all_content = get_page_comment(driver)
        while True:
            try:
                driver.find_element_by_link_text('下一页').click()
                all_content = all_content + get_page_comment(driver)
            except:
                print("没有下一页了 - " + str(i))  # TODO 点击下一页,获取失败,待优化
                break
        return all_content
    
    
    def get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/'):
        i = 0
        for url in urls:
            i += 1
            driver.get(url)
            driver.find_element_by_xpath('//*[@id="detail"]/div[1]/ul/li[5]').click()  # 点击商品详情
            name = driver.find_element_by_xpath('/html/body/div[8]/div/div[2]/div[1]').text
            print("文件%d - %s" % (i, name))
            comment = get_page_all_comment(driver, i)
            comment = pd.DataFrame(comment)
            comment.to_csv(outpath + str(i) + '.csv')
        return None
    
    
    def get_links(key, driver):
        url = 'https://search.jd.com/Search?keyword=' + quote(key) + '&enc=utf-8'  # 构造url
        driver.get(url)  # 打开url
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')  # 滚动到页面底部
        driver.implicitly_wait(3)  # 等待
        links = driver.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li/div/div[1]/a')  # 查找当前页面的商品链接
        urls = [l.get_attribute('href') for l in links]
        return urls
    
    
    def main(key):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')  # 设置headless模型
        driver = webdriver.Chrome(chrome_options=chrome_options)
        urls = get_links(key, driver)
        get_all_comment(urls, driver, outpath='D:/DataguruPyhton/PythonSpider/images/')
    
    
    main('红酒')
    

    三、本篇文章中的代码,运行环境

    • python 3.6.4
    • selenium 3.8.0
    • goole chrome 68.0.3440.106(正式版本) (64 位)
    • chromedriver.exe

    相关文章

      网友评论

        本文标题:Python网络爬虫实战之九:Selenium进阶操作与爬取京东

        本文链接:https://www.haomeiwen.com/subject/fxmsbftx.html