美文网首页我爱编程
python Selenium 借助浏览器抓包

python Selenium 借助浏览器抓包

作者: proud2008 | 来源:发表于2018-03-01 15:51 被阅读1150次

    安装
    pip install selenium

    from selenium import webdriver
    browser = webdriver.Chrome()
    browser.get('https://qiang.taobao.com/')
    
    
    # 导入selenium模块中的web引擎
    from selenium import webdriver
    
    
    # 建立浏览器对象 ,通过Phantomjs
    browser = webdriver.Chrome()
    
    # 设置访问的url
    url = 'https://www.baidu.com'
    
    # 访问url
    browser.get(url)
    
    # 等待一定时间,让js脚本加载完毕
    browser.implicitly_wait(3)
    
    # 找到搜索框
    text = browser.find_element_by_id('kw')
    
    # 清空搜索框的文字
    text.clear()
    
    # 填写搜索框的文字
    text.send_keys('python')
    
    # 找到submit按钮
    button = browser.find_element_by_id('su')
    
    # 点击按钮 提交搜索请求
    button.submit()
    
    
    # 查看当前浏览器标题
    print(browser.title)
    
    # 以截图的方式查看浏览器的页面
    browser.save_screenshot('text.png')
    
    # 找到结果 结果保存为列表变量
    results = browser.find_elements_by_class_name('t')
    
    # 循环遍历找出每个结果的标题和url
    for result in results:
        print('标题:{} 超链接:{}'.format(result.text,
                                    result.find_element_by_tag_name('a').get_attribute('href')))
    
    

    问题
    1、Error message: “'chromedriver' executable needs to be available in the path”
    https://sites.google.com/a/chromium.org/chromedriver/downloads下载chromedriver
    将chromedriver.exe 放入放到python脚本的文件夹下面
    或者webdriver.Chrome() 参数中指定全路径

    文档

    http://selenium-python-zh.readthedocs.io/en/latest/page-objects.html

    与scrapy配合使用
    https://github.com/clemfromspace/scrapy-selenium

    """This module contains the ``SeleniumMiddleware`` scrapy middleware"""
    
    from importlib import import_module
    from scrapy import signals
    from scrapy.exceptions import NotConfigured
    from scrapy.http import HtmlResponse
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    """This module contains the ``SeleniumRequest`` class"""
    
    from scrapy import Request
    
    
    class SeleniumRequest(Request):
        """Scrapy ``Request`` subclass providing additional arguments"""
    
        def __init__(self, url, wait_time=None, wait_until=None, screenshot=False, *args, **kwargs):
            """Initialize a new selenium request
    
            Parameters
            ----------
            wait_time: int
                The number of seconds to wait.
            wait_until: method
                One of the "selenium.webdriver.support.expected_conditions". The response
                will be returned until the given condition is fulfilled.
            screenshot: bool
                If True, a screenshot of the page will be taken and the data of the screenshot
                will be returned in the response "meta" attribute.
    
            """
    
            self.wait_time = wait_time
            self.wait_until = wait_until
            self.screenshot = screenshot
    
            super().__init__(url, *args, **kwargs)
    
    
    
    
    class SeleniumMiddleware:
        """Scrapy middleware handling the requests using selenium"""
    
        def __init__(self):
            self.driver = webdriver.Chrome()
    
        @classmethod
        def from_crawler(cls, crawler):
            """Initialize the middleware with the crawler settings"""
            middleware = cls()
            crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
            return middleware
    
        def process_request(self, request, spider):
            """Process a request using the selenium driver if applicable"""
    
            if not isinstance(request, SeleniumRequest):
                return request
    
            self.driver.get(request.url)
    
            for cookie_name, cookie_value in request.cookies.items():
                self.driver.add_cookie(
                    {
                        'name': cookie_name,
                        'value': cookie_value
                    }
                )
    
            if request.wait_until:
                WebDriverWait(self.driver, request.wait_time).until(
                    request.wait_until
                )
    
            if request.screenshot:
                request.meta['screenshot'] = self.driver.get_screenshot_as_png()
    
            body = str.encode(self.driver.page_source)
    
            # Expose the driver via the "meta" attribute
            request.meta.update({'driver': self.driver})
    
            return HtmlResponse(
                self.driver.current_url,
                body=body,
                encoding='utf-8',
                request=request
            )
    
        def spider_closed(self):
            """Shutdown the driver when spider is closed"""
    
            self.driver.quit()
    
    

    return SeleniumRequest(url) 返回
    settings.py DOWNLOADER_MIDDLEWARES中添加 该SeleniumMiddleware

    相关文章

      网友评论

        本文标题:python Selenium 借助浏览器抓包

        本文链接:https://www.haomeiwen.com/subject/fhjgxftx.html