美文网首页我爱编程
动态爬虫之QQ空间登录

动态爬虫之QQ空间登录

作者: deadcalm | 来源:发表于2017-07-24 17:57 被阅读0次

    准备:

    1、intellij idea
    2、python
    3、selenium
    4、phantomJs

    1、分析Qzone Html页面

    打开手机版qzone https://mobile.qzone.qq.com

    qzone_openhtml.jpg

    <color style="color:red">按照上面流程复制账号、密码和登录按钮的的XPath粘贴到记事本中</color>

    2、超链

    1、构建浏览器并且设置请求头
    2、开始请求
    3、模仿用户输入
    4、输入验证码
    5、自动登录
    6、完整代码

    <a id='1'></a>

    3、编写爬虫代码

    首先创建一个浏览器对象和设置请求头

    # 导入驱动包
    from selenium import webdriver
    from selenium.webdriver import DesiredCapabilities
    
    class qzone_dlewares(object):
        # 浏览器请求头
        headers = {'Accept': '*/*',
                   'Accept-Language': 'en-US,en;q=0.8',
                   'Cache-Control': 'max-age=0',
                   'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/4.5.255',
                   'Connection': 'keep-alive', }
        
        #初始化浏览器
        def __init__(self,userName='' ,password = '', *args, **kwargs):
            self.userName = userName
            self.password = password
            desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
            for key, value in self.headers.items():
                desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
            self.driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
            # 设置屏幕大小
            self.driver.set_window_size(414, 736)
    

    <a id='2' ></a>

    开始请求 截图

        def startQzoneRequest(self):
            #开始请求qzone
            self.driver.get('https://mobile.qzone.qq.com')
            #截图保存到当前项目下
            self.driver.save_screenshot('screenshot.png')
    

    截图成功会在 project 下面生成screenshot.png


    qzone_open_mobile.png

    <a id='3' ></a>

    模仿用户输入 登录关键性代码

    import time
    from selenium.webdriver import ActionChains
    
        def loginQzone(self):
            u = self.driver.find_element_by_xpath('//*[@id="u"]')
            p = self.driver.find_element_by_xpath('//*[@id="p"]')
            go = self.driver.find_element_by_xpath('//*[@id="go"]')
            
            # 移动到账号框模仿键盘输入账号
            action = ActionChains(self.driver)
            action.move_to_element(u)
            action.click(u)
            # 模仿键盘输入账号
            action.send_keys(self.userName)
            
            # 移动到密码输入框
            action.move_to_element(p)
            action.click(p)
            # 模仿键盘输入密码
            action.send_keys(self.password)
            
            # 点击登录
            action.move_by_offset(go.location['x'], go.location['y'])
            action.click(go)
            
            # 执行登录
            action.perform()
            # 休息1秒保证能执行
            time.sleep(1)
            # 截图保存到当前项目下
            self.driver.save_screenshot('screenshotLoginQzone.png')
    

    登录代码就写完了现在开始写个测试代码

    if __name__ == '__main__':
            # 事先输入账号和密码
            userName = input("账号:")
            password = input("密码:")
            oldTime = time.time()
            browser = qzone_dlewares(userName=userName, password=password)
            initTime = time.time()
            # 打开浏览器并且截图
            browser.startQzoneRequest()
            requestTime = time.time()
            # 模仿用户登录
            browser.loginQzone()
            currentTime = time.time()
        
            print('开始时间 %f' % oldTime)
            print('结束时间 %f' % currentTime)
            print('初始化时间 %f' % (initTime - oldTime))
            print('加载页面时间 %f'%(requestTime - initTime))
            print('模仿操作时间 %f' %(currentTime - requestTime))
            print('总运行时间 %f' % (currentTime - oldTime))
    

    运行测试结果

    qzon_runtime_date.png qzon_screenshot_verify_login_success.jpg

    运行几遍后发现每次都要登录一遍,然后腾讯验证码也出来了。。。
    先把验证码这块给处理了

    按照开始寻找图片的方法把验证码图片、验证码输入框、按钮找出来

    from selenium.webdriver import ActionChains
    def check_code(self):
        que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
        que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
        que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
        
        #保存验证码
        self.save_verify_code(que_code)
        #输入验证码
        input_verify_code = input("验证码:")
        #模仿用户输入
        action = ActionChains(self.driver)
        action.move_to_element(que_input)
        action.click()
        action.send_keys(input())
        action.move_to_element(que_but)
        action.click()
        #执行
        action.perform()
    

    保存验证码

    import urllib
    def save_verify_code(self,element):
        url = element.get_attribute('src')
        fileName = element.get_attribute('id') + '.jpg'
        urllib.request.urlretrieve(url, fileName)
    

    运行测试,发现以下错误<p style="color:red;">Traceback (most recent call last):
    File "C:/Users/user/IdeaProjects/untitled/untitled/qzone.py", line 108, in <module>
    browser.check_code2()
    File "C:/Users/user/IdeaProjects/untitled/untitled/qzone.py", line 67, in check_code2
    que_code = self.driver.find_element_by_xpath('//[@id="cap_que_img"]')
    File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 313, in find_element_by_xpath
    return self.find_element(by=By.XPATH, value=xpath)
    File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 791, in find_element
    'value': value})['value']
    File "D:\python\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 256, in execute
    self.error_handler.check_response(response)
    File "D:\python\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 194, in check_response
    raise exception_class(message, screen, stacktrace)
    selenium.common.exceptions.NoSuchElementException: Message: {"errorMessage":"Unable to find element with xpath '//
    [@id="cap_que_img"]'","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"108","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:53613","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{"using": "xpath", "value": "//*[@id=\"cap_que_img\"]", "sessionId": "737e6b90-6929-11e7-8958-3b746283f061"}","url":"/element","urlParsed":{"anchor":"","query":"","file":"element","directory":"/","path":"/element","relative":"/element","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/element","queryKey":{},"chunks":["element"]},"urlOriginal":"/session/737e6b90-6929-11e7-8958-3b746283f061/element"}}
    Screenshot: available via screen</p>

    说没有找到这个节点,后来分析源码发现他是包在 iframe 中的 既然包在里面了那我们切换窗口好了

    # 校验码
    from selenium.common.exceptions import NoSuchElementException
    
    def check_code(self):
        # 先切换到默认的窗口
        self.driver.switch_to.default_content()
        iframe = None
        try:
            # 验证码
            iframe = self.driver.find_element_by_xpath('//*[@id="new_vcode"]/iframe[2]')
        except NoSuchElementException:
            print('无需输入验证码')
        else:
            self.driver.switch_to.frame(iframe)
            self.verify_code()
    

    手动输入验证码,暂时还不能自动输入验证码 并且也不能验证验证码是否错误或者切换

    # 验证码
    def verify_code(self):
        que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
        que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
        que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
    
        # 保存验证码
        self.save_verify_code(que_code)
        verify_path = que_code.get_attribute('id') + '.jpg'
        # 输入验证码
        if (self.isWindows()):
            os.startfile(verify_path)
        else:
            os.subprocess.call(["xdg-open", verify_path])
    
        input_verify_code = input("验证码:")
        # 模仿用户输入
        action = ActionChains(self.driver)
        action.move_to_element(que_input)
        action.click()
        action.send_keys(input_verify_code)
        action.move_to_element(que_but)
        action.click()
        # 执行
        action.perform()
    

    完美运行登录成功

    qzon_screenshot_verify_code.jpg qzon_screenshot_verify_login_success.jpg

    <a id='5'></a>
    <p></p>
    每次运行都需要手动登录太麻烦了 qzone 保存cookies 好像可以不需要手动登录了

    # 保存登录 cookies
        def save_cookies(self):
            with open(self.hashCode(), 'wb') as f:
                obj = self.driver.get_cookies()
                pickle.dump(obj, f)
                f.close()
    
        # 读取并设置 cookies
        def load_cookies(self):
            fileName = self.hashCode()
            # 判断文件是否存在
            if self.file_exists(fileName):
                f = open(fileName, 'rb')
                obj = pickle.load(file=f)
                f.close()
                # 循环设置 cookie
                try:
                    for cookie in obj:
                        self.driver.add_cookie(cookie)
                except Exception as e:
                    print(e)
    
        # hasCode
        def hashCode(self):
            sha = sha1()
            sha.update(b'qzone_cookies')
            return sha.hexdigest()
    
            # 判断文件是否存在
    
        def file_exists(self, filename):
            try:
                with open(filename) as f:
                    return True
            except IOError:
                return False
    

    测试代码

    if __name__ == '__main__':
        # 事先输入账号和密码
        userName = input("账号:")
        password = input("密码:")
        oldTime = time.time()
        browser = qzone_dlewares(userName=userName, password=password)
        # 加载cookies
        browser.load_cookies()
        initTime = time.time()
        # 打开浏览器并且截图
        browser.startQzoneRequest()
        requestTime = time.time()
    
        # 判断是否登录
        if (not browser.isLogin()):
            # 模仿用户登录
            browser.loginQzone()
            # 检查code
            browser.check_code()
        currentTime = time.time()
        
        # 解析动态
        browser.paresHtml()
        # 运行完成后再截图一次
        browser.driver.save_screenshot('screenshotLoginQzoneSuccess.png')
        # 保存cookies
        browser.save_cookies()
        print('开始时间 %f' % oldTime)
        print('结束时间 %f' % currentTime)
        print('初始化时间 %f' % (initTime - oldTime))
        print('加载页面时间 %f' % (requestTime - initTime))
        print('模仿操作时间 %f' % (currentTime - requestTime))
        print('总运行时间 %f' % (currentTime - oldTime))
    

    <a id='6'></a>

    qzone 登录自动登录完整代码

    import json
    import os
    import pickle
    import platform
    import time
    import urllib
    from _sha1 import sha1
    
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver import DesiredCapabilities, ActionChains
    
    
    class qzone_dlewares(object):
        # 浏览器请求头
        headers = {'Accept': '*/*',
                   'Accept-Language': 'en-US,en;q=0.8',
                   'Cache-Control': 'max-age=0',
                   'User-Agent': 'Mozilla/5.0 (Linux; U; Android 2.3.6; zh-cn; GT-S5660 Build/GINGERBREAD) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1 MicroMessenger/4.5.255',
                   'Connection': 'keep-alive', }
    
        # 初始化浏览器
        def __init__(self, userName='', password='', *args, **kwargs):
            self.userName = userName
            self.password = password
            desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
            for key, value in self.headers.items():
                desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
            # 禁止加载图片
            desired_capabilities["phantomjs.page.settings.loadImages"] = False
            self.driver = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
            # 设置屏幕大小
            self.driver.set_window_size(414, 736)
    
        # 开始请求并且截图
        def startQzoneRequest(self):
            # 开始请求qzone
            self.driver.get('https://mobile.qzone.qq.com')
            # 截图保存到当前项目下
            self.driver.save_screenshot('screenshot.png')
    
        # 判断是否登录了
        def isLogin(self):
            try:
                u = self.driver.find_element_by_xpath('//*[@id="u"]')
                p = self.driver.find_element_by_xpath('//*[@id="p"]')
                go = self.driver.find_element_by_xpath('//*[@id="go"]')
            except NoSuchElementException:
                return True
            return False
    
        def loginQzone(self):
            u = self.driver.find_element_by_xpath('//*[@id="u"]')
            p = self.driver.find_element_by_xpath('//*[@id="p"]')
            go = self.driver.find_element_by_xpath('//*[@id="go"]')
    
            # 清理账号和密码
            u.clear()
            p.click()
    
            # 移动到账号框模仿键盘输入账号
            action = ActionChains(self.driver)
            action.move_to_element(u)
            action.click(u)
            # 模仿键盘输入账号
            action.send_keys(self.userName)
    
            # 移动到密码输入框
            action.move_to_element(p)
            action.click(p)
            # 模仿键盘输入密码
            action.send_keys(self.password)
    
            # 点击登录
            action.move_by_offset(go.location['x'], go.location['y'])
            action.click(go)
    
            # 执行登录
            action.perform()
            # 休息1秒保证能执行
            time.sleep(1)
            # 截图保存到当前项目下
            self.driver.save_screenshot('screenshotLoginQzone.png')
    
        def save_verify_code(self, element):
            url = element.get_attribute('src')
            fileName = element.get_attribute('id') + '.jpg'
            urllib.request.urlretrieve(url, fileName)
    
        # 校验码
        def check_code(self):
            # 先切换到默认的窗口
            self.driver.switch_to.default_content()
            iframe = None
            try:
                # 验证码
                iframe = self.driver.find_element_by_xpath('//*[@id="new_vcode"]/iframe[2]')
            except NoSuchElementException:
                print('无需输入验证码')
            else:
                self.driver.switch_to.frame(iframe)
                self.verify_code()
    
        # 验证码
        def verify_code(self):
            que_code = self.driver.find_element_by_xpath('//*[@id="cap_que_img"]')
            que_input = self.driver.find_element_by_xpath('//*[@id="cap_input"]')
            que_but = self.driver.find_element_by_xpath('//*[@id="verify_btn"]')
    
            # 保存验证码
            self.save_verify_code(que_code)
            verify_path = que_code.get_attribute('id') + '.jpg'
            # 输入验证码
            if (self.isWindows()):
                os.startfile(verify_path)
            else:
                os.subprocess.call(["xdg-open", verify_path])
    
            input_verify_code = input("验证码:")
            # 模仿用户输入
            action = ActionChains(self.driver)
            action.move_to_element(que_input)
            action.click()
            action.send_keys(input_verify_code)
            action.move_to_element(que_but)
            action.click()
            # 执行
            action.perform()
    
        # 解析动态
        def paresHtml(self):
            pass
    
        # 是 windows 系统
        def isWindows(self):
            sysstr = platform.system()
            if (sysstr == "Windows"):
                return True
            return False
    
        # 保存登录 cookies
        def save_cookies(self):
            with open(self.hashCode(), 'wb') as f:
                obj = self.driver.get_cookies()
                pickle.dump(obj, f)
                f.close()
    
        # 读取并设置 cookies
        def load_cookies(self):
            fileName = self.hashCode()
            # 判断文件是否存在
            if self.file_exists(fileName):
                f = open(fileName, 'rb')
                obj = pickle.load(file=f)
                f.close()
                # 循环设置 cookie
                try:
                    for cookie in obj:
                        self.driver.add_cookie(cookie)
                except Exception as e:
                    print(e)
    
        # hasCode
        def hashCode(self):
            sha = sha1()
            sha.update(b'qzone_cookies')
            return sha.hexdigest()
    
            # 判断文件是否存在
    
        def file_exists(self, filename):
            try:
                with open(filename) as f:
                    return True
            except IOError:
                return False
    
        # 退出浏览器
        def __del__(self):
            self.driver.quit()
    
    
    if __name__ == '__main__':
        # 事先输入账号和密码
        userName = input("账号:")# 
        password = input("密码:")# 
        oldTime = time.time()
        browser = qzone_dlewares(userName=userName, password=password)
        # 加载cookies
        browser.load_cookies()
        initTime = time.time()
        # 打开浏览器并且截图
        browser.startQzoneRequest()
        requestTime = time.time()
    
        # 判断是否登录
        if (not browser.isLogin()):
            # 模仿用户登录
            browser.loginQzone()
            # 检查code
            browser.check_code()
        currentTime = time.time()
        
        # 解析动态
        browser.paresHtml()
        # 运行完成后再截图一次
        browser.driver.save_screenshot('screenshotLoginQzoneSuccess.png')
        # 保存cookies
        browser.save_cookies()
        print('开始时间 %f' % oldTime)
        print('结束时间 %f' % currentTime)
        print('初始化时间 %f' % (initTime - oldTime))
        print('加载页面时间 %f' % (requestTime - initTime))
        print('模仿操作时间 %f' % (currentTime - requestTime))
        print('总运行时间 %f' % (currentTime - oldTime))
    

    总结

    1、QQ空间登录其实可用使用js来模仿用户操作直接输入代码量也很少
    2、然后这边也有一个写入cookies 的bug因为作用域不对会报错

    我只是一只小菜鸟,如果你看到代码有错误的地方请提出来

    相关文章

      网友评论

        本文标题:动态爬虫之QQ空间登录

        本文链接:https://www.haomeiwen.com/subject/iqobkxtx.html