爬虫自动化

作者: 财务自由_lang | 来源:发表于2019-02-22 10:44 被阅读0次

自动化对比

模块	UIAutomation	appium	selenium	pyppeteer	终极武器(chrome-devtools-protocol)
跨平台	否	Android，ios	是	chrome 浏览器就行	是
缺点	无法跨平台	adb驱动	太强大	chrome 协议	底层
driver	google原生	UIAutomation	#很多	chrome	all

很多：

selenium可支持的PC浏览器驱动包括：

safari driver【包含在selenium server中】

ie driver

chrome driver 【第三方】

opera driver【第三方]
selenium可支持的伪浏览器驱动：

PhantomJS Driver【第三方,停止更新】

HtmlUnit Driver【包含在selenium server中】
selenium可支持的移动端驱动：

Windows Phone driver

Selendroid-Selenium for Android【第三方】

ios-driver【第三方】

Appium 支持iphone、ipad、android、FirefoxOS【第三方】

驱动代码

UIAutomation

python 端安装 uiautomator2 client：
pip install uiautomator2
Android端安装 ATX server
安装方法
离线
    将init_machine.sh,及android_package.zip上传至/data/local/tmp目录下
    给手机安装termux，此为命令行工具，打开软件
    输入su,切换至管理员账户
    输入cd data/local/tmp,因为云手机不能打/符，所有用tab键来凑齐。
    输入sh init_machine.sh,即可安装好软件和服务
    
usb线连
    在电脑上输入python -m uiautomator2 init即可
    ### 2、打开ATX
    打开ATX 点击"启动uiautomator"
    ### 3、打开QpythonL
    一定要先打开这个软件，不然你python脚本会上传不成功

import uiautomator2 as u2

# 0.0.0.0 为本机，127.0.0.1这个也行，这个连接是在手机上运行的
d = u2.connect('http://0.0.0.0:7912')
# 如果远程操作，就填入手机ATX显示的ip，下面有图
# d = u2.connect('http://172.17.2.237:7912')
# 字符串为包名，下面有获取包名教程
sess = d.session("com.tencent.wework")

def my_click(sess, ele, timeout=3):
    ele.wait(timeout=timeout) # 等待超时，因为可能此时界面动画，或者反映迟钝，该控件还没出现
    x, y = ele.center() # 找到该空间的中间坐标
    sess.touch.down(x, y) # 在此坐标按下
    time.sleep(0.3)
    sess.touch.up() #等待3秒后抬起


def click_search_btn(sess):
    ele = sess(resourceId=package_name+":id/e3g", #找到该空间的id
               className="android.widget.TextView",# 找到该空间的类名
               instance=0) #如果根据属性查找有多个控件，则找第一个
    my_click(sess, ele, 20)


click_search_btn(sess)
print('begin to input')
ele = sess(resourceId="com.tencent.wework:id/dpu", className="android.widget.EditText")
ele.set_text(phone) # 将手机号填入
print('end to input')
time.sleep(2)
# print(u"点击搜索")
# 点击搜索
sess(resourceId="com.tencent.wework:id/azq",
           className="android.widget.RelativeLayout").child(className="android.widget.TextView").click() # 因为该控件没有id，所以先找它爹，再找它爹的儿子控件，.click()直接触发点击事件。
ele1 = sess(resourceId="com.tencent.wework:id/sa",
               className="android.widget.TextView")
ele1.get_text() # 获取标签文本内容

Seleinum

安装：
pip install selenium
自行下载需要驱动的driver：如chromedriver
实用executable_path 引入driver路径

from selenium import webdriver
import time
 
drivers = ['HtmlUnit', 'PhantomJS', 'Chrome', 'FF', 'IE'] 
 
dervers_time = {
    'HtmlUnit' : 0,
    'PhantomJS' : 0,
    'Chrome' : 0,
    'FF' : 0,
    'IE' : 0,
}
times = 50
def run_with_Chrome():
    common_step(webdriver.Chrome())
 
def run_with_FF():
    common_step(webdriver.Firefox())
    
def run_with_IE():
    common_step(webdriver.Ie())
 
def run_with_PhantomJS():
    common_step(webdriver.PhantomJS(executable_path=r'C:\Python27\Scripts\phantomjs.exe'))
    
def run_with_HtmlUnit():
    driver = webdriver.Remote("http://localhost:4444/wd/hub", 
                                desired_capabilities=webdriver.DesiredCapabilities.HTMLUNIT)
    common_step(driver)
    
def common_step(driver):
    driver.get('http://www.baidu.com')
    ele = driver.find_element_by_id('su')
    print ele.get_attribute('value')
    driver.quit()

Appium

1.node.js 安装
brew install node
npm install -g appium  # get appium     
npm install wd         # get appium client

直接用npm下载往往不成功，这是需要通过代理来下载
具体方法如下：
npm i cnpm -g --registry=http://registry.npm.taobao.org
cnpm i appium -g       # get appium
cnpm i wd -g　　       # get appium client
cnpm i appium-doctor 　# get appium-doctor

2.检查Appium成功安装
appium-doctor
更具提示配置 xcode 和 Android SDK

3.下载appium客户端更加方便地址 github
4.python client 安装 pip install Appium_Python_Client
5.启动appium server .用客户端appium 启动即可

# encoding: utf-8
"""
--------------------------------------
@describe 自动化微信添加好友
@version: 1.0
@project: operator_spider
@file: app_chrome.py
@author: yuanlang 
@time: 2019-02-19 10:14
---------------------------------------
"""
import unittest
from time import sleep

from appium import webdriver


class MyTestCase(unittest.TestCase):

    def setUp(self):
        capabilities = {
            'platformName': 'Android',
            'platformVersion': "5.1",
            'deviceName': "mx4",
            'appPackage': 'com.tencent.mm',
            'appActivity': '.ui.LauncherUI',
            'automationName': 'Uiautomator2',
            'unicodeKeyboard': True,
            'resetKeyboard': True,
            'noReset': True,
            'chromeOptions': {'androidProcess': 'com.tencent.mm:tools'},
        }
        self.driver = webdriver.Remote("http://127.0.0.1:4723/wd/hub", capabilities)
        self.driver.implicitly_wait(30)

        sleep(5)

    def test_chromeApp(self):
        # 点击加号
        print("search")
        driver = self.driver
        el1 = driver.find_element_by_id("com.tencent.mm:id/gd")
        el1.click()
        sleep(5)
        el2 = driver.find_element_by_xpath(
            "/hierarchy/android.widget.FrameLayout/android.widget.ListView/android.widget.LinearLayout[2]/android.widget.LinearLayout/android.widget.TextView")
        el2.click()
        sleep(5)
        el3 = driver.find_element_by_id("com.tencent.mm:id/hx")
        el3.click()
        sleep(5)
        driver.find_element_by_id("com.tencent.mm:id/hx").send_keys("15775691981")
        sleep(5)
        driver.find_element_by_id("com.tencent.mm:id/l4").click()
        sleep(5)
        print(driver.context)
        name = driver.find_element_by_id("com.tencent.mm:id/ang").text
        print(name)

    def tearDown(self):
        self.driver.close_app()
        self.driver.quit()


if __name__ == '__main__':
    unittest.main()

pyppeteer

原生是node.js 开发基于chrome-devtools-protocol + websocket
python也有对应改写版
项目由google维护

pip install pyppeteer

# encoding: utf-8
"""
--------------------------------------
@describe 
@version: 1.0
@project: operator_spider
@file: dddd.py
@author: yuanlang 
@time: 2019-02-21 19:11
---------------------------------------
"""

import asyncio
import pyppeteer
from collections import namedtuple

Response = namedtuple("rs", "title url html cookies headers history status")


async def get_html(url, timeout=30):
    # 默认30s
    #,executablePath=""
    browser = await pyppeteer.launch(headless=False, args=['--no-sandbox'])
    page = await  browser.newPage()
    js="""
    function sniffDetector() {
    const userAgent = window.navigator.userAgent;
    const platform = window.navigator.platform;

    window.navigator.__defineGetter__('userAgent', function() {
      window.navigator.sniffed = true;
      return userAgent;
    });

    window.navigator.__defineGetter__('platform', function() {
      window.navigator.sniffed = true;
      return platform;
    });
    //自动化反反爬虫，反自动化检测
    Object.defineProperty(navigator, 'webdriver', {
        get: () => false,
    });
  }
    """
    await page.evaluateOnNewDocument(js)
    res = await page.goto(url, options={'timeout': int(timeout * 1000)})
    await asyncio.sleep(5)
    data = await page.content()
    title = await page.title()
    resp_cookies = await page.cookies()
    resp_headers = res.headers
    resp_history = None
    resp_status = res.status
    response = Response(title=title, url=url,
                        html=data,
                        cookies=resp_cookies,
                        headers=resp_headers,
                        history=resp_history,
                        status=resp_status)
    return response


if __name__ == '__main__':
    url_list = ["https://jx.ac.10086.cn"]
    task = (get_html(url) for url in url_list)

    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(asyncio.gather(*task))
    for res in results:
        print(res)

chrome-devtools-protocol

底层采用websocket 协议控制浏览器。[协议连接](https://chromedevtools.github.io/devtools-protocol/)
安装
1.pip install pychrom
2./Chromium --remote-debugging-port=9222

import pychrome

browser = pychrome.Browser(url="http://127.0.0.1:9222")
tab = browser.new_tab()

def request_will_be_sent(**kwargs):
    print("loading: %s" % kwargs.get('request').get('url'))


tab.set_listener("Network.requestWillBeSent", request_will_be_sent)

tab.start()
tab.call_method("Network.enable")
tab.call_method("Page.navigate", url="https://jx.ac.10086.cn", _timeout=5)
import time
# print("ok")
time.sleep(5)
result=tab.call_method("Runtime.evaluate",expression="document.documentElement.outerHTML")
print(result)
tab.stop()

自动化反反爬虫

Object.defineProperty(navigator, 'webdriver', {
        get: () => false,
});

'''
// overwrite the `languages` property to use a custom getter
Object.defineProperty(navigator, "languages", {
  get: function() {
    return ["zh-CN","zh","zh-TW","en-US","en"];
  }
});

// Overwrite the `plugins` property to use a custom getter.
Object.defineProperty(navigator, 'plugins', {
  get: () => [1, 2, 3, 4, 5],
});

// Pass the Webdriver test
Object.defineProperty(navigator, 'webdriver', {
  get: () => false,
});


// Pass the Chrome Test.
// We can mock this in as much depth as we need for the test.
window.navigator.chrome = {
  runtime: {},
  // etc.
};

// Pass the Permissions Test.
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters) => (
  parameters.name === 'notifications' ?
    Promise.resolve({ state: Notification.permission }) :
    originalQuery(parameters)
);
'''

# 修改chromedriver
$ hexedit chromedriver 

        # 操作
        1. tab 跳转到string栏
        2. ctrl+S 查找 var key = '$cdc_asdjflasutopfhvcZLmcfl_'（对于2.40版本）
        3. 替换'$cdc_asdjflasutopfhvcZLmcfl_'为任意值
        4. ctrl+X 保存

# 移动chromedriver 到 /usr/bin
$ mv chromedriver /usr/bin

[详细连接]("https://zhuanlan.zhihu.com/p/43581988?utm_source=wechat_session&utm_medium=social&utm_oi=32546582691840&from=groupmessage")

总结：seleium 是采用webdriver 协议 ,seleium server端也是使用chrome-devtools-protocol。只不过为了跨平台采用webdriver restful api 形式开发。chrome-devtools-protocol 可以使用 ./chromium --remote-debug 自动化测试

爬虫自动化

自动化对比

很多：

驱动代码

UIAutomation

Seleinum

Appium

pyppeteer

chrome-devtools-protocol

自动化反反爬虫

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读