在爬虫开发过程中,踩过无数的坑,与目标方斗智斗勇。总结来看终归是成本的博弈,不管开发成本,时间成本,空间成本,拟或是其他。方案万千,权衡后低成本拿下才是王道,当成本超过预期也就放弃挣扎了。
在某些安全至上的行业,时间、空间成本通常只能往后靠。优先安全的情况下,完全模拟用户行为几乎是最为有效的。想想那些各种加密,接口鉴权,请求策略…,为了最后几根头发妥协吧!
本文是cdp第三种使用方式,目的都是为了嗅探接口数据(为啥嗅探?你去看看强鉴权的网站)。某些变态的网站,selenium cdp、chrome带启动参数cdp都会检测,导致登录或鉴权错误。走投无路的情况下,采用了chrome扩展,通过debugger来实现。
浏览器启动
未避免debugger弹调试弹框,启动项增加--silent-debugger-extension-api配置,如下:
# 跨域、debugger api、指定嗅探扩展目录
"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" --disable-web-security --user-data-dir="d:\aaa" --silent-debugger-extension-api --load-extension="d:\嗅探"
manifest.json
{
"background": {
"persistent": true,
"scripts": [ "background.js" ]
},
"browser_action": {
"default_icon": "icon_38.png"
},
"content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'",
"description": "网页嗅探",
"icons": {
"128": "icon.png",
"16": "icon.png",
"48": "icon.png"
},
"manifest_version": 2,
"name": "网页嗅探",
"permissions": [ "debugger", "storage", "notifications", "tts", "webRequest", "webRequestBlocking", "http://*/*","https://*/*", "tabs", "contextMenus", "webNavigation", "clipboardWrite", "clipboardRead" ],
"short_name": "网页嗅探",
"version": "2.7"
}
background.js
本来是准备用Native Message的,但是太复杂了。索性用nodejs简单搭了个web服务用着,脚手架也方便。
console.log("开启嗅探");
//循环定时器,获取规则
var timename = setInterval(initRule, 1500);
// 跨域jsonp
function xhr(url, method, data, callback) {
url = url || "http://127.0.0.1:8080/";
method = method || "POST";
data = data ? (typeof (data) == "object" ? JSON.stringify(data) : data) : "";
var xhr = new XMLHttpRequest();
xhr.open(method, url, true);
xhr.setRequestHeader("Content-Type", "application/json");
xhr.onload = callback || function () {
console.log(this.responseText);
};
xhr.send(data);
}
// 初始化规则
function initRule() {
var url = "http://127.0.0.1:8080/rule";
var callback = function () {
// 删除定时循环
clearInterval(timename);
var rule = this.responseText;
rule = JSON.parse(rule);
console.log(rule);
handleRule(rule);
};
xhr(url, "GET", "", callback);
}
// 规则解析
function handleRule(rule) {
// tab页嗅探规则,指定对哪些域名对应的页面嗅探
var domain = rule["tabDomain"];
domain.forEach(function (i, b, c) {
c[b] = i.replace(/\./ig, "\\.");
});
domain = domain.join("|");
var w_Domain = new RegExp("^https?:\\/\\/(" + domain + ")");
var b_Domain = new RegExp("^https?:\\/\\/(?!" + domain + ")");
// 嗅探url过滤
var fu_Filters = rule["fetchUrlFilters"];
fu_Filters.forEach(function (i, b, c) {
c[b] = new RegExp(i);
});
initListener(w_Domain, b_Domain, fu_Filters);
}
// 过滤嗅探到的url,
function fetchUrlFilter(fu_Filters, url) {
for (ft in fu_Filters) {
if (fu_Filters[ft].test(url)) {
return true;
}
}
return false;
}
// 创建监听
function initListener(w_Domain, b_Domain, fu_Filters) {
// 全局tab缓存
var tab_cache = {};
// tab页存在更新动作
chrome.tabs.onUpdated.addListener(function (id, info, tab) {
console.log(id + ":tab 更新\t" + JSON.stringify(info));
//var index = global_tab.indexOf(id);
var index = tab_cache[id + ""];
// 释放监听
if (info.status == "loading" && index && b_Domain.test(tab["url"])) {
chrome.debugger.detach({
"tabId": id
}, function () {
console.log(id + ":tab debugger解绑");
// 删除监控,这里不安全,异步没有保护。可能其他删除,导致下标不一致。
delete tab_cache[id + ""]["fetch_urls"];
delete tab_cache[id + ""]["webSocket_urls"];
delete tab_cache[id + ""];
});
} else if (info.status == "loading" && !index && w_Domain.test(tab["url"])) {
// 添加监听
tab_cache[id + ""] = {};
tab_cache[id + ""]["fetch_urls"] = {};
tab_cache[id + ""]["webSocket_urls"] = {};
chrome.debugger.attach({
"tabId": id
}, "1.0", function () {
console.log(id + ":tab debugger绑定");
chrome.debugger.sendCommand({
"tabId": id
}, "Network.enable", {}, function () {
console.log(id + ":tab Network.enable");
chrome.debugger.onEvent.addListener(function (source, method, params) {
console.log(id + ":tab debugger event fetch");
var requestId = params.requestId;
var fetch_urls = tab_cache[id + ""]["fetch_urls"];
var webSocket_urls = tab_cache[id + ""]["webSocket_urls"];
switch (method) {
case "Network.requestWillBeSent":
var feg = fetchUrlFilter(fu_Filters, params.request.url);
if (feg) {
fetch_urls[requestId] = {};
fetch_urls[requestId]["request"] = params["request"];
}
break;
case "Network.responseReceived":
if (fetch_urls[requestId]) {
fetch_urls[requestId]["ResponseHeaders"] = params["response"]["headers"];
}
break;
case "Network.loadingFinished":
if (fetch_urls[requestId]) {
console.log(method + "\t" + fetch_urls[requestId].request
.url);
chrome.debugger.sendCommand(source,
"Network.getResponseBody", {
"requestId": requestId
},
function (response) {
var body = {};
fetch_urls[requestId]["tabId"] = source.tabId;
fetch_urls[requestId]["response"] = response;
var callback = function () {
console.log(this.responseText);
// 删除requestId,减小缓存
delete fetch_urls[requestId];
};
// 传输嗅探结果
xhr(null, null, fetch_urls[requestId],
callback);
});
}
break;
case "Network.webSocketCreated":
var feg = fetchUrlFilter(fu_Filters, params.url);
if (feg) {
webSocket_urls[requestId] = params.url;
}
break;
case "Network.webSocketFrameReceived":
if (webSocket_urls[requestId]) {
var data = params;
data["url"] = webSocket_urls[params.requestId];
data["tabId"] = source.tabId;
// 传输嗅探结果,websocket是长链接,不能删缓存
xhr(null, null, data);
}
break;
default:
break;
}
});
});
});
}
});
// Cleanup the variables when a tab is closed
chrome.tabs.onRemoved.addListener(function (id) {
console.log(id + ":tab 关闭");
var index = tab_cache[id + ""];
if (index) {
delete tab_cache[id + ""]["fetch_urls"];
delete tab_cache[id + ""]["webSocket_urls"];
delete tab_cache[id + ""];
}
});
}
rule.json 嗅探规则
每当浏览器启动时,扩展会每1.5秒请求一次rule规则,直到请求到为止。
{
"tabDomain": [
"xxxx.xxxx.com",
"xxxx.xxxx.com"
],
"fetchUrlFilters": [
"\\.json",
]
}
rule.json说明:
1、浏览器嗅规则,扩展在启动后每隔1.5秒获取一次,请求成功则停止获取。如果想新规则生效,则重启浏览器。
2、浏览器启动参数附带--silent-debugger-extension-api,解决浏览器弹调试框问题。
3、规则:
tabDomain:要嗅探的域名,这里必须是完整的域名。对应浏览器的地址栏的url,进行域名过滤。tab刷新会实时监控,不用担心重复会丢失嗅探。
fetchUrlFilters:要抓取的目标请求地址,正则表达式,用来过滤出要抓取的内容。
最后
嗅探的结构会post到web服务,至于怎么处理嗅探结果,就具体分析了。
网友评论