爬取加速乐处理的网站

- 用postman直接访问导致521且返回加密js
- 运用java中的ScriptEngineManager脚本引擎处理拿到cookie
代码如下:
CloseableHttpClient client = HttpClients.createDefault();
HttpGet get = new HttpGet(url);
//模拟浏览器
get.setHeader("Accept", "Accept text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
get.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
get.setHeader("Accept-Encoding", "gzip, deflate");
get.setHeader("Accept-Language", "zh-cn,zh;q=0.5");
get.setHeader("Connection", "keep-alive");
get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
CloseableHttpResponse response = client.execute(get);
//请求返回521
if(response.getStatusLine().getStatusCode()==521){
HttpEntity entity = response.getEntity();
String resHtml = EntityUtils.toString(entity);
//对返回js处理 拿到jsl_clearance
String jsl_clearance = getJslClearance(resHtml);;
get.setHeader("Cookie",jsl_clearance);
response = client.execute(get);
}
//拿到最终想要的页面
HttpEntity entity = response.getEntity();
String res = EntityUtils.toString(entity,"utf-8");
Document doc = Jsoup.parse(res);
/**
* 获取加密cookie
* @param body
*/
private static String getJslClearance(String body) {
String jsl_clearance = "";
ScriptEngineManager manager = new ScriptEngineManager();
//得到脚本引擎
ScriptEngine engine = manager.getEngineByName("JavaScript");
//处理加密js
String js = body.trim().replace("<script>", "")
.replace("</script>", "")
.replace("eval(y.replace(/\\b\\w+\\b/g, function(y){return x[f(y,z)-1]||(\"_\"+y)}))",
"y.replace(/\\b\\w+\\b/g, function(y){return x[f(y,z)-1]||(\"_\"+y)})");
try {
//得到解密后的js
String result = (String) engine.eval(js);
String jsl_pre = result.substring(result.indexOf("__jsl_clearance=") + 16,
result.indexOf("|'+(function(){var") + 1);
String bac = (result.substring(result.indexOf("|'+(function(){"), result.indexOf("+';Expires=") - 23)
+ result.substring(result.indexOf("+';Expires=") - 16, result.indexOf("+';Expires=") - 4))
.replace("|'+(function(){", "").replaceAll("window", "'Chrome'");
String jsl_bac = (String) engine.eval(bac);
jsl_clearance = "__jsl_clearance=" +jsl_pre + jsl_bac+";";
logger.debug(jsl_clearance);
} catch (ScriptException e) {
e.printStackTrace();
}
return jsl_clearance;
}
随后访问成功。
网友评论