自己搭建puppeteer 抓取spa单页折腾快1天了,最终的问题是在命令行下始终无法启动chrome浏览器,但是在桌面环境(centos7)是可以正常运行的,还是使用docker简单:
拉取镜像
docker pull docker.io/alekzonder/puppeteer
编写脚本f.js
const puppeteer = require('puppeteer');
async function getVideo() {
const browser = await puppeteer.launch({ignoreHTTPSErrors:true,timeout:30000,args: ['--no-sandbox', '--disable-setuid-sandbox']});
const page = await browser.newPage();
await page.goto('https://v.douyin.com/eYc8Gcv/') //抖音视频页面
const allResultsSelector = '.video-player video';
await page.waitForSelector(allResultsSelector);
const aHandle = await page.evaluateHandle(() => document.body);
const resultHandle = await page.evaluateHandle(body => body.innerHTML, aHandle);
console.log(await resultHandle.jsonValue());
await resultHandle.dispose();
await browser.close();
}
getVideo()
测试
docker run -i --init --rm --cap-add=SYS_ADMIN --name puppeteer-chrome docker.io/alekzonder/puppeteer node -e "`cat f.js`"
//输出:
image.png
其他脚本
拦截响应:
const puppeteer = require('puppeteer');
(async () => {
try {
const browser = await puppeteer.launch({ignoreHTTPSErrors:true,timeout:30000,args: ['--no-sandbox', '--disable-setuid-sandbox']});
const page = await browser.newPage();
await page.setRequestInterception(true);
await page.on('request', request => {
if (request.url() === 'https://video.kuaishou.com/graphql') {
console.log(request.url());
console.log("拦截到了这条url然后就该请求了");
page.on('response', response => {
if (response.url() === 'https://video.kuaishou.com/graphql') {
//const req = response.request();
let message = response.text();
message.then(function (result1) {
results = result1;
console.log(results)
});
}
});
}
request.continue();
})
await page.goto('https://v.kuaishou.com/dYaBzk');
await page.screenshot({path: 'news.png', fullPage: true});
await browser.close();
} catch (e) {
console.log(e);
}
})();
网友评论