nodejs使用xpath抓取百度首页的内容

作者: 醉笙情丶浮生梦 | 来源:发表于2020-01-19 15:28 被阅读0次

nodejs使用xpath抓取百度首页的内容
Python爬虫学习笔记（1）：抓取静态网页
李亚涛：百度蜘蛛抓取异常见原因
爬虫篇之--xpath
Python学习第一天
Python爬虫(十三)_案例：使用XPath的爬虫
Python爬虫爬坑记录
puppeteer + nodejs 抓取网页内容
xpath , beautifulsoup4 , pyquery
scrapy +chrome 抓取考拉动态详情信息并把图片保存到

安装模块
xpath
xmldom
htmlparser2

const xpath = require('xpath')
const DOMParser = require('xmldom').DOMParser
const htmlparser2 = require('htmlparser2')
const domParser = new DOMParser({
    errorHandler: {
        warning: w => {
            // console.warn(w)
        },
        error: e => {
            // console.error(e)
        },
        fatalError: e => {
            // console.error(e)
        }
    }
})
function loadPage(url) {
    var http = require('https');
    var pm = new Promise(function (resolve, reject) {
        http.get(url, function (res) {
            var html = '';
            res.on('data', function (d) {
                html += d.toString()
            });
            res.on('end', function () {
                resolve(html, 'end');
            });
        }).on('error', function (e) {
            reject(e)
        });
    });
    return pm;
}
loadPage('https://www.baidu.com/').then(function (html) {

    // console.log(html);
    // 不用用 htmlparser2 转换直接解析有时候会报错
    const outerHTML = htmlparser2.DomUtils.getOuterHTML(htmlparser2.parseDOM(html))

    // console.log(outerHTML);

    let doc = domParser.parseFromString(outerHTML)

    // console.log(doc);
    
    let nodes = xpath.select("//a[@href='http://home.baidu.com']", doc)

    console.log(nodes[0]);

    console.log(nodes[0].localName + ": " + nodes[0].lastChild.data)
    console.log("Node: " + nodes[0].toString())
});

参考:使用node.js第三方库xpath进行html文档解析