Node.js 简单爬虫实践

作者: Axiba | 来源:发表于2016-12-12 14:21 被阅读41次

Node.js 简单爬虫实践
Node.js学习——爬虫
使用Node.js实现简单的爬虫
各语言简单爬虫
node.js爬虫入门（二）爬取动态页面(puppeteer)
Java简单的爬虫实践
Node.js 笔记四：简单爬虫
Java爬虫爬妹子图
Node.js Request+Cheerio实现一个小爬虫-基
Node.js Request+Cheerio实现一个小爬虫-番

1、简单抓取一个网页的数据信息

var http = require('http'),
    cheerio = require('cheerio'),
    url = 'http://www.imooc.com/learn/348';

//过滤
function filterChapter(html) {
    var $ = cheerio.load(html);

    var chapters = $('.chapter');
    var courseData = [];

    chapters.each(function(index, item) {
        var chapter = $(this);
        var chapterTitle = chapter.find('strong').text();
        var videos = chapter.find('.video').children('li');
        var chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        };

        videos.each(function(index, item) {
            var video = $(this).find('.J-media-item');
            var videoTitle = video.text().replace(/[ ]/g, "")
                                         .replace(/[\r\n]/g, "")
                                         .replace(/[开始学习]/g, "");

            var id = video.attr('href').split('video/')[1];

            chapterData.videos.push({
                title: videoTitle,
                id: id
            });
        });

        courseData.push(chapterData);
    });


    return courseData;
}

//打印
function printCourseInfo(courseData) {
    courseData.forEach(function(item) {
        var chapterTitle = item.chapterTitle;
        item.videos.forEach(function(video) {
            console.log('【' + video.id + '】' + video.title + '\n')
        })
    })
}

http.get(url, (res) => {
    var html = '';

    res.on('data', (data) => {
        html += data;
    });

    res.on('end', () => {
        var courseData = filterChapter(html);
        printCourseInfo(courseData);
    });
}).on('error', () => {
    console.log('获取课程信息出错');
});

2、利用 Promise 同时异步请求多个页面

var http = require('http'),
    cheerio = require('cheerio'),
    Promise = require('Promise'),
    baseUrl = 'http://www.imooc.com/learn/',
    videoIds = [348, 637];

//过滤
function filterChapter(html) {
    var $ = cheerio.load(html),
        chapters = $('.chapter');

    var courseData = {
        title: $('.course-infos h2').text().trim().replace(/[\r\n]/g, ""),
        videos: []
    }

    chapters.each(function(index, item) {
        var chapter = $(this);
        var chapterTitle = chapter.find('strong').text().replace(/[ ]/g, "").replace(/[\r\n]/g, "");
        var videos = chapter.find('.video').children('li');
        var chapterData = {
            chapterTitle: chapterTitle,
            videos: []
        };

        videos.each(function(index, item) {
            var video = $(this).find('.J-media-item');
            var videoTitle = video.text().replace(/[ ]/g, "")
                                         .replace(/[\r\n]/g, "")
                                         .replace(/[开始学习]/g, "");

            var id = video.attr('href').split('video/')[1];

            chapterData.videos.push({
                title: videoTitle,
                id: id
            });
        });

        courseData.videos.push(chapterData);
    });


    return courseData;
}

//打印
function printCourseInfo(coursesData) {

    coursesData.forEach(function(courseData){
        console.log('\n **'+courseData.title +'**\n ');

        courseData.videos.forEach(function(item) {
            console.log(item.chapterTitle);
            var chapterTitle = item.chapterTitle;
            item.videos.forEach(function(video) {
                console.log('【' + video.id + '】' + video.title)
            })
        })
    })
    
}

//利用Promise 异步请求每一个URL的模版数据
function asyncAllPages(url){
    return new Promise(function(resolve, reject){
        console.log('启动爬虫1号:' + url);

        http.get(url, (res) => {
            var html = '';

            res.on('data', (data) => {
                html += data;
            });

            res.on('end', () => {
                resolve(html);
            });
        }).on('error', () => {
            reject(e);
            console.log('获取课程信息出错');
        });
    })
}

//存入所有页面请求到的模版数据
var fetchPageArray = [];
videoIds.forEach(function(id){
    fetchPageArray.push(asyncAllPages(baseUrl + id));
})

//利用Promise请求所有的页面, pages对应的就是请求每个页面过后传递下来的数据 的数组
Promise
    .all(fetchPageArray)
    .then(function(pages){

        var coursesData = [];
        pages.forEach(function(html){
            var course = filterChapter(html);
            coursesData.push(course);
        })

        printCourseInfo(coursesData);

    })

3、模拟提交请求

var http=require('http')
var querystring = require('querystring')
var postData = querystring.stringify({
    'content':'node learning srart now ! let\'s go ！',
    'mid':8837
})
var options ={
    hostname : 'www.imooc.com',
    port:80,
    path :'/course/docomment',
    method:'POST',
    headers:{
        'Accept':'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
        'Content-Length':postData.length,
        'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie':'...',
        'Host':'www.imooc.com',
        'Origin':'http://www.imooc.com',
        'Pragma':'no-cache',
        'Referer':'http://www.imooc.com/video/8837',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
        'X-Requested-With':'XMLHttpRequest'
    }
}

var req = http.request(options, function(res){
    console.log('status:'+res.statusCode);
    console.log('headers:'+JSON.stringify(res.headers));


    res.on('data',function(chunk){
        console.log(Buffer.isBuffer(chunk))
        console.log(typeof chunk);
    })

    res.on('end',function(){
        console.log('评论完毕')
    });
    res.on('error',function(e){
        console.log('Error:'+e.message)
    })
})
req.write(postData);
req.end();