美文网首页
node.js 小试爬虫

node.js 小试爬虫

作者: 小学生的博客 | 来源:发表于2017-10-27 17:55 被阅读7次

cheerio
request

封装 download.js

let http = require("http");

function download(url, callback) {
    http.get(url, function (res) {
        let data = "";
        res.on('data', function (chunk) {
            data += chunk;
        });
        res.on("end", function () {
            callback(data);
        });
    }).on("error", function () {
        callback(null);
    });
}

exports.download = download;

index.js

let cheerio = require("cheerio");
let server = require("./download");
let request = require('request')
let fs = require('fs')
let url = "http://image.baidu.com/"

server.download(url, function (data) {
    if (data) {
        let $ = cheerio.load(data);
        $('div.img_pic_wrap_layer img').each(function (index, item) {
            let img = $(this).attr('src');
            request(img).pipe(fs.createWriteStream('./image/' + index + '.jpg'));
        });
    } else {
        console.log("error");
    }
});


over

express 脚手架 搭建的 cnode 爬虫

let cheerio = require('cheerio')
let superagent = require('superagent')
let cnodeUrl = 'https://cnodejs.org'
router.get('/cnode', (req, res, next) => {
    Promise.all(arr).then(data => {
        console.log(data)
        // res.send(data)
    })
});

function getPageAsync(url) {
    return new Promise((resolve, reject) => {
        superagent.get(url).end((err, result) => {
            if (err) {
                reject(err)
            } else {
                let $ = cheerio.load(result.text)
                let items = []
                $('#topic_list .cell').each((idx, ele) => {
                    let $element = $(ele)
                    items.push({
                        avator: cnodeUrl + $element.find('.user_avatar').attr('href'),
                        avatorImg: $element.find('.user_avatar a').attr('src'),
                        title: $element.find('.topic_title_wrapper a').attr('title').trim(),
                        url: cnodeUrl + $element.find('.topic_title_wrapper a').attr('href').trim(),
                        tab: $element.find('.topic_title_wrapper span').text().trim(),
                        last_time: $element.find('.last_time span').text().trim(),
                        clickNum: $element.find('.reply_count  span').text().trim(),
                    })
                })
                resolve(items)
            }
        })
    })
}

let arr = []
for (let i = 1; i < 10; i++) {
    console.log(cnodeUrl + '?tab=all&page=' + i)
    arr.push(getPageAsync(cnodeUrl + '?tab=all&page=' + i))
}

相关文章

网友评论

      本文标题:node.js 小试爬虫

      本文链接:https://www.haomeiwen.com/subject/hirdpxtx.html