美文网首页
爬虫案例---Nodejs

爬虫案例---Nodejs

作者: 蓝Renly | 来源:发表于2018-10-04 15:39 被阅读0次

    需求:爬取http://www.27270.com/ent/meinvtupian/网站的美女图片和图片标题;

    1.需要的第三方库

    1.1 iconv-lite

    1.2 cheerio

    详细使用可查看npm,

    //获取第三方库,解决utf-8与gbk乱码问题
    let iconv = require('iconv-lite');
    //获取第三方库cherrio,解析html文件dom结构
    let cheerio = require('cheerio');
    
    2.爬虫代码
    /*
    * @author:  lansir
    * @date"    2018/10/3
    * @description  go~js!
    */
    
    let http = require('http');
    //使用第三方库 iconv-lite
    let iconv = require('iconv-lite');
    let cheerio = require('cheerio');
    let fs = require('fs');
    let path = require('path');
    
    const theUrl = "http://www.27270.com/ent/meinvtupian/";
    //1.获取目标网站的html内容
    http.get(theUrl,res=>{
        // let date = '';
        let data = [];
        res.on('data',chunk=>{
            // date += chunk;
            data.push(chunk);
            // console.log(chunk);
        });
        res.on('end',()=>{
            // console.log(date);
            //2.使用三方库iconv-lite解决乱码问题
            let html = iconv.decode(Buffer.concat(data),'gbk');
            //3.从html的dom结构中提取需要的数据,图片的src和标题
            let imageData = getSrcAndTitleFromHtml(html);
            // console.log(imageData);
            // console.log(html);
            download(imageData)
        })
    });
    
    //获取目标网站的img的src和title数组
    function getSrcAndTitleFromHtml(html) {
        let $ = cheerio.load(html);
        let arr = $('div.MeinvTuPianBox>ul>li>a>i>img').toArray();
        let imageSrcAndTitile = [];
        for (let i = 0; i < arr.length; i++) {
            let obj = arr[i];
            let src = $(obj).attr('src');
            let title = $(obj).attr('alt');
    
            imageSrcAndTitile.push({
                src,title
            })
        }
        return imageSrcAndTitile
    }
    //下载图片
    function download(imageData) {
        imageData.forEach(obj=>{
            http.get(obj.src,res => {
                let pathImag = path.join('imgs',obj.title+ path.extname(obj.src));
                let writer = fs.createWriteStream(pathImag);
                res.pipe(writer)
            })
        })
    }
    
    3.使用监听事件events优化爬虫程序
    /*
    * @author:  lansir
    * @date"    2018/10/4
    * @description  go~js!
    */
    
    //获取EventEmitter对象
    let EventEmitter = require('events');
    let http = require('http');
    let fs = require('fs');
    //获取第三方库,解决utf-8与gbk乱码问题
    let iconv = require('iconv-lite');
    //获取第三方库cherrio,解析html文件dom结构
    let cheerio = require('cheerio');
    let path = require('path');
    
    class MyEvent extends EventEmitter {
    
        /*1.获取html*/
        getHtml() {
            http.get("http://www.27270.com/ent/meinvtupian/", res => {
                let data = [];
                res.on('data', (chunk) => {
                    //将读取到的数据放入data数组中
                    data.push(chunk)
                });
                res.on('end', () => {
                    //获取html文件
                    let html = iconv.decode(Buffer.concat(data), 'gbk');
                    //通过事件来通知html获取完毕
                    this.emit('getHtmlFinsh', html);
                })
            })
        }
    
        /*2.解析已获取的html的dom结构,获取图片的src和title*/
        getSrcAndTitleFromHtml(html) {
            let $ = cheerio.load(html);
            //将解析到的数据转为数组
            let arr = $('div.MeinvTuPianBox>ul>li>a>i>img').toArray();
            //定义一个空的数组,用来存放src和title
            let imageSrcAndTitleArray = [];
            for (let i = 0; i < arr.length; i++) {
                let obj = arr[i];
                let src = $(obj).attr('src');
                let title = $(obj).attr('alt');
                //将src和title以JSON对象格式存入到imageSrcAndTitleArray数组中
                imageSrcAndTitleArray.push({
                    src, title
                })
            }
            //获取到带有src和title的数组后,通知事件
            this.emit('extraHtmlFinsh', imageSrcAndTitleArray)
        }
    
        /*3.下载图片*/
        download(imageSrcAndTitleArray) {
            imageSrcAndTitleArray.forEach(obj => {
                http.get(obj.src, res => {
                    let pathImage = path.join('imgs', obj.title + path.extname(obj.src));
                    //创建写入流
                    let writer = fs.createWriteStream(pathImage);
                    res.pipe(writer)
                })
            })
        }
    
        /*4.初始化监听器的注册*/
        init() {
            /* setTimeout(()=>{
                 this.on('getHtmlFinsh', (html)=>{
                     this.getSrcAndTitleFromHtml(html)
                 })
             },1000);*/
            this.on('getHtmlFinsh', (html) => {
                this.getSrcAndTitleFromHtml(html)
            });
            this.on('extraHtmlFinsh', (imageSrcAndTitleArray) => {
                this.download(imageSrcAndTitleArray)
            })
            this.getHtml()
        }
    }
    //开启爬虫程序
    let myEvent = new MyEvent();
    myEvent.init();
    

    展示:


    爬取结果展示.PNG

    相关文章

      网友评论

          本文标题:爬虫案例---Nodejs

          本文链接:https://www.haomeiwen.com/subject/xqmxaftx.html