美文网首页
node+phantomjs+cheerio 实现爬虫(爬取百度

node+phantomjs+cheerio 实现爬虫(爬取百度

作者: lovelydong | 来源:发表于2018-07-02 10:21 被阅读0次

    1.安装 phantomjs
    下载
    http://phantomjs.org/download.html
    解压配环境变量
    npm i phantomjs
    2.安装 cheerio
    使用管理员身份运行cmd
    npm install -g cheerio
    3.node 代码
    pa.js

    const phantom = require('phantom')
    const cheerio = require('cheerio')
    const request = require('request')
    const fs = require('fs')
    function delay(second) {
        return new Promise((resolve) => {
            setTimeout(resolve, second * 1000);
        });
    }
    let url = 'http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%CF%C4%C4%BF%D3%D1%C8%CB%D5%CA%B1%DA%D6%BD1080&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=000000'
    function save(url) {
        let ext = url.split('.').pop()
        request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`));
    }
    (async function() {
        let instance = await phantom.create();
        let page = await instance.createPage();
        let status = await page.open(url);
        let size = await page.property('viewportSize', {
            width: 1920,
            height: 1080
        })
        let $
        async function pageScroll(i) {
            await delay(1)
            await page.property('scrollPosition', {
                left: 0,
                top: 1000 * i
            })
            let content = await page.property('content')
            $ = cheerio.load(content)
            if($('.imgbox').length < 200) {
                await pageScroll(++i)
            }
        }
        await pageScroll(0)
        let urlList = []
        $('.imgbox').each(function() {
            urlList.push('https://image.baidu.com'+$(this).find('a').attr('href'))
        })
        async function imgSave(i) {
            let status = await page.open(urlList[i])
            await delay(1)
            let content = await page.property('content')
            $ = cheerio.load(content)
            let src = $('#currentImg').attr('src')
            save(src)
            if(i<urlList.length) {
                await imgSave(++i)
            }
        }
        await imgSave(0)
        await instance.exit()
    }());
    
    

    4.执行目录创建image 文件夹(存放爬的图片)
    5.在此处执行cmd
    node pa.js

    然后等着图片一个个出来吧 滑稽

    相关文章

      网友评论

          本文标题:node+phantomjs+cheerio 实现爬虫(爬取百度

          本文链接:https://www.haomeiwen.com/subject/wqpkuftx.html