美文网首页
nodejs 爬视频初探

nodejs 爬视频初探

作者: yes先生boss | 来源:发表于2021-05-08 16:27 被阅读0次

    直接上代码

    var cheerio = require("cheerio");
    var fs = require('fs');
    var async = require("async");
    const superagent = require('superagent')
    const request = require('request')
    const mkdirp = require('mkdirp')
    const path = require('path')
    var options = {
        uri: 'http://xxxxx', //要爬的视频网站
        dirfile: './output/', //保存目录
        downLimit: 2//视频并行下载上限
    }, prolist = [], videolist = [];
    down(options.uri)
    async function down(url) {
    //首页列表
    await new Promise((resolve) => {
        superagent
            .get(url)
            .end((error, response) => {
                if (!error && response.statusCode == 200) {
                    //获取页面文档数据
                    var $ = cheerio.load(response.text, {
                        normalizeWhitespace: true,
                        decodeEntities: false
                    });
                    //这一步需要去网站界面来查看elements
                    $(".comapny-card.bg-fff.div-animationone").each((i, obj) => {
                        let json = {
                            dir: $(obj).find("h6").text(),
                            title: $(obj).find("a").attr("title"),
                            url: $(obj).find("a").attr("href")
                        }
                        prolist.push(json)
                    })
                    resolve()
                }
            })
    })
    // 详情
    for (var opt of prolist) {
        await mkdir(opt.dir);
        await new Promise((resolve) => {
            //这是一个小坑,需要来模拟浏览器,添加header就可以
            superagent
                .get(opt.url)
                .set("Connection", "keep-alive")
                .set("Content-Length", 0)
                .set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
                .set("User-Agent", "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36")
                .end((error, response) => {
                    // 获取具体视频的详情界面(获取视频路径)
                    if (!error && response.statusCode == 200) {
                        var $ = cheerio.load(response.text, {
                            normalizeWhitespace: true,
                            decodeEntities: false
                        });
                        $("#List1_1 .video_name1").each((i, obj) => {
                            let json = {
                                title: $(obj).attr("title"),
                                url: $(obj).attr("rel")
                            }
                            videolist.push(json)
                        })
                        resolve()
                    }
                })
        })
    }
    await sleep(2000);
    // 下载视频
    if (videolist?.length) {
        await downliu(opt.dir, videolist, function () {
            console.log('下载结束');
        })
    }
    }
    
    /**
      * 创建视频保存的目录
      */
     function mkdir(title) {
        console.log('创建目录:%s', title);
        if (!fs.existsSync(options.dirfile + title)) {
           mkdirp(options.dirfile + title, function (err) {
                console.log(`目录:${title} 创建成功`);
            });
        }
    }
    
    function sleep(duration) {
        return new Promise((resolve, reject) => {
            setTimeout(resolve, duration);
        });
    };
    
    /**
     * 下载视频
     */
    function downliu(dir, links, callback) {
      console.log(`发现${links.length}个视频,准备开始下载...`);
      async.eachLimit(links, options.downLimit, function (video, callback) {
          // 获取url最后的名字
          var fileName = path.basename(video.title).replace(/ /g, '');
          // 去掉/
          var toPath = path.join(options.dirfile + dir, fileName);
          console.log(`开始下载视频:${fileName},保存到:${dir}`);
          request(encodeURI(video.url)).on('error', function (err) {
              callback();
          }).pipe(fs.createWriteStream(toPath + ".mp4")).on('finish', () => {
              console.log(`视频下载成功:${video.url}`);
              callback();
          })
      }, callback);
    }

    相关文章

      网友评论

          本文标题:nodejs 爬视频初探

          本文链接:https://www.haomeiwen.com/subject/ksgudltx.html