美文网首页
nodejs简易爬虫支持代理ip

nodejs简易爬虫支持代理ip

作者: 北方蜘蛛 | 来源:发表于2018-06-14 22:52 被阅读22次

    nodejs 爬虫支持代理IP 原创雕虫小技

    欢迎一起交流学习,废话不说直接上代码

    const request = require("request")
    const iconv = require('iconv-lite')
    const cheerio = require("cheerio")
    const _ = require("lodash")
    const crypto = require('crypto')
    class spider {
    
      constructor(config) {
        /**
         * @param {boolen} debug
         * @param {boolen} proxy
         * @param {string} proxy_ip
         * @param {array} temp_proxy
         * @param {array} headers
         */
        this.config = config
        this.doc_type = config.doc_type || 'html'
        this.headers = {
          'Host': this.config.Host
        }
        this.result_obj = {
          id: Number,
          title: String,
          link: String,
          desc: String,
          from: String
        }
        this.result_ext = {}
        this.debug = false
        this.proxy = false
        this.proxy_ip = null
        this.temp_proxy = []
        this.headers = []
        this.result_list = []
      }
    
      /**
       * @param {string} url
       * @param {string} method
       * @return {string} 
       */
      async so(url = "baidu.com", method = 'get') {
        let options = {
          method: method,
          url: url,
          timeout: 8000,
          headers: this.headers
        }
        console.log('开始爬取', url)
        if (this.temp_proxy.length && this.proxy) {
          if (!this.proxy_ip) {
            this.say('找出可用的代理ip')
            this.proxy_ip = await this.checkIp()
            console.log('获取到了才到下一步')
          }
          if (!this.proxy_ip) {
            console.log('还没有的话。估计是不行了')
    
            return false
          }
          options.proxy = this.proxy_ip
          return new Promise((resolve, reject) => {
    
            request(options, (error, response, body) => {
              try {
    
                if (error) throw error;
    
                if (/meta.*charset=gb2312/.test(body)) {
                  body = iconv.decode(body, 'gbk');
                }
                if (this.proxy_ip) {
                  this.say('这个IP果然牛逼!!!!', this.proxy_ip)
                }
                this.say('爬取完成了,丢出去html给下一个兄弟处理\n')
    
                if (this.doc_type == 'json') {
                  resolve(this.handle(body[this.res_data]))
                } else {
                  resolve(this.handle(body))
                }
    
    
              } catch (e) {
                this.say(options.proxy + '爬取失败了,因为这原因代理ip换一下吧' + options.proxy, e.response)
                this.tempProxy = []
                this.proxyIp = null
                resolve(false)
              }
    
            });
          }).catch(e => {
            this.say(e);
            return false
          })
        } else {
          return new Promise((resolve, reject) => {
    
            request(options, (error, response, body) => {
              try {
    
                if (error) throw error;
    
                if (/meta.*charset=gb2312/.test(body)) {
                  body = iconv.decode(body, 'gbk');
                }
                let result
                this.say('爬取完成了,丢出去html给下一个兄弟处理\n')
                if (this.doc_type == 'json') {
                  result = JSON.parse(body)
                  // console.log(result)
                  return resolve(this.handle(result[this.res_data]))
                } else {
                  console.log(this.config.doc_type)
                  resolve(this.handle(body))
                }
    
              } catch (e) {
                this.say('爬取失败了,因为这原因', e.response)
                return reject(e);
              }
    
            });
          }).catch(e => {
            console.log(e)
          })
        }
      }
      /**
       * 获取代理IP
       * @returns 
       */
      get_proxy_list() {
        let api_url = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1';
        return new Promise((resolve, reject) => {
          let options = {
            method: 'GET',
            url: api_url,
            gzip: true,
            encoding: null,
            headers: {
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
              'Accept-Encoding': 'gzip, deflate',
              'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
              'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
              'referer': 'http://www.66ip.cn/'
            }
          }
    
          request(options, (error, response, body) => {
            try {
              if (error) throw error;
              if (/meta.*charset=gb2312/.test(body)) {
                body = iconv.decode(body, 'gbk');
              }
              let ret = body.match(/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,4}/g);
              resolve(ret);
            } catch (e) {
              return reject(e);
            }
          })
        })
      }
      /**
       * 设置代理ip
       */
      async set_proxy() {
        //   this.say(this.tempProxy)
        this.proxy = true
        this.say('好吧请稍等,代理IP稍后奉上....')
        if (!this.tempProxy.length) {
          this.temp_proxy = await this.get_proxy_list()
          this.say('这次一用力获取到了' + this.temp_proxy.length + '个IP,请测试!')
        } else {
          this.say('原来的代理IP还有慢慢慢用')
          //其实该换新的了
          this.temp_proxy = []
        }
      }
      /**
       * 处理数据
       * @returns
       */
      handle(data) {
        if (this.doc_type == 'json') {
          let item = {}
          for (let i in data) {
            item = {}
            for (let it in this.result_obj) {
              //如果是数读取到最后一个字符串
              if (_.isArray(this.result_obj[it])) {
                if (this.result_obj[it].length == 2) {
                  item[it] = data[i][this.result_obj[it][0]][this.result_obj[it][1]]
                }
                if (this.result_obj[it].length == 3) {
                  item[it] = data[i][this.result_obj[it][0]][this.result_obj[it][1]][this.result_obj[it][2]]
                }
              } else {
                item[it] = data[i][this.result_obj[it]]
              }
    
            }
            let new_item = Object.assign(item, this.result_ext)
            new_item['key'] = this.set_key(new_item.title)
            this.result_list[i] = new_item
          }
          if (this.debug) {
            this.say(data[0])
            this.say(this.result_list[0])
          }
        }
    
        if (this.doc_type == 'html') {
          let $ = cheerio.load(data)
          this.say('这是一个html文档,具体是否可用要问一下下面的兄弟才知道')
          return $
        }
        return this.result_list
      }
    
      async test_proxy(proxy_ip) {
        if (proxyIp == undefined) {
          this.say('ip 已经没了,重新获取')
          this.temp_proxy = []
          await this.set_proxy()
        }
        //测试这个代理IP 可用就设置
        return new Promise((resolve, reject) => {
          let target_options = {
            method: 'GET',
            url: 'http://ip.chinaz.com/getip.aspx',
            timeout: 8000
          };
    
          //这里修改一下,变成你要访问的目标网站
          this.say(`开始测试这个IP ${proxyIp}`);
    
          target_options.proxy = 'http://' + proxy_ip;
          request(target_options, (error, response, body) => {
            try {
              if (error) throw error;
              body = body.toString();
              // this.say(body);
    
              if (body.length < 100) {
                this.say(`兄弟这个可以拿去用==>> ${proxy_ip}`);
                resolve('http://' + proxy_ip)
              } else {
                this.say(`这个IP无效==>> ${proxy_ip}`);
                resolve(false)
              }
    
            } catch (e) {
              return reject(false);
            }
    
          });
    
        }).catch(e => {
          return e
        })
      }
    
      say(lang, lang1 = '') {
        if (this.debug) {
          console.log(lang, lang1)
        }
      }
      async check_ip() {
        //从第一个开始找找到合适的返回
        if (this.proxy) {
          for (let i in this.temp_proxy) {
            console.log(this.temp_proxy[i])
            let true_ip = await this.testProxy(this.temp_proxy[i])
            if (true_ip) {
              //记住这个索引
              return true_ip
            }
          }
        }
        return false
      }
      set_key(str) {
        if (!str) {
          return 'no str'
        }
        let rand = Math.ceil(Math.random() * 100)
        const md5 = crypto.createHash('md5')
        let password = md5.update(str + rand).digest('hex')
        return password
      }
    }
    
    /**
     * api 例子
     */
    
    const api_spider = new spider({
      Host: 'juejin.com',
      doc_type: 'json'
    })
    api_spider.res_data = ['d']
    api_spider.result_obj = {
      id: 'objectId',
      title: 'title',
      link: 'originalUrl',
      desc: 'content',
      like: 'collectionCount',
      comments: 'commentsCount',
      createdAt: 'createdAt'
    }
    api_spider.result_ext = {
      from: '掘金'
    }
    
    api_spider.so('https://search-merger-ms.juejin.im/v1/search?query=node&page=0&raw_result=false&src=web')
      .then((data) => {
        console.log(data[0])
      })
    
    
    
    /**
     * html 例子
     */
    
    const so = async function () {
    
      let start = new spider({
        Host: 'zzk.cnblogs.com',
        doc_type: 'html'
      })
      start.headers['Cookie'] = 'GA1.2.182722864.1520590357; UM_distinctid=162105121121049-0b14491452f4e4-32667b04-1aeaa0-162105121136f2; _gid=GA1.2.1004931403.1527083606; __utma=59123430.182722864.1520590357.1527094058.1527094058.1; __utmc=59123430; __utmz=59123430.1527094058.1.1.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=59123430.3.10.1527094058'
      let $ = await start.so('http://zzk.cnblogs.com/s?t=b&w=node')
      $('div.searchItem').each(function (i, elem) {
        let item = $(this).children('.searchItemTitle')
        start.result_list[i] = {
          id: i + 1,
          title: item.text(),
          link: item.children('a').attr('href'),
          desc: $(this).children('.searchCon').text().substr(0, 80),
          from: '博客园',
          createdAt: $(this).children('.searchItemInfo').children('.searchItemInfo-publishDate').text(),
          comments: $(this).children('.searchItemInfo').children('.searchItemInfo-comments').text(),
          views: $(this).children('.searchItemInfo').children('.searchItemInfo-views').text(),
          key: start.set_key(item.text())
        }
      })
      // console.log(start.resultList)
      return start.result_list
    }
    so().then((data)=>{
        console.log(data[0])
    })
    module.exports = spider
    ``

    相关文章

      网友评论

          本文标题:nodejs简易爬虫支持代理ip

          本文链接:https://www.haomeiwen.com/subject/vogbjftx.html