美文网首页GIS加油站
【node爬虫】通过高德接口爬取地址的经纬度

【node爬虫】通过高德接口爬取地址的经纬度

作者: 牛老师讲webgis | 来源:发表于2021-11-15 17:54 被阅读0次

    概述

    通过地址的经纬度在GIS中我们称之为地理编码,与之对应的就是通过经纬度获取地址,被称为逆地理编码。不论是地理编码还是逆地理编码,在我们实际的工作、学习中都会有很多的场景,本文讲述的在node环境下,通过高德API实现经纬度数据的获取。

    效果

    爬虫中 爬虫结束 返回的结果 实现效果

    实现

    在本示例中,首先将地址数据导入到数据库中,同时将获取到的经纬度数据存储在数据库中。实现代码如下:

    const pgConfig = require('./model/pgConfig');
    const pg = require('pg');
    const pool = new pg.Pool(pgConfig);
    let request = require("./utils/request");
    const chalk = require('chalk');
    
    // 申请的key
    const aKey = {
      0: '申请的key1',
      1: '申请的key2',
      2: '申请的key3'
    }
    const aUrl = 'https://restapi.amap.com/v3/geocode/geo'
    
    /**
     * 获取随机数
     * @param minNum
     * @param maxNum
     * @return {number}
     */
    function getRandom(minNum = 80,maxNum = 200){
      switch(arguments.length){
        case 1:
          return parseInt(Math.random()*minNum+1,10);
        case 2:
          return parseInt(Math.random()*(maxNum-minNum+1)+minNum,10);
        default:
          return 0;
      }
    }
    
    /**
     * json
     * @param url
     * @return {Promise<unknown>}
     */
    function getJson(url) {
      return new Promise(resolve => {
        request(url).then(res => {
          resolve(res)
        });
      })
    }
    
    async function getAddressLonLat(address) {
      try {
        const urlFull = `${aUrl}?address=${address}&key=${aKey[getRandom(0, 2)]}`
        let res = await getJson(urlFull)
        res = JSON.parse(res.toString())
        const geocodes = res['geocodes']
        if(geocodes && geocodes.length > 0) {
          const { location } = geocodes[0]
          return location
        }
      } catch (e) {
        console.debug(e)
      }
    }
    
    function startSpider(table) {
      console.time(table)
      let index = 0
      let count = 0
      let rowsData = []
      let flag = 0
      let updateSql = {
        id: [],
        lonlat: []
      }
    
      const getLocation = () => {
        flag = setTimeout(() => {
          // 进度输出
          if((index > 0 && index % 20 === 0) || index === count) {
            console.log(`${table}:----------${index}`)
            if(updateSql.id.length > 0) {
              let val = ''
              updateSql.id.forEach((id, index) => {
                val += ` WHEN ${id} THEN '${updateSql.lonlat[index]}' `
              })
              let sqlUpdate = `
                UPDATE ${table}
                SET lonlat = CASE id
                        ${val}
                    END
                WHERE id IN (${updateSql.id.join(',')});
            `
              pool.query(sqlUpdate, (_isErr, _res) => {
                if(_isErr) console.error(`${table}----------数据库更新错误`)
                index++
                getLocation()
                if(index >= count) {
                  clearTimeout(flag)
                  console.timeEnd(table)
                  console.log(chalk.green(`----------${table}结束处理----------`))
                  process.exit(1);// 退出进程
                }
              })
            } else {
              if(index >= count) {
                clearTimeout(flag)
                console.timeEnd(table)
                console.log(chalk.green(`----------${table}结束处理----------`))
                process.exit(1);// 退出进程
              }
              index++
              getLocation()
            }
          } else {
            const id = rowsData[index] && rowsData[index].id
            if(id) {
              let sql = `select reg_location as loc, company_name as name
                       from ${table} where id=${id};`
              pool.query(sql, (isErr, res) => {
                if(isErr) console.error('数据库查询错误')
                else {
                  const address = [res.rows[0].loc, res.rows[0].name]
                  getAddressLonLat(address.join('')).then(location => {
                    if(location) {
                      updateSql.id.push(id)
                      updateSql.lonlat.push(location)
                    }
                    index++
                    getLocation()
                  })
                }
              });
            }
    
          }
        }, getRandom())
      }
      // 查询条数
      const countSql = `select id from ${table} where lonlat  = '' order by id;`
      pool.query(countSql, (isErr, res) => {
        if(isErr) console.error('数据库查询错误')
        else {
          const rows = res.rows
          rowsData = rows
          count = rows.length
          console.log(chalk.red(`----------开始处理${table},共${count}条记录----------`))
          getLocation()
        }
      });
    }
    
    pool.connect(isError => {
      if(isError) console.error(chalk.red('数据库连接错误'))
      startSpider('company_2006')
    })
    

    request代码如下:

    const request = require("request");
    
    function handleRequestByPromise(url, options = {}) {
      options.url = encodeURI(url)
      let op = Object.assign(
        {},
        {
          url: "",
          method: "GET",
          encoding: null,
          header: {
            "User-Agent":
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
            Referer: "https://www.meituri.com"
          }
        },
        options
      );
    
      if (op.url === "") {
        throw new Error("请求的url地址不正确");
      }
    
      const promise = new Promise(function(resolve, reject) {
        request(op, (err, response, body) => {
          if (err) reject(err);
          if (response && response.statusCode === 200) {
            resolve(body);
          } else {
            reject(`请求${url}失败!`);
          }
        });
      });
    
      return promise;
    }
    
    module.exports = handleRequestByPromise
    

    pgConfig代码如下:

    const config = {
      host: 'ip',
      user: 'user',
      database: 'database',
      password: 'password',
      port: 5432,
      // 扩展属性
      max: 40, // 连接池最大连接数
      idleTimeoutMillis: 3000, // 连接最大空闲时间 3s
    };
    
    module.exports = config;
    

    说明

    1. aKey设置了多个,是为了防止账户被封;
    2. 在获取经纬度数据的时候设置了80-200ms的随机延迟,也是为了防止账户被封;

    相关文章

      网友评论

        本文标题:【node爬虫】通过高德接口爬取地址的经纬度

        本文链接:https://www.haomeiwen.com/subject/ltmqtrtx.html