背景:需要座机区号,但是没有现成的json,遂决定去爬一个。
爬那种年久失修的被遗忘的网站,一般遇到的问题比较棘手的就是编码问题,gbk、bg2312这种。因为选的网络请求依赖设置不对,有时候拿到的就是乱码,用iconv解不出来。
然后咧,仔细查阅了一下superagent这个用起来比较顺手的库的文档。
其实人家是可以设置编码方式的,不需要自己傻傻地折腾iconv。
在这里有写
感兴趣的自己去看 https://www.npmjs.com/package/superagent#browser
首先就是装依赖
npm install superagent
npm i superagent-charset --save
npm install cheerio
我就只取了想要的电话区号,还用了个特丑的正则- -
有知道的可以告诉我咋修改得好看一些
以下是完整代码
const superagent = require('superagent');
require('superagent-charset')(superagent);
let cheerio = require('cheerio');
let fs = require('fs');
let num = 1;
let task = function(href) {
return superagent
.get(href)
.charset('gb2312')
.then(res => {
// debugger;
let $ = cheerio.load(res.text);
let data = [];
// debugger;
console.log(num, href);
$('#contents').each((index, element) => {
let content = $(element).text();
let arr = content.match(/(\d{3,4})\,/g).map(item => {
return item.replace(',', '');
});
data = [...data, ...arr];
});
return Promise.resolve(data);
});
};
let main = async function() {
let resArr = [];
while (num <= 19) {
let dataArr = await task(`https://doc.wendoc.com/b0ac7f2bf0ad4e6ccae1b80674d2fd6eb62f3e75b-${num}.html`);
resArr = [...resArr, ...dataArr];
num++;
}
let set = new Set(resArr);
resArr = Array.from(set);
fs.writeFile('./res.js', 'export default' + JSON.stringify(resArr), function() {});
};
main();
爬取的全国座机区号
export default [
'010',
'021',
'022',
'023',
'0551',
'0564',
'0563',
'0562',
'0561',
'0559',
'0558',
'0557',
'0556',
'0555',
'0554',
'0553',
'0552',
'0550',
'0566',
'0565',
'0591',
'0592',
'0593',
'0594',
'0595',
'0596',
'0597',
'0598',
'0599',
'0931',
'0930',
'0932',
'0933',
'0993',
'0934',
'0935',
'0936',
'0937',
'0938',
'0939',
'0943',
'9401',
'9402',
'9403',
'9404',
'9405',
'9406',
'9407',
'9411',
'9412',
'9413',
'9414',
'9415',
'9416',
'9417',
'9418',
'9421',
'9422',
'9423',
'9424',
'9425',
'9426',
'9441',
'9442',
'9443',
'9444',
'9445',
'9446',
'9447',
'9491',
'9492',
'9493',
'9494',
'9495',
'9496',
'9497',
'9498',
'020',
'0660',
'0661',
'0662',
'0663',
'0668',
'0750',
'0751',
'0752',
'0753',
'0754',
'0755',
'0756',
'0757',
'0758',
'0759',
'0760',
'0762',
'0763',
'0765',
'0766',
'0768',
'0769',
'0771',
'0770',
'0772',
'0773',
'0774',
'0775',
'0776',
'0777',
'0778',
'0779',
'0851',
'0852',
'0853',
'0854',
'0855',
'0856',
'0857',
'0858',
'0859',
'8631',
'8632',
'8633',
'8634',
'8635',
'8640',
'8641',
'8642',
'8643',
'8644',
'8645',
'8646',
'8647',
'8648',
'8649',
'8650',
'8651',
'8652',
'8653',
'8654',
'8655',
'8656',
'8657',
'8658',
'8659',
'8661',
'8662',
'8663',
'8664',
'8665',
'8666',
'8667',
'8668',
'8669',
'8670',
'8671',
'8672',
'8673',
'8674',
'8675',
'8676',
'8677',
'8680',
'8681',
'8682',
'8686',
'8687',
'8688',
'8689',
'0898',
'0899',
'0890',
'0311',
'0312',
'0313',
'0314',
'0315',
'0316',
'0317',
'0318',
'0319',
'0310',
'0335',
'0371',
'0370',
'0372',
'0373',
'0374',
'0375',
'0376',
'0377',
'0378',
'0379',
'0391',
'0392',
'0393',
'0394',
'0395',
'0396',
'0397',
'0398',
'0451',
'0452',
'0453',
'0454',
'0458',
'0456',
'0457',
'0459',
'027',
'0710',
'0711',
'0712',
'0713',
'0714',
'0715',
'0716',
'0717',
'0718',
'0719',
'0722',
'0727',
'0728',
'0731',
'0730',
'0732',
'0733',
'0734',
'0735',
'0736',
'0737',
'0738',
'0739',
'0743',
'0744',
'0745',
'0746',
'0431',
'0432',
'0433',
'0434',
'0435',
'0436',
'0437',
'0438',
'0439',
'0440',
'0448',
'025',
'0510',
'0511',
'0512',
'0513',
'0514',
'0515',
'0516',
'0517',
'0518',
'0519',
'0520',
'0523',
'0527',
'0791',
'0790',
'0792',
'0793',
'0794',
'0795',
'0796',
'0797',
'0798',
'0799',
'0701',
'024',
'0410',
'0411',
'0412',
'0413',
'0414',
'0415',
'0416',
'0417',
'0418',
'0419',
'0421',
'0427',
'0429',
'0471',
'0470',
'0472',
'0473',
'0474',
'0475',
'0476',
'0477',
'0478',
'0479',
'0482',
'4831',
'4887',
'4888',
'0951',
'0952',
'0953',
'0954',
'0971',
'0970',
'0972',
'0973',
'0974',
'0975',
'0976',
'0977',
'0978',
'0979',
'9820',
'9828',
'9831',
'9832',
'9833',
'9834',
'9835',
'9836',
'9837',
'9838',
'9839',
'9840',
'9841',
'9842',
'9843',
'9844',
'9846',
'9847',
'9848',
'9849',
'9851',
'9852',
'9853',
'9854',
'0531',
'0530',
'0532',
'0533',
'0534',
'0535',
'0536',
'0537',
'0538',
'0539',
'0543',
'0546',
'0631',
'0632',
'0633',
'0634',
'0635',
'0351',
'0349',
'0350',
'0352',
'0353',
'0354',
'0355',
'0356',
'0357',
'0358',
'0359',
'029',
'0913',
'0912',
'0911',
'0910',
'0914',
'0916',
'0915',
'9244',
'9243',
'9242',
'9240',
'9229',
'9228',
'9227',
'9226',
'9225',
'9224',
'9223',
'9222',
'9221',
'9220',
'0919',
'0917',
'028',
'0812',
'0813',
'0816',
'0817',
'0818',
'0825',
'0826',
'0830',
'0831',
'0832',
'0833',
'0834',
'0835',
'0836',
'0837',
'0838',
'0839',
'8225',
'8228',
'8229',
'8241',
'8247',
'8270',
'8277',
'8278',
'8279',
'8295',
'8296',
'8298',
'8407',
'8411',
'8417',
'8430',
'7437',
'8440',
'8444',
'8445',
'8447',
'8452',
'8453',
'8455',
'8456',
'8457',
'8458',
'8459',
'8461',
'8462',
'8463',
'8465',
'8466',
'8489',
'8493',
'0891',
'0892',
'0894',
'0895',
'8015',
'8016',
'8017',
'8018',
'8040',
'8049',
'8051',
'8054',
'8056',
'8057',
'8059',
'8061',
'8062',
'0896',
'8067',
'8069',
'8073',
'8078',
'8081',
'0893',
'0991',
'0901',
'0902',
'0903',
'0906',
'0908',
'0909',
'0990',
'0992',
'0994',
'0995',
'0996',
'0997',
'0998',
'0999',
'0871',
'0870',
'0872',
'0873',
'0874',
'0875',
'0876',
'0877',
'0878',
'0879',
'0881',
'0883',
'0886',
'0887',
'0888',
'0691',
'0692',
'0571',
'0570',
'0572',
'0573',
'0574',
'0575',
'0576',
'0577',
'0578',
'0579',
'0580',
'886',
'887',
'888',
'889',
'890',
'891',
'892',
'893',
'894',
'895',
'896',
'897',
'898',
'899',
'900',
'901',
'902',
'903',
'904',
'905',
'906',
'907',
'908',
'909',
'910',
'911',
'912',
'913'
];
网友评论