P1 工具：Node.js、cheerio、superagent、fs、async

利用 superagent 请求获取 html 内容

利用 cheerio 解析获取的 html 内容
利用 fs 读写文件
利用 async 异步流程控制

P2 分析赶集网租房页面html结构

1.分析页面结构

租房块 .f-list-item

对应的 html 代码

2. 分析结果：

租房.title = $('.f-list-item').find('.dd-item.title a').attr('title');
租房.address = $('.f-list-item').find('.area').text().replace(/\s+/g, '');  //去除字符串内的全部空格
租房.price = $('.f-list-item').find('.price').text();
租房.size = $('.f-list-item').find('.dd-item.size').text().replace(/\s+/g, '');

P3 利用 superagent 发出 GET 请求获取页面的 html

参考文档：superagent Github、通读superagent文档[整理稿] 、好用的 HTTP模块SuperAgent

1. 将要请求的页面链接放进一个数组，便于并发执行多个任务

// 生成爬取页面的链接
var pages = 2000;
function getLinkArr(cb) {
  var links = [];
  for(var i = 0; i < pages; i++) {
    var url = 'http://hz.ganji.com/fang1/o' + (i + 1) + '/';  // 杭州房源网页地址
    //var url = 'http://sh.ganji.com/fang1/o' + (i + 1) + '/';  // 上海房源网页地址
    links.push(url);
  }
  readFile();
  cb(null, links);
}

2.GET请求页面 html

//开始处理请求得到的html
function start(links, cb) {
  async.eachLimit(links, 5, function(item, callback) {  // 限制允许并发执行的任务数为5，防止访问过快
    setTimeout(function () {                            // 设置每批任务之间休息1s，防止访问过快
      request.get(item).end(function (err, res) {       // get请求页面
        if(res.ok) {
          console.log(item);
          getRoom(res.text);
          console.log(item + ' is processing...');
          callback();
        } else {
          console.log(item + ' failed...');
          callback();
        }
      })
    }, 1000)
  }, function(err) {
    if(err) {
      console.log('A file failed to process');
    } else {
      console.log('All files have been processed successfully');
      cb(null, data);
    }
  });
}

P4 利用 cheerio 处理获取到的html

cheerio 文档

//获取房屋基本信息
function getRoom(html) {
  var $ = cheerio.load(html);
  var room = $('.f-list-item').toArray();
  var len = room.length;
  console.log('room is：' + len);
  for(var i = 0; i < len; i++) {
    var arr = {};
    arr.title = $(room[i]).find('.dd-item.title a').attr('title');  // 房屋标题
    arr.address = $(room[i]).find('.area').text().replace(/\s+/g, ''); // 地址
    arr.price = $(room[i]).find('.price').text(); // 价格/月
    arr.size = $(room[i]).find('.dd-item.size').text().replace(/\s+/g, ''); // 房屋大小等简介
    var item = '{"title":"' + arr.title + '","address":"' + arr.address + '","price":"' + arr.price + '","size":"' + arr.size +'"}';  // 将数据组装成json格式的字符串
    if(findItem(data, arr) == -1) {
      data.push(JSON.parse(formatString(item)));  // 将json格式的字符串push进data数组
    }
  }
}

P5 利用 fs 读写文件，在上一次爬取基础上增加数据

nodejs 数据读写详解

1.读文件

//读取保存的文本数据
function readFile() {
  var fileData = '';
  var readStream = fs.createReadStream(path);

  readStream.setEncoding('UTF8');
  readStream.on('data', function (chunk) {
    fileData += chunk;
  });
  readStream.on('end', function () {
    data = JSON.parse(formatString(fileData));
    console.log(data)
  });
  readStream.on('error', function (err) {
    console.log(err.stack);
  })
}

2.写文件

//写入数据
function writeFile(data, cb) {
  console.log('最终爬取到：' + data.length + ' 条数据');
  console.log("开始写入数据...");
  fs.writeFile(path, JSON.stringify(data), function (err) {
    if (err) {
      return console.error(err);
    }
    console.log("数据写入成功！");
  });
  cb(null, 'Job Is Done !');
}

P6 总结

1.避免获取到重复数据

不知什么原因，爬了4天才爬到4万多条数据，与赶集网上的数据显示12万+相差甚远，大约每天爬到1万的新数据。

//去重，data为 json
function findItem(data, item) {
  for(var i = 0; i < data.length; i++) {
    if(item.title === data[i].title && item.address === data[i].address && item.price === data[i].price && item.size === data[i].size) {
      return i;
    }
  }
  return -1;
}

// 或者将data转换成字符串，然后 indexOf()
var str = JSON.stringify(data);
if(str.indexOf('xxx') === -1) {
    data.push(JSON.parse(formatString(item))); 
}

2.读取JSON文本内容报错

报错： Unexpected token in JSON at position
原因： UTF-8的BOM头导致解析JSON异常，在保存一个以UTF-8编码的文件时，会在文件开始的地方插入三个不可见的字符（0xEF 0xBB 0xBF，即BOM）。

// 解析json前去掉BOM报头（UTF-8签名）
function formatString(str) {
  if (str != null) {
    str = str.replace("\ufeff", "").replace(/\\/g, "/");
  }
  return str;
}

3.写文件总是在superagent获取并处理页面之前，使用async的waterfall解决之

async 文档

//async按顺序执行
async.waterfall([
  getLinkArr,
  start,
  writeFile
], function (err, result) {
  if(err) {
    console.log('error: ' + err);
  } else {
    console.log('任务完成！！！');
  }
});

4.访问页面过快，被服务器识别出，需要填验证码，封IP

利用 async 的eachLimit限制允许并发执行的任务数，防止访问过快；async 批内并行执行，批与批之间顺序执行。

async.eachLimit(["123",  "456",  "789"],  2,  function(item, callback){
    console.log(item);
    callback();  // 必须调用，才能触发下一个任务执行
}, function(error){
    if(error){
        console.error("error: " + error);
    }
});