需求:爬取http://www.27270.com/ent/meinvtupian/网站的美女图片和图片标题;
1.需要的第三方库
1.1 iconv-lite
1.2 cheerio
详细使用可查看npm,
//获取第三方库,解决utf-8与gbk乱码问题
let iconv = require('iconv-lite');
//获取第三方库cherrio,解析html文件dom结构
let cheerio = require('cheerio');
2.爬虫代码
/*
* @author: lansir
* @date" 2018/10/3
* @description go~js!
*/
let http = require('http');
//使用第三方库 iconv-lite
let iconv = require('iconv-lite');
let cheerio = require('cheerio');
let fs = require('fs');
let path = require('path');
const theUrl = "http://www.27270.com/ent/meinvtupian/";
//1.获取目标网站的html内容
http.get(theUrl,res=>{
// let date = '';
let data = [];
res.on('data',chunk=>{
// date += chunk;
data.push(chunk);
// console.log(chunk);
});
res.on('end',()=>{
// console.log(date);
//2.使用三方库iconv-lite解决乱码问题
let html = iconv.decode(Buffer.concat(data),'gbk');
//3.从html的dom结构中提取需要的数据,图片的src和标题
let imageData = getSrcAndTitleFromHtml(html);
// console.log(imageData);
// console.log(html);
download(imageData)
})
});
//获取目标网站的img的src和title数组
function getSrcAndTitleFromHtml(html) {
let $ = cheerio.load(html);
let arr = $('div.MeinvTuPianBox>ul>li>a>i>img').toArray();
let imageSrcAndTitile = [];
for (let i = 0; i < arr.length; i++) {
let obj = arr[i];
let src = $(obj).attr('src');
let title = $(obj).attr('alt');
imageSrcAndTitile.push({
src,title
})
}
return imageSrcAndTitile
}
//下载图片
function download(imageData) {
imageData.forEach(obj=>{
http.get(obj.src,res => {
let pathImag = path.join('imgs',obj.title+ path.extname(obj.src));
let writer = fs.createWriteStream(pathImag);
res.pipe(writer)
})
})
}
3.使用监听事件events优化爬虫程序
/*
* @author: lansir
* @date" 2018/10/4
* @description go~js!
*/
//获取EventEmitter对象
let EventEmitter = require('events');
let http = require('http');
let fs = require('fs');
//获取第三方库,解决utf-8与gbk乱码问题
let iconv = require('iconv-lite');
//获取第三方库cherrio,解析html文件dom结构
let cheerio = require('cheerio');
let path = require('path');
class MyEvent extends EventEmitter {
/*1.获取html*/
getHtml() {
http.get("http://www.27270.com/ent/meinvtupian/", res => {
let data = [];
res.on('data', (chunk) => {
//将读取到的数据放入data数组中
data.push(chunk)
});
res.on('end', () => {
//获取html文件
let html = iconv.decode(Buffer.concat(data), 'gbk');
//通过事件来通知html获取完毕
this.emit('getHtmlFinsh', html);
})
})
}
/*2.解析已获取的html的dom结构,获取图片的src和title*/
getSrcAndTitleFromHtml(html) {
let $ = cheerio.load(html);
//将解析到的数据转为数组
let arr = $('div.MeinvTuPianBox>ul>li>a>i>img').toArray();
//定义一个空的数组,用来存放src和title
let imageSrcAndTitleArray = [];
for (let i = 0; i < arr.length; i++) {
let obj = arr[i];
let src = $(obj).attr('src');
let title = $(obj).attr('alt');
//将src和title以JSON对象格式存入到imageSrcAndTitleArray数组中
imageSrcAndTitleArray.push({
src, title
})
}
//获取到带有src和title的数组后,通知事件
this.emit('extraHtmlFinsh', imageSrcAndTitleArray)
}
/*3.下载图片*/
download(imageSrcAndTitleArray) {
imageSrcAndTitleArray.forEach(obj => {
http.get(obj.src, res => {
let pathImage = path.join('imgs', obj.title + path.extname(obj.src));
//创建写入流
let writer = fs.createWriteStream(pathImage);
res.pipe(writer)
})
})
}
/*4.初始化监听器的注册*/
init() {
/* setTimeout(()=>{
this.on('getHtmlFinsh', (html)=>{
this.getSrcAndTitleFromHtml(html)
})
},1000);*/
this.on('getHtmlFinsh', (html) => {
this.getSrcAndTitleFromHtml(html)
});
this.on('extraHtmlFinsh', (imageSrcAndTitleArray) => {
this.download(imageSrcAndTitleArray)
})
this.getHtml()
}
}
//开启爬虫程序
let myEvent = new MyEvent();
myEvent.init();
展示:
爬取结果展示.PNG
网友评论