简介:
最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。
以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。
内涵段子网页抓取分析代码:
//内涵段子
//http://neihanshequ.com/
var webUrl = 'http://neihanshequ.com/';
var imageUrl = 'http://neihanshequ.com/pic/';
var index = 1;
var endIndex = 5;
var retVal =
{
success: false,
items: []
};
function getJoyFromOnePage(htmlData, requestParams)
{
var nCount = 0;
var bEndOnePage = false;
while(!bEndOnePage)
{
var result =
{
webname: 'NeiHanDuanzi',
webid: '',
type: '',
context: '',
pic_url: '',
read_count: '',
publish_time: '',
best_comment: ''
}
//check gif
{
var keyWords = '"is_gif":"';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('"');
var gif = htmlData.substring(0, endIndex);
if( gif == 1 )
{
alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
continue;
}
}
}
//webid
{
var keyWords = 'data-group-id="';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('"');
result.webid = htmlData.substring(0, endIndex);
alert('NeiHanDuanzi:webid-' + result.webid);
}
else
{
bEndOnePage = true;
alert('NeiHanDuanzi:webid not find, page end.');
}
}
//read_count
{
var keyWords = '<span class="digg">';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('</span>');
result.read_count = htmlData.substring(0, endIndex);
//alert('NeiHanDuanzi:read_count-' + result.read_count);
}
}
//context
{
var keyWords = 'data-text="';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('"');
result.context = htmlData.substring(0, endIndex);
//alert('NeiHanDuanzi:context-' + result.context);
}
}
//pic_url
{
var keyWords = 'data-pic="';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('"');
result.pic_url = htmlData.substring(0, endIndex);
}
}
/*
nCount++;
if( nCount >= 20 )
{
break;
}
*/
retVal.items.push(result);
}
return;
}
function getJoyContextList( url, parametersString )
{
var parameters = eval("(" + parametersString + ")");
var requestParams =
{
method: 'GET',
version: 'HTTP/1.1',
headers: {},
scriptParamaters: parameters
};
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Cache-Control'] = 'no-cache';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
requestParams.headers['Host'] = 'neihanshequ.com';
requestParams.headers['Pragma'] = 'no-cache';
requestParams.headers['Referer'] = 'http://neihanshequ.com/';
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
{
//get web context
var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
var httpRsp = eval("(" + httpRspString + ")");
if( !httpRsp || httpRsp.statusCode != 200 )
{
alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
return JSON.stringify(retVal);
}
var htmlData = httpRsp.data;
getJoyFromOnePage(htmlData, requestParams)
}
{
//get image
var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
var httpRsp = eval("(" + httpRspString + ")");
if( !httpRsp || httpRsp.statusCode != 200 )
{
alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
return JSON.stringify(retVal);
}
var htmlData = httpRsp.data;
getJoyFromOnePage(htmlData, requestParams)
//alert('NeiHanDuanzi: headers' + httpRsp.data);
}
retVal.success = true;
return JSON.stringify(retVal);
}
糗事百科网页抓取分析代码
//糗事百科
//http://www.qiushibaike.com/hot/page/{index}
var webUrl = 'http://www.qiushibaike.com/hot/page/';
var index = 1;
var endIndex = 5;
var retVal =
{
success: false,
items: []
};
function getJoyFromOnePage(htmlData)
{
var bEndOnePage = false;
while(!bEndOnePage)
{
var result =
{
webname: 'QiuShiBaiKe',
webid: '',
type: '',
context: '',
pic_url: '',
read_count: '',
publish_time: '',
best_comment: ''
}
//webid
{
var keyWords = 'qiushi_tag_';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf("'>");
result.webid = htmlData.substring(0, endIndex);
alert('QiuShiBaiKe:webid-' + result.webid);
}
else
{
bEndOnePage = true;
alert('QiuShiBaiKe:webid not find, page end.');
}
}
//context
{
var keyWords = '<div class="content">';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf("<!");
result.context = htmlData.substring(0, endIndex);
//alert('QiuShiBaiKe:context-' + result.context);
}
}
//pic_url
{
var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
//get sub
keyWords = '<img src="';
startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf('" alt="');
result.pic_url = htmlData.substring(0, endIndex);
//alert('QiuShiBaiKe:pic_url-' + result.pic_url);
}
}
}
//read_count
{
var keyWords = '<span class="stats-vote"><i class="number">';
var startIndex = htmlData.indexOf(keyWords);
if( startIndex > 0 )
{
htmlData = htmlData.substring((startIndex + keyWords.length));
var endIndex = htmlData.indexOf("</i>");
result.read_count = htmlData.substring(0, endIndex);
//alert('QiuShiBaiKe:read_count-' + result.read_count);
}
}
retVal.items.push(result);
}
return;
}
function getJoyContextList( url, parametersString )
{
var parameters = eval("(" + parametersString + ")");
var requestParams =
{
method: 'GET',
version: 'HTTP/1.1',
headers: {},
scriptParamaters: parameters
};
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
requestParams.headers['Host'] = 'www.qiushibaike.com';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
for(var i = index; index <= endIndex; index++)
{
var trueUrl = webUrl + index;
var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
var httpRsp = eval("(" + httpRspString + ")");
if( !httpRsp || httpRsp.statusCode != 200 )
{
alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
return JSON.stringify(retVal);
}
var htmlData = httpRsp.data;
getJoyFromOnePage(htmlData)
}
retVal.success = true;
return JSON.stringify(retVal);
}
本文摘录于海阔天空的博客,作者: zjg555543,发布时间: 2015-09-17
网友评论