美文网首页
内涵段子、糗事百科网页抓取分析

内涵段子、糗事百科网页抓取分析

作者: 海阔天空的博客 | 来源:发表于2021-12-06 08:17 被阅读0次

    简介:

    最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。

    以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。

    内涵段子网页抓取分析代码:

    //内涵段子
    //http://neihanshequ.com/
    var webUrl = 'http://neihanshequ.com/';
    var imageUrl = 'http://neihanshequ.com/pic/';
    var index = 1;
    var endIndex = 5;
    var retVal =
    {
        success: false,
        items: []
    };
     
    function getJoyFromOnePage(htmlData, requestParams)
    {
        var nCount = 0;
        var bEndOnePage = false;
        while(!bEndOnePage)
        {
            var result = 
            {
                webname: 'NeiHanDuanzi',
                webid: '',
                type: '',
                context: '',
                pic_url: '',
                read_count: '',
                publish_time: '',
                best_comment: ''       
            }
             
            //check gif
            {
                var keyWords = '"is_gif":"';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('"');
                    var gif = htmlData.substring(0, endIndex);
                    if( gif == 1 )
                    {
                        alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
                        continue;
                    }
                }
            }
             
            //webid
            {
                var keyWords = 'data-group-id="';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('"');
                    result.webid = htmlData.substring(0, endIndex);
                    alert('NeiHanDuanzi:webid-' + result.webid);
                }
                else
                {
                    bEndOnePage = true;
                    alert('NeiHanDuanzi:webid not find, page end.');
                }
            }
             
            //read_count
            {
                var keyWords = '<span class="digg">';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('</span>');
                    result.read_count = htmlData.substring(0, endIndex);
                    //alert('NeiHanDuanzi:read_count-' + result.read_count);
                }
            }
             
            //context
            {
                var keyWords = 'data-text="';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('"');
                    result.context = htmlData.substring(0, endIndex);
                    //alert('NeiHanDuanzi:context-' + result.context);
                }
            }
             
            //pic_url
            {
                var keyWords = 'data-pic="';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf('"');
                    result.pic_url = htmlData.substring(0, endIndex);
                }
            }
             
            /*
            nCount++;
            if( nCount >= 20 )
            {
                break;
            }
            */
             
            retVal.items.push(result);
        }
         
        return;
    }
     
    function getJoyContextList( url, parametersString )
    {
        var parameters = eval("(" + parametersString + ")");
        var requestParams =
        {
            method: 'GET',
            version: 'HTTP/1.1',
            headers: {},
            scriptParamaters: parameters
        };
         
        requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
        requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
        requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
        requestParams.headers['Cache-Control'] = 'no-cache';
        requestParams.headers['Connection'] = 'keep-alive';
        requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
        requestParams.headers['Host'] = 'neihanshequ.com';
        requestParams.headers['Pragma'] = 'no-cache';
        requestParams.headers['Referer'] = 'http://neihanshequ.com/';
        requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
     
        {
            //get web context
            var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
            var httpRsp = eval("(" + httpRspString + ")");
            if( !httpRsp || httpRsp.statusCode != 200 )
            {
                alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
                return JSON.stringify(retVal);
            }
             
            var htmlData = httpRsp.data;
            getJoyFromOnePage(htmlData, requestParams)
        }
         
        {
            //get image
            var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
            var httpRsp = eval("(" + httpRspString + ")");
            if( !httpRsp || httpRsp.statusCode != 200 )
            {
                alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
                return JSON.stringify(retVal);
            }
             
            var htmlData = httpRsp.data;
            getJoyFromOnePage(htmlData, requestParams)
             
            //alert('NeiHanDuanzi: headers' + httpRsp.data);
        }
     
         
        retVal.success = true;
        return JSON.stringify(retVal);
    }
    

    糗事百科网页抓取分析代码

    //糗事百科
    //http://www.qiushibaike.com/hot/page/{index}
    var webUrl = 'http://www.qiushibaike.com/hot/page/';
    var index = 1;
    var endIndex = 5;
    var retVal =
    {
        success: false,
        items: []
    };
     
    function getJoyFromOnePage(htmlData)
    {
        var bEndOnePage = false;
        while(!bEndOnePage)
        {
            var result = 
            {
                webname: 'QiuShiBaiKe',
                webid: '',
                type: '',
                context: '',
                pic_url: '',
                read_count: '',
                publish_time: '',
                best_comment: ''       
            }
            //webid
            {
                var keyWords = 'qiushi_tag_';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf("'>");
                    result.webid = htmlData.substring(0, endIndex);
                    alert('QiuShiBaiKe:webid-' + result.webid);
                }
                else
                {
                    bEndOnePage = true;
                    alert('QiuShiBaiKe:webid not find, page end.');
                }
            }
            //context
            {
                var keyWords = '<div class="content">';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf("<!");
                    result.context = htmlData.substring(0, endIndex);
                    //alert('QiuShiBaiKe:context-' + result.context);
                }
            }
             
            //pic_url
            {
                var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                     
                    //get sub
                    keyWords = '<img src="';
                    startIndex = htmlData.indexOf(keyWords);
                    if( startIndex > 0 )
                    {
                        htmlData = htmlData.substring((startIndex + keyWords.length));
                        var endIndex = htmlData.indexOf('" alt="');
                        result.pic_url = htmlData.substring(0, endIndex);
                        //alert('QiuShiBaiKe:pic_url-' + result.pic_url);
                    }
                }
            }
             
            //read_count
            {
                var keyWords = '<span class="stats-vote"><i class="number">';
                var startIndex = htmlData.indexOf(keyWords);
                if( startIndex > 0 )
                {
                    htmlData = htmlData.substring((startIndex + keyWords.length));
                    var endIndex = htmlData.indexOf("</i>");
                    result.read_count = htmlData.substring(0, endIndex);
                    //alert('QiuShiBaiKe:read_count-' + result.read_count);
                }
            }
             
            retVal.items.push(result);
        }
         
        return;
    }
     
    function getJoyContextList( url, parametersString )
    {
        var parameters = eval("(" + parametersString + ")");
        var requestParams =
        {
            method: 'GET',
            version: 'HTTP/1.1',
            headers: {},
            scriptParamaters: parameters
        };
        requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
        requestParams.headers['Host'] = 'www.qiushibaike.com';
        requestParams.headers['Connection'] = 'keep-alive';
        requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
        requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
        requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
         
        for(var i = index; index <= endIndex; index++)
        {
            var trueUrl = webUrl + index;
            var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
            var httpRsp = eval("(" + httpRspString + ")");
            if( !httpRsp || httpRsp.statusCode != 200 )
            {
                alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
                return JSON.stringify(retVal);
            }
             
            var htmlData = httpRsp.data;
            getJoyFromOnePage(htmlData)
        }
         
        retVal.success = true;
        return JSON.stringify(retVal);
    }
    

    本文摘录于海阔天空的博客,作者: zjg555543,发布时间: 2015-09-17

    相关文章

      网友评论

          本文标题:内涵段子、糗事百科网页抓取分析

          本文链接:https://www.haomeiwen.com/subject/cmudaltx.html