美文网首页python
使用Python脚本拉取2014 CSDN博客之星投票情况

使用Python脚本拉取2014 CSDN博客之星投票情况

作者: 张明云 | 来源:发表于2015-01-07 16:32 被阅读164次

    前言

      最近在自学Python,正好2014 CSDN博客之星投票搞得如火如荼,拿来练练手。

    • 环境:Win7 64位 Python 2.7;

    • 用到了正则表达式、函数、写文件、urllib2;

    • 没有用到线程;

    • 程序也不怎么规范,但终归是能够达到目的了,哈哈。

    源码

    # -*- coding: utf-8 -*-
    
    import urllib2;
    import re;
    import os;
    import thread;
    
    
    def loadBlogSort(url):
        pageCount = getPageCount(url);
        print 'pageCount == ',pageCount;
        baseUrl = 'http://vote.blog.csdn.net/Blogstar2014/Selection?PageIndex=';
        urlSuffix = '#content';
    
        filepath = 'csdn_blog_star_vote.txt';
        if os.path.exists(filepath):
            os.remove(filepath);
        f = open(filepath,'w+');
        for pageIndex in range(1,int(pageCount)+1):
            contentUrl = baseUrl + str(pageIndex) + urlSuffix;
            print 'pageIndex == ',pageIndex, ' contentUrl == ',contentUrl;
            user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
            headers = { 'User-Agent' : user_agent }
            request = urllib2.Request(contentUrl, headers = headers)
            response = urllib2.urlopen(request);
            result = response.read();
            # unicodeResult = result.decode("utf-8");
            # 名称
            # <div\sclass=\"star-con\"><span\sclass=\"star-name\"><a\shref=(.+?)\starget=\"_blank\"\stitle=(.+?)>(.+?)</a></span>
            names = re.findall('<div\sclass=\"star-con\"><span\sclass=\"star-name\"><a\shref=(.+?)\starget=\"_blank\"\stitle=(.+?)>(.+?)</a></span>',result,re.S);
            nameList = [];
            for name in names:
                # print '昵称:',name[2];
                nameList.append(name[ 2 ]);
    
            # 博客地址
            # <dt><a\shref=\"(.+?)\"\s\starget="_blank"><img\ssrc=(.+?)></a></dt>
            blogUrlList = [];
            detailUrls = re.findall('<dt><a\shref=\"(.+?)\"\s\starget="_blank"><img\ssrc=(.+?)></a></dt>',result,re.S);
            for detailUrl in detailUrls:
                blogUrlList.append(getBlogUrl(detailUrl[0]));
    
            # 得票
            # <p><b>得票:</b><span\sid=(.+?)>(.+?)</span></p>
            votes = re.findall('<p><b>(.+?)</b><span\sid=(.+?)>(.+?)</span></p>',result,re.S);
            voteList = [];
            for vote in votes:
                # print ' 得票:',str(vote[2]);
                voteList.append(vote[ 2 ]);
            # 博文浏览量、博文数、评论数
            # <div\sclass="star-post1"><span>(.+?)</span><span>(.+?)</span><span>(.+?)</span></div>
            infos = re.findall('<div\sclass="star-post1"><span>(.+?)</span><span>(.+?)</span><span>(.+?)</span></div>',result,re.S);
            infoIndex = 0;
            blankSize = 20;
            for info in infos:
                user = '昵称:'+nameList[infoIndex] + ( blankSize - len(nameList[infoIndex]) )*' '+'得票:'+voteList[infoIndex] + ( blankSize - len(voteList[infoIndex]) )*' '+'博文浏览量: '+str(info[0]) + ( blankSize - len(str(info[0])) )*' '+'博文数:'+str(info[1]) + ( blankSize - len(str(info[1])) )*' '+'评论数:'+str(info[2])+ + ( blankSize - len(str(info[2])) )*' '+'博客地址:' + blogUrlList[infoIndex]+ '\n'
                # print user;
                f.write(user);
                infoIndex += 1;
    
        f.close();
        print '写文件完毕!';
    
    # 得到博客链接
    def getBlogUrl(detailUrl):
        url = 'http://vote.blog.csdn.net/' + detailUrl;
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        headers = { 'User-Agent' : user_agent }
        request = urllib2.Request(url, headers = headers)
        response = urllib2.urlopen(request);
        result = response.read();
        blogUrls = re.findall('<p>(.+?)<a\shref=\"(.+?)\"\s\starget="_blank">(.+?)</a></p>',result,re.S);
        print 'blogUrl == ',url + '\n' + str(blogUrls[0][1]);
        return str(blogUrls[0][1]);
    
    # 得到总页码数
    def getPageCount(url):
        user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        headers = { 'User-Agent' : user_agent }
        request = urllib2.Request(url, headers = headers)
        response = urllib2.urlopen(request);
        result = response.read();
        pageCount = re.findall('<div\sid=\"PageCount\"\sstyle=\"\sdisplay:none\">(.+?)</div>',result,re.S);
        return pageCount[0];
    
    url = 'http://vote.blog.csdn.net/Blogstar2014/Selection?PageIndex=1#content';
    loadBlogSort(url);
    

    效果

    2014_csdn_blog_star_vote2014_csdn_blog_star_vote

    相关文章

      网友评论

        本文标题:使用Python脚本拉取2014 CSDN博客之星投票情况

        本文链接:https://www.haomeiwen.com/subject/zrfxxttx.html