美文网首页
python 简单爬取segmentFault 前100页数据进

python 简单爬取segmentFault 前100页数据进

作者: _Construct | 来源:发表于2017-12-14 09:46 被阅读0次

    使用 urllib 和 BeautifulSoup pymysql

    抓取页代码

       from urllib.request import Request, urlopen
       from bs4 import BeautifulSoup
       import re
       import pymysql
       datas = [] 
    
         def getHtml(url):
        page = urlopen(url)
        if page.getcode() != 200:
            return None;
        html = page.read().decode(encoding='utf-8')
        return html
    def parser(html_doc):
         soup = BeautifulSoup(html_doc,'html.parser')
         summs = soup.findAll('div',class_="summary")
         #res_data 不能放在循环外面 
         #当声明一个字典 info = {} 的操作时候,该字典就已经在内存中获取了某一块地址。
        #对该字典进行操作时,如 info['name'] = 'github' 的时候,这个字典依旧是之前所占用的地址。
         config = {
            'host':'127.0.0.1',
            'port':3306,
            'user':'root',
            'password':'root',
            'db':'python_test',
            'charset':'UTF8',
            'cursorclass':pymysql.cursors.DictCursor,
         }
        #因为上面的是一个字典,所以传进来需要 **
    
         for summ in summs:
            res_data = {}
            res_data['title'] = summ.find('h2',class_="title").find('a').get_text()
            tags =  summ.findAll('li',class_="tagPopup")
            tags_tag = set()
            for tag in tags:
                tags_tag.add(tag.find('a',class_='tag').get_text())
            res_data['tags']   = tags_tag
            datas.append( res_data )
            const = pymysql.Connect(**config)
            try:
                cursor = const.cursor()
                sql = 'insert into segment(title,tags) values(%s,%s)'
                cursor.execute(sql,(res_data['title'],str(res_data['tags'])))
                #没有设置默认自动提交,需要主动提交,以保存所执行的语句
                const.commit()
            except Exception as e:
                print(e)
            finally:
                cursor.close()
                const.close()
    
    if __name__ == '__main__':
        url = 'https://segmentfault.com/t/javascript?type=newest&page='
        count = 1
        while count < 100:
            new_url = url+str(count)
            html_dom = getHtml(new_url )
            print('正在执行第'+str(count)+'页的内容抓取')
            parser(html_dom)
            count = count + 1
        print('程序执行完毕') 
    

    统计页代码

    import pymysql
    config = {
            'host':'127.0.0.1',
            'port':3306,
            'user':'root',
            'password':'root',
            'db':'python_test',
            'charset':'UTF8',
            'cursorclass':pymysql.cursors.DictCursor,
    }
    
    def get_tags():
        connect = pymysql.connect(**config)
        try:
            cursor = connect.cursor()
            sql = "select tags from segment"
            cursor.execute(sql)
            result = cursor.fetchall()
            fout = open('output1.html','w')
            fout.write('<html>')
            fout.write('<body>')
            fout.write('<table border="1" cellspacing="0" cellpadding="0">')
            fout.write('<tr>')
            fout.write('<td>tag</td>')
            fout.write('<td>统计</td>')
            fout.write('</tr>')
            tags_box = {}
            for tag in result:
                for item in eval(tag['tags']):
                    count = 0;
                    if item in tags_box:
                       tags_box[item] = tags_box[item] + 1
                    else:
                        tags_box[item] = count + 1 
            new_tag = sorted(tags_box.items(),key = lambda x:x[1],reverse = True)
            for item in new_tag:
                fout.write('<tr>')
                for i in item:
                    fout.write("<td> %s  </td>" % i)
                fout.write('</tr>')
            fout.write('</table>')
            fout.write('</body>')
            fout.write('</html>')
            fout.close()       
        except Exception as e:
            print( e )
        finally:
            connect.close()
    
    if __name__ == '__main__':
        get_tags()
    

    相关文章

      网友评论

          本文标题:python 简单爬取segmentFault 前100页数据进

          本文链接:https://www.haomeiwen.com/subject/bqphwxtx.html