美文网首页Python开发
新浪博客博文文本聚类

新浪博客博文文本聚类

作者: 木一晟 | 来源:发表于2017-03-02 18:12 被阅读49次

    前言

    这是年前帮一位 QQ 好友完成论文报告所写的程序。有待完善。



    以下是本次报告所使用的程序, 全部使用 Python 编写。根据需要,编写了以下四个程序。

    1. article_base_info.py 用于抓取文章的基本信息:文章标题、链接、作者、发表日期
    2. article_content_gevent.py 用于抓取文章内容
    3. text_category.py 对文章进行分类
    4. format_data.py 格式化数据

    下面是程序代码

    # coding: utf-8
    '''
    程序: article_base_info.py
    1. 此程序通过给定的页数抓取新浪博客文章的基本信息:文章标题、链接、作者、发表日期
    2. 数据保存到MongoDB中
    '''
    import re
    import concurrent.futures
    import requests
    from bs4 import BeautifulSoup as bs
    from pymongo import MongoClient
    
    
    def fetch(url):
        res = requests.get(url)
        res.encoding = 'gbk'
        content = bs(res.text, 'lxml')
        return content
    
    
    def base_info(html):
        pattern = re.compile(r'http://blog.sina.com.cn/s/blog_.*\.html')
        links = re.findall(pattern, str(html))
        date_ = re.findall(r'\((\d{2,}.*)\)', str(html))
        tle_auth = html.select('li')
        authes = (auth.text.split(' ')[0] for auth in tle_auth)
        titles = (title.text.split(' ')[-1] for title in tle_auth)
        for infos in zip(links, titles, authes, date_):
            yield infos
    
    
    def save(url):
        html = fetch(url)
        data = base_info(html)
        client = MongoClient('localhost', 27017)
        db = client.infos
        coll = db.coll
        for num, d in enumerate(data, 1):
            datum = {
                'links': d[0],
                'title': d[1],
                'auther': d[2],
                'date': d[3]
            }
    
            count = coll.find({'links': d[0]}).count()
            if count == 0:
                coll.insert_one(datum)
        print('{} is grabbed'.format(urls))
    
    
    if __name__ == '__main__':
        url = 'http://roll.blog.sina.com.cn/list/other/index_{}.shtml'
    
        start = int(input('请输入开始页数, 默认为1 >> '))
        if not start:
            start = 1
    
        end = int(input('输入结束页数, 默认为100 >> '))
        if not end:
            end = 100
    
        pages = range(start, end + 1)
        urls = [url.format(page) for page in pages]
    
        with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
            executor.map(save, urls)
    
    

    # -*-coding: utf-8 -*-
    '''
    程序: article_content_gevent.py
    1. 此程序是用来抓取新浪博客的文章内容的!
    2. 文章链接从 筛选后所有博客数据 .csv 读取, 此文件由 article_base_info.py 抓取生成。
    3. 由于作者删帖或者其他方面的原因,导致文章本身已不存在而其链接仍存在的现象。
    '''
    
    import os
    import csv
    import logging
    import requests
    import gevent
    from bs4 import BeautifulSoup as bs
    
    
    def fetch(url):
        res = requests.get(url)
        res.encoding = 'utf-8'
        content = bs(res.text, 'lxml')
        if not content:
            logging.warning('The blog have been deleted!')
        return content
    
    
    def content_get(html):
        try:
            artical = html.select('#sina_keyword_ad_area2')[0].text.strip()
        except IndexError as e:
            print(e)
            logging.warning('the page is None')
            artical = ' '
        return artical
    
    
    def links_get(filename, urls=None):
        with open(filename, 'r') as csvfile:
            logging.info('readed the file {}'.format(filename))
            reader = csv.reader(csvfile)
            if urls is None:
                urls = []
            urls = [row[0] for row in reader]
        return urls
    
    
    def download(url):
        html = fetch(url)
        artical = content_get(html)
        with open('/home/mouse/Documents/artical/{}.txt'
                  .format(url[-12:-5]), 'w') as f:
            f.write(artical)
        logging.info('writring the {}'.format(url))
    
    
    if __name__ == '__main__':
        logging.basicConfig(format='%(asctime)s %(message)s',
                            level=logging.WARNING)
        filename = '/home/mouse/我的坚果云/董姐的论文所需/筛选后所有博客数据.csv'
        urls = links_get(filename)
        if not os.path.isdir('/home/mouse/Documents/artical/'):
            os.makedirs('/home/mouse/Documents/artical/')
        threads = [gevent.spawn(download, url) for url in urls]
        gevent.joinall(threads)
    
    

    # coding=utf-8
    '''
    程序: text_category.py
    1. 此程序用于对从新浪博客抓取的文章进行自动分类
    2. 分类所使用的库来自 https://github.com/2shou/TextGrocery
    3. 工作流 读取已抓取的文章标题 -> 对文章分类 -> 写入分类后的文章标题
    '''
    import os
    import csv
    from tgrocery import Grocery
    from train_txt import train_src
    
    
    def category(title_lst, cates=None):  # 对文章分类
        if cates is None:
            cates = []
        for title in title_lst:
            cate = new_grocery.predict(title)
            cates.append(cate.predicted_y)
        return cates
    
    
    def get_artical_title(filename, title_lst=None):  # 读取文本
        if title_lst is None:
            title_lst = []
    
        with open(filename, 'r') as f1:
            f1_csv = csv.reader(f1)
            title_lst = [row[1] for row in f1_csv]
    
        return title_lst
    
    
    def write_cated_info(filename, new_filename):  # 写入已分类的文章
        titles = get_artical_title(filename)
        categ = category(titles)
        with open(filename, 'r') as read_file:
            reader = csv.reader(read_file)
            for i, row in enumerate(reader):
                row.append(categ[i])
                with open(new_filename, 'a+') as write_file:
                    writer = csv.writer(write_file)
                    writer.writerow(row)
    
                print 'writing the {} item'.format(i)
        print 'Done....................'
    
    
    if __name__ == "__main__":
        # filename和new_filename是文件路径,保存读取和写入的文件
        # 更改路径名即可对不同的数据分类,前提要符合一定格式
        filename = '/home/mouse/我的坚果云/董姐的论文所需/female7.csv'
        new_filename = '/home/mouse/我的坚果云/董姐的论文所需/female7_2.csv'
        if os.path.isfile(new_filename):
            os.remove(new_filename)
        grocery = Grocery('sample')
        grocery.train(train_src)
        grocery.save()
        new_grocery = Grocery('sample')
        new_grocery.load()
        write_cated_info(filename, new_filename)
    

    # -*- coding: utf-8 -*-
    '''
    程序: format_data.py
    此程序是一个辅助程序,用于对 标题整理数据.xlsx 的格式化,标题整理数据手动转换为 csv 格式
    '''
    
    import csv
    from collections import namedtuple
    cate = ['社会冲突和问题', '毛泽东思想与政策', '政党与政府设置', '民主与法治', '民族和国际关系',
            '媒体与言论自由', '资本主义与市场经济', '全球化和对外开放', '民生与福利',
            '家庭冲突与伦理', '传统文化', '性与个人自由', '环境污染', '生态保护', ]
    Category = namedtuple(
        'Category', 'social mao govm demcy nation media capi glob live home tran sex env eco')
    
    filename = '/home/mouse/我的坚果云/董姐的论文所需/标题整理数据2.csv'
    
    
    def train_text(filename, train_src=None):
        if train_src is None:
            train_src = []
    
        def format_cate():
            for emp in map(Category._make, csv.reader(open(filename, 'r'))):
                social = (cate[0], emp.social)
                mao = (cate[1], emp.mao)
                govm = (cate[2], emp.govm)
                demcy = (cate[3], emp.demcy)
                nation = (cate[4], emp.nation)
                media = (cate[5], emp.media)
                capi = (cate[6], emp.capi)
                glob = (cate[7], emp.glob)
                live = (cate[8], emp.live)
                home = (cate[9], emp.home)
                tran = (cate[10], emp.tran)
                sex = (cate[11], emp.sex)
                env = (cate[12], emp.env)
                eco = (cate[13], emp.eco)
                yield social, mao, govm, demcy, nation, media, capi, glob, \
                    live, home, tran, sex, env, eco
    
        for cat in format_cate():
            train_src.extend(list(cat))
    
        return train_src
    

    以上程序均有本人编写,并且全部我的电脑上运行通过,但未在其他电脑和平台上测试,由于各种依赖和兼容性问题以及本人水平有限,不保证他人也能正常运行此程序。

    相关文章

      网友评论

      • luoben886:不用调参数 操作是够简单 但是准确率太低了

      本文标题:新浪博客博文文本聚类

      本文链接:https://www.haomeiwen.com/subject/tqekgttx.html